1
3
Fork 0
CAPCOM/capcom.py

245 lines
8.6 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import datetime
import os.path
import random
import time
import urllib.parse
import Agunua
import feedparser
# monkey-patch Gemini support in urllib.parse
# see https://github.com/python/cpython/blob/master/Lib/urllib/parse.py
urllib.parse.uses_relative.append("gemini")
urllib.parse.uses_netloc.append("gemini")
def load_feed_urls(filename="feeds.txt"):
feeds = []
with open(filename, "r") as fp:
for line in fp:
line = line.strip()
if not line or line.startswith("#"):
continue
feeds.append(line)
return feeds
def update_feed_urls(perm_redirects, filename="feeds.txt"):
if not perm_redirects:
return
with open(filename, "r") as fp:
original_lines = fp.readlines()
newlines = []
for line in original_lines:
if line.strip() in perm_redirects:
url = line.strip()
while True:
url = perm_redirects[url]
if url not in perm_redirects:
break
newlines.append(url+"\n")
else:
newlines.append(line)
with open(filename, "w") as fp:
for line in newlines:
fp.write(line)
def items_from_feed_string(feed_str):
items = []
feed_obj = feedparser.parse(feed_str)
feed = feed_obj.feed
for entry in feed_obj.entries:
# Only add gemini:// URLs
if not entry.link.startswith("gemini://"):
for link in entry.links:
if link.rel == "alternate" and link.href.startswith("gemini://"):
entry.link = link.href
break
else:
continue
# Use published timestamp instead of updated timestamp, if it exists.
# Skip entries with no timestamp at all.
timestamp = entry.get("updated_parsed", None)
timestamp = entry.get("published_parsed", timestamp)
if timestamp:
timestamp = datetime.datetime.fromtimestamp(time.mktime(timestamp))
items.append((timestamp, entry.link, entry.title, feed.title))
return items
def items_from_gemsub_string(feed_url, gemsub_str):
items = []
feed_title = ""
for line in gemsub_str.splitlines():
if not feed_title and (line.startswith("# ") or line.startswith("#\t")):
feed_title = line[2:].strip()
elif line.startswith("=>"):
bits = line[2:].split(maxsplit=1)
if len(bits) == 1:
continue
entry_link, descr = bits
entry_link = urllib.parse.urljoin(feed_url, entry_link)
descr = descr.strip()
try:
timestamp = datetime.datetime.fromisoformat(descr[0:10])
except:
continue
entry_title = descr[10:].strip()
while entry_title[0] in ("-", ":"):
entry_title = entry_title[1:].strip()
items.append((timestamp, entry_link, entry_title, feed_title))
return items
def format_aggregated(items, filename, n_feeds):
with open(filename, "w") as fp:
# Add header
if os.path.exists("header.gmi"):
with open("header.gmi", "r") as fp2:
fp.write(fp2.read())
else:
fp.write("# CAPCOM Gemini feed aggregator\n\n")
# Feed count
fp.write("Aggregating {} Atom feeds from Geminispace.\n".format(n_feeds))
# List feed entries
current_day = (0,0)
for updated, link, entry_title, feed_title in items:
timetup = updated.timetuple()
item_day = (timetup.tm_year, timetup.tm_yday)
if item_day != current_day:
current_day = item_day
fp.write("\n## " + updated.strftime("%Y-%m-%d") + "\n\n")
fp.write("=> {} {} - {}\n".format(link, feed_title, entry_title))
fp.write("\n")
# Add footer
if os.path.exists("footer.gmi"):
with open("footer.gmi", "r") as fp2:
fp.write(fp2.read())
def aggregate(feed_file="feeds.txt", output_file="index.gmi", max_posts=64, max_posts_per_feed=0, max_days=0):
# Load feed URLs to query
feed_urls = load_feed_urls(feed_file)
N = len(feed_urls)
# Prepare to extract feed items
last_accessed = {}
perm_redirects = {}
skips = 0
items = []
while feed_urls:
# Get a feed URL to fetch
feed_url = feed_urls.pop()
# Don't hammer servers
netloc = urllib.parse.urlsplit(feed_url).netloc
last = last_accessed.get(netloc, 0)
now = time.time()
interval = int(now - last)
if interval < 5:
print("Declining to hit {} again after only {} seconds".format(netloc, interval))
feed_urls.insert(0, feed_url)
skips += 1
if skips == len(feed_urls):
# We've hammered every server in the queue! Sleep a bit...
print("Sleeping to give all servers a rest!")
time.sleep(5)
continue
skips = 0
# Good to go
print("Fetching ", feed_url)
redirect_count = 0
try:
# Redirect following loop
while True:
resp = u = Agunua.GeminiUri(feed_url, insecure=True, get_content=True)
if u.status_code not in ("30", "31"):
break
old_url = feed_url
feed_url = urllib.parse.urljoin(old_url, u.meta)
if u.status_code == "31":
print("Recording permanent redirect from {} to {}".format(old_url, feed_url))
perm_redirects[old_url] = feed_url
redirect_count += 1
if redirect_count > 3:
break
# Parse Atom or Gemsub
if resp.status_code == "20":
last_accessed[netloc] = time.time()
if resp.mediatype == "text/gemini":
feed_items = items_from_gemsub_string(feed_url, resp.payload)
else:
feed_items = items_from_feed_string(resp.payload)
if max_posts_per_feed and len(feed_items) > max_posts_per_feed:
feed_items.sort(reverse=True)
feed_items = feed_items[0:max_posts_per_feed]
items.extend(feed_items)
except Exception as e:
print("Error on {}, skipping...".format(feed_url))
continue
# Discard posts which are too old
if max_days:
now = datetime.datetime.now()
items = [i for i in items if (now - i[0]).days <= max_days]
# Remove duplicate items
## Do this randomly, maybe not ideal?
random.shuffle(items)
seen_urls = []
new_items = []
for item in items:
if item[1] not in seen_urls:
seen_urls.append(item[1])
new_items.append(item)
items = new_items
# Find 64 most recent items
items.sort(reverse=True)
items = items[0:max_posts]
# Format output
format_aggregated(items, output_file, N)
# Update feed file with permanent redirects
update_feed_urls(perm_redirects, feed_file)
def main():
# Parse arguments
parser = argparse.ArgumentParser(description='Aggregate Atom feeds to a Gemini doc.')
parser.add_argument('-a', '--age', dest='max_age', type=str,
default="", help='maximum age of posts in output')
parser.add_argument('-f', '--feeds', dest='feed_file', type=str,
default="feeds.txt", help="file to read feed URLs from")
parser.add_argument('-o', '--output', dest='output', type=str,
default="index.gmi", help='output filename')
parser.add_argument('-p', '--posts', dest='max_posts', type=int,
default=64, help='maximum number of posts in output')
parser.add_argument('-P', '--perfeed', dest='max_posts_per_feed', type=int,
default=0, help='maximum number of posts per feed in output')
args = parser.parse_args()
# Parse max age
max_days = 0
if args.max_age:
try:
unit = args.max_age[-1].lower()
value = int(args.max_age[0:-1])
if unit == "d":
max_days = value
elif unit == "w":
max_days = 7*value
elif unit == "m":
max_days = 30*value
elif unit == "y":
max_days = 365*value
else:
raise Exception
except:
print("Ignoring invalid maximum age.")
# Aggregate feeds
aggregate(args.feed_file, args.output, args.max_posts, args.max_posts_per_feed, max_days)
if __name__ == "__main__":
main()