245 lines
8.6 KiB
Python
Executable File
245 lines
8.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import datetime
|
|
import os.path
|
|
import random
|
|
import time
|
|
import urllib.parse
|
|
|
|
import Agunua
|
|
import feedparser
|
|
|
|
# monkey-patch Gemini support in urllib.parse
|
|
# see https://github.com/python/cpython/blob/master/Lib/urllib/parse.py
|
|
urllib.parse.uses_relative.append("gemini")
|
|
urllib.parse.uses_netloc.append("gemini")
|
|
|
|
def load_feed_urls(filename="feeds.txt"):
|
|
feeds = []
|
|
with open(filename, "r") as fp:
|
|
for line in fp:
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
feeds.append(line)
|
|
return feeds
|
|
|
|
def update_feed_urls(perm_redirects, filename="feeds.txt"):
|
|
if not perm_redirects:
|
|
return
|
|
with open(filename, "r") as fp:
|
|
original_lines = fp.readlines()
|
|
newlines = []
|
|
for line in original_lines:
|
|
if line.strip() in perm_redirects:
|
|
url = line.strip()
|
|
while True:
|
|
url = perm_redirects[url]
|
|
if url not in perm_redirects:
|
|
break
|
|
newlines.append(url+"\n")
|
|
else:
|
|
newlines.append(line)
|
|
with open(filename, "w") as fp:
|
|
for line in newlines:
|
|
fp.write(line)
|
|
|
|
def items_from_feed_string(feed_str):
|
|
items = []
|
|
feed_obj = feedparser.parse(feed_str)
|
|
feed = feed_obj.feed
|
|
for entry in feed_obj.entries:
|
|
# Only add gemini:// URLs
|
|
if not entry.link.startswith("gemini://"):
|
|
for link in entry.links:
|
|
if link.rel == "alternate" and link.href.startswith("gemini://"):
|
|
entry.link = link.href
|
|
break
|
|
else:
|
|
continue
|
|
# Use published timestamp instead of updated timestamp, if it exists.
|
|
# Skip entries with no timestamp at all.
|
|
timestamp = entry.get("updated_parsed", None)
|
|
timestamp = entry.get("published_parsed", timestamp)
|
|
if timestamp:
|
|
timestamp = datetime.datetime.fromtimestamp(time.mktime(timestamp))
|
|
items.append((timestamp, entry.link, entry.title, feed.title))
|
|
return items
|
|
|
|
def items_from_gemsub_string(feed_url, gemsub_str):
|
|
items = []
|
|
feed_title = ""
|
|
for line in gemsub_str.splitlines():
|
|
if not feed_title and (line.startswith("# ") or line.startswith("#\t")):
|
|
feed_title = line[2:].strip()
|
|
elif line.startswith("=>"):
|
|
bits = line[2:].split(maxsplit=1)
|
|
if len(bits) == 1:
|
|
continue
|
|
entry_link, descr = bits
|
|
entry_link = urllib.parse.urljoin(feed_url, entry_link)
|
|
descr = descr.strip()
|
|
try:
|
|
timestamp = datetime.datetime.fromisoformat(descr[0:10])
|
|
except:
|
|
continue
|
|
entry_title = descr[10:].strip()
|
|
while entry_title[0] in ("-", ":"):
|
|
entry_title = entry_title[1:].strip()
|
|
items.append((timestamp, entry_link, entry_title, feed_title))
|
|
return items
|
|
|
|
def format_aggregated(items, filename, n_feeds):
|
|
with open(filename, "w") as fp:
|
|
# Add header
|
|
if os.path.exists("header.gmi"):
|
|
with open("header.gmi", "r") as fp2:
|
|
fp.write(fp2.read())
|
|
else:
|
|
fp.write("# CAPCOM Gemini feed aggregator\n\n")
|
|
# Feed count
|
|
fp.write("Aggregating {} Atom feeds from Geminispace.\n".format(n_feeds))
|
|
# List feed entries
|
|
current_day = (0,0)
|
|
for updated, link, entry_title, feed_title in items:
|
|
timetup = updated.timetuple()
|
|
item_day = (timetup.tm_year, timetup.tm_yday)
|
|
if item_day != current_day:
|
|
current_day = item_day
|
|
fp.write("\n## " + updated.strftime("%Y-%m-%d") + "\n\n")
|
|
fp.write("=> {} {} - {}\n".format(link, feed_title, entry_title))
|
|
fp.write("\n")
|
|
# Add footer
|
|
if os.path.exists("footer.gmi"):
|
|
with open("footer.gmi", "r") as fp2:
|
|
fp.write(fp2.read())
|
|
|
|
def aggregate(feed_file="feeds.txt", output_file="index.gmi", max_posts=64, max_posts_per_feed=0, max_days=0):
|
|
|
|
# Load feed URLs to query
|
|
feed_urls = load_feed_urls(feed_file)
|
|
N = len(feed_urls)
|
|
|
|
# Prepare to extract feed items
|
|
last_accessed = {}
|
|
perm_redirects = {}
|
|
skips = 0
|
|
items = []
|
|
while feed_urls:
|
|
# Get a feed URL to fetch
|
|
feed_url = feed_urls.pop()
|
|
|
|
# Don't hammer servers
|
|
netloc = urllib.parse.urlsplit(feed_url).netloc
|
|
last = last_accessed.get(netloc, 0)
|
|
now = time.time()
|
|
interval = int(now - last)
|
|
if interval < 5:
|
|
print("Declining to hit {} again after only {} seconds".format(netloc, interval))
|
|
feed_urls.insert(0, feed_url)
|
|
skips += 1
|
|
if skips == len(feed_urls):
|
|
# We've hammered every server in the queue! Sleep a bit...
|
|
print("Sleeping to give all servers a rest!")
|
|
time.sleep(5)
|
|
continue
|
|
skips = 0
|
|
|
|
# Good to go
|
|
print("Fetching ", feed_url)
|
|
redirect_count = 0
|
|
try:
|
|
# Redirect following loop
|
|
while True:
|
|
resp = u = Agunua.GeminiUri(feed_url, insecure=True, get_content=True)
|
|
if u.status_code not in ("30", "31"):
|
|
break
|
|
old_url = feed_url
|
|
feed_url = urllib.parse.urljoin(old_url, u.meta)
|
|
if u.status_code == "31":
|
|
print("Recording permanent redirect from {} to {}".format(old_url, feed_url))
|
|
perm_redirects[old_url] = feed_url
|
|
redirect_count += 1
|
|
if redirect_count > 3:
|
|
break
|
|
# Parse Atom or Gemsub
|
|
if resp.status_code == "20":
|
|
last_accessed[netloc] = time.time()
|
|
if resp.mediatype == "text/gemini":
|
|
feed_items = items_from_gemsub_string(feed_url, resp.payload)
|
|
else:
|
|
feed_items = items_from_feed_string(resp.payload)
|
|
if max_posts_per_feed and len(feed_items) > max_posts_per_feed:
|
|
feed_items.sort(reverse=True)
|
|
feed_items = feed_items[0:max_posts_per_feed]
|
|
items.extend(feed_items)
|
|
except Exception as e:
|
|
print("Error on {}, skipping...".format(feed_url))
|
|
continue
|
|
|
|
# Discard posts which are too old
|
|
if max_days:
|
|
now = datetime.datetime.now()
|
|
items = [i for i in items if (now - i[0]).days <= max_days]
|
|
|
|
# Remove duplicate items
|
|
## Do this randomly, maybe not ideal?
|
|
random.shuffle(items)
|
|
seen_urls = []
|
|
new_items = []
|
|
for item in items:
|
|
if item[1] not in seen_urls:
|
|
seen_urls.append(item[1])
|
|
new_items.append(item)
|
|
items = new_items
|
|
|
|
# Find 64 most recent items
|
|
items.sort(reverse=True)
|
|
items = items[0:max_posts]
|
|
|
|
# Format output
|
|
format_aggregated(items, output_file, N)
|
|
|
|
# Update feed file with permanent redirects
|
|
update_feed_urls(perm_redirects, feed_file)
|
|
|
|
def main():
|
|
# Parse arguments
|
|
parser = argparse.ArgumentParser(description='Aggregate Atom feeds to a Gemini doc.')
|
|
parser.add_argument('-a', '--age', dest='max_age', type=str,
|
|
default="", help='maximum age of posts in output')
|
|
parser.add_argument('-f', '--feeds', dest='feed_file', type=str,
|
|
default="feeds.txt", help="file to read feed URLs from")
|
|
parser.add_argument('-o', '--output', dest='output', type=str,
|
|
default="index.gmi", help='output filename')
|
|
parser.add_argument('-p', '--posts', dest='max_posts', type=int,
|
|
default=64, help='maximum number of posts in output')
|
|
parser.add_argument('-P', '--perfeed', dest='max_posts_per_feed', type=int,
|
|
default=0, help='maximum number of posts per feed in output')
|
|
args = parser.parse_args()
|
|
|
|
# Parse max age
|
|
max_days = 0
|
|
if args.max_age:
|
|
try:
|
|
unit = args.max_age[-1].lower()
|
|
value = int(args.max_age[0:-1])
|
|
if unit == "d":
|
|
max_days = value
|
|
elif unit == "w":
|
|
max_days = 7*value
|
|
elif unit == "m":
|
|
max_days = 30*value
|
|
elif unit == "y":
|
|
max_days = 365*value
|
|
else:
|
|
raise Exception
|
|
except:
|
|
print("Ignoring invalid maximum age.")
|
|
|
|
# Aggregate feeds
|
|
aggregate(args.feed_file, args.output, args.max_posts, args.max_posts_per_feed, max_days)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|