2020-03-18 20:32:52 +00:00
|
|
|
#!/usr/bin/env python3
|
2020-03-16 18:45:04 +00:00
|
|
|
import argparse
|
2020-03-16 17:42:58 +00:00
|
|
|
import datetime
|
|
|
|
import glob
|
2020-03-16 21:18:37 +00:00
|
|
|
import os
|
2020-03-16 17:42:58 +00:00
|
|
|
import os.path
|
2020-05-20 17:16:45 +00:00
|
|
|
import re
|
2020-03-16 21:44:33 +00:00
|
|
|
import stat
|
2020-03-16 19:29:52 +00:00
|
|
|
import urllib.parse
|
2020-03-16 17:42:58 +00:00
|
|
|
|
|
|
|
from feedgen.feed import FeedGenerator
|
|
|
|
|
2020-03-17 12:09:51 +00:00
|
|
|
def is_world_readable(filename):
|
2020-03-17 14:45:49 +00:00
|
|
|
"""
|
|
|
|
Return True if the named file is world readable, otherwise return False.
|
|
|
|
"""
|
2020-03-17 12:09:51 +00:00
|
|
|
st = os.stat(filename)
|
|
|
|
return st.st_mode & stat.S_IROTH
|
|
|
|
|
2020-03-17 14:45:49 +00:00
|
|
|
def extract_first_heading(filename, default=""):
|
|
|
|
"""
|
|
|
|
Open a file which is presumed to contain text/gemini content and return
|
|
|
|
the contents of the first heading line (regardless of heading level).
|
|
|
|
If no heading lines are found, return the specified default.
|
|
|
|
"""
|
|
|
|
with open(filename) as fp:
|
|
|
|
for line in fp:
|
|
|
|
if line.startswith("#"):
|
|
|
|
while line[0] == "#":
|
|
|
|
line = line[1:]
|
|
|
|
return line.strip()
|
|
|
|
return default
|
|
|
|
|
2020-03-17 18:05:13 +00:00
|
|
|
def get_feed_title(directory):
|
2020-03-17 14:45:49 +00:00
|
|
|
"""
|
|
|
|
If an index.gmi or index.gemini file exists and is worldreadable, return
|
|
|
|
the content of the first heading line in the file, otherwise return a
|
|
|
|
default feed title.
|
|
|
|
"""
|
2020-03-17 20:19:28 +00:00
|
|
|
# By default, use the deepest directory name as a feed title
|
|
|
|
# This needs a little care, as os.path.basename will return an empty
|
|
|
|
# string if `directory` ends in a trailing slash...
|
|
|
|
head, default = os.path.split(directory)
|
|
|
|
if not default:
|
|
|
|
default = os.path.basename(head)
|
|
|
|
# Check for index files which may override the default
|
2020-03-17 14:12:11 +00:00
|
|
|
for index_file in ("index.gmi", "index.gemini"):
|
2020-03-17 18:05:13 +00:00
|
|
|
index_file = os.path.join(directory, index_file)
|
2020-03-17 14:12:11 +00:00
|
|
|
if os.path.exists(index_file) and is_world_readable(index_file):
|
|
|
|
return extract_first_heading(index_file, default)
|
2020-03-17 20:09:45 +00:00
|
|
|
return default
|
2020-03-17 12:09:51 +00:00
|
|
|
|
2020-07-07 12:00:18 +00:00
|
|
|
def find_files(directory, time_func, n=10):
|
2020-03-17 14:45:49 +00:00
|
|
|
"""
|
|
|
|
Return the n most recently created world readable files with extensions of
|
|
|
|
.gmi or .gemini, as a list sorted from most to least recent.
|
|
|
|
"""
|
2020-03-16 17:54:49 +00:00
|
|
|
files = []
|
2020-03-17 14:12:11 +00:00
|
|
|
for extension in ("gmi", "gemini"):
|
2020-03-17 18:05:13 +00:00
|
|
|
glob_pattern = os.path.join(directory, "*.{}".format(extension))
|
|
|
|
files.extend(glob.glob(glob_pattern))
|
2020-03-18 14:29:10 +00:00
|
|
|
index = os.path.join(directory, "index.{}".format(extension))
|
2020-03-16 17:54:49 +00:00
|
|
|
if index in files:
|
|
|
|
files.remove(index)
|
2020-03-17 12:26:32 +00:00
|
|
|
files = [f for f in files if is_world_readable(f)]
|
2020-07-07 12:00:18 +00:00
|
|
|
files.sort(key=time_func, reverse=True)
|
2020-03-17 12:26:32 +00:00
|
|
|
return files[0:n]
|
2020-03-17 12:09:51 +00:00
|
|
|
|
2020-03-17 14:45:49 +00:00
|
|
|
def urljoin(base, url):
|
|
|
|
"""
|
|
|
|
Return an absolute URL formed by combining the provided base and relative
|
|
|
|
URLs.
|
|
|
|
|
|
|
|
This is necessary because the various functions in Python's urllib to do
|
|
|
|
this do not function as expected if the URL scheme is not recognised,
|
|
|
|
which of course gemini:// is not. Thus, we need to do a little dance
|
|
|
|
where we transform gemini URLs to https URLs, join them, and then undo
|
|
|
|
the transformation.
|
|
|
|
"""
|
|
|
|
base = urllib.parse.urlsplit(base)
|
|
|
|
base = base._replace(scheme="https")
|
|
|
|
base = urllib.parse.urlunsplit(base)
|
|
|
|
joined = urllib.parse.urljoin(base, url)
|
|
|
|
joined = urllib.parse.urlsplit(joined)
|
|
|
|
joined = joined._replace(scheme="gemini")
|
|
|
|
return urllib.parse.urlunsplit(joined)
|
2020-03-16 17:42:58 +00:00
|
|
|
|
2020-07-07 12:00:18 +00:00
|
|
|
def populate_entry_from_file(filename, base_url, entry, time_func):
|
2020-03-17 14:45:49 +00:00
|
|
|
"""
|
|
|
|
Set the id, title, updated and link attributes of the provided
|
|
|
|
FeedGenerator entry object according the contents of the named
|
|
|
|
Gemini file and the base URL.
|
|
|
|
"""
|
2020-03-18 18:01:32 +00:00
|
|
|
url = urljoin(base_url, os.path.basename(filename))
|
2020-03-16 17:42:58 +00:00
|
|
|
entry.guid(url)
|
2020-03-16 19:01:55 +00:00
|
|
|
entry.link(href=url, rel="alternate")
|
2020-07-07 12:00:18 +00:00
|
|
|
updated = get_update_time(filename, time_func)
|
2020-03-16 17:42:58 +00:00
|
|
|
entry.updated(updated)
|
2020-03-17 18:05:13 +00:00
|
|
|
default_title = os.path.splitext(os.path.basename(filename))[0]
|
|
|
|
title = extract_first_heading(filename, default_title)
|
2020-03-16 17:42:58 +00:00
|
|
|
entry.title(title)
|
|
|
|
|
2020-07-07 12:00:18 +00:00
|
|
|
def get_update_time(filename, time_func):
|
2020-05-20 17:16:45 +00:00
|
|
|
"""
|
|
|
|
Return an update time for a Gemini file.
|
|
|
|
|
|
|
|
If the filename begins with an ISO8601 date stamp, that date
|
|
|
|
(with a time of midnight) will be used. Otherwise, the file
|
|
|
|
"creation time" (which in unix is actually the time of last
|
|
|
|
metadata update) will be used instead as a best estimate.
|
|
|
|
"""
|
|
|
|
# Check for leading YYYY-MM-DD
|
|
|
|
basename = os.path.basename(filename)
|
|
|
|
if re.search("^[0-9]{4}-[01][0-9]-[0-3][0-9]", basename):
|
|
|
|
date = basename[0:10] + " Z" # Add UTC marker
|
|
|
|
return datetime.datetime.strptime(date, "%Y-%m-%d %z")
|
|
|
|
else:
|
2020-07-07 12:00:18 +00:00
|
|
|
updated = time_func(filename)
|
2020-05-20 17:16:45 +00:00
|
|
|
return datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)
|
|
|
|
|
2020-07-07 12:00:18 +00:00
|
|
|
def build_feed(directory, time_func, base_url, output="atom.xml", n=10,
|
|
|
|
title="", subtitle="", author="", email="", verbose=False):
|
2020-03-17 16:29:32 +00:00
|
|
|
"""
|
|
|
|
Build an Atom feed for all world readable Gemini files in the current
|
|
|
|
directory, and write it to atom.xml.
|
|
|
|
"""
|
|
|
|
# If a title hasn't been provided, try to get one from an index page
|
|
|
|
if not title:
|
2020-03-17 18:05:13 +00:00
|
|
|
title = get_feed_title(directory)
|
2020-03-16 18:45:04 +00:00
|
|
|
|
2020-03-17 20:05:30 +00:00
|
|
|
# Let user know feed title and URL
|
|
|
|
feed_url = urljoin(base_url, output)
|
|
|
|
if verbose:
|
|
|
|
print('Generating feed "{}", which should be served from {}'.format(title, feed_url))
|
|
|
|
|
2020-03-17 16:29:32 +00:00
|
|
|
# Setup feed
|
|
|
|
feed = FeedGenerator()
|
|
|
|
feed.id(base_url)
|
|
|
|
feed.title(title)
|
|
|
|
if subtitle:
|
|
|
|
feed.subtitle(subtitle)
|
|
|
|
author_details = {}
|
|
|
|
if author:
|
|
|
|
author_details["name"] = author
|
|
|
|
if email:
|
|
|
|
author_details["email"] = email
|
|
|
|
if author_details:
|
|
|
|
feed.author(author_details)
|
2020-03-17 20:05:30 +00:00
|
|
|
feed.link(href=feed_url, rel='self')
|
2020-03-17 16:29:32 +00:00
|
|
|
feed.link(href=base_url, rel='alternate')
|
2020-03-17 12:09:51 +00:00
|
|
|
|
2020-03-17 16:29:32 +00:00
|
|
|
# Add one entry per .gmi file
|
2020-07-07 12:00:18 +00:00
|
|
|
files = find_files(directory, time_func, n)
|
2020-03-17 16:29:32 +00:00
|
|
|
if not files:
|
2020-03-17 17:50:21 +00:00
|
|
|
if verbose:
|
|
|
|
print("No world-readable Gemini content found! :(")
|
2020-03-17 16:29:32 +00:00
|
|
|
return
|
|
|
|
for n, filename in enumerate(files):
|
|
|
|
entry = feed.add_entry()
|
2020-07-07 12:00:18 +00:00
|
|
|
populate_entry_from_file(filename, base_url, entry, time_func)
|
2020-03-17 16:29:32 +00:00
|
|
|
if n == 0:
|
|
|
|
feed.updated(entry.updated())
|
2020-03-17 17:50:21 +00:00
|
|
|
if verbose:
|
2020-03-17 18:05:13 +00:00
|
|
|
print("Adding {} with title '{}'...".format(os.path.basename(filename),
|
|
|
|
entry.title()))
|
2020-03-17 16:29:32 +00:00
|
|
|
|
|
|
|
# Write file
|
2020-03-17 18:05:13 +00:00
|
|
|
output = os.path.join(directory, output)
|
2020-03-17 16:29:32 +00:00
|
|
|
feed.atom_file(output, pretty=True)
|
2020-03-17 17:50:21 +00:00
|
|
|
if verbose:
|
|
|
|
print("Wrote Atom feed to {}.".format(output))
|
2020-03-17 16:29:32 +00:00
|
|
|
|
|
|
|
def main():
|
|
|
|
"""
|
|
|
|
Parse command line arguments, do some minor processing, and then invoke
|
|
|
|
the build_feed command with the provided settings.
|
|
|
|
"""
|
2020-03-17 18:05:13 +00:00
|
|
|
|
|
|
|
# Get cwd as default value for --directory
|
|
|
|
cwd = os.getcwd()
|
|
|
|
|
2020-03-16 18:45:04 +00:00
|
|
|
# Parse arguments
|
|
|
|
parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')
|
2020-03-16 19:09:17 +00:00
|
|
|
parser.add_argument('-a', '--author', dest='author', type=str,
|
|
|
|
help="feed author's name")
|
2020-03-16 18:45:04 +00:00
|
|
|
parser.add_argument('-b', '--base', dest='base_url', type=str,
|
|
|
|
required=True, help='base URL for feed and entries')
|
2020-03-17 18:05:13 +00:00
|
|
|
parser.add_argument('-d', '--directory', dest='directory', type=str,
|
|
|
|
default=cwd, help='directory to find content and save feed to')
|
2020-03-16 19:09:17 +00:00
|
|
|
parser.add_argument('-e', '--email', dest='email', type=str,
|
|
|
|
help="feed author's email address")
|
2020-03-17 12:26:32 +00:00
|
|
|
parser.add_argument('-n', dest='n', type=int, default=10,
|
|
|
|
help='include N most recently created files in feed (default 10)')
|
2020-03-16 18:45:04 +00:00
|
|
|
parser.add_argument('-o', '--output', dest='output', type=str,
|
|
|
|
default="atom.xml", help='output filename')
|
2020-03-17 17:50:21 +00:00
|
|
|
parser.add_argument('-q', '--quiet', dest='verbose', action="store_false",
|
|
|
|
help='Write nothing to stdout under non-error conditions')
|
2020-03-16 19:09:17 +00:00
|
|
|
parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,
|
|
|
|
help='feed subtitle')
|
2020-03-16 18:45:04 +00:00
|
|
|
parser.add_argument('-t', '--title', dest='title', type=str,
|
2020-03-17 20:05:03 +00:00
|
|
|
help='feed title')
|
2020-07-07 12:00:18 +00:00
|
|
|
parser.add_argument('--mtime', action="store_true",
|
|
|
|
help='Use file modification time, not file update time, in feeds')
|
2020-03-16 18:45:04 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2020-03-16 19:29:52 +00:00
|
|
|
# Normalise base URL
|
|
|
|
base_url = urllib.parse.urlsplit(args.base_url)
|
|
|
|
if not base_url.netloc and base_url.path:
|
|
|
|
# Handle a naked domain, which urlsplit will interpet at a local path
|
|
|
|
base_url = base_url._replace(netloc=base_url.path, path="")
|
|
|
|
base_url = base_url._replace(scheme="gemini")
|
|
|
|
args.base_url = urllib.parse.urlunsplit(base_url)
|
2020-03-18 17:14:28 +00:00
|
|
|
if not args.base_url.endswith("/"):
|
|
|
|
args.base_url += "/"
|
2020-03-16 17:54:49 +00:00
|
|
|
|
2020-03-17 16:29:32 +00:00
|
|
|
# Build the feed
|
2020-07-07 12:00:18 +00:00
|
|
|
time_function = os.path.getmtime if args.mtime else os.path.getctime
|
|
|
|
build_feed(args.directory, time_function, args.base_url, args.output,
|
|
|
|
args.n, args.title, args.subtitle, args.author, args.email,
|
|
|
|
args.verbose)
|
2020-03-16 17:54:49 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|