gemfeed/gemfeed.py

#!/usr/bin/env python3
import argparse
import datetime
import glob
import os
import os.path
import re
import stat
import urllib.parse

from feedgen.feed import FeedGenerator

def is_world_readable(filename):
    """
    Return True if the named file is world readable, otherwise return False.
    """
    st = os.stat(filename)
    return st.st_mode & stat.S_IROTH

def extract_first_heading(filename, default=""):
    """
    Open a file which is presumed to contain text/gemini content and return
    the contents of the first heading line (regardless of heading level).
    If no heading lines are found, return the specified default.
    """
    with open(filename) as fp:
        for line in fp:
            if line.startswith("#"):
                while line[0] == "#":
                    line = line[1:]
                return line.strip()
    return default

def get_feed_title(directory):
    """
    If an index.gmi or index.gemini file exists and is worldreadable, return
    the content of the first heading line in the file, otherwise return a
    default feed title.
    """
    # By default, use the deepest directory name as a feed title
    # This needs a little care, as os.path.basename will return an empty
    # string if `directory` ends in a trailing slash...
    head, default = os.path.split(directory)
    if not default:
        default = os.path.basename(head)
    # Check for index files which may override the default
    for index_file in ("index.gmi", "index.gemini"):
        index_file = os.path.join(directory, index_file)
        if os.path.exists(index_file) and is_world_readable(index_file):
            return extract_first_heading(index_file, default)
    return default

def find_files(directory, time_func, n=10):
    """
    Return the n most recently created world readable files with extensions of
    .gmi or .gemini, as a list sorted from most to least recent.
    """
    files = []
    for extension in ("gmi", "gemini"):
        glob_pattern = os.path.join(directory, "*.{}".format(extension))
        files.extend(glob.glob(glob_pattern))
        index = os.path.join(directory, "index.{}".format(extension))
        if index in files:
            files.remove(index)
    files = [f for f in files if is_world_readable(f)]
    files.sort(key=time_func, reverse=True)
    return files[0:n]

def urljoin(base, url):
    """
    Return an absolute URL formed by combining the provided base and relative
    URLs.

    This is necessary because the various functions in Python's urllib to do
    this do not function as expected if the URL scheme is not recognised,
    which of course gemini:// is not.  Thus, we need to do a little dance
    where we transform gemini URLs to https URLs, join them, and then undo
    the transformation.
    """
    base = urllib.parse.urlsplit(base)
    base = base._replace(scheme="https")
    base = urllib.parse.urlunsplit(base)
    joined = urllib.parse.urljoin(base, url)
    joined = urllib.parse.urlsplit(joined)
    joined = joined._replace(scheme="gemini")
    return urllib.parse.urlunsplit(joined)

def populate_entry_from_file(filename, base_url, entry, time_func):
    """
    Set the id, title, updated and link attributes of the provided
    FeedGenerator entry object according the contents of the named
    Gemini file and the base URL.
    """
    url = urljoin(base_url, os.path.basename(filename))
    entry.guid(url)
    entry.link(href=url, rel="alternate")
    updated = get_update_time(filename, time_func)
    entry.updated(updated)
    default_title = os.path.splitext(os.path.basename(filename))[0]
    title = extract_first_heading(filename, default_title)
    entry.title(title)

def get_update_time(filename, time_func):
    """
    Return an update time for a Gemini file.

    If the filename begins with an ISO8601 date stamp, that date
    (with a time of midnight) will be used.  Otherwise, the file
    "creation time" (which in unix is actually the time of last
    metadata update) will be used instead as a best estimate.
    """
    # Check for leading YYYY-MM-DD
    basename = os.path.basename(filename)
    if re.search("^[0-9]{4}-[01][0-9]-[0-3][0-9]", basename):
        date = basename[0:10] + " Z" # Add UTC marker
        return datetime.datetime.strptime(date, "%Y-%m-%d %z")
    else:
        updated = time_func(filename)
        return datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)

def build_feed(directory, time_func, base_url, output="atom.xml", n=10,
        title="", subtitle="", author="", email="", verbose=False):
    """
    Build an Atom feed for all world readable Gemini files in the current
    directory, and write it to atom.xml.
    """
    # If a title hasn't been provided, try to get one from an index page
    if not title:
        title = get_feed_title(directory)

    # Let user know feed title and URL
    feed_url = urljoin(base_url, output)
    if verbose:
        print('Generating feed "{}", which should be served from {}'.format(title, feed_url))

    # Setup feed
    feed = FeedGenerator()
    feed.id(base_url)
    feed.title(title)
    if subtitle:
        feed.subtitle(subtitle)
    author_details = {}
    if author:
        author_details["name"] = author
    if email:
        author_details["email"] = email
    if author_details:
        feed.author(author_details)
    feed.link(href=feed_url, rel='self')
    feed.link(href=base_url, rel='alternate')

    # Add one entry per .gmi file
    files = find_files(directory, time_func, n)
    if not files:
        if verbose:
            print("No world-readable Gemini content found! :(")
        return
    for n, filename in enumerate(files):
        entry = feed.add_entry()
        populate_entry_from_file(filename, base_url, entry, time_func)
        if n == 0:
            feed.updated(entry.updated())
        if verbose:
            print("Adding {} with title '{}'...".format(os.path.basename(filename),
                entry.title()))

    # Write file
    output = os.path.join(directory, output)
    feed.atom_file(output, pretty=True)
    if verbose:
        print("Wrote Atom feed to {}.".format(output))

def main():
    """
    Parse command line arguments, do some minor processing, and then invoke
    the build_feed command with the provided settings.
    """

    # Get cwd as default value for --directory
    cwd = os.getcwd()

    # Parse arguments
    parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')
    parser.add_argument('-a', '--author', dest='author', type=str,
            help="feed author's name")
    parser.add_argument('-b', '--base', dest='base_url', type=str,
            required=True, help='base URL for feed and entries')
    parser.add_argument('-d', '--directory', dest='directory', type=str,
            default=cwd, help='directory to find content and save feed to')
    parser.add_argument('-e', '--email', dest='email', type=str,
            help="feed author's email address")
    parser.add_argument('-n', dest='n', type=int, default=10,
            help='include N most recently created files in feed (default 10)')
    parser.add_argument('-o', '--output', dest='output', type=str,
            default="atom.xml", help='output filename')
    parser.add_argument('-q', '--quiet', dest='verbose', action="store_false",
            help='Write nothing to stdout under non-error conditions')
    parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,
            help='feed subtitle')
    parser.add_argument('-t', '--title', dest='title', type=str,
            help='feed title')
    parser.add_argument('--mtime', action="store_true",
            help='Use file modification time, not file update time, in feeds')
    args = parser.parse_args()

    # Normalise base URL
    base_url = urllib.parse.urlsplit(args.base_url)
    if not base_url.netloc and base_url.path:
        # Handle a naked domain, which urlsplit will interpet at a local path
        base_url = base_url._replace(netloc=base_url.path, path="")
    base_url = base_url._replace(scheme="gemini")
    args.base_url = urllib.parse.urlunsplit(base_url)
    if not args.base_url.endswith("/"):
        args.base_url += "/"

    # Build the feed
    time_function = os.path.getmtime if args.mtime else os.path.getctime
    build_feed(args.directory, time_function, args.base_url, args.output,
            args.n, args.title, args.subtitle, args.author, args.email,
            args.verbose)

if __name__ == "__main__":
    main()
Add hashbang line. 2020-03-18 20:32:52 +00:00			`#!/usr/bin/env python3`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`import argparse`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`import datetime`
			`import glob`
Include only world readable files. 2020-03-16 21:18:37 +00:00			`import os`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`import os.path`
Extract datestamps from filename if possible. 2020-05-20 17:16:45 +00:00			`import re`
Fix stat nonsense, git can die in a fire. 2020-03-16 21:44:33 +00:00			`import stat`
Better URL handling. 2020-03-16 19:29:52 +00:00			`import urllib.parse`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00
			`from feedgen.feed import FeedGenerator`

Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`def is_world_readable(filename):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Return True if the named file is world readable, otherwise return False.`
			`"""`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`st = os.stat(filename)`
			`return st.st_mode & stat.S_IROTH`

Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`def extract_first_heading(filename, default=""):`
			`"""`
			`Open a file which is presumed to contain text/gemini content and return`
			`the contents of the first heading line (regardless of heading level).`
			`If no heading lines are found, return the specified default.`
			`"""`
			`with open(filename) as fp:`
			`for line in fp:`
			`if line.startswith("#"):`
			`while line[0] == "#":`
			`line = line[1:]`
			`return line.strip()`
			`return default`

Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`def get_feed_title(directory):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`If an index.gmi or index.gemini file exists and is worldreadable, return`
			`the content of the first heading line in the file, otherwise return a`
			`default feed title.`
			`"""`
Fix another default feed name bug! 2020-03-17 20:19:28 +00:00			`# By default, use the deepest directory name as a feed title`
			`# This needs a little care, as os.path.basename will return an empty`
			# string if `directory` ends in a trailing slash...
			`head, default = os.path.split(directory)`
			`if not default:`
			`default = os.path.basename(head)`
			`# Check for index files which may override the default`
Remove support for .gem extension, as the Best Practices document mentions only .gmi and .gemini. 2020-03-17 14:12:11 +00:00			`for index_file in ("index.gmi", "index.gemini"):`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`index_file = os.path.join(directory, index_file)`
Remove support for .gem extension, as the Best Practices document mentions only .gmi and .gemini. 2020-03-17 14:12:11 +00:00			`if os.path.exists(index_file) and is_world_readable(index_file):`
			`return extract_first_heading(index_file, default)`
Fix bug introduced by changes to default feed title handling. 2020-03-17 20:09:45 +00:00			`return default`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`def find_files(directory, time_func, n=10):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Return the n most recently created world readable files with extensions of`
			`.gmi or .gemini, as a list sorted from most to least recent.`
			`"""`
Break into functions. 2020-03-16 17:54:49 +00:00			`files = []`
Remove support for .gem extension, as the Best Practices document mentions only .gmi and .gemini. 2020-03-17 14:12:11 +00:00			`for extension in ("gmi", "gemini"):`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`glob_pattern = os.path.join(directory, "*.{}".format(extension))`
			`files.extend(glob.glob(glob_pattern))`
Fix index file detection when using --directory. 2020-03-18 14:29:10 +00:00			`index = os.path.join(directory, "index.{}".format(extension))`
Break into functions. 2020-03-16 17:54:49 +00:00			`if index in files:`
			`files.remove(index)`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`files = [f for f in files if is_world_readable(f)]`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`files.sort(key=time_func, reverse=True)`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`return files[0:n]`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`def urljoin(base, url):`
			`"""`
			`Return an absolute URL formed by combining the provided base and relative`
			`URLs.`

			`This is necessary because the various functions in Python's urllib to do`
			`this do not function as expected if the URL scheme is not recognised,`
			`which of course gemini:// is not. Thus, we need to do a little dance`
			`where we transform gemini URLs to https URLs, join them, and then undo`
			`the transformation.`
			`"""`
			`base = urllib.parse.urlsplit(base)`
			`base = base._replace(scheme="https")`
			`base = urllib.parse.urlunsplit(base)`
			`joined = urllib.parse.urljoin(base, url)`
			`joined = urllib.parse.urlsplit(joined)`
			`joined = joined._replace(scheme="gemini")`
			`return urllib.parse.urlunsplit(joined)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`def populate_entry_from_file(filename, base_url, entry, time_func):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Set the id, title, updated and link attributes of the provided`
			`FeedGenerator entry object according the contents of the named`
			`Gemini file and the base URL.`
			`"""`
Format entry links correctly when --directory is used. 2020-03-18 18:01:32 +00:00			`url = urljoin(base_url, os.path.basename(filename))`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`entry.guid(url)`
Add rel attribute for entry links. 2020-03-16 19:01:55 +00:00			`entry.link(href=url, rel="alternate")`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`updated = get_update_time(filename, time_func)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`entry.updated(updated)`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`default_title = os.path.splitext(os.path.basename(filename))[0]`
			`title = extract_first_heading(filename, default_title)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`entry.title(title)`

Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`def get_update_time(filename, time_func):`
Extract datestamps from filename if possible. 2020-05-20 17:16:45 +00:00			`"""`
			`Return an update time for a Gemini file.`

			`If the filename begins with an ISO8601 date stamp, that date`
			`(with a time of midnight) will be used. Otherwise, the file`
			`"creation time" (which in unix is actually the time of last`
			`metadata update) will be used instead as a best estimate.`
			`"""`
			`# Check for leading YYYY-MM-DD`
			`basename = os.path.basename(filename)`
			`if re.search("^[0-9]{4}-[01][0-9]-[0-3][0-9]", basename):`
			`date = basename[0:10] + " Z" # Add UTC marker`
			`return datetime.datetime.strptime(date, "%Y-%m-%d %z")`
			`else:`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`updated = time_func(filename)`
Extract datestamps from filename if possible. 2020-05-20 17:16:45 +00:00			`return datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)`

Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`def build_feed(directory, time_func, base_url, output="atom.xml", n=10,`
			`title="", subtitle="", author="", email="", verbose=False):`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`"""`
			`Build an Atom feed for all world readable Gemini files in the current`
			`directory, and write it to atom.xml.`
			`"""`
			`# If a title hasn't been provided, try to get one from an index page`
			`if not title:`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`title = get_feed_title(directory)`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00
Advise user of feed title and URL when run interactively. 2020-03-17 20:05:30 +00:00			`# Let user know feed title and URL`
			`feed_url = urljoin(base_url, output)`
			`if verbose:`
			`print('Generating feed "{}", which should be served from {}'.format(title, feed_url))`

Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`# Setup feed`
			`feed = FeedGenerator()`
			`feed.id(base_url)`
			`feed.title(title)`
			`if subtitle:`
			`feed.subtitle(subtitle)`
			`author_details = {}`
			`if author:`
			`author_details["name"] = author`
			`if email:`
			`author_details["email"] = email`
			`if author_details:`
			`feed.author(author_details)`
Advise user of feed title and URL when run interactively. 2020-03-17 20:05:30 +00:00			`feed.link(href=feed_url, rel='self')`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`feed.link(href=base_url, rel='alternate')`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`# Add one entry per .gmi file`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`files = find_files(directory, time_func, n)`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`if not files:`
Only write to stdout when asked, to facilitate use in scripts instead of as a command line tool. 2020-03-17 17:50:21 +00:00			`if verbose:`
			`print("No world-readable Gemini content found! :(")`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`return`
			`for n, filename in enumerate(files):`
			`entry = feed.add_entry()`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`populate_entry_from_file(filename, base_url, entry, time_func)`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`if n == 0:`
			`feed.updated(entry.updated())`
Only write to stdout when asked, to facilitate use in scripts instead of as a command line tool. 2020-03-17 17:50:21 +00:00			`if verbose:`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`print("Adding {} with title '{}'...".format(os.path.basename(filename),`
			`entry.title()))`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00
			`# Write file`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`output = os.path.join(directory, output)`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`feed.atom_file(output, pretty=True)`
Only write to stdout when asked, to facilitate use in scripts instead of as a command line tool. 2020-03-17 17:50:21 +00:00			`if verbose:`
			`print("Wrote Atom feed to {}.".format(output))`
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00
			`def main():`
			`"""`
			`Parse command line arguments, do some minor processing, and then invoke`
			`the build_feed command with the provided settings.`
			`"""`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00
			`# Get cwd as default value for --directory`
			`cwd = os.getcwd()`

Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`# Parse arguments`
			`parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-a', '--author', dest='author', type=str,`
			`help="feed author's name")`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-b', '--base', dest='base_url', type=str,`
			`required=True, help='base URL for feed and entries')`
Enable processing of directories other than pwd. 2020-03-17 18:05:13 +00:00			`parser.add_argument('-d', '--directory', dest='directory', type=str,`
			`default=cwd, help='directory to find content and save feed to')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-e', '--email', dest='email', type=str,`
			`help="feed author's email address")`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`parser.add_argument('-n', dest='n', type=int, default=10,`
			`help='include N most recently created files in feed (default 10)')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-o', '--output', dest='output', type=str,`
			`default="atom.xml", help='output filename')`
Only write to stdout when asked, to facilitate use in scripts instead of as a command line tool. 2020-03-17 17:50:21 +00:00			`parser.add_argument('-q', '--quiet', dest='verbose', action="store_false",`
			`help='Write nothing to stdout under non-error conditions')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,`
			`help='feed subtitle')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-t', '--title', dest='title', type=str,`
Use directory as default feed title even when not run as a command line tool. 2020-03-17 20:05:03 +00:00			`help='feed title')`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`parser.add_argument('--mtime', action="store_true",`
			`help='Use file modification time, not file update time, in feeds')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`args = parser.parse_args()`

Better URL handling. 2020-03-16 19:29:52 +00:00			`# Normalise base URL`
			`base_url = urllib.parse.urlsplit(args.base_url)`
			`if not base_url.netloc and base_url.path:`
			`# Handle a naked domain, which urlsplit will interpet at a local path`
			`base_url = base_url._replace(netloc=base_url.path, path="")`
			`base_url = base_url._replace(scheme="gemini")`
			`args.base_url = urllib.parse.urlunsplit(base_url)`
Add trailing slash to base URL if necessary. 2020-03-18 17:14:28 +00:00			`if not args.base_url.endswith("/"):`
			`args.base_url += "/"`
Break into functions. 2020-03-16 17:54:49 +00:00
Separate out argument parsing and sanitising from actual feed construction logic. 2020-03-17 16:29:32 +00:00			`# Build the feed`
Add --mtime option to use file modification time, not creation/update time, as that can be manually set via touch. 2020-07-07 12:00:18 +00:00			`time_function = os.path.getmtime if args.mtime else os.path.getctime`
			`build_feed(args.directory, time_function, args.base_url, args.output,`
			`args.n, args.title, args.subtitle, args.author, args.email,`
			`args.verbose)`
Break into functions. 2020-03-16 17:54:49 +00:00
			`if __name__ == "__main__":`
			`main()`