gemfeed/gemfeed.py

import argparse
import datetime
import glob
import os
import os.path
import stat
import urllib.parse

from feedgen.feed import FeedGenerator

def is_world_readable(filename):
    """
    Return True if the named file is world readable, otherwise return False.
    """
    st = os.stat(filename)
    return st.st_mode & stat.S_IROTH

def extract_first_heading(filename, default=""):
    """
    Open a file which is presumed to contain text/gemini content and return
    the contents of the first heading line (regardless of heading level).
    If no heading lines are found, return the specified default.
    """
    with open(filename) as fp:
        for line in fp:
            if line.startswith("#"):
                while line[0] == "#":
                    line = line[1:]
                return line.strip()
    return default

def get_feed_title():
    """
    If an index.gmi or index.gemini file exists and is worldreadable, return
    the content of the first heading line in the file, otherwise return a
    default feed title.
    """
    default = "Just another Gemini feed"
    for index_file in ("index.gmi", "index.gemini"):
        if os.path.exists(index_file) and is_world_readable(index_file):
            return extract_first_heading(index_file, default)
    return default

def find_files(n=10):
    """
    Return the n most recently created world readable files with extensions of
    .gmi or .gemini, as a list sorted from most to least recent.
    """
    files = []
    for extension in ("gmi", "gemini"):
        files.extend(glob.glob("*.{}".format(extension)))
        index = "index.{}".format(extension)
        if index in files:
            files.remove(index)
    files = [f for f in files if is_world_readable(f)]
    files.sort(key=os.path.getctime, reverse=True)
    return files[0:n]

def urljoin(base, url):
    """
    Return an absolute URL formed by combining the provided base and relative
    URLs.

    This is necessary because the various functions in Python's urllib to do
    this do not function as expected if the URL scheme is not recognised,
    which of course gemini:// is not.  Thus, we need to do a little dance
    where we transform gemini URLs to https URLs, join them, and then undo
    the transformation.
    """
    base = urllib.parse.urlsplit(base)
    base = base._replace(scheme="https")
    base = urllib.parse.urlunsplit(base)
    joined = urllib.parse.urljoin(base, url)
    joined = urllib.parse.urlsplit(joined)
    joined = joined._replace(scheme="gemini")
    return urllib.parse.urlunsplit(joined)

def populate_entry_from_file(filename, base_url, entry):
    """
    Set the id, title, updated and link attributes of the provided
    FeedGenerator entry object according the contents of the named
    Gemini file and the base URL.
    """
    url = urljoin(base_url, filename)
    entry.guid(url)
    entry.link(href=url, rel="alternate")
    updated = os.path.getctime(filename)
    updated = datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)
    entry.updated(updated)
    title = extract_first_heading(filename, filename)
    entry.title(title)

def main():

    # Get default title from index page, if there is one
    feed_title = get_feed_title()

    # Parse arguments
    parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')
    parser.add_argument('-a', '--author', dest='author', type=str,
            help="feed author's name")
    parser.add_argument('-b', '--base', dest='base_url', type=str,
            required=True, help='base URL for feed and entries')
    parser.add_argument('-e', '--email', dest='email', type=str,
            help="feed author's email address")
    parser.add_argument('-n', dest='n', type=int, default=10,
            help='include N most recently created files in feed (default 10)')
    parser.add_argument('-o', '--output', dest='output', type=str,
            default="atom.xml", help='output filename')
    parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,
            help='feed subtitle')
    parser.add_argument('-t', '--title', dest='title', type=str,
            default=feed_title, help='feed title')
    args = parser.parse_args()

    # Normalise base URL
    base_url = urllib.parse.urlsplit(args.base_url)
    if not base_url.netloc and base_url.path:
        # Handle a naked domain, which urlsplit will interpet at a local path
        base_url = base_url._replace(netloc=base_url.path, path="")
    base_url = base_url._replace(scheme="gemini")
    args.base_url = urllib.parse.urlunsplit(base_url)

    # Setup feed
    feed = FeedGenerator()
    feed.id(args.base_url)
    feed.title(args.title)
    if args.subtitle:
        feed.subtitle(args.subtitle)
    author = {}
    if args.author:
        author["name"] = args.author
    if args.email:
        author["email"] = args.email
    if author:
        feed.author(author)
    feed.link(href=args.base_url, rel='alternate')
    feed.link(href=urljoin(args.base_url, args.output), rel='self')

    # Add one entry per .gmi file
    files = find_files(args.n)
    if not files:
        print("No world-readable Gemini content found! :(")
        return

    for n, filename in enumerate(files):
        entry = feed.add_entry()
        populate_entry_from_file(filename, args.base_url, entry)
        print("Adding {} with title '{}'...".format(filename, entry.title()))
        if n == 0:
            feed.updated(entry.updated())

    # Write file
    feed.atom_file(args.output, pretty=True)
    print("Wrote Atom feed to {}.".format(args.output))

if __name__ == "__main__":
    main()
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`import argparse`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`import datetime`
			`import glob`
Include only world readable files. 2020-03-16 21:18:37 +00:00			`import os`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`import os.path`
Fix stat nonsense, git can die in a fire. 2020-03-16 21:44:33 +00:00			`import stat`
Better URL handling. 2020-03-16 19:29:52 +00:00			`import urllib.parse`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00
			`from feedgen.feed import FeedGenerator`

Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`def is_world_readable(filename):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Return True if the named file is world readable, otherwise return False.`
			`"""`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`st = os.stat(filename)`
			`return st.st_mode & stat.S_IROTH`

Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`def extract_first_heading(filename, default=""):`
			`"""`
			`Open a file which is presumed to contain text/gemini content and return`
			`the contents of the first heading line (regardless of heading level).`
			`If no heading lines are found, return the specified default.`
			`"""`
			`with open(filename) as fp:`
			`for line in fp:`
			`if line.startswith("#"):`
			`while line[0] == "#":`
			`line = line[1:]`
			`return line.strip()`
			`return default`

Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`def get_feed_title():`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`If an index.gmi or index.gemini file exists and is worldreadable, return`
			`the content of the first heading line in the file, otherwise return a`
			`default feed title.`
			`"""`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`default = "Just another Gemini feed"`
Remove support for .gem extension, as the Best Practices document mentions only .gmi and .gemini. 2020-03-17 14:12:11 +00:00			`for index_file in ("index.gmi", "index.gemini"):`
			`if os.path.exists(index_file) and is_world_readable(index_file):`
			`return extract_first_heading(index_file, default)`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`return default`

Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`def find_files(n=10):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Return the n most recently created world readable files with extensions of`
			`.gmi or .gemini, as a list sorted from most to least recent.`
			`"""`
Break into functions. 2020-03-16 17:54:49 +00:00			`files = []`
Remove support for .gem extension, as the Best Practices document mentions only .gmi and .gemini. 2020-03-17 14:12:11 +00:00			`for extension in ("gmi", "gemini"):`
Break into functions. 2020-03-16 17:54:49 +00:00			`files.extend(glob.glob("*.{}".format(extension)))`
			`index = "index.{}".format(extension)`
			`if index in files:`
			`files.remove(index)`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`files = [f for f in files if is_world_readable(f)]`
			`files.sort(key=os.path.getctime, reverse=True)`
			`return files[0:n]`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`def urljoin(base, url):`
			`"""`
			`Return an absolute URL formed by combining the provided base and relative`
			`URLs.`

			`This is necessary because the various functions in Python's urllib to do`
			`this do not function as expected if the URL scheme is not recognised,`
			`which of course gemini:// is not. Thus, we need to do a little dance`
			`where we transform gemini URLs to https URLs, join them, and then undo`
			`the transformation.`
			`"""`
			`base = urllib.parse.urlsplit(base)`
			`base = base._replace(scheme="https")`
			`base = urllib.parse.urlunsplit(base)`
			`joined = urllib.parse.urljoin(base, url)`
			`joined = urllib.parse.urlsplit(joined)`
			`joined = joined._replace(scheme="gemini")`
			`return urllib.parse.urlunsplit(joined)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00
Break into functions. 2020-03-16 17:54:49 +00:00			`def populate_entry_from_file(filename, base_url, entry):`
Add docstrings for all functions except main(). 2020-03-17 14:45:49 +00:00			`"""`
			`Set the id, title, updated and link attributes of the provided`
			`FeedGenerator entry object according the contents of the named`
			`Gemini file and the base URL.`
			`"""`
Better URL handling. 2020-03-16 19:29:52 +00:00			`url = urljoin(base_url, filename)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`entry.guid(url)`
Add rel attribute for entry links. 2020-03-16 19:01:55 +00:00			`entry.link(href=url, rel="alternate")`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`updated = os.path.getctime(filename)`
			`updated = datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)`
			`entry.updated(updated)`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`title = extract_first_heading(filename, filename)`
Initial proof-of-concept implementation. 2020-03-16 17:42:58 +00:00			`entry.title(title)`

Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`def main():`

Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`# Get default title from index page, if there is one`
			`feed_title = get_feed_title()`

Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`# Parse arguments`
			`parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-a', '--author', dest='author', type=str,`
			`help="feed author's name")`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-b', '--base', dest='base_url', type=str,`
			`required=True, help='base URL for feed and entries')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-e', '--email', dest='email', type=str,`
			`help="feed author's email address")`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`parser.add_argument('-n', dest='n', type=int, default=10,`
			`help='include N most recently created files in feed (default 10)')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-o', '--output', dest='output', type=str,`
			`default="atom.xml", help='output filename')`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,`
			`help='feed subtitle')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`parser.add_argument('-t', '--title', dest='title', type=str,`
Extract feed title from index.gmi if it exists and is world readable. 2020-03-17 12:09:51 +00:00			`default=feed_title, help='feed title')`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`args = parser.parse_args()`

Better URL handling. 2020-03-16 19:29:52 +00:00			`# Normalise base URL`
			`base_url = urllib.parse.urlsplit(args.base_url)`
			`if not base_url.netloc and base_url.path:`
			`# Handle a naked domain, which urlsplit will interpet at a local path`
			`base_url = base_url._replace(netloc=base_url.path, path="")`
			`base_url = base_url._replace(scheme="gemini")`
			`args.base_url = urllib.parse.urlunsplit(base_url)`
Break into functions. 2020-03-16 17:54:49 +00:00
			`# Setup feed`
			`feed = FeedGenerator()`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`feed.id(args.base_url)`
			`feed.title(args.title)`
Add command line options for feed subtitle and author details. 2020-03-16 19:09:17 +00:00			`if args.subtitle:`
			`feed.subtitle(args.subtitle)`
			`author = {}`
			`if args.author:`
			`author["name"] = args.author`
			`if args.email:`
			`author["email"] = args.email`
			`if author:`
			`feed.author(author)`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`feed.link(href=args.base_url, rel='alternate')`
Better URL handling. 2020-03-16 19:29:52 +00:00			`feed.link(href=urljoin(args.base_url, args.output), rel='self')`
Break into functions. 2020-03-16 17:54:49 +00:00
			`# Add one entry per .gmi file`
Include only 10 (by default) most recent files in the feed. 2020-03-17 12:26:32 +00:00			`files = find_files(args.n)`
Don't produce empty feeds. 2020-03-17 12:35:37 +00:00			`if not files:`
			`print("No world-readable Gemini content found! :(")`
			`return`

Take advantage of the fact that find_files now sorts files by timestamp. 2020-03-17 12:33:38 +00:00			`for n, filename in enumerate(files):`
Break into functions. 2020-03-16 17:54:49 +00:00			`entry = feed.add_entry()`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`populate_entry_from_file(filename, args.base_url, entry)`
Break into functions. 2020-03-16 17:54:49 +00:00			`print("Adding {} with title '{}'...".format(filename, entry.title()))`
Take advantage of the fact that find_files now sorts files by timestamp. 2020-03-17 12:33:38 +00:00			`if n == 0:`
			`feed.updated(entry.updated())`
Break into functions. 2020-03-16 17:54:49 +00:00
			`# Write file`
Add command line options for base URL and feed title. 2020-03-16 18:45:04 +00:00			`feed.atom_file(args.output, pretty=True)`
			`print("Wrote Atom feed to {}.".format(args.output))`
Break into functions. 2020-03-16 17:54:49 +00:00
			`if __name__ == "__main__":`
			`main()`