#! /usr/bin/env python3 """ mw2html - Mediawiki to static HTML I use this to create a personal website from a local mediawiki installation. No search functionality. Hacks the Monobook skin and the produced HTML. Connelly Barnes 2005. Public domain. Reworked by Andre Pinto 2009. Improved performance. Improved filtering. Improved usability. Customized for Audacity's manual wiki. Minor tweaks (for Audacity) By James Crook, Nov 2009. Moved to Python3 by Jack Thomson, May 2020 ... """ __version__ = '0.1.0.3' import re import sys import getopt import random import urllib.request, urllib.parse, urllib.error import textwrap import urllib.parse import os, os.path import htmldata import errno import hashlib import http.client from time import strftime from shutil import copyfile try: import htmldata except: print('Requires Python3 htmldata module:') print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py') sys.exit() config = None MOVE_HREF = 'movehref' MADE_BY_COMMENT = '' INDEX_HTML = 'index.html' QHELP_HTML = 'quick_help.html' url_filename_cache = {} redir_cache = {} wrote_file_set = set() sidebar_html = '' footer_text = '' counter = 0 errors = 0 conn = None headers = {"User-Agent": "mw2html.py/Audacity"} domain = '' MONOBOOK_SKIN = 'monobook' # Constant identifier for Monobook. class Config: """ Instances contain all options passed at the command line. """ def __init__(self, rooturl, outdir, flatten=True, index=None, clean=True, sidebar=None, hack_skin=True, made_by=True, overwrite=False, footer=None, skin=MONOBOOK_SKIN, move_href=True, remove_png=True, remove_history=True, limit_parent=False, special_mode=False, debug=False, no_images=False): self.rooturl = rooturl self.outdir = os.path.abspath(outdir) self.flatten = flatten self.index = index self.clean = clean self.sidebar = sidebar self.hack_skin = hack_skin self.made_by = made_by self.overwrite = overwrite self.footer = footer self.skin = skin self.move_href = move_href if self.sidebar is not None: self.sidebar = os.path.abspath(self.sidebar) if self.footer is not None: self.footer = os.path.abspath(self.footer) self.remove_png = remove_png self.remove_history = remove_history self.limit_parent = limit_parent self.special_mode = special_mode self.debug = debug self.no_images = no_images def get_domain(u): """ Get domain of URL. """ url = normalize_url(u) #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') L = list(urllib.parse.urlparse(url)) return L[1] def normalize_url(url, lower=True): # url normalization - only for local comparison operations, use original url for online requests url = split_section(url)[0] if lower: url = url.lower() #if url.startswith('http://'): # url = url[len('http://'):] if url.startswith('https://'): url = url[len('https://'):] if url.startswith('www.'): url = url[len('www.'):] url = url.strip('/') url = 'https://' + url urllib.parse.urljoin(config.rooturl, url) return url def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0): # find tag limits - start_string must be an unique identifier within doc i1 = doc.find(filter_string, start_point) if i1 == -1: return (-1, -1) aux = doc.rfind(start_tag, start_point, i1 + len(filter_string)) # we've found the filter_string but it has not the start_tag, so we return a different value # telling the script to keep searching starting on the end of the filter_string found if aux == -1: return (-2, i1 + len(filter_string)) i1 = aux sdiv = i1 ediv = i1 + len(start_tag) while(sdiv < ediv and sdiv != -1): sdiv = doc.find(start_tag, sdiv + len(start_tag)) ediv = doc.find(end_tag , ediv + len(end_tag)) return (i1, ediv) def clean_tag(doc, filter_string, end_tag, start_tag): #clean tagged text function start_point = 0 while True: (start1, start2) = find_tag_limits(doc, filter_string, end_tag, start_tag, start_point) if start1 == -1 or start2 == -1: return doc if start1 == -2: start_point = start2 continue end1 = doc.find('>', start1) + 1; end2 = start2 + len(end_tag); doc = doc[:start1] + doc[end1:start2] + doc[end2:] def remove_tag(doc, start_string, end_tag, start_tag): #remove tagged text function while True: (i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag) if i1 == -1 or i2 == -1: return doc doc = doc[:i1] + doc[i2 + len(end_tag):] def monobook_fix_html(doc, page_url): """ Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output. """ global config if config.made_by: doc = doc.replace('', '', '', '', '', '', '', '', '', '', '', '', '', '', r'', doc) doc = re.sub(r'', r'', doc) # Remove print footer doc = re.sub(r'
[\s\S]+?
', r'', doc) # Remove noexport doc = remove_tag(doc, '
', '', '', '[\s\S]+?(', r'', doc) doc = re.sub(r'

Image links

[\s\S]+?', r'', doc) return doc def html_remove_translation_links(doc): """ Remove translation links (the international flags). We identify them by the pattern for a 2 or 3 letter language code, /[a-z]{2,3}[/"] in the URL. The second version deals with links like /pt_PT and /zh_CN We are case sensitive, so as not to treat FAQ as a language code. """ doc = re.sub(r'
', r'', doc) doc = re.sub(r'', r'', doc) return doc def monobook_hack_skin_html(doc): """ Hacks Monobook HTML output: use CSS ids for hacked skin. See monobook_hack_skin_css. """ doc = doc.replace('
', '
') doc = doc.replace('