diff --git a/archive-fandom-wiki b/archive-fandom-wiki new file mode 100755 index 0000000..79ff87f --- /dev/null +++ b/archive-fandom-wiki @@ -0,0 +1,324 @@ +#!/usr/bin/env python + +# This program archives the content of fandom wikis. +# +# It's pretty much feature-complete. I still have to add detailed comments to +# describe what each significant piece of code is doing. +# +# This program doesn't scrape from the fandom.com wiki sites directly; rather, +# it uses my BreezeWiki instance to avoid downloading unneccessary ads, images, +# and other junk. +# +# Each resulting archive is self-contained, meaning one can extract the +# contents and browse the wiki snapshot locally (offline). The URLs for CSS, +# images, and links in each page are replaced by the file:/// URLs for their +# corresponding pages on the local filesystem. +# +# This file is formatted with `black -l 79' to comply with PEP8 standards. + +import concurrent.futures +import shutil +import sys +import tarfile +from datetime import datetime +from pathlib import Path +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +from rich.console import Console +from rich.progress import Progress +from rich.tree import Tree + +console = Console() + + +class FandomWiki: + def __init__(self, name: str): + self.name = name + self.canonical_url = "https://{}.fandom.com".format(name) + self.breezewiki_url = "https://wiki.hyperreal.coffee/{}".format(name) + self.site_dir = Path.cwd().joinpath("{}.fandom.com".format(name)) + self.images_dir = self.site_dir.joinpath("images") + + if not self.site_dir.exists(): + self.site_dir.mkdir() + + if not self.images_dir.exists(): + self.images_dir.mkdir() + + def get_hop0_urls(self) -> list: + starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url) + hop0_urls = list() + + while True: + response = requests.get(starting_url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + mw_allpages_nav = soup.find_all( + "div", {"class": "mw-allpages-nav"} + )[0] + + if ( + len(mw_allpages_nav.find_all("a")) < 2 + and "Next page" + not in mw_allpages_nav.find_all("a")[0].get_text() + ): + break + else: + if len(mw_allpages_nav.find_all("a")) < 2: + starting_url = "{}{}".format( + self.canonical_url, + mw_allpages_nav.find_all("a")[0].get("href"), + ) + else: + starting_url = "{}{}".format( + self.canonical_url, + mw_allpages_nav.find_all("a")[1].get("href"), + ) + + hop0_urls.append(starting_url) + console.print("[[bold]HOP 0[/bold]] {}".format(starting_url)) + + return hop0_urls + + def get_hop1_urls(self, hop0_urls: list): + hop1_urls = [self.breezewiki_url] + + for url in hop0_urls: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + for item in soup.find_all("a"): + if item.get("href") and item.get("href").startswith("/wiki"): + if "Local_Sitemap" not in item.get( + "href" + ) and "Special:" not in item.get("href"): + new_url = "{}{}".format( + self.breezewiki_url, item.get("href") + ) + hop1_urls.append(new_url) + console.print( + "[[bold]HOP 1[/bold]] {}".format(new_url) + ) + + return hop1_urls + + def save_css(self): + response = requests.get(self.breezewiki_url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + css_pages = list() + + for css in soup.find_all("link"): + if css.attrs.get("href"): + css_url = urljoin(self.breezewiki_url, css.attrs.get("href")) + css_pages.append(css_url) + + for page in css_pages: + response = requests.get(page) + response.raise_for_status() + + css_filename = self.site_dir.joinpath( + "proxy{}.css".format(css_pages.index(page)) + ) + with open(css_filename, "wb") as outfile: + outfile.write(response.content) + + console.print( + "[[bold green]CSS[/bold green]] {}".format(css_filename) + ) + + def save_img(self, img_url: str): + filename = self.images_dir.joinpath( + Path(img_url.split("/revision")[0]).name + ) + if not filename.exists(): + response = requests.get(img_url, stream=True) + response.raise_for_status() + + with open(filename, "wb") as outfile: + for chunk in response.iter_content(chunk_size=8192): + outfile.write(chunk) + + console.print("[[bold green]IMG[/bold green]] {}".format(filename)) + else: + console.print( + "[[bold yellow]IMG (EXISTS)[/bold yellow]] {}".format(filename) + ) + + def fetch_all_images(self, page_url: str): + response = requests.get(page_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + img_tags = soup.find_all("img") + img_urls = [img["src"] for img in img_tags if "src" in img.attrs] + clean_img_urls = [ + x + for x in img_urls + if "breezewiki" not in x and "Wordmark" not in x + ] + + for img_url in clean_img_urls: + self.save_img(img_url) + + def save_page(self, url: str): + filename = self.site_dir.joinpath("{}.html".format(url.split("/")[-1])) + if not filename.exists(): + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + stylesheet_count = 0 + for link in soup.find_all("link", {"rel": "stylesheet"}): + stylesheet_count += 1 + link.decompose() + + for i in range(stylesheet_count): + if soup.head: + soup.head.append( + soup.new_tag( + "link", + rel="stylesheet", + type="text/css", + href="proxy{}.css".format(i), + ) + ) + + self.fetch_all_images(url) + + soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore + + for link in soup.find_all("a"): + if link.get("href") and link.get("href").startswith( + "/{}/wiki".format(self.name) + ): + link_basename = link.get("href").partition("/wiki/")[2] + link["href"] = "{}/{}.html".format( + self.site_dir, link_basename + ) + + with open(filename, "w") as outfile: + outfile.write(soup.prettify()) + + console.print( + "[[bold green]HTML[/bold green]] {}".format(filename) + ) + else: + console.print( + "[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format( + filename + ) + ) + + def fetch_all_pages(self, hop1_urls: list): + self.save_css() + + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(self.save_page, hop1_urls) + + def archive(self): + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + + img_files = [ + f + for f in self.images_dir.iterdir() + if self.images_dir.joinpath(f).is_file() + ] + + img_archive_filename = "{}-{}.tar.xz".format( + self.images_dir, timestamp + ) + + with Progress() as progress: + task = progress.add_task( + "[cyan]Archiving images...", total=len(img_files) + ) + + with tarfile.open(img_archive_filename, "w:xz") as tar: + for img_file in img_files: + if progress.finished: + break + full_file_path = self.images_dir.joinpath(img_file) + tar.add(full_file_path, arcname=img_file) + progress.update(task, advance=1) + + progress.stop() + + shutil.rmtree(self.images_dir, ignore_errors=True) + + web_files = [ + f + for f in self.site_dir.iterdir() + if self.site_dir.joinpath(f).is_file + or self.site_dir.joinpath(f).is_dir() + ] + + web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp) + + with Progress() as progress: + task = progress.add_task( + "[cyan]Archiving web files...", total=len(web_files) + ) + + with tarfile.open(web_archive_filename, "w:gz") as tar: + for web_file in web_files: + if progress.finished: + break + full_file_path = self.site_dir.joinpath(web_file) + tar.add(full_file_path, arcname=web_file) + progress.update(task, advance=1) + + progress.stop() + + shutil.rmtree(self.site_dir, ignore_errors=True) + + console.print("\nTotal web files scraped: {}".format(len(web_files))) + console.print("Total images scraped: {}".format(len(img_files))) + + +def archive_site(name: str): + site = FandomWiki(name) + site.fetch_all_pages(site.get_hop1_urls(site.get_hop0_urls())) + site.archive() + + +def usage_message(): + supported_wikis = [ + "cyberpunk", + "dishonored", + "dragonage", + "forgottenrealms", + "masseffect", + "residentevil", + ] + wiki_tree = Tree("[green]Fandom Wikis") + for wiki in supported_wikis: + wiki_tree.add(wiki) + + console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n") + console.print("Example:\n\tarchive-fandom-wiki dishonored\n") + console.print(wiki_tree) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + match sys.argv[1]: + case "cyberpunk": + archive_site("cyberpunk") + case "dishonored": + archive_site("dishonored") + case "dragonage": + archive_site("dragonage") + case "forgottenrealms": + archive_site("forgottenrealms") + case "masseffect": + archive_site("masseffect") + case "residentevil": + archive_site("residentevil") + case _: + usage_message() + else: + usage_message()