Get all URLs in fandom page

This commit is contained in:
Jeffrey Serio 2023-09-08 07:33:56 -05:00
parent 39c7c68be7
commit 8c1e1e3afc
2 changed files with 40 additions and 10 deletions

15
archive-to-megasync Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
SYNC_DIR="${HOME}/sync"
archive_maybe=(
"${HOME}/sync/org"
"${HOME}/sync/org-roam"
"${HOME}/sync/sites"
)
for dir in "${archive_maybe[@]}"; do
if [ "$(find "$dir" -type f -mtime -1 | wc -l)" -gt 0 ]; then
create-archive "$dir"
mv -v "$dir-$(date '+%Y%m%d').tar.gz" "${SYNC_DIR}/archived/"
fi
done

View File

@ -50,9 +50,9 @@ import requests
from bs4 import BeautifulSoup
def get_urls(fandom: str) -> list():
def get_hop0_urls(fandom: str) -> list():
starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"
urls = [starting_url]
hop0_urls = [starting_url]
while True:
reqs = requests.get(starting_url)
@ -80,9 +80,24 @@ def get_urls(fandom: str) -> list():
+ mw_allpages_nav.find_all("a")[1].get("href")
)
urls.append(starting_url)
hop0_urls.append(starting_url)
return urls
return hop0_urls
def get_hop1_urls(hop0_urls: list) -> list():
hop1_urls = list()
for url in hop0_urls:
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, "html.parser")
fandom = url.split(sep="/wiki")[0]
for item in soup.find_all("a"):
if item.get("href") and item.get("href").startswith("/wiki"):
hop1_urls.append(fandom + item.get("href"))
return hop1_urls
def help_message():
@ -98,17 +113,17 @@ if __name__ == "__main__":
if len(sys.argv) > 1:
match sys.argv[1]:
case "cyberpunk":
urls = get_urls("cyberpunk")
urls = get_hop1_urls(get_hop0_urls("cyberpunk"))
case "dishonored":
urls = get_urls("dishonored")
urls = get_hop1_urls(get_hop0_urls("dishonored"))
case "dragonage":
urls = get_urls("dragonage")
urls = get_hop1_urls(get_hop0_urls("dragonage"))
case "forgottenrealms":
urls = get_urls("forgottenrealms")
urls = get_hop1_urls(get_hop0_urls("forgottenrealms"))
case "masseffect":
urls = get_urls("masseffect")
urls = get_hop1_urls(get_hop0_urls("masseffect"))
case "residentevil":
urls = get_urls("residentevil")
urls = get_hop1_urls(get_hop0_urls("residentevil"))
case _:
help_message()