Upload bs-google-podcasts-extractor.py

2023-09-26 22:50:20 +00:00 · 2023-09-26 22:50:20 +00:00 · fcb7328371
parent 7c43ba296d
commit fcb7328371
1 changed files with 48 additions and 0 deletions
--- a/bs-google-podcasts-extractor.py
+++ b/bs-google-podcasts-extractor.py
@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+"""
+Quick, dirty Google Podcasts list extractor.
+Uses Beautiful Soup 4 on a saved version of your podcasts subscription page.
+For example, a text-only copy of "https://podcasts.google.com/subscriptions" saved from your browser to disk.
+
+By Tildebeast, 2023.
+"""
+
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+
+# Your input and output filenames. Edit names/paths as required.
+input_filename = "Google Podcasts - Subscriptions.html"
+output_filename = "google-podcasts-names.txt"
+
+# Flag to print the contents as the script runs. Otherwise output only goes to a file.
+SHOW_CONTENTS = True 
+
+# I think these CSS selectors are constant but can't guarantee it.
+class_search_list = ["eWeGpe","yFWEIe"]
+
+with open(input_filename, 'r') as f:
+    html_data = f.read()
+
+#print(html_data)
+
+soup = BeautifulSoup(html_data, 'html.parser')
+
+#data = soup.find_all("div", attrs={"class": ["eWeGpe","yFWEIe"]}"
+data = soup.find_all("div", attrs={"class": class_search_list})
+
+text_only = [x.get_text().strip() for x in data]
+pairs = {x:y for x,y in zip( text_only[0::2], text_only[1::2] ) }
+
+sorted_pairs = OrderedDict()
+with open(output_filename, 'w') as f:
+    for k in sorted(pairs.keys(), key=str.lower):
+        if SHOW_CONTENTS:
+            print(f"{k}\t {pairs[k]}")
+        f.write(f"{k}\t {pairs[k]}\n")
+        
+print(f"\nOutput written to file: {output_filename}\n")
+print("\nFinished.\n")
+        
+    
+