Upload bs-google-podcasts-extractor.py

This commit is contained in:
Tildebeast 2023-09-26 22:50:20 +00:00
parent 7c43ba296d
commit fcb7328371
1 changed files with 48 additions and 0 deletions

View File

@ -0,0 +1,48 @@
#!/usr/bin/env python3
"""
Quick, dirty Google Podcasts list extractor.
Uses Beautiful Soup 4 on a saved version of your podcasts subscription page.
For example, a text-only copy of "https://podcasts.google.com/subscriptions" saved from your browser to disk.
By Tildebeast, 2023.
"""
from bs4 import BeautifulSoup
from collections import OrderedDict
# Your input and output filenames. Edit names/paths as required.
input_filename = "Google Podcasts - Subscriptions.html"
output_filename = "google-podcasts-names.txt"
# Flag to print the contents as the script runs. Otherwise output only goes to a file.
SHOW_CONTENTS = True
# I think these CSS selectors are constant but can't guarantee it.
class_search_list = ["eWeGpe","yFWEIe"]
with open(input_filename, 'r') as f:
html_data = f.read()
#print(html_data)
soup = BeautifulSoup(html_data, 'html.parser')
#data = soup.find_all("div", attrs={"class": ["eWeGpe","yFWEIe"]}"
data = soup.find_all("div", attrs={"class": class_search_list})
text_only = [x.get_text().strip() for x in data]
pairs = {x:y for x,y in zip( text_only[0::2], text_only[1::2] ) }
sorted_pairs = OrderedDict()
with open(output_filename, 'w') as f:
for k in sorted(pairs.keys(), key=str.lower):
if SHOW_CONTENTS:
print(f"{k}\t {pairs[k]}")
f.write(f"{k}\t {pairs[k]}\n")
print(f"\nOutput written to file: {output_filename}\n")
print("\nFinished.\n")