GooglePodcastSubs/bs-google-podcasts-extracto...

#!/usr/bin/env python3

"""
Quick, dirty Google Podcasts list extractor.
Uses Beautiful Soup 4 on a saved version of your podcasts subscription page.
For example, a text-only copy of "https://podcasts.google.com/subscriptions" saved from your browser to disk.

By Tildebeast, 2023.
"""

from bs4 import BeautifulSoup
from collections import OrderedDict

# Your input and output filenames. Edit names/paths as required.
input_filename = "Google Podcasts - Subscriptions.html"
output_filename = "google-podcasts-names.txt"

# Flag to print the contents as the script runs. Otherwise output only goes to a file.
SHOW_CONTENTS = True

# I think these CSS selectors are constant but can't guarantee it.
class_search_list = ["eWeGpe","yFWEIe"]

with open(input_filename, 'r') as f:
    html_data = f.read()

#print(html_data)

soup = BeautifulSoup(html_data, 'html.parser')

#data = soup.find_all("div", attrs={"class": ["eWeGpe","yFWEIe"]}"
data = soup.find_all("div", attrs={"class": class_search_list})

text_only = [x.get_text().strip() for x in data]
pairs = {x:y for x,y in zip( text_only[0::2], text_only[1::2] ) }

sorted_pairs = OrderedDict()
with open(output_filename, 'w') as f:
    for k in sorted(pairs.keys(), key=str.lower):
        if SHOW_CONTENTS:
            print(f"{k}\t {pairs[k]}")
        f.write(f"{k}\t {pairs[k]}\n")

print(f"\nOutput written to file: {output_filename}\n")
print("\nFinished.\n")