Add YouTube subscription to OPML tooling

2024-05-06 10:33:43 +10:00 · 2024-05-06 10:33:43 +10:00 · 1fb35e8048
commit 1fb35e8048
parent b10a7442c9
5 changed files with 121 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 /html
 /json
 /subscriptions.*
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 JAQ?=jaq
 subscriptions.txt: subscriptions.json
 	$(JAQ) --raw-output '.[]' subscriptions.json > $@
 subscriptions.curl: subscriptions.json
 	$(JAQ) --raw-output '.[] | (split("/") | last) as $$name | . | "url \(.)\noutput \($$name).html"' subscriptions.json > $@
 fetch: subscriptions.curl
 	curl --location --output-dir html --create-dirs --rate 1/s --config subscriptions.curl
 channel-json: subscriptions.txt
 	# nproc is not portable :-/
 	xargs -n1 --max-procs=$$(nproc) --arg-file subscriptions.txt --verbose ./generate-json-opml
 # turn all the channel json files into an OPML file
 subscriptions.opml:
 	./generate-opml > $@
 .PHONY: channel-json subscriptions.opml
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
 # YouTube Subscriptions to OPML
 This repo contains a small collection of scripts that I used to turn my YouTube subscriptions into an OPML file for import in to [Feedbin].
 ## Dependencies
 The scripts have only been run on a Linux system using GNU coreutils. They will
 probably need some tweaking to run on other UNIX-like systems.
 - [Scraper](https://lib.rs/crates/scraper)
 - [jaq](https://github.com/01mf02/jaq)
 - curl
 - Python
 - awk
 - GNU make (I haven't tested non-GNU make)
 ## Usage
 1. Visit your [subscriptions page](https://www.youtube.com/feed/channels)
 2. Repeatedly scroll to the end of the page to make them all load
 3. Run the following in the JavaScript console to copy the list of subscriptions to you clipboard as JSON array:
    ```javascript
    copy(JSON.stringify(Array.from(new Set(Array.prototype.map.call(document.querySelectorAll('a.channel-link'), (link) => link.href))).filter((x) => !x.includes('/channel/')), null, 2))
    ```
    **Note:** I only tested the above on Firefox.
    Also why do this instead of processing the subscriptions.csv from Google Takeout?
    1. Takeout generates multiple gigabytes of archives I have to download to get the CSV file.
    2. It's slow to generate. This process can be done whenever you want.
 4. Paste the list of subscriptions into `subscriptions.json`.
 5. Run `make fetch` to fetch the channel pages of all the subscriptions. This only needs to be run once.
 6. Run `make channel-json` to extract info from each channel page.
 7. Run `make subscriptions.opml` to generate the OPML file.
 [Feedbin]: https://feedbin.com/
--- a/20
+++ b/20
@ -0,0 +1,20 @@
 #!/bin/sh
 set -eu
 URL="$1"
 NAME=$(echo "$URL" | awk -F / '{ print $NF }')
 HTML="html/${NAME}.html"
 CHANNEL_ID=$(scraper -a content 'meta[property="og:url"]' < "$HTML" | awk -F / '{ print $NF }')
 TITLE=$(scraper -a content 'meta[property="og:title"]' < "$HTML")
 XML_URL="https://www.youtube.com/feeds/videos.xml?channel_id=${CHANNEL_ID}"
 json_escape() {
  echo "$1" | jaq --raw-input .
 }
 JSON_TITLE=$(json_escape "$TITLE")
 JSON_XML_URL=$(json_escape "$XML_URL")
 JSON_URL=$(json_escape "$URL")
 printf '{"title": %s, "xmlUrl": %s, "htmlUrl": %s}\n' "$JSON_TITLE" "$JSON_XML_URL" "$JSON_URL" > json/"$NAME".json
--- a/41
+++ b/41
@ -0,0 +1,41 @@
 #!/usr/bin/env python
 import email.utils
 import glob
 import json
 import xml.etree.ElementTree as ET
 # This is what we're aiming to generate:
 #
 # <?xml version="1.0" encoding="UTF-8"?>
 # <opml version="1.0">
 #   <head>
 #     <title>RSS subscriptions for wes@wezm.net</title>
 #     <dateCreated>Sun, 05 May 2024 02:54:31 +0000</dateCreated>
 #     <ownerEmail>wes@wezm.net</ownerEmail>
 #   </head>
 #   <body>
 #     <outline text="3D Printing" title="3D Printing">
 # <outline text="CadHub Blog" title="CadHub Blog" type="rss" xmlUrl="https://learn.cadhub.xyz/blog/rss.xml" htmlUrl="https://learn.cadhub.xyz/blog"/>
 #     </outline>
 #   </body>
 # </opml>
 opml = ET.Element("opml")
 head = ET.SubElement(opml, "head")
 title = ET.SubElement(head, "title")
 title.text = "YouTube Subscription"
 dateCreated = ET.SubElement(head, "dateCreated")
 dateCreated.text = email.utils.formatdate(timeval=None, localtime=True)
 body = ET.SubElement(opml, "body")
 youtube = ET.SubElement(body, "outline", {"title": "YouTube", "text": "YouTube"})
 for path in glob.glob("json/*.json"):
    with open(path) as f:
        info = json.load(f)
        ET.SubElement(youtube, "outline", info, type="rss", text=info["title"])
 ET.indent(opml)
 print(ET.tostring(opml, encoding="unicode", xml_declaration=True))