Add YouTube subscription to OPML tooling

2024-05-06 10:33:43 +10:00 · 2024-05-06 10:33:43 +10:00 · 1fb35e8048
commit 1fb35e8048
parent b10a7442c9
5 changed files with 121 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 /html
 /json
+/subscriptions.*
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+JAQ?=jaq
+
+subscriptions.txt: subscriptions.json
+	$(JAQ) --raw-output '.[]' subscriptions.json > $@
+
+subscriptions.curl: subscriptions.json
+	$(JAQ) --raw-output '.[] | (split("/") | last) as $$name | . | "url \(.)\noutput \($$name).html"' subscriptions.json > $@
+
+fetch: subscriptions.curl
+	curl --location --output-dir html --create-dirs --rate 1/s --config subscriptions.curl
+
+channel-json: subscriptions.txt
+	# nproc is not portable :-/
+	xargs -n1 --max-procs=$$(nproc) --arg-file subscriptions.txt --verbose ./generate-json-opml
+
+# turn all the channel json files into an OPML file
+subscriptions.opml:
+	./generate-opml > $@
+
+.PHONY: channel-json subscriptions.opml
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
+# YouTube Subscriptions to OPML
+
+This repo contains a small collection of scripts that I used to turn my YouTube subscriptions into an OPML file for import in to [Feedbin].
+
+## Dependencies
+
+The scripts have only been run on a Linux system using GNU coreutils. They will
+probably need some tweaking to run on other UNIX-like systems.
+
+- [Scraper](https://lib.rs/crates/scraper)
+- [jaq](https://github.com/01mf02/jaq)
+- curl
+- Python
+- awk
+- GNU make (I haven't tested non-GNU make)
+
+## Usage
+
+1. Visit your [subscriptions page](https://www.youtube.com/feed/channels)
+2. Repeatedly scroll to the end of the page to make them all load
+3. Run the following in the JavaScript console to copy the list of subscriptions to you clipboard as JSON array:
+
+    ```javascript
+    copy(JSON.stringify(Array.from(new Set(Array.prototype.map.call(document.querySelectorAll('a.channel-link'), (link) => link.href))).filter((x) => !x.includes('/channel/')), null, 2))
+    ```
+
+    **Note:** I only tested the above on Firefox.
+
+    Also why do this instead of processing the subscriptions.csv from Google Takeout?
+
+    1. Takeout generates multiple gigabytes of archives I have to download to get the CSV file.
+    2. It's slow to generate. This process can be done whenever you want.
+
+4. Paste the list of subscriptions into `subscriptions.json`.
+5. Run `make fetch` to fetch the channel pages of all the subscriptions. This only needs to be run once.
+6. Run `make channel-json` to extract info from each channel page.
+7. Run `make subscriptions.opml` to generate the OPML file.
+
+[Feedbin]: https://feedbin.com/
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+#!/bin/sh
+
+set -eu
+
+URL="$1"
+NAME=$(echo "$URL" | awk -F / '{ print $NF }')
+HTML="html/${NAME}.html"
+CHANNEL_ID=$(scraper -a content 'meta[property="og:url"]' < "$HTML" | awk -F / '{ print $NF }')
+TITLE=$(scraper -a content 'meta[property="og:title"]' < "$HTML")
+XML_URL="https://www.youtube.com/feeds/videos.xml?channel_id=${CHANNEL_ID}"
+
+json_escape() {
+  echo "$1" | jaq --raw-input .
+}
+
+JSON_TITLE=$(json_escape "$TITLE")
+JSON_XML_URL=$(json_escape "$XML_URL")
+JSON_URL=$(json_escape "$URL")
+
+printf '{"title": %s, "xmlUrl": %s, "htmlUrl": %s}\n' "$JSON_TITLE" "$JSON_XML_URL" "$JSON_URL" > json/"$NAME".json
--- a/41
+++ b/41
@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import email.utils
+import glob
+import json
+import xml.etree.ElementTree as ET
+
+# This is what we're aiming to generate:
+#
+# <?xml version="1.0" encoding="UTF-8"?>
+# <opml version="1.0">
+#   <head>
+#     <title>RSS subscriptions for wes@wezm.net</title>
+#     <dateCreated>Sun, 05 May 2024 02:54:31 +0000</dateCreated>
+#     <ownerEmail>wes@wezm.net</ownerEmail>
+#   </head>
+#   <body>
+#     <outline text="3D Printing" title="3D Printing">
+# <outline text="CadHub Blog" title="CadHub Blog" type="rss" xmlUrl="https://learn.cadhub.xyz/blog/rss.xml" htmlUrl="https://learn.cadhub.xyz/blog"/>
+#     </outline>
+#   </body>
+# </opml>
+
+opml = ET.Element("opml")
+
+head = ET.SubElement(opml, "head")
+title = ET.SubElement(head, "title")
+title.text = "YouTube Subscription"
+dateCreated = ET.SubElement(head, "dateCreated")
+dateCreated.text = email.utils.formatdate(timeval=None, localtime=True)
+
+body = ET.SubElement(opml, "body")
+youtube = ET.SubElement(body, "outline", {"title": "YouTube", "text": "YouTube"})
+
+for path in glob.glob("json/*.json"):
+    with open(path) as f:
+        info = json.load(f)
+        ET.SubElement(youtube, "outline", info, type="rss", text=info["title"])
+
+ET.indent(opml)
+print(ET.tostring(opml, encoding="unicode", xml_declaration=True))