diff --git a/.gitignore b/.gitignore index 0694fa3..ad12bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /html /json +/subscriptions.* diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7606f42 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +JAQ?=jaq + +subscriptions.txt: subscriptions.json + $(JAQ) --raw-output '.[]' subscriptions.json > $@ + +subscriptions.curl: subscriptions.json + $(JAQ) --raw-output '.[] | (split("/") | last) as $$name | . | "url \(.)\noutput \($$name).html"' subscriptions.json > $@ + +fetch: subscriptions.curl + curl --location --output-dir html --create-dirs --rate 1/s --config subscriptions.curl + +channel-json: subscriptions.txt + # nproc is not portable :-/ + xargs -n1 --max-procs=$$(nproc) --arg-file subscriptions.txt --verbose ./generate-json-opml + +# turn all the channel json files into an OPML file +subscriptions.opml: + ./generate-opml > $@ + +.PHONY: channel-json subscriptions.opml diff --git a/README.md b/README.md new file mode 100644 index 0000000..99c4986 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# YouTube Subscriptions to OPML + +This repo contains a small collection of scripts that I used to turn my YouTube subscriptions into an OPML file for import in to [Feedbin]. + +## Dependencies + +The scripts have only been run on a Linux system using GNU coreutils. They will +probably need some tweaking to run on other UNIX-like systems. + +- [Scraper](https://lib.rs/crates/scraper) +- [jaq](https://github.com/01mf02/jaq) +- curl +- Python +- awk +- GNU make (I haven't tested non-GNU make) + +## Usage + +1. Visit your [subscriptions page](https://www.youtube.com/feed/channels) +2. Repeatedly scroll to the end of the page to make them all load +3. Run the following in the JavaScript console to copy the list of subscriptions to you clipboard as JSON array: + + ```javascript + copy(JSON.stringify(Array.from(new Set(Array.prototype.map.call(document.querySelectorAll('a.channel-link'), (link) => link.href))).filter((x) => !x.includes('/channel/')), null, 2)) + ``` + + **Note:** I only tested the above on Firefox. + + Also why do this instead of processing the subscriptions.csv from Google Takeout? + + 1. Takeout generates multiple gigabytes of archives I have to download to get the CSV file. + 2. It's slow to generate. This process can be done whenever you want. + +4. Paste the list of subscriptions into `subscriptions.json`. +5. Run `make fetch` to fetch the channel pages of all the subscriptions. This only needs to be run once. +6. Run `make channel-json` to extract info from each channel page. +7. Run `make subscriptions.opml` to generate the OPML file. + +[Feedbin]: https://feedbin.com/ diff --git a/generate-json-opml b/generate-json-opml new file mode 100755 index 0000000..fa8893c --- /dev/null +++ b/generate-json-opml @@ -0,0 +1,20 @@ +#!/bin/sh + +set -eu + +URL="$1" +NAME=$(echo "$URL" | awk -F / '{ print $NF }') +HTML="html/${NAME}.html" +CHANNEL_ID=$(scraper -a content 'meta[property="og:url"]' < "$HTML" | awk -F / '{ print $NF }') +TITLE=$(scraper -a content 'meta[property="og:title"]' < "$HTML") +XML_URL="https://www.youtube.com/feeds/videos.xml?channel_id=${CHANNEL_ID}" + +json_escape() { + echo "$1" | jaq --raw-input . +} + +JSON_TITLE=$(json_escape "$TITLE") +JSON_XML_URL=$(json_escape "$XML_URL") +JSON_URL=$(json_escape "$URL") + +printf '{"title": %s, "xmlUrl": %s, "htmlUrl": %s}\n' "$JSON_TITLE" "$JSON_XML_URL" "$JSON_URL" > json/"$NAME".json diff --git a/generate-opml b/generate-opml new file mode 100755 index 0000000..4cff1bf --- /dev/null +++ b/generate-opml @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import email.utils +import glob +import json +import xml.etree.ElementTree as ET + +# This is what we're aiming to generate: +# +# +# +# +# RSS subscriptions for wes@wezm.net +# Sun, 05 May 2024 02:54:31 +0000 +# wes@wezm.net +# +# +# +# +# +# +# + +opml = ET.Element("opml") + +head = ET.SubElement(opml, "head") +title = ET.SubElement(head, "title") +title.text = "YouTube Subscription" +dateCreated = ET.SubElement(head, "dateCreated") +dateCreated.text = email.utils.formatdate(timeval=None, localtime=True) + +body = ET.SubElement(opml, "body") +youtube = ET.SubElement(body, "outline", {"title": "YouTube", "text": "YouTube"}) + +for path in glob.glob("json/*.json"): + with open(path) as f: + info = json.load(f) + ET.SubElement(youtube, "outline", info, type="rss", text=info["title"]) + +ET.indent(opml) +print(ET.tostring(opml, encoding="unicode", xml_declaration=True))