File: podfeed.py
   1 #!/usr/bin/python3
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright © 2024 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the “Software”), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # podfeed [options...] [filepaths/URIs...]
  27 #
  28 # PODcast FEED fetches all episodes from the feeds given as URIs, either as
  29 # arguments, or as lines in the plain-text files given.
  30 #
  31 # The result is self-contained HTML which links to all episodes, and adds
  32 # many little extras, such as tooltips showing date of publication and play
  33 # duration.
  34 #
  35 # Podcast thumbnails are also included inline as data-URIs. Doing so can make
  36 # output size considerably bigger: podcast logos already tend to come as big
  37 # pictures, and their base-64 encoding further adds to their size, but using
  38 # external URIs for them would no longer make the output self-contained.
  39 #
  40 # All (optional) leading options start with either single or double-dash,
  41 # and most of them change the style/color used. Some of the options are,
  42 # shown in their single-dash form:
  43 #
  44 #     -h          show this help message
  45 #     -help       show this help message
  46 #
  47 #     -title      use the next argument as the title in the HTML output
  48 
  49 
  50 from datetime import datetime
  51 from html import escape
  52 from multiprocessing import Pool
  53 from sys import argv, exit, stderr, stdin, stdout
  54 from typing import Dict, List
  55 from urllib.parse import urlparse, urlunparse
  56 from urllib.request import urlopen
  57 from xml.dom.minidom import parse
  58 
  59 
  60 # info is the message shown when the leading argument is one of the standard
  61 # cmd-line help options
  62 info = '''
  63 podfeed [options...] [filepaths/URIs...]
  64 
  65 PODcast FEED fetches all episodes from the feeds given as URIs, either as
  66 arguments, or as lines in the plain-text files given.
  67 
  68 The result is self-contained HTML which links to all episodes, and adds
  69 many little extras, such as tooltips showing date of publication and play
  70 duration.
  71 
  72 Podcast thumbnails are also included inline as data-URIs. Doing so can make
  73 output size considerably bigger: podcast logos already tend to come as big
  74 pictures, and their base-64 encoding further adds to their size, but using
  75 external URIs for them would no longer make the output self-contained.
  76 
  77 All (optional) leading options start with either single or double-dash,
  78 and most of them change the style/color used. Some of the options are,
  79 shown in their single-dash form:
  80 
  81     -h          show this help message
  82     -help       show this help message
  83 
  84     -title      use the next argument as the title in the HTML output
  85 '''.strip()
  86 
  87 # a leading help-option arg means show the help message and quit
  88 if len(argv) == 2 and argv[1].lower() in ('-h', '--h', '-help', '--help'):
  89     print(info, file=stderr)
  90     exit(0)
  91 
  92 
  93 def fail(msg, code: int = 1) -> None:
  94     '''Show the error message given, and quit the app right away.'''
  95     print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
  96     exit(code)
  97 
  98 
  99 # handle leading cmd-line options
 100 title = ''
 101 start_args = 1
 102 while start_args < len(argv) and argv[start_args].startswith('-'):
 103     l = argv[start_args].lstrip('-').lower()
 104     if l in ('title'):
 105         if start_args + 1 >= len(argv):
 106             fail('missing actual title in cmd-line arguments', 1)
 107         title = argv[start_args + 1]
 108         start_args += 2
 109         continue
 110     break
 111 args = argv[start_args:]
 112 
 113 # use a default web-page title if one wasn't given
 114 if title == '':
 115     now = datetime.now()
 116     ymd = f'{now.year}-{now.month:02}-{now.day:02}'
 117     hms = f'{now.hour}:{now.minute:02}:{now.second:02}'
 118     title = f'Latest Podcast Episodes as of {ymd} {hms}'
 119 
 120 
 121 def parse_feed(uri: str) -> Dict:
 122     '''
 123     Turn an XML feed into more convenient-to-use dictionaries,
 124     given the feed's URI.
 125     '''
 126 
 127     res = {'rss': []}
 128     with urlopen(uri) as inp:
 129         feed = parse(inp)
 130         for rss in feed.getElementsByTagName('rss'):
 131             channels = rss.getElementsByTagName('channel')
 132             channels = [parse_channel(chan) for chan in channels]
 133             res['rss'].append({'channels': channels})
 134     return res
 135 
 136 
 137 def parse_channel(chan) -> Dict:
 138     '''Help func parse_feed do its job'''
 139 
 140     title = get_str(chan, 'title')
 141     link = get_str(chan, 'link')
 142     descr = get_str(chan, 'description')
 143     # no channel thumbnail for now
 144 
 145     episodes = chan.getElementsByTagName('item')
 146     episodes = [parse_episode(ep) for ep in episodes]
 147 
 148     return {
 149         'title': title,
 150         'link': link,
 151         'description': descr,
 152         'episodes': episodes,
 153     }
 154 
 155 
 156 def parse_episode(episode) -> Dict:
 157     '''Help func parse_channel do its job'''
 158 
 159     title = get_str(episode, 'title')
 160     link = get_str(episode, 'link')
 161     description = get_str(episode, 'description')
 162     pub_date = get_str(episode, 'pubDate')
 163     duration = get_str(episode, 'itunes:duration')
 164     for enc in episode.getElementsByTagName('enclosure'):
 165         link = enc.getAttribute('url')
 166 
 167     return {
 168         'title': title,
 169         'link': link,
 170         'description': description,
 171         'pub_date': pub_date,
 172         'duration': duration,
 173     }
 174 
 175 
 176 def render_feed(feed) -> None:
 177     '''Handle a single parsed RSS feed.'''
 178 
 179     indent = 12 * ' '
 180     print('        <article>')
 181 
 182     for rss in feed['rss']:
 183         for chan in rss['channels']:
 184             href = urlunparse(urlparse(chan['link']))
 185             title = escape(chan['title'])
 186             descr = escape(chan['description'])
 187             a = make_anchor(href, title)
 188             s = f'{indent}<h1><summary title="{descr}">{a}</summary></h1>'
 189             print(s)
 190             # no channel thumbnail for now
 191 
 192             for episode in chan['episodes']:
 193                 render_episode(episode)
 194 
 195     print('        </article>')
 196 
 197 
 198 def render_episode(episode) -> None:
 199     '''Help func render_feed do its job.'''
 200 
 201     title = escape(episode['title'])
 202     href = urlunparse(urlparse(episode['link']))
 203     description = escape(episode['description'])
 204     pub_date = escape(episode['pub_date'])
 205     duration = escape(episode['duration'])
 206     tt = make_tooltip(pub_date, duration)
 207     a = make_anchor(href, title)
 208 
 209     print('            <section>')
 210     print('                <details>')
 211     print(f'                    <summary title="{tt}">{a}</summary>')
 212     print(f'                    <p>{description}</p>')
 213     print('                </details>')
 214     print('            </section>')
 215 
 216 
 217 def make_anchor(href: str, title: str) -> str:
 218     '''Standardize how hyperlinks are handled in this script.'''
 219     return f'<a target="_blank" rel="noreferrer" href="{href}">{title}</a>'
 220 
 221 
 222 def make_tooltip(pub_date: str, duration: str) -> str:
 223     try:
 224         # because datetime's supposedly-idiomatic solutions are so ugly
 225         s = int(duration)
 226         h = int(s / 3600)
 227         m = int(s / 60) % 3600
 228         s %= 60
 229         hms = f'{h:02}:{m:02}:{s:02}'.lstrip('00:')
 230     except:
 231         hms = duration
 232     return f'published: {pub_date} | duration: {hms}'
 233 
 234 
 235 def get_str(src, tag: str) -> str:
 236     '''Simplify the control-flow of various feed-parsing funcs.'''
 237 
 238     try:
 239         res = src.getElementsByTagName(tag)
 240         if len(res) == 0:
 241             return ''
 242         for e in res[0].childNodes:
 243             if e.nodeType in (e.TEXT_NODE, e.CDATA_SECTION_NODE):
 244                 return e.data.strip()
 245         return ''
 246     except:
 247         return ''
 248 
 249 
 250 def get_uris(src) -> List[str]:
 251     '''This func helps func load_feed_uris load all URIs from a file.'''
 252 
 253     uris = []
 254     for line in src:
 255         line = line.rstrip('\r\n').rstrip('\n').strip()
 256         if line == '' or line.startswith('#'):
 257             continue
 258         uris.append(line)
 259     return uris
 260 
 261 
 262 def load_feed_uris(args: List[str]) -> List[str]:
 263     '''Turn a mix of URIs and filepaths into a list of URIs to load.'''
 264 
 265     if len(args) == 0:
 266         return get_uris(stdin)
 267 
 268     uris = []
 269     if args.count('-') > 1:
 270         msg = 'reading from `-` (standard input) more than once not allowed'
 271         raise ValueError(msg)
 272 
 273     for path in args:
 274         if path.startswith('https://') or path.startswith('http://'):
 275             uris.append(path)
 276             continue
 277 
 278         if path == '-':
 279             uris.extend(get_uris(stdin))
 280 
 281         with open(path) as inp:
 282             uris.extend(get_uris(inp))
 283 
 284     return uris
 285 
 286 
 287 # style is the `inner` CSS used inside the style tag, and handles all
 288 # visual styles for all supported input types
 289 style = '''
 290         body {
 291             font-size: 0.9rem;
 292             margin: 0 0 2rem 0;
 293             font-family: system-ui, -apple-system, sans-serif;
 294         }
 295 
 296         main {
 297             margin: auto;
 298             display: flex;
 299             width: fit-content;
 300         }
 301 
 302         h1 {
 303             top: 0;
 304             position: sticky;
 305             font-size: 0.9rem;
 306             text-align: center;
 307             background-color: white;
 308         }
 309 
 310         img {
 311             margin: auto;
 312             margin-bottom: 1rem;
 313             display: block;
 314             max-width: 15ch;
 315         }
 316 
 317         section {
 318             width: 48ch;
 319             padding: 0.3rem;
 320             margin: 0 0.1rem;
 321         }
 322 
 323         section:nth-child(2n+1) {
 324             background-color: #eee;
 325         }
 326 
 327         a {
 328             color: steelblue;
 329             text-decoration: none;
 330         }
 331 
 332         details p {
 333             line-height: 1.3rem;
 334         }
 335 '''.strip('\n')
 336 
 337 
 338 try:
 339     stdout.reconfigure(newline='\n', encoding='utf-8')
 340     feeds = load_feed_uris(args)
 341 
 342     print('<!DOCTYPE html>')
 343     print('<html lang="en">')
 344     print('<head>')
 345     print('    <meta charset="UTF-8">')
 346     print('    <link rel="icon" href="data:,">')
 347     cattr = 'content="width=device-width, initial-scale=1.0"'
 348     print(f'    <meta name="viewport" {cattr}>')
 349     if title != '':
 350         print(f'    <title>{escape(title)}</title>')
 351     print('    <style>')
 352     print(style)
 353     print('    </style>')
 354     print('</head>')
 355     print('<body>')
 356     print('    <main>')
 357 
 358     # significantly speed-up script by loading/parsing feeds concurrently
 359     with Pool(processes=min(4, len(feeds))) as pool:
 360         feeds = pool.map(parse_feed, feeds)
 361 
 362     # render parsed feeds sequentially
 363     for feed in feeds:
 364         render_feed(feed)
 365 
 366     print('    </main>')
 367     print('</body>')
 368     print('</html>')
 369 except Exception as e:
 370     fail(e, 1)