#!/usr/bin/python3

# The MIT License (MIT)
#
# Copyright © 2020-2025 pacman64
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


from html import escape
from io import StringIO, TextIOWrapper
from re import compile
from sys import argv, exit, stderr, stdin
from urllib.parse import urlparse, urlunparse


info = '''
htmlify [options...] [filepaths/URIs...]


Render plain-text prose into self-contained HTML. Lines which are just a
valid data-URI are turned into pictures, audio, or even video elements.

All HTTP(s) URIs are autodetected and rendered as hyperlinks, even when
lines have multiple URIs in them.

If a title isn't given from the cmd-line options, the first line is used
as the title.

All (optional) leading options start with either single or double-dash,
and most of them change the style/color used. Some of the options are,
shown in their single-dash form:

    -h          show this help message
    -help       show this help message

    -title      use the next argument as the title in the HTML output
'''

# handle standard help cmd-line options, quitting right away in that case
if len(argv) == 2 and argv[1] in ('-h', '--h', '-help', '--help'):
    print(info.strip())
    exit(0)

# links is used in func handle_normal_text_line to handle hyperlinks
links = compile('''(https?|ftps?)\://[a-zA-Z0-9_%.,?/&=#-]+''')

# style is the `inner` CSS used inside the style tag, and handles all
# visual styles for all supported input types
style = '''
        body {
            margin: 1rem auto 2rem auto;
            padding: 0.25rem;
            font-size: 1.1rem;
            line-height: 1.8rem;
            font-family: Arial, Helvetica, sans-serif;

            max-width: 95vw;
            /* width: max-content; */
            width: fit-content;

            box-sizing: border-box;
            display: block;
        }

        a {
            color: steelblue;
            text-decoration: none;
        }

        p {
            display: block;
            margin: auto;
            width: 80ch;
        }

        audio {
            width: 60ch;
        }

        table {
            margin: 2rem auto;
            border-collapse: collapse;
        }

        thead>* {
            position: sticky;
            top: 0;
            background-color: white;
        }

        tfoot th {
            user-select: none;
        }

        th, td {
            padding: 0.1rem 1ch;
            min-width: 4ch;
            border-bottom: solid thin transparent;
        }

        tr:nth-child(5n) td {
            border-bottom: solid thin #ccc;
        }

        .monospace {
            font-family: monospace;
        }
'''.strip('\n')


def fail(msg, code: int = 1) -> None:
    'Show the error message given, and quit the app right away.'
    print(f'\x1b[31m{msg}\x1b[0m', file=stderr)
    exit(code)


title = ''
start_args = 1
while start_args < len(argv) and argv[start_args].startswith('-'):
    if argv[start_args] in ('-title', '--title'):
        if start_args + 1 >= len(argv):
            fail('missing actual title in cmd-line arguments', 1)
        title = escape(argv[start_args + 1])
        start_args += 2
        continue
    break
args = argv[start_args:]


def is_base64(n: int) -> bool:
    'Help build base64-byte-checker lookup tables.'

    if ord('0') <= n <= ord('9'):
        return True
    if ord('A') <= n <= ord('Z'):
        return True
    if ord('a') <= n <= ord('z'):
        return True
    return n in (ord('+'), ord('/'), ord('='))


# valid_base64 helps func seems_supported_data_uri do its job quickly
valid_base64 = tuple(is_base64(n) for n in range(256))


def start_page(title: str) -> None:
    print('<!DOCTYPE html>')
    print('<html lang="en">')
    print('<head>')
    print('    <meta charset="UTF-8">')
    print('    <link rel="icon" href="data:,">')
    cattr = 'content="width=device-width, initial-scale=1.0"'
    print(f'    <meta name="viewport" {cattr}>')
    if title:
        print(f'    <title>{escape(title)}</title>')
    print('    <style>')
    print(style)
    print('    </style>')
    print('</head>')
    print('<body>')


def shorten(s: str, maxchars: int) -> str:
    return s if len(s) <= maxchars else s[:maxchars]


def handle_text(src, title: str, first: bool) -> None:
    'Render plain-text prose.'

    prev = ''
    # buf is a reusable string-buffer for func handle_normal_text_line
    buf = StringIO()
    num_lines = 0

    for i, line in enumerate(src):
        line = line.rstrip('\r\n').rstrip('\n').rstrip()
        if not (prev or line):
            # keep skipping empty(ish) lines in runs of such lines
            continue

        num_lines += 1
        if first and num_lines == 1:
            if title:
                start_page(shorten(title, 100))
            else:
                start_page(shorten(line, 100))
                continue

        if (not line) and prev:
            print('</p>')
        if not prev:
            print('<p>')
        prev = line

        if seems_supported_data_uri(line):
            handle_data_uri(line)
            print('<br>')
        else:
            handle_normal_text_line(line, buf)

    # don't forget to close last paragraph
    if line:
        print('</p>')


def handle_normal_text_line(line: str, buf: StringIO) -> None:
    'Handle prose lines for func handle_text.'

    # get rid of previous buffer content
    buf.truncate(0)
    buf.seek(0)

    # j keeps track of end of detected hyperlinks, and is used outside
    # the regex-match loop to detect trailing parts in lines
    j = 0

    # matches is to keep track of whether any matches occurred
    matches = 0

    for m in links.finditer(line):
        matches += 1
        # remember previous index-end, used to emit the part before
        # the current match
        start = j

        i = m.start()
        j = m.end()
        # remember part before match
        buf.write(escape(line[start:i]))
        # replace matched hyperlink with an html anchor tag for it
        href = line[i:j]
        buf.write(f'<a href="{href}">{href}</a>')

    if matches == 0:
        # avoid emptying lines with no matches
        print(f'{escape(line)}<br>')
        return

    # no need to copy the line when it's not changing anyway
    if j > 0:
        # don't forget the last part of the line, or the whole line
        buf.write(escape(line[j:]))

    buf.write('<br>')
    print(buf.getvalue())


data_uri_starts = {
    'data:image/': '<img src="',
    'data:audio/': '<audio controls src="',
    'data:video/': '<video controls src="',
}


def seems_supported_data_uri(s: str) -> bool:
    if not any(s.startswith(e) for e in data_uri_starts.keys()):
        return False

    base64_index = s.find(';base64,')
    if base64_index < 0:
        return False

    # check all payload bytes
    start = base64_index + len(';base64,')
    for i, c in enumerate(s):
        if i >= start and (not valid_base64[ord(c)]):
            return False
    return True


def handle_data_uri(s: str) -> bool:
    for kind, start in data_uri_starts.items():
        if s.starts_with(kind):
            print(start, end='')
            print(s, end='')
            print('">', end='')
            return True
    return False


def anchorize(href: str) -> str:
    rel = 'rel="noopener noreferrer"'
    return f'<a {rel} href="{urlunparse(urlparse(href))}">{escape(href)}</a>'


def seems_url(s: str) -> bool:
    protocols = ('https://', 'http://', 'file://', 'ftp://', 'data:')
    return any(s.startswith(p) for p in protocols)


if args.count('-') > 1:
    fail('reading from `-` (standard input) more than once not allowed', 1)

if any(seems_url(e) for e in args):
    from urllib.request import urlopen

try:
    for i, path in enumerate(args):
        if path == '-':
            handle_text(stdin, title, i == 0)
            continue

        if seems_url(path):
            with urlopen(path) as inp:
                ctype = inp.getheader('Content-Type')
                if not isinstance(ctype, str):
                    ctype = ''

                # try to detect response encoding, if given
                enc = 'utf-8'
                i = ctype.find('charset=')
                if i >= 0:
                    enc = ctype[i + len('charset='):]

                with TextIOWrapper(inp, encoding=enc) as txt:
                    handle_text(txt, title, i == 0)
            continue

        with open(path, encoding='utf-8') as inp:
            handle_text(inp, title, i == 0)

    if len(args) == 0:
        handle_text(stdin, title, True)

    print('</body>')
    print('</html>')
except BrokenPipeError:
    # quit quietly, instead of showing a confusing error message
    stderr.close()
    exit(0)
except KeyboardInterrupt:
    exit(2)
except Exception as e:
    fail(e, 1)