File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath utfate.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "bytes"
  37     "encoding/binary"
  38     "errors"
  39     "io"
  40     "os"
  41     "unicode"
  42     "unicode/utf16"
  43 )
  44 
  45 // Note: the code is avoiding using the fmt package to save hundreds of
  46 // kilobytes on the resulting executable, which is a noticeable difference.
  47 
  48 const info = `
  49 utfate [options...] [file...]
  50 
  51 This app turns plain-text input into UTF-8. Supported input formats are
  52 
  53     - ASCII
  54     - UTF-8
  55     - UTF-8 with a leading BOM
  56     - UTF-16 BE
  57     - UTF-16 LE
  58     - UTF-32 BE
  59     - UTF-32 LE
  60 
  61 All (optional) leading options start with either single or double-dash:
  62 
  63     -h          show this help message
  64     -help       show this help message
  65 `
  66 
  67 func main() {
  68     if len(os.Args) > 1 {
  69         switch os.Args[1] {
  70         case `-h`, `--h`, `-help`, `--help`:
  71             os.Stdout.WriteString(info[1:])
  72             return
  73         }
  74     }
  75 
  76     if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF {
  77         os.Stderr.WriteString(err.Error())
  78         os.Stderr.WriteString("\n")
  79         os.Exit(1)
  80     }
  81 }
  82 
  83 func run(w io.Writer, args []string) error {
  84     bw := bufio.NewWriter(w)
  85     defer bw.Flush()
  86 
  87     for _, path := range args {
  88         if err := handleFile(bw, path); err != nil {
  89             return err
  90         }
  91     }
  92 
  93     if len(args) == 0 {
  94         return utfate(bw, os.Stdin)
  95     }
  96     return nil
  97 }
  98 
  99 func handleFile(w *bufio.Writer, name string) error {
 100     if name == `-` {
 101         return utfate(w, os.Stdin)
 102     }
 103 
 104     f, err := os.Open(name)
 105     if err != nil {
 106         return errors.New(`can't read from file named "` + name + `"`)
 107     }
 108     defer f.Close()
 109 
 110     return utfate(w, f)
 111 }
 112 
 113 func utfate(w io.Writer, r io.Reader) error {
 114     br := bufio.NewReader(r)
 115     bw := bufio.NewWriter(w)
 116     defer bw.Flush()
 117 
 118     lead, err := br.Peek(4)
 119     if err != nil && err != io.EOF {
 120         return err
 121     }
 122 
 123     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 124         br.Discard(4)
 125         return utf32toUTF8(bw, br, binary.BigEndian)
 126     }
 127 
 128     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 129         br.Discard(4)
 130         return utf32toUTF8(bw, br, binary.LittleEndian)
 131     }
 132 
 133     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 134         br.Discard(2)
 135         return utf16toUTF8(bw, br, readBytePairBE)
 136     }
 137 
 138     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 139         br.Discard(2)
 140         return utf16toUTF8(bw, br, readBytePairLE)
 141     }
 142 
 143     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 144         br.Discard(3)
 145         return handleUTF8(bw, br)
 146     }
 147 
 148     return handleUTF8(bw, br)
 149 }
 150 
 151 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 152     for {
 153         c, _, err := r.ReadRune()
 154         if c == unicode.ReplacementChar {
 155             return errors.New(`invalid UTF-8 stream`)
 156         }
 157         if err == io.EOF {
 158             return nil
 159         }
 160         if err != nil {
 161             return err
 162         }
 163 
 164         if _, err := w.WriteRune(c); err != nil {
 165             return io.EOF
 166         }
 167     }
 168 }
 169 
 170 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever
 171 // don't seem to speed things up much when given ASCII input
 172 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 173     lookahead := 1
 174     maxAhead := r.Size() / 2
 175 
 176     for {
 177         // look ahead to check for ASCII runs
 178         ahead, err := r.Peek(lookahead)
 179         if err == io.EOF {
 180             return nil
 181         }
 182         if err != nil {
 183             return err
 184         }
 185 
 186         // copy leading ASCII runs
 187         n := leadASCII(ahead)
 188         if n > 0 {
 189             w.Write(ahead[:n])
 190             r.Discard(n)
 191         }
 192 
 193         // adapt lookahead size
 194         if n == len(ahead) && lookahead < maxAhead {
 195             lookahead *= 2
 196         } else if lookahead > 1 {
 197             lookahead /= 2
 198         }
 199 
 200         if n == len(ahead) {
 201             continue
 202         }
 203 
 204         c, _, err := r.ReadRune()
 205         if c == unicode.ReplacementChar {
 206             return errors.New(`invalid UTF-8 stream`)
 207         }
 208         if err == io.EOF {
 209             return nil
 210         }
 211         if err != nil {
 212             return err
 213         }
 214 
 215         if _, err := w.WriteRune(c); err != nil {
 216             return io.EOF
 217         }
 218     }
 219 }
 220 
 221 // leadASCII is used by func fancyHandleUTF8
 222 func leadASCII(buf []byte) int {
 223     for i, b := range buf {
 224         if b >= 128 {
 225             return i
 226         }
 227     }
 228     return len(buf)
 229 }
 230 
 231 // readPairFunc narrows source-code lines below
 232 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 233 
 234 // utf16toUTF8 handles UTF-16 inputs for func utfate
 235 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 236     for {
 237         a, b, err := read2(r)
 238         if err == io.EOF {
 239             return nil
 240         }
 241         if err != nil {
 242             return err
 243         }
 244 
 245         c := rune(256*int(a) + int(b))
 246         if utf16.IsSurrogate(c) {
 247             a, b, err := read2(r)
 248             if err == io.EOF {
 249                 return nil
 250             }
 251             if err != nil {
 252                 return err
 253             }
 254 
 255             next := rune(256*int(a) + int(b))
 256             c = utf16.DecodeRune(c, next)
 257         }
 258 
 259         if _, err := w.WriteRune(c); err != nil {
 260             return io.EOF
 261         }
 262     }
 263 }
 264 
 265 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 266 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 267     a, err := br.ReadByte()
 268     if err != nil {
 269         return a, 0, err
 270     }
 271 
 272     b, err := br.ReadByte()
 273     return a, b, err
 274 }
 275 
 276 // readBytePairLE gets you a pair of bytes in little-endian order
 277 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 278     a, b, err := readBytePairBE(br)
 279     return b, a, err
 280 }
 281 
 282 // utf32toUTF8 handles UTF-32 inputs for func utfate
 283 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 284     var n uint32
 285     for {
 286         err := binary.Read(r, o, &n)
 287         if err == io.EOF {
 288             return nil
 289         }
 290         if err != nil {
 291             return err
 292         }
 293 
 294         if _, err := w.WriteRune(rune(n)); err != nil {
 295             return io.EOF
 296         }
 297     }
 298 }