File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath utfate.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "bytes"
  37     "encoding/binary"
  38     "errors"
  39     "io"
  40     "os"
  41     "unicode"
  42     "unicode/utf16"
  43 )
  44 
  45 const info = `
  46 utfate [options...] [file...]
  47 
  48 This app turns plain-text input into UTF-8. Supported input formats are
  49 
  50     - ASCII
  51     - UTF-8
  52     - UTF-8 with a leading BOM
  53     - UTF-16 BE
  54     - UTF-16 LE
  55     - UTF-32 BE
  56     - UTF-32 LE
  57 
  58 All (optional) leading options start with either single or double-dash:
  59 
  60     -h, -help    show this help message
  61 `
  62 
  63 func main() {
  64     if len(os.Args) > 1 {
  65         switch os.Args[1] {
  66         case `-h`, `--h`, `-help`, `--help`:
  67             os.Stdout.WriteString(info[1:])
  68             return
  69         }
  70     }
  71 
  72     if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF {
  73         os.Stderr.WriteString(err.Error())
  74         os.Stderr.WriteString("\n")
  75         os.Exit(1)
  76     }
  77 }
  78 
  79 func run(w io.Writer, args []string) error {
  80     bw := bufio.NewWriter(w)
  81     defer bw.Flush()
  82 
  83     for _, path := range args {
  84         if err := handleFile(bw, path); err != nil {
  85             return err
  86         }
  87     }
  88 
  89     if len(args) == 0 {
  90         return utfate(bw, os.Stdin)
  91     }
  92     return nil
  93 }
  94 
  95 func handleFile(w *bufio.Writer, name string) error {
  96     if name == `-` {
  97         return utfate(w, os.Stdin)
  98     }
  99 
 100     f, err := os.Open(name)
 101     if err != nil {
 102         return errors.New(`can't read from file named "` + name + `"`)
 103     }
 104     defer f.Close()
 105 
 106     return utfate(w, f)
 107 }
 108 
 109 func utfate(w io.Writer, r io.Reader) error {
 110     br := bufio.NewReader(r)
 111     bw := bufio.NewWriter(w)
 112     defer bw.Flush()
 113 
 114     lead, err := br.Peek(4)
 115     if err != nil && err != io.EOF {
 116         return err
 117     }
 118 
 119     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 120         br.Discard(4)
 121         return utf32toUTF8(bw, br, binary.BigEndian)
 122     }
 123 
 124     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 125         br.Discard(4)
 126         return utf32toUTF8(bw, br, binary.LittleEndian)
 127     }
 128 
 129     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 130         br.Discard(2)
 131         return utf16toUTF8(bw, br, readBytePairBE)
 132     }
 133 
 134     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 135         br.Discard(2)
 136         return utf16toUTF8(bw, br, readBytePairLE)
 137     }
 138 
 139     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 140         br.Discard(3)
 141         return handleUTF8(bw, br)
 142     }
 143 
 144     return handleUTF8(bw, br)
 145 }
 146 
 147 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 148     for {
 149         c, _, err := r.ReadRune()
 150         if c == unicode.ReplacementChar {
 151             return errors.New(`invalid UTF-8 stream`)
 152         }
 153         if err == io.EOF {
 154             return nil
 155         }
 156         if err != nil {
 157             return err
 158         }
 159 
 160         if _, err := w.WriteRune(c); err != nil {
 161             return io.EOF
 162         }
 163     }
 164 }
 165 
 166 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever
 167 // don't seem to speed things up much when given ASCII input
 168 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 169     lookahead := 1
 170     maxAhead := r.Size() / 2
 171 
 172     for {
 173         // look ahead to check for ASCII runs
 174         ahead, err := r.Peek(lookahead)
 175         if err == io.EOF {
 176             return nil
 177         }
 178         if err != nil {
 179             return err
 180         }
 181 
 182         // copy leading ASCII runs
 183         n := leadASCII(ahead)
 184         if n > 0 {
 185             w.Write(ahead[:n])
 186             r.Discard(n)
 187         }
 188 
 189         // adapt lookahead size
 190         if n == len(ahead) && lookahead < maxAhead {
 191             lookahead *= 2
 192         } else if lookahead > 1 {
 193             lookahead /= 2
 194         }
 195 
 196         if n == len(ahead) {
 197             continue
 198         }
 199 
 200         c, _, err := r.ReadRune()
 201         if c == unicode.ReplacementChar {
 202             return errors.New(`invalid UTF-8 stream`)
 203         }
 204         if err == io.EOF {
 205             return nil
 206         }
 207         if err != nil {
 208             return err
 209         }
 210 
 211         if _, err := w.WriteRune(c); err != nil {
 212             return io.EOF
 213         }
 214     }
 215 }
 216 
 217 // leadASCII is used by func fancyHandleUTF8
 218 func leadASCII(buf []byte) int {
 219     for i, b := range buf {
 220         if b >= 128 {
 221             return i
 222         }
 223     }
 224     return len(buf)
 225 }
 226 
 227 // readPairFunc narrows source-code lines below
 228 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 229 
 230 // utf16toUTF8 handles UTF-16 inputs for func utfate
 231 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 232     for {
 233         a, b, err := read2(r)
 234         if err == io.EOF {
 235             return nil
 236         }
 237         if err != nil {
 238             return err
 239         }
 240 
 241         c := rune(256*int(a) + int(b))
 242         if utf16.IsSurrogate(c) {
 243             a, b, err := read2(r)
 244             if err == io.EOF {
 245                 return nil
 246             }
 247             if err != nil {
 248                 return err
 249             }
 250 
 251             next := rune(256*int(a) + int(b))
 252             c = utf16.DecodeRune(c, next)
 253         }
 254 
 255         if _, err := w.WriteRune(c); err != nil {
 256             return io.EOF
 257         }
 258     }
 259 }
 260 
 261 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 262 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 263     a, err := br.ReadByte()
 264     if err != nil {
 265         return a, 0, err
 266     }
 267 
 268     b, err := br.ReadByte()
 269     return a, b, err
 270 }
 271 
 272 // readBytePairLE gets you a pair of bytes in little-endian order
 273 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 274     a, b, err := readBytePairBE(br)
 275     return b, a, err
 276 }
 277 
 278 // utf32toUTF8 handles UTF-32 inputs for func utfate
 279 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 280     var n uint32
 281     for {
 282         err := binary.Read(r, o, &n)
 283         if err == io.EOF {
 284             return nil
 285         }
 286         if err != nil {
 287             return err
 288         }
 289 
 290         if _, err := w.WriteRune(rune(n)); err != nil {
 291             return io.EOF
 292         }
 293     }
 294 }