File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright (c) 2026 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath utfate.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "bytes"
  37     "encoding/binary"
  38     "errors"
  39     "io"
  40     "os"
  41     "unicode"
  42     "unicode/utf16"
  43 )
  44 
  45 const info = `
  46 utfate [options...] [file...]
  47 
  48 This app turns plain-text input into UTF-8. Supported input formats are
  49 
  50     - ASCII
  51     - UTF-8
  52     - UTF-8 with a leading BOM
  53     - UTF-16 BE
  54     - UTF-16 LE
  55     - UTF-32 BE
  56     - UTF-32 LE
  57 
  58 All (optional) leading options start with either single or double-dash:
  59 
  60     -h, -help    show this help message
  61 `
  62 
  63 func main() {
  64     if len(os.Args) > 1 {
  65         switch os.Args[1] {
  66         case `-h`, `--h`, `-help`, `--help`:
  67             os.Stdout.WriteString(info[1:])
  68             return
  69         }
  70     }
  71 
  72     if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF {
  73         os.Stderr.WriteString(err.Error())
  74         os.Stderr.WriteString("\n")
  75         os.Exit(1)
  76         return
  77     }
  78 }
  79 
  80 func run(w io.Writer, args []string) error {
  81     bw := bufio.NewWriter(w)
  82     defer bw.Flush()
  83 
  84     for _, path := range args {
  85         if err := handleFile(bw, path); err != nil {
  86             return err
  87         }
  88     }
  89 
  90     if len(args) == 0 {
  91         return utfate(bw, os.Stdin)
  92     }
  93     return nil
  94 }
  95 
  96 func handleFile(w *bufio.Writer, name string) error {
  97     if name == `-` {
  98         return utfate(w, os.Stdin)
  99     }
 100 
 101     f, err := os.Open(name)
 102     if err != nil {
 103         return errors.New(`can't read from file named "` + name + `"`)
 104     }
 105     defer f.Close()
 106 
 107     return utfate(w, f)
 108 }
 109 
 110 func utfate(w io.Writer, r io.Reader) error {
 111     br := bufio.NewReader(r)
 112     bw := bufio.NewWriter(w)
 113     defer bw.Flush()
 114 
 115     lead, err := br.Peek(4)
 116     if err != nil && err != io.EOF {
 117         return err
 118     }
 119 
 120     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 121         br.Discard(4)
 122         return utf32toUTF8(bw, br, binary.BigEndian)
 123     }
 124 
 125     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 126         br.Discard(4)
 127         return utf32toUTF8(bw, br, binary.LittleEndian)
 128     }
 129 
 130     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 131         br.Discard(2)
 132         return utf16toUTF8(bw, br, readBytePairBE)
 133     }
 134 
 135     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 136         br.Discard(2)
 137         return utf16toUTF8(bw, br, readBytePairLE)
 138     }
 139 
 140     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 141         br.Discard(3)
 142         return handleUTF8(bw, br)
 143     }
 144 
 145     return handleUTF8(bw, br)
 146 }
 147 
 148 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 149     for {
 150         c, _, err := r.ReadRune()
 151         if c == unicode.ReplacementChar {
 152             return errors.New(`invalid UTF-8 stream`)
 153         }
 154         if err == io.EOF {
 155             return nil
 156         }
 157         if err != nil {
 158             return err
 159         }
 160 
 161         if _, err := w.WriteRune(c); err != nil {
 162             return io.EOF
 163         }
 164     }
 165 }
 166 
 167 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever
 168 // don't seem to speed things up much when given ASCII input
 169 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 170     lookahead := 1
 171     maxAhead := r.Size() / 2
 172 
 173     for {
 174         // look ahead to check for ASCII runs
 175         ahead, err := r.Peek(lookahead)
 176         if err == io.EOF {
 177             return nil
 178         }
 179         if err != nil {
 180             return err
 181         }
 182 
 183         // copy leading ASCII runs
 184         n := leadASCII(ahead)
 185         if n > 0 {
 186             w.Write(ahead[:n])
 187             r.Discard(n)
 188         }
 189 
 190         // adapt lookahead size
 191         if n == len(ahead) && lookahead < maxAhead {
 192             lookahead *= 2
 193         } else if lookahead > 1 {
 194             lookahead /= 2
 195         }
 196 
 197         if n == len(ahead) {
 198             continue
 199         }
 200 
 201         c, _, err := r.ReadRune()
 202         if c == unicode.ReplacementChar {
 203             return errors.New(`invalid UTF-8 stream`)
 204         }
 205         if err == io.EOF {
 206             return nil
 207         }
 208         if err != nil {
 209             return err
 210         }
 211 
 212         if _, err := w.WriteRune(c); err != nil {
 213             return io.EOF
 214         }
 215     }
 216 }
 217 
 218 // leadASCII is used by func fancyHandleUTF8
 219 func leadASCII(buf []byte) int {
 220     for i, b := range buf {
 221         if b >= 128 {
 222             return i
 223         }
 224     }
 225     return len(buf)
 226 }
 227 
 228 // readPairFunc narrows source-code lines below
 229 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 230 
 231 // utf16toUTF8 handles UTF-16 inputs for func utfate
 232 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 233     for {
 234         a, b, err := read2(r)
 235         if err == io.EOF {
 236             return nil
 237         }
 238         if err != nil {
 239             return err
 240         }
 241 
 242         c := rune(256*int(a) + int(b))
 243         if utf16.IsSurrogate(c) {
 244             a, b, err := read2(r)
 245             if err == io.EOF {
 246                 return nil
 247             }
 248             if err != nil {
 249                 return err
 250             }
 251 
 252             next := rune(256*int(a) + int(b))
 253             c = utf16.DecodeRune(c, next)
 254         }
 255 
 256         if _, err := w.WriteRune(c); err != nil {
 257             return io.EOF
 258         }
 259     }
 260 }
 261 
 262 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 263 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 264     a, err := br.ReadByte()
 265     if err != nil {
 266         return a, 0, err
 267     }
 268 
 269     b, err := br.ReadByte()
 270     return a, b, err
 271 }
 272 
 273 // readBytePairLE gets you a pair of bytes in little-endian order
 274 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 275     a, b, err := readBytePairBE(br)
 276     return b, a, err
 277 }
 278 
 279 // utf32toUTF8 handles UTF-32 inputs for func utfate
 280 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 281     var n uint32
 282     for {
 283         err := binary.Read(r, o, &n)
 284         if err == io.EOF {
 285             return nil
 286         }
 287         if err != nil {
 288             return err
 289         }
 290 
 291         if _, err := w.WriteRune(rune(n)); err != nil {
 292             return io.EOF
 293         }
 294     }
 295 }