File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 To compile a smaller-sized command-line app, you can use the `go` command as
  27 follows:
  28 
  29 go build -ldflags "-s -w" -trimpath utfate.go
  30 */
  31 
  32 package main
  33 
  34 import (
  35     "bufio"
  36     "bytes"
  37     "encoding/binary"
  38     "errors"
  39     "io"
  40     "os"
  41     "unicode"
  42     "unicode/utf16"
  43 )
  44 
  45 // Note: the code is avoiding using the fmt package to save hundreds of
  46 // kilobytes on the resulting executable, which is a noticeable difference.
  47 
  48 const info = `
  49 utfate [options...] [file...]
  50 
  51 This app turns plain-text input into UTF-8. Supported input formats are
  52 
  53     - ASCII
  54     - UTF-8
  55     - UTF-8 with a leading BOM
  56     - UTF-16 BE
  57     - UTF-16 LE
  58     - UTF-32 BE
  59     - UTF-32 LE
  60 
  61 All (optional) leading options start with either single or double-dash:
  62 
  63     -h          show this help message
  64     -help       show this help message
  65 `
  66 
  67 // errNoMoreOutput is a dummy error whose message is ignored, and which
  68 // causes the app to quit immediately and successfully
  69 var errNoMoreOutput = errors.New(`no more output`)
  70 
  71 func main() {
  72     if len(os.Args) > 1 {
  73         switch os.Args[1] {
  74         case `-h`, `--h`, `-help`, `--help`:
  75             os.Stderr.WriteString(info[1:])
  76             return
  77         }
  78     }
  79 
  80     if err := run(os.Stdout, os.Args[1:]); isActualError(err) {
  81         os.Stderr.WriteString(err.Error())
  82         os.Stderr.WriteString("\n")
  83         os.Exit(1)
  84     }
  85 }
  86 
  87 func run(w io.Writer, args []string) error {
  88     bw := bufio.NewWriter(w)
  89     defer bw.Flush()
  90 
  91     for _, path := range args {
  92         if err := handleFile(bw, path); err != nil {
  93             return err
  94         }
  95     }
  96 
  97     if len(args) == 0 {
  98         return utfate(bw, os.Stdin)
  99     }
 100     return nil
 101 }
 102 
 103 func handleFile(w *bufio.Writer, name string) error {
 104     if name == `-` {
 105         return utfate(w, os.Stdin)
 106     }
 107 
 108     f, err := os.Open(name)
 109     if err != nil {
 110         return errors.New(`can't read from file named "` + name + `"`)
 111     }
 112     defer f.Close()
 113 
 114     return utfate(w, f)
 115 }
 116 
 117 // isActualError is to figure out whether not to ignore an error, and thus
 118 // show it as an error message
 119 func isActualError(err error) bool {
 120     return err != nil && err != io.EOF && err != errNoMoreOutput
 121 }
 122 
 123 func utfate(w io.Writer, r io.Reader) error {
 124     br := bufio.NewReader(r)
 125     bw := bufio.NewWriter(w)
 126     defer bw.Flush()
 127 
 128     lead, err := br.Peek(4)
 129     if err != nil && err != io.EOF {
 130         return err
 131     }
 132 
 133     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 134         br.Discard(4)
 135         return utf32toUTF8(bw, br, binary.BigEndian)
 136     }
 137 
 138     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 139         br.Discard(4)
 140         return utf32toUTF8(bw, br, binary.LittleEndian)
 141     }
 142 
 143     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 144         br.Discard(2)
 145         return utf16toUTF8(bw, br, readBytePairBE)
 146     }
 147 
 148     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 149         br.Discard(2)
 150         return utf16toUTF8(bw, br, readBytePairLE)
 151     }
 152 
 153     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 154         br.Discard(3)
 155         return handleUTF8(bw, br)
 156     }
 157 
 158     return handleUTF8(bw, br)
 159 }
 160 
 161 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 162     for {
 163         c, _, err := r.ReadRune()
 164         if c == unicode.ReplacementChar {
 165             return errors.New(`invalid UTF-8 stream`)
 166         }
 167         if err == io.EOF {
 168             return nil
 169         }
 170         if err != nil {
 171             return err
 172         }
 173 
 174         if _, err := w.WriteRune(c); err != nil {
 175             return errNoMoreOutput
 176         }
 177     }
 178 }
 179 
 180 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever
 181 // don't seem to speed things up much when given ASCII input
 182 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 183     lookahead := 1
 184     maxAhead := r.Size() / 2
 185 
 186     for {
 187         // look ahead to check for ASCII runs
 188         ahead, err := r.Peek(lookahead)
 189         if err == io.EOF {
 190             return nil
 191         }
 192         if err != nil {
 193             return err
 194         }
 195 
 196         // copy leading ASCII runs
 197         n := leadASCII(ahead)
 198         if n > 0 {
 199             w.Write(ahead[:n])
 200             r.Discard(n)
 201         }
 202 
 203         // adapt lookahead size
 204         if n == len(ahead) && lookahead < maxAhead {
 205             lookahead *= 2
 206         } else if lookahead > 1 {
 207             lookahead /= 2
 208         }
 209 
 210         if n == len(ahead) {
 211             continue
 212         }
 213 
 214         c, _, err := r.ReadRune()
 215         if c == unicode.ReplacementChar {
 216             return errors.New(`invalid UTF-8 stream`)
 217         }
 218         if err == io.EOF {
 219             return nil
 220         }
 221         if err != nil {
 222             return err
 223         }
 224 
 225         if _, err := w.WriteRune(c); err != nil {
 226             return errNoMoreOutput
 227         }
 228     }
 229 }
 230 
 231 // leadASCII is used by func fancyHandleUTF8
 232 func leadASCII(buf []byte) int {
 233     for i, b := range buf {
 234         if b >= 128 {
 235             return i
 236         }
 237     }
 238     return len(buf)
 239 }
 240 
 241 // readPairFunc narrows source-code lines below
 242 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 243 
 244 // utf16toUTF8 handles UTF-16 inputs for func utfate
 245 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 246     for {
 247         a, b, err := read2(r)
 248         if err == io.EOF {
 249             return nil
 250         }
 251         if err != nil {
 252             return err
 253         }
 254 
 255         c := rune(256*int(a) + int(b))
 256         if utf16.IsSurrogate(c) {
 257             a, b, err := read2(r)
 258             if err == io.EOF {
 259                 return nil
 260             }
 261             if err != nil {
 262                 return err
 263             }
 264 
 265             next := rune(256*int(a) + int(b))
 266             c = utf16.DecodeRune(c, next)
 267         }
 268 
 269         if _, err := w.WriteRune(c); err != nil {
 270             return errNoMoreOutput
 271         }
 272     }
 273 }
 274 
 275 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 276 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 277     a, err := br.ReadByte()
 278     if err != nil {
 279         return a, 0, err
 280     }
 281 
 282     b, err := br.ReadByte()
 283     return a, b, err
 284 }
 285 
 286 // readBytePairLE gets you a pair of bytes in little-endian order
 287 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 288     a, b, err := readBytePairBE(br)
 289     return b, a, err
 290 }
 291 
 292 // utf32toUTF8 handles UTF-32 inputs for func utfate
 293 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 294     var n uint32
 295     for {
 296         err := binary.Read(r, o, &n)
 297         if err == io.EOF {
 298             return nil
 299         }
 300         if err != nil {
 301             return err
 302         }
 303 
 304         if _, err := w.WriteRune(rune(n)); err != nil {
 305             return errNoMoreOutput
 306         }
 307     }
 308 }