File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 Single-file source-code for utfate.
  27 
  28 To compile a smaller-sized command-line app, you can use the `go` command as
  29 follows:
  30 
  31 go build -ldflags "-s -w" -trimpath utfate.go
  32 */
  33 
  34 package main
  35 
  36 import (
  37     "bufio"
  38     "bytes"
  39     "encoding/binary"
  40     "errors"
  41     "io"
  42     "os"
  43     "unicode"
  44     "unicode/utf16"
  45 )
  46 
  47 // Note: the code is avoiding using the fmt package to save hundreds of
  48 // kilobytes on the resulting executable, which is a noticeable difference.
  49 
  50 const info = `
  51 utfate [options...] [file...]
  52 
  53 This app turns plain-text input into UTF-8. Supported input formats are
  54 
  55     - ASCII
  56     - UTF-8
  57     - UTF-8 with a leading BOM
  58     - UTF-16 BE
  59     - UTF-16 LE
  60     - UTF-32 BE
  61     - UTF-32 LE
  62 
  63 All (optional) leading options start with either single or double-dash:
  64 
  65     -h          show this help message
  66     -help       show this help message
  67 `
  68 
  69 // errNoMoreOutput is a dummy error, whose message is ignored, and which
  70 // causes the app to quit immediately and successfully
  71 var errNoMoreOutput = errors.New(`no more output`)
  72 
  73 const errorStyle = "\x1b[31m"
  74 
  75 func main() {
  76     if len(os.Args) > 1 {
  77         switch os.Args[1] {
  78         case `-h`, `--h`, `-help`, `--help`:
  79             os.Stderr.WriteString(info[1:])
  80             return
  81         }
  82     }
  83 
  84     if err := run(os.Stdout, os.Args[1:]); isActualError(err) {
  85         os.Stderr.WriteString(errorStyle)
  86         os.Stderr.WriteString(err.Error())
  87         os.Stderr.WriteString("\x1b[0m\n")
  88         os.Exit(1)
  89     }
  90 }
  91 
  92 func run(w io.Writer, args []string) error {
  93     bw := bufio.NewWriter(w)
  94     defer bw.Flush()
  95 
  96     for _, path := range args {
  97         if err := handleFile(bw, path); err != nil {
  98             return err
  99         }
 100     }
 101 
 102     if len(args) == 0 {
 103         return utfate(bw, os.Stdin)
 104     }
 105     return nil
 106 }
 107 
 108 func handleFile(w *bufio.Writer, name string) error {
 109     if name == `-` {
 110         return utfate(w, os.Stdin)
 111     }
 112 
 113     f, err := os.Open(name)
 114     if err != nil {
 115         return errors.New(`can't read from file named "` + name + `"`)
 116     }
 117     defer f.Close()
 118 
 119     return utfate(w, f)
 120 }
 121 
 122 // isActualError is to figure out whether not to ignore an error, and thus
 123 // show it as an error message
 124 func isActualError(err error) bool {
 125     return err != nil && err != io.EOF && err != errNoMoreOutput
 126 }
 127 
 128 func utfate(w io.Writer, r io.Reader) error {
 129     br := bufio.NewReader(r)
 130     bw := bufio.NewWriter(w)
 131     defer bw.Flush()
 132 
 133     lead, err := br.Peek(4)
 134     if err != nil {
 135         return err
 136     }
 137 
 138     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 139         br.Discard(4)
 140         return utf32toUTF8(bw, br, binary.BigEndian)
 141     }
 142 
 143     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 144         br.Discard(4)
 145         return utf32toUTF8(bw, br, binary.LittleEndian)
 146     }
 147 
 148     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 149         br.Discard(2)
 150         return utf16toUTF8(bw, br, readBytePairBE)
 151     }
 152 
 153     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 154         br.Discard(2)
 155         return utf16toUTF8(bw, br, readBytePairLE)
 156     }
 157 
 158     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 159         br.Discard(3)
 160         return handleUTF8(bw, br)
 161     }
 162 
 163     return handleUTF8(bw, br)
 164 }
 165 
 166 func leadASCII(buf []byte) int {
 167     for i, b := range buf {
 168         if b < 128 {
 169             continue
 170         }
 171         return i
 172     }
 173 
 174     return len(buf)
 175 }
 176 
 177 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 178     for {
 179         c, _, err := r.ReadRune()
 180         if c == unicode.ReplacementChar {
 181             return errors.New(`invalid UTF-8 stream`)
 182         }
 183 
 184         if err != nil {
 185             if err == io.EOF {
 186                 return nil
 187             }
 188             return err
 189         }
 190 
 191         _, err = w.WriteRune(c)
 192         if err != nil {
 193             return errNoMoreOutput
 194         }
 195     }
 196 }
 197 
 198 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 199     lookahead := 1
 200     maxAhead := r.Size() / 2
 201 
 202     for {
 203         // look ahead to check for ASCII runs
 204         ahead, err := r.Peek(lookahead)
 205         if err == io.EOF {
 206             return nil
 207         }
 208         if err != nil {
 209             return err
 210         }
 211 
 212         // copy leading ASCII runs
 213         n := leadASCII(ahead)
 214         if n > 0 {
 215             w.Write(ahead[:n])
 216             r.Discard(n)
 217         }
 218 
 219         // adapt lookahead size
 220         if n == len(ahead) && lookahead < maxAhead {
 221             lookahead *= 2
 222         } else if lookahead > 1 {
 223             lookahead /= 2
 224         }
 225 
 226         if n == len(ahead) {
 227             continue
 228         }
 229 
 230         c, _, err := r.ReadRune()
 231         if c == unicode.ReplacementChar {
 232             return errors.New(`invalid UTF-8 stream`)
 233         }
 234         if err == io.EOF {
 235             return nil
 236         }
 237         if err != nil {
 238             return err
 239         }
 240 
 241         _, err = w.WriteRune(c)
 242         if err != nil {
 243             return errNoMoreOutput
 244         }
 245     }
 246 }
 247 
 248 // readPairFunc narrows source-code lines below
 249 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 250 
 251 // utf16toUTF8 handles UTF-16 inputs for func utfate
 252 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 253     for {
 254         a, b, err := read2(r)
 255         if err == io.EOF {
 256             return nil
 257         }
 258         if err != nil {
 259             return err
 260         }
 261 
 262         c := rune(256*int(a) + int(b))
 263         if utf16.IsSurrogate(c) {
 264             a, b, err := read2(r)
 265             if err == io.EOF {
 266                 return nil
 267             }
 268             if err != nil {
 269                 return err
 270             }
 271 
 272             next := rune(256*int(a) + int(b))
 273             c = utf16.DecodeRune(c, next)
 274         }
 275 
 276         _, err = w.WriteRune(c)
 277         if err != nil {
 278             return errNoMoreOutput
 279         }
 280     }
 281 }
 282 
 283 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 284 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 285     a, err := br.ReadByte()
 286     if err != nil {
 287         return a, 0, err
 288     }
 289 
 290     b, err := br.ReadByte()
 291     return a, b, err
 292 }
 293 
 294 // readBytePairLE gets you a pair of bytes in little-endian order
 295 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 296     a, b, err := readBytePairBE(br)
 297     return b, a, err
 298 }
 299 
 300 // utf32toUTF8 handles UTF-32 inputs for func utfate
 301 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 302     var n uint32
 303     for {
 304         err := binary.Read(r, o, &n)
 305         if err == io.EOF {
 306             return nil
 307         }
 308         if err != nil {
 309             return err
 310         }
 311 
 312         _, err = w.WriteRune(rune(n))
 313         if err != nil {
 314             return errNoMoreOutput
 315         }
 316     }
 317 }