File: utfate.go
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 Single-file source-code for utfate.
  27 
  28 To compile a smaller-sized command-line app, you can use the `go` command as
  29 follows:
  30 
  31 go build -ldflags "-s -w" -trimpath utfate.go
  32 */
  33 
  34 package main
  35 
  36 import (
  37     "bufio"
  38     "bytes"
  39     "encoding/binary"
  40     "errors"
  41     "io"
  42     "os"
  43     "unicode"
  44     "unicode/utf16"
  45 )
  46 
  47 // Note: the code is avoiding using the fmt package to save hundreds of
  48 // kilobytes on the resulting executable, which is a noticeable difference.
  49 
  50 const info = `
  51 utfate [options...] [file...]
  52 
  53 This app turns plain-text input into UTF-8. Supported input formats are
  54 
  55     - ASCII
  56     - UTF-8
  57     - UTF-8 with a leading BOM
  58     - UTF-16 BE
  59     - UTF-16 LE
  60     - UTF-32 BE
  61     - UTF-32 LE
  62 
  63 All (optional) leading options start with either single or double-dash:
  64 
  65     -h          show this help message
  66     -help       show this help message
  67 `
  68 
  69 // errNoMoreOutput is a dummy error whose message is ignored, and which
  70 // causes the app to quit immediately and successfully
  71 var errNoMoreOutput = errors.New(`no more output`)
  72 
  73 const errorStyle = "\x1b[31m"
  74 
  75 func main() {
  76     if len(os.Args) > 1 {
  77         switch os.Args[1] {
  78         case `-h`, `--h`, `-help`, `--help`:
  79             os.Stderr.WriteString(info[1:])
  80             return
  81         }
  82     }
  83 
  84     if err := run(os.Stdout, os.Args[1:]); isActualError(err) {
  85         os.Stderr.WriteString(errorStyle)
  86         os.Stderr.WriteString(err.Error())
  87         os.Stderr.WriteString("\x1b[0m\n")
  88         os.Exit(1)
  89     }
  90 }
  91 
  92 func run(w io.Writer, args []string) error {
  93     bw := bufio.NewWriter(w)
  94     defer bw.Flush()
  95 
  96     for _, path := range args {
  97         if err := handleFile(bw, path); err != nil {
  98             return err
  99         }
 100     }
 101 
 102     if len(args) == 0 {
 103         return utfate(bw, os.Stdin)
 104     }
 105     return nil
 106 }
 107 
 108 func handleFile(w *bufio.Writer, name string) error {
 109     if name == `-` {
 110         return utfate(w, os.Stdin)
 111     }
 112 
 113     f, err := os.Open(name)
 114     if err != nil {
 115         return errors.New(`can't read from file named "` + name + `"`)
 116     }
 117     defer f.Close()
 118 
 119     return utfate(w, f)
 120 }
 121 
 122 // isActualError is to figure out whether not to ignore an error, and thus
 123 // show it as an error message
 124 func isActualError(err error) bool {
 125     return err != nil && err != io.EOF && err != errNoMoreOutput
 126 }
 127 
 128 func utfate(w io.Writer, r io.Reader) error {
 129     br := bufio.NewReader(r)
 130     bw := bufio.NewWriter(w)
 131     defer bw.Flush()
 132 
 133     lead, err := br.Peek(4)
 134     if err != nil && err != io.EOF {
 135         return err
 136     }
 137 
 138     if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) {
 139         br.Discard(4)
 140         return utf32toUTF8(bw, br, binary.BigEndian)
 141     }
 142 
 143     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) {
 144         br.Discard(4)
 145         return utf32toUTF8(bw, br, binary.LittleEndian)
 146     }
 147 
 148     if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) {
 149         br.Discard(2)
 150         return utf16toUTF8(bw, br, readBytePairBE)
 151     }
 152 
 153     if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) {
 154         br.Discard(2)
 155         return utf16toUTF8(bw, br, readBytePairLE)
 156     }
 157 
 158     if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) {
 159         br.Discard(3)
 160         return handleUTF8(bw, br)
 161     }
 162 
 163     return handleUTF8(bw, br)
 164 }
 165 
 166 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 167     for {
 168         c, _, err := r.ReadRune()
 169         if c == unicode.ReplacementChar {
 170             return errors.New(`invalid UTF-8 stream`)
 171         }
 172         if err == io.EOF {
 173             return nil
 174         }
 175         if err != nil {
 176             return err
 177         }
 178 
 179         if _, err := w.WriteRune(c); err != nil {
 180             return errNoMoreOutput
 181         }
 182     }
 183 }
 184 
 185 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever
 186 // don't seem to speed things up much when given ASCII input
 187 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error {
 188     lookahead := 1
 189     maxAhead := r.Size() / 2
 190 
 191     for {
 192         // look ahead to check for ASCII runs
 193         ahead, err := r.Peek(lookahead)
 194         if err == io.EOF {
 195             return nil
 196         }
 197         if err != nil {
 198             return err
 199         }
 200 
 201         // copy leading ASCII runs
 202         n := leadASCII(ahead)
 203         if n > 0 {
 204             w.Write(ahead[:n])
 205             r.Discard(n)
 206         }
 207 
 208         // adapt lookahead size
 209         if n == len(ahead) && lookahead < maxAhead {
 210             lookahead *= 2
 211         } else if lookahead > 1 {
 212             lookahead /= 2
 213         }
 214 
 215         if n == len(ahead) {
 216             continue
 217         }
 218 
 219         c, _, err := r.ReadRune()
 220         if c == unicode.ReplacementChar {
 221             return errors.New(`invalid UTF-8 stream`)
 222         }
 223         if err == io.EOF {
 224             return nil
 225         }
 226         if err != nil {
 227             return err
 228         }
 229 
 230         if _, err := w.WriteRune(c); err != nil {
 231             return errNoMoreOutput
 232         }
 233     }
 234 }
 235 
 236 // leadASCII is used by func fancyHandleUTF8
 237 func leadASCII(buf []byte) int {
 238     for i, b := range buf {
 239         if b >= 128 {
 240             return i
 241         }
 242     }
 243     return len(buf)
 244 }
 245 
 246 // readPairFunc narrows source-code lines below
 247 type readPairFunc func(*bufio.Reader) (byte, byte, error)
 248 
 249 // utf16toUTF8 handles UTF-16 inputs for func utfate
 250 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error {
 251     for {
 252         a, b, err := read2(r)
 253         if err == io.EOF {
 254             return nil
 255         }
 256         if err != nil {
 257             return err
 258         }
 259 
 260         c := rune(256*int(a) + int(b))
 261         if utf16.IsSurrogate(c) {
 262             a, b, err := read2(r)
 263             if err == io.EOF {
 264                 return nil
 265             }
 266             if err != nil {
 267                 return err
 268             }
 269 
 270             next := rune(256*int(a) + int(b))
 271             c = utf16.DecodeRune(c, next)
 272         }
 273 
 274         if _, err := w.WriteRune(c); err != nil {
 275             return errNoMoreOutput
 276         }
 277     }
 278 }
 279 
 280 // readBytePairBE gets you a pair of bytes in big-endian (original) order
 281 func readBytePairBE(br *bufio.Reader) (byte, byte, error) {
 282     a, err := br.ReadByte()
 283     if err != nil {
 284         return a, 0, err
 285     }
 286 
 287     b, err := br.ReadByte()
 288     return a, b, err
 289 }
 290 
 291 // readBytePairLE gets you a pair of bytes in little-endian order
 292 func readBytePairLE(br *bufio.Reader) (byte, byte, error) {
 293     a, b, err := readBytePairBE(br)
 294     return b, a, err
 295 }
 296 
 297 // utf32toUTF8 handles UTF-32 inputs for func utfate
 298 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error {
 299     var n uint32
 300     for {
 301         err := binary.Read(r, o, &n)
 302         if err == io.EOF {
 303             return nil
 304         }
 305         if err != nil {
 306             return err
 307         }
 308 
 309         if _, err := w.WriteRune(rune(n)); err != nil {
 310             return errNoMoreOutput
 311         }
 312     }
 313 }