File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath utfate.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "bytes" 37 "encoding/binary" 38 "errors" 39 "io" 40 "os" 41 "unicode" 42 "unicode/utf16" 43 ) 44 45 // Note: the code is avoiding using the fmt package to save hundreds of 46 // kilobytes on the resulting executable, which is a noticeable difference. 47 48 const info = ` 49 utfate [options...] [file...] 50 51 This app turns plain-text input into UTF-8. Supported input formats are 52 53 - ASCII 54 - UTF-8 55 - UTF-8 with a leading BOM 56 - UTF-16 BE 57 - UTF-16 LE 58 - UTF-32 BE 59 - UTF-32 LE 60 61 All (optional) leading options start with either single or double-dash: 62 63 -h show this help message 64 -help show this help message 65 ` 66 67 func main() { 68 if len(os.Args) > 1 { 69 switch os.Args[1] { 70 case `-h`, `--h`, `-help`, `--help`: 71 os.Stdout.WriteString(info[1:]) 72 return 73 } 74 } 75 76 if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF { 77 os.Stderr.WriteString(err.Error()) 78 os.Stderr.WriteString("\n") 79 os.Exit(1) 80 } 81 } 82 83 func run(w io.Writer, args []string) error { 84 bw := bufio.NewWriter(w) 85 defer bw.Flush() 86 87 for _, path := range args { 88 if err := handleFile(bw, path); err != nil { 89 return err 90 } 91 } 92 93 if len(args) == 0 { 94 return utfate(bw, os.Stdin) 95 } 96 return nil 97 } 98 99 func handleFile(w *bufio.Writer, name string) error { 100 if name == `-` { 101 return utfate(w, os.Stdin) 102 } 103 104 f, err := os.Open(name) 105 if err != nil { 106 return errors.New(`can't read from file named "` + name + `"`) 107 } 108 defer f.Close() 109 110 return utfate(w, f) 111 } 112 113 func utfate(w io.Writer, r io.Reader) error { 114 br := bufio.NewReader(r) 115 bw := bufio.NewWriter(w) 116 defer bw.Flush() 117 118 lead, err := br.Peek(4) 119 if err != nil && err != io.EOF { 120 return err 121 } 122 123 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 124 br.Discard(4) 125 return utf32toUTF8(bw, br, binary.BigEndian) 126 } 127 128 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 129 br.Discard(4) 130 return utf32toUTF8(bw, br, binary.LittleEndian) 131 } 132 133 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 134 br.Discard(2) 135 return utf16toUTF8(bw, br, readBytePairBE) 136 } 137 138 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 139 br.Discard(2) 140 return utf16toUTF8(bw, br, readBytePairLE) 141 } 142 143 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 144 br.Discard(3) 145 return handleUTF8(bw, br) 146 } 147 148 return handleUTF8(bw, br) 149 } 150 151 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 152 for { 153 c, _, err := r.ReadRune() 154 if c == unicode.ReplacementChar { 155 return errors.New(`invalid UTF-8 stream`) 156 } 157 if err == io.EOF { 158 return nil 159 } 160 if err != nil { 161 return err 162 } 163 164 if _, err := w.WriteRune(c); err != nil { 165 return io.EOF 166 } 167 } 168 } 169 170 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever 171 // don't seem to speed things up much when given ASCII input 172 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 173 lookahead := 1 174 maxAhead := r.Size() / 2 175 176 for { 177 // look ahead to check for ASCII runs 178 ahead, err := r.Peek(lookahead) 179 if err == io.EOF { 180 return nil 181 } 182 if err != nil { 183 return err 184 } 185 186 // copy leading ASCII runs 187 n := leadASCII(ahead) 188 if n > 0 { 189 w.Write(ahead[:n]) 190 r.Discard(n) 191 } 192 193 // adapt lookahead size 194 if n == len(ahead) && lookahead < maxAhead { 195 lookahead *= 2 196 } else if lookahead > 1 { 197 lookahead /= 2 198 } 199 200 if n == len(ahead) { 201 continue 202 } 203 204 c, _, err := r.ReadRune() 205 if c == unicode.ReplacementChar { 206 return errors.New(`invalid UTF-8 stream`) 207 } 208 if err == io.EOF { 209 return nil 210 } 211 if err != nil { 212 return err 213 } 214 215 if _, err := w.WriteRune(c); err != nil { 216 return io.EOF 217 } 218 } 219 } 220 221 // leadASCII is used by func fancyHandleUTF8 222 func leadASCII(buf []byte) int { 223 for i, b := range buf { 224 if b >= 128 { 225 return i 226 } 227 } 228 return len(buf) 229 } 230 231 // readPairFunc narrows source-code lines below 232 type readPairFunc func(*bufio.Reader) (byte, byte, error) 233 234 // utf16toUTF8 handles UTF-16 inputs for func utfate 235 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 236 for { 237 a, b, err := read2(r) 238 if err == io.EOF { 239 return nil 240 } 241 if err != nil { 242 return err 243 } 244 245 c := rune(256*int(a) + int(b)) 246 if utf16.IsSurrogate(c) { 247 a, b, err := read2(r) 248 if err == io.EOF { 249 return nil 250 } 251 if err != nil { 252 return err 253 } 254 255 next := rune(256*int(a) + int(b)) 256 c = utf16.DecodeRune(c, next) 257 } 258 259 if _, err := w.WriteRune(c); err != nil { 260 return io.EOF 261 } 262 } 263 } 264 265 // readBytePairBE gets you a pair of bytes in big-endian (original) order 266 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 267 a, err := br.ReadByte() 268 if err != nil { 269 return a, 0, err 270 } 271 272 b, err := br.ReadByte() 273 return a, b, err 274 } 275 276 // readBytePairLE gets you a pair of bytes in little-endian order 277 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 278 a, b, err := readBytePairBE(br) 279 return b, a, err 280 } 281 282 // utf32toUTF8 handles UTF-32 inputs for func utfate 283 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 284 var n uint32 285 for { 286 err := binary.Read(r, o, &n) 287 if err == io.EOF { 288 return nil 289 } 290 if err != nil { 291 return err 292 } 293 294 if _, err := w.WriteRune(rune(n)); err != nil { 295 return io.EOF 296 } 297 } 298 }