File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath utfate.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "bytes" 37 "encoding/binary" 38 "errors" 39 "io" 40 "os" 41 "unicode" 42 "unicode/utf16" 43 ) 44 45 // Note: the code is avoiding using the fmt package to save hundreds of 46 // kilobytes on the resulting executable, which is a noticeable difference. 47 48 const info = ` 49 utfate [options...] [file...] 50 51 This app turns plain-text input into UTF-8. Supported input formats are 52 53 - ASCII 54 - UTF-8 55 - UTF-8 with a leading BOM 56 - UTF-16 BE 57 - UTF-16 LE 58 - UTF-32 BE 59 - UTF-32 LE 60 61 All (optional) leading options start with either single or double-dash: 62 63 -h show this help message 64 -help show this help message 65 ` 66 67 // errNoMoreOutput is a dummy error whose message is ignored, and which 68 // causes the app to quit immediately and successfully 69 var errNoMoreOutput = errors.New(`no more output`) 70 71 func main() { 72 if len(os.Args) > 1 { 73 switch os.Args[1] { 74 case `-h`, `--h`, `-help`, `--help`: 75 os.Stderr.WriteString(info[1:]) 76 return 77 } 78 } 79 80 if err := run(os.Stdout, os.Args[1:]); isActualError(err) { 81 os.Stderr.WriteString(err.Error()) 82 os.Stderr.WriteString("\n") 83 os.Exit(1) 84 } 85 } 86 87 func run(w io.Writer, args []string) error { 88 bw := bufio.NewWriter(w) 89 defer bw.Flush() 90 91 for _, path := range args { 92 if err := handleFile(bw, path); err != nil { 93 return err 94 } 95 } 96 97 if len(args) == 0 { 98 return utfate(bw, os.Stdin) 99 } 100 return nil 101 } 102 103 func handleFile(w *bufio.Writer, name string) error { 104 if name == `-` { 105 return utfate(w, os.Stdin) 106 } 107 108 f, err := os.Open(name) 109 if err != nil { 110 return errors.New(`can't read from file named "` + name + `"`) 111 } 112 defer f.Close() 113 114 return utfate(w, f) 115 } 116 117 // isActualError is to figure out whether not to ignore an error, and thus 118 // show it as an error message 119 func isActualError(err error) bool { 120 return err != nil && err != io.EOF && err != errNoMoreOutput 121 } 122 123 func utfate(w io.Writer, r io.Reader) error { 124 br := bufio.NewReader(r) 125 bw := bufio.NewWriter(w) 126 defer bw.Flush() 127 128 lead, err := br.Peek(4) 129 if err != nil && err != io.EOF { 130 return err 131 } 132 133 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 134 br.Discard(4) 135 return utf32toUTF8(bw, br, binary.BigEndian) 136 } 137 138 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 139 br.Discard(4) 140 return utf32toUTF8(bw, br, binary.LittleEndian) 141 } 142 143 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 144 br.Discard(2) 145 return utf16toUTF8(bw, br, readBytePairBE) 146 } 147 148 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 149 br.Discard(2) 150 return utf16toUTF8(bw, br, readBytePairLE) 151 } 152 153 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 154 br.Discard(3) 155 return handleUTF8(bw, br) 156 } 157 158 return handleUTF8(bw, br) 159 } 160 161 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 162 for { 163 c, _, err := r.ReadRune() 164 if c == unicode.ReplacementChar { 165 return errors.New(`invalid UTF-8 stream`) 166 } 167 if err == io.EOF { 168 return nil 169 } 170 if err != nil { 171 return err 172 } 173 174 if _, err := w.WriteRune(c); err != nil { 175 return errNoMoreOutput 176 } 177 } 178 } 179 180 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever 181 // don't seem to speed things up much when given ASCII input 182 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 183 lookahead := 1 184 maxAhead := r.Size() / 2 185 186 for { 187 // look ahead to check for ASCII runs 188 ahead, err := r.Peek(lookahead) 189 if err == io.EOF { 190 return nil 191 } 192 if err != nil { 193 return err 194 } 195 196 // copy leading ASCII runs 197 n := leadASCII(ahead) 198 if n > 0 { 199 w.Write(ahead[:n]) 200 r.Discard(n) 201 } 202 203 // adapt lookahead size 204 if n == len(ahead) && lookahead < maxAhead { 205 lookahead *= 2 206 } else if lookahead > 1 { 207 lookahead /= 2 208 } 209 210 if n == len(ahead) { 211 continue 212 } 213 214 c, _, err := r.ReadRune() 215 if c == unicode.ReplacementChar { 216 return errors.New(`invalid UTF-8 stream`) 217 } 218 if err == io.EOF { 219 return nil 220 } 221 if err != nil { 222 return err 223 } 224 225 if _, err := w.WriteRune(c); err != nil { 226 return errNoMoreOutput 227 } 228 } 229 } 230 231 // leadASCII is used by func fancyHandleUTF8 232 func leadASCII(buf []byte) int { 233 for i, b := range buf { 234 if b >= 128 { 235 return i 236 } 237 } 238 return len(buf) 239 } 240 241 // readPairFunc narrows source-code lines below 242 type readPairFunc func(*bufio.Reader) (byte, byte, error) 243 244 // utf16toUTF8 handles UTF-16 inputs for func utfate 245 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 246 for { 247 a, b, err := read2(r) 248 if err == io.EOF { 249 return nil 250 } 251 if err != nil { 252 return err 253 } 254 255 c := rune(256*int(a) + int(b)) 256 if utf16.IsSurrogate(c) { 257 a, b, err := read2(r) 258 if err == io.EOF { 259 return nil 260 } 261 if err != nil { 262 return err 263 } 264 265 next := rune(256*int(a) + int(b)) 266 c = utf16.DecodeRune(c, next) 267 } 268 269 if _, err := w.WriteRune(c); err != nil { 270 return errNoMoreOutput 271 } 272 } 273 } 274 275 // readBytePairBE gets you a pair of bytes in big-endian (original) order 276 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 277 a, err := br.ReadByte() 278 if err != nil { 279 return a, 0, err 280 } 281 282 b, err := br.ReadByte() 283 return a, b, err 284 } 285 286 // readBytePairLE gets you a pair of bytes in little-endian order 287 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 288 a, b, err := readBytePairBE(br) 289 return b, a, err 290 } 291 292 // utf32toUTF8 handles UTF-32 inputs for func utfate 293 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 294 var n uint32 295 for { 296 err := binary.Read(r, o, &n) 297 if err == io.EOF { 298 return nil 299 } 300 if err != nil { 301 return err 302 } 303 304 if _, err := w.WriteRune(rune(n)); err != nil { 305 return errNoMoreOutput 306 } 307 } 308 }