File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 Single-file source-code for utfate. 27 28 To compile a smaller-sized command-line app, you can use the `go` command as 29 follows: 30 31 go build -ldflags "-s -w" -trimpath utfate.go 32 */ 33 34 package main 35 36 import ( 37 "bufio" 38 "bytes" 39 "encoding/binary" 40 "errors" 41 "io" 42 "os" 43 "unicode" 44 "unicode/utf16" 45 ) 46 47 // Note: the code is avoiding using the fmt package to save hundreds of 48 // kilobytes on the resulting executable, which is a noticeable difference. 49 50 const info = ` 51 utfate [options...] [file...] 52 53 This app turns plain-text input into UTF-8. Supported input formats are 54 55 - ASCII 56 - UTF-8 57 - UTF-8 with a leading BOM 58 - UTF-16 BE 59 - UTF-16 LE 60 - UTF-32 BE 61 - UTF-32 LE 62 63 All (optional) leading options start with either single or double-dash: 64 65 -h show this help message 66 -help show this help message 67 ` 68 69 // errNoMoreOutput is a dummy error, whose message is ignored, and which 70 // causes the app to quit immediately and successfully 71 var errNoMoreOutput = errors.New(`no more output`) 72 73 const errorStyle = "\x1b[31m" 74 75 func main() { 76 if len(os.Args) > 1 { 77 switch os.Args[1] { 78 case `-h`, `--h`, `-help`, `--help`: 79 os.Stderr.WriteString(info[1:]) 80 return 81 } 82 } 83 84 if err := run(os.Stdout, os.Args[1:]); isActualError(err) { 85 os.Stderr.WriteString(errorStyle) 86 os.Stderr.WriteString(err.Error()) 87 os.Stderr.WriteString("\x1b[0m\n") 88 os.Exit(1) 89 } 90 } 91 92 func run(w io.Writer, args []string) error { 93 bw := bufio.NewWriter(w) 94 defer bw.Flush() 95 96 for _, path := range args { 97 if err := handleFile(bw, path); err != nil { 98 return err 99 } 100 } 101 102 if len(args) == 0 { 103 return utfate(bw, os.Stdin) 104 } 105 return nil 106 } 107 108 func handleFile(w *bufio.Writer, name string) error { 109 if name == `-` { 110 return utfate(w, os.Stdin) 111 } 112 113 f, err := os.Open(name) 114 if err != nil { 115 return errors.New(`can't read from file named "` + name + `"`) 116 } 117 defer f.Close() 118 119 return utfate(w, f) 120 } 121 122 // isActualError is to figure out whether not to ignore an error, and thus 123 // show it as an error message 124 func isActualError(err error) bool { 125 return err != nil && err != io.EOF && err != errNoMoreOutput 126 } 127 128 func utfate(w io.Writer, r io.Reader) error { 129 br := bufio.NewReader(r) 130 bw := bufio.NewWriter(w) 131 defer bw.Flush() 132 133 lead, err := br.Peek(4) 134 if err != nil { 135 return err 136 } 137 138 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 139 br.Discard(4) 140 return utf32toUTF8(bw, br, binary.BigEndian) 141 } 142 143 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 144 br.Discard(4) 145 return utf32toUTF8(bw, br, binary.LittleEndian) 146 } 147 148 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 149 br.Discard(2) 150 return utf16toUTF8(bw, br, readBytePairBE) 151 } 152 153 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 154 br.Discard(2) 155 return utf16toUTF8(bw, br, readBytePairLE) 156 } 157 158 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 159 br.Discard(3) 160 return handleUTF8(bw, br) 161 } 162 163 return handleUTF8(bw, br) 164 } 165 166 func leadASCII(buf []byte) int { 167 for i, b := range buf { 168 if b < 128 { 169 continue 170 } 171 return i 172 } 173 174 return len(buf) 175 } 176 177 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 178 for { 179 c, _, err := r.ReadRune() 180 if c == unicode.ReplacementChar { 181 return errors.New(`invalid UTF-8 stream`) 182 } 183 184 if err != nil { 185 if err == io.EOF { 186 return nil 187 } 188 return err 189 } 190 191 _, err = w.WriteRune(c) 192 if err != nil { 193 return errNoMoreOutput 194 } 195 } 196 } 197 198 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 199 lookahead := 1 200 maxAhead := r.Size() / 2 201 202 for { 203 // look ahead to check for ASCII runs 204 ahead, err := r.Peek(lookahead) 205 if err == io.EOF { 206 return nil 207 } 208 if err != nil { 209 return err 210 } 211 212 // copy leading ASCII runs 213 n := leadASCII(ahead) 214 if n > 0 { 215 w.Write(ahead[:n]) 216 r.Discard(n) 217 } 218 219 // adapt lookahead size 220 if n == len(ahead) && lookahead < maxAhead { 221 lookahead *= 2 222 } else if lookahead > 1 { 223 lookahead /= 2 224 } 225 226 if n == len(ahead) { 227 continue 228 } 229 230 c, _, err := r.ReadRune() 231 if c == unicode.ReplacementChar { 232 return errors.New(`invalid UTF-8 stream`) 233 } 234 if err == io.EOF { 235 return nil 236 } 237 if err != nil { 238 return err 239 } 240 241 _, err = w.WriteRune(c) 242 if err != nil { 243 return errNoMoreOutput 244 } 245 } 246 } 247 248 // readPairFunc narrows source-code lines below 249 type readPairFunc func(*bufio.Reader) (byte, byte, error) 250 251 // utf16toUTF8 handles UTF-16 inputs for func utfate 252 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 253 for { 254 a, b, err := read2(r) 255 if err == io.EOF { 256 return nil 257 } 258 if err != nil { 259 return err 260 } 261 262 c := rune(256*int(a) + int(b)) 263 if utf16.IsSurrogate(c) { 264 a, b, err := read2(r) 265 if err == io.EOF { 266 return nil 267 } 268 if err != nil { 269 return err 270 } 271 272 next := rune(256*int(a) + int(b)) 273 c = utf16.DecodeRune(c, next) 274 } 275 276 _, err = w.WriteRune(c) 277 if err != nil { 278 return errNoMoreOutput 279 } 280 } 281 } 282 283 // readBytePairBE gets you a pair of bytes in big-endian (original) order 284 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 285 a, err := br.ReadByte() 286 if err != nil { 287 return a, 0, err 288 } 289 290 b, err := br.ReadByte() 291 return a, b, err 292 } 293 294 // readBytePairLE gets you a pair of bytes in little-endian order 295 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 296 a, b, err := readBytePairBE(br) 297 return b, a, err 298 } 299 300 // utf32toUTF8 handles UTF-32 inputs for func utfate 301 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 302 var n uint32 303 for { 304 err := binary.Read(r, o, &n) 305 if err == io.EOF { 306 return nil 307 } 308 if err != nil { 309 return err 310 } 311 312 _, err = w.WriteRune(rune(n)) 313 if err != nil { 314 return errNoMoreOutput 315 } 316 } 317 }