File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 Single-file source-code for utfate. 27 28 To compile a smaller-sized command-line app, you can use the `go` command as 29 follows: 30 31 go build -ldflags "-s -w" -trimpath utfate.go 32 */ 33 34 package main 35 36 import ( 37 "bufio" 38 "bytes" 39 "encoding/binary" 40 "errors" 41 "io" 42 "os" 43 "unicode" 44 "unicode/utf16" 45 ) 46 47 // Note: the code is avoiding using the fmt package to save hundreds of 48 // kilobytes on the resulting executable, which is a noticeable difference. 49 50 const info = ` 51 utfate [options...] [file...] 52 53 This app turns plain-text input into UTF-8. Supported input formats are 54 55 - ASCII 56 - UTF-8 57 - UTF-8 with a leading BOM 58 - UTF-16 BE 59 - UTF-16 LE 60 - UTF-32 BE 61 - UTF-32 LE 62 63 All (optional) leading options start with either single or double-dash: 64 65 -h show this help message 66 -help show this help message 67 ` 68 69 // errNoMoreOutput is a dummy error whose message is ignored, and which 70 // causes the app to quit immediately and successfully 71 var errNoMoreOutput = errors.New(`no more output`) 72 73 const errorStyle = "\x1b[31m" 74 75 func main() { 76 if len(os.Args) > 1 { 77 switch os.Args[1] { 78 case `-h`, `--h`, `-help`, `--help`: 79 os.Stderr.WriteString(info[1:]) 80 return 81 } 82 } 83 84 if err := run(os.Stdout, os.Args[1:]); isActualError(err) { 85 os.Stderr.WriteString(errorStyle) 86 os.Stderr.WriteString(err.Error()) 87 os.Stderr.WriteString("\x1b[0m\n") 88 os.Exit(1) 89 } 90 } 91 92 func run(w io.Writer, args []string) error { 93 bw := bufio.NewWriter(w) 94 defer bw.Flush() 95 96 for _, path := range args { 97 if err := handleFile(bw, path); err != nil { 98 return err 99 } 100 } 101 102 if len(args) == 0 { 103 return utfate(bw, os.Stdin) 104 } 105 return nil 106 } 107 108 func handleFile(w *bufio.Writer, name string) error { 109 if name == `-` { 110 return utfate(w, os.Stdin) 111 } 112 113 f, err := os.Open(name) 114 if err != nil { 115 return errors.New(`can't read from file named "` + name + `"`) 116 } 117 defer f.Close() 118 119 return utfate(w, f) 120 } 121 122 // isActualError is to figure out whether not to ignore an error, and thus 123 // show it as an error message 124 func isActualError(err error) bool { 125 return err != nil && err != io.EOF && err != errNoMoreOutput 126 } 127 128 func utfate(w io.Writer, r io.Reader) error { 129 br := bufio.NewReader(r) 130 bw := bufio.NewWriter(w) 131 defer bw.Flush() 132 133 lead, err := br.Peek(4) 134 if err != nil && err != io.EOF { 135 return err 136 } 137 138 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 139 br.Discard(4) 140 return utf32toUTF8(bw, br, binary.BigEndian) 141 } 142 143 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 144 br.Discard(4) 145 return utf32toUTF8(bw, br, binary.LittleEndian) 146 } 147 148 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 149 br.Discard(2) 150 return utf16toUTF8(bw, br, readBytePairBE) 151 } 152 153 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 154 br.Discard(2) 155 return utf16toUTF8(bw, br, readBytePairLE) 156 } 157 158 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 159 br.Discard(3) 160 return handleUTF8(bw, br) 161 } 162 163 return handleUTF8(bw, br) 164 } 165 166 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 167 for { 168 c, _, err := r.ReadRune() 169 if c == unicode.ReplacementChar { 170 return errors.New(`invalid UTF-8 stream`) 171 } 172 if err == io.EOF { 173 return nil 174 } 175 if err != nil { 176 return err 177 } 178 179 if _, err := w.WriteRune(c); err != nil { 180 return errNoMoreOutput 181 } 182 } 183 } 184 185 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever 186 // don't seem to speed things up much when given ASCII input 187 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 188 lookahead := 1 189 maxAhead := r.Size() / 2 190 191 for { 192 // look ahead to check for ASCII runs 193 ahead, err := r.Peek(lookahead) 194 if err == io.EOF { 195 return nil 196 } 197 if err != nil { 198 return err 199 } 200 201 // copy leading ASCII runs 202 n := leadASCII(ahead) 203 if n > 0 { 204 w.Write(ahead[:n]) 205 r.Discard(n) 206 } 207 208 // adapt lookahead size 209 if n == len(ahead) && lookahead < maxAhead { 210 lookahead *= 2 211 } else if lookahead > 1 { 212 lookahead /= 2 213 } 214 215 if n == len(ahead) { 216 continue 217 } 218 219 c, _, err := r.ReadRune() 220 if c == unicode.ReplacementChar { 221 return errors.New(`invalid UTF-8 stream`) 222 } 223 if err == io.EOF { 224 return nil 225 } 226 if err != nil { 227 return err 228 } 229 230 if _, err := w.WriteRune(c); err != nil { 231 return errNoMoreOutput 232 } 233 } 234 } 235 236 // leadASCII is used by func fancyHandleUTF8 237 func leadASCII(buf []byte) int { 238 for i, b := range buf { 239 if b >= 128 { 240 return i 241 } 242 } 243 return len(buf) 244 } 245 246 // readPairFunc narrows source-code lines below 247 type readPairFunc func(*bufio.Reader) (byte, byte, error) 248 249 // utf16toUTF8 handles UTF-16 inputs for func utfate 250 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 251 for { 252 a, b, err := read2(r) 253 if err == io.EOF { 254 return nil 255 } 256 if err != nil { 257 return err 258 } 259 260 c := rune(256*int(a) + int(b)) 261 if utf16.IsSurrogate(c) { 262 a, b, err := read2(r) 263 if err == io.EOF { 264 return nil 265 } 266 if err != nil { 267 return err 268 } 269 270 next := rune(256*int(a) + int(b)) 271 c = utf16.DecodeRune(c, next) 272 } 273 274 if _, err := w.WriteRune(c); err != nil { 275 return errNoMoreOutput 276 } 277 } 278 } 279 280 // readBytePairBE gets you a pair of bytes in big-endian (original) order 281 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 282 a, err := br.ReadByte() 283 if err != nil { 284 return a, 0, err 285 } 286 287 b, err := br.ReadByte() 288 return a, b, err 289 } 290 291 // readBytePairLE gets you a pair of bytes in little-endian order 292 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 293 a, b, err := readBytePairBE(br) 294 return b, a, err 295 } 296 297 // utf32toUTF8 handles UTF-32 inputs for func utfate 298 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 299 var n uint32 300 for { 301 err := binary.Read(r, o, &n) 302 if err == io.EOF { 303 return nil 304 } 305 if err != nil { 306 return err 307 } 308 309 if _, err := w.WriteRune(rune(n)); err != nil { 310 return errNoMoreOutput 311 } 312 } 313 }