File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath utfate.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "bytes" 37 "encoding/binary" 38 "errors" 39 "io" 40 "os" 41 "unicode" 42 "unicode/utf16" 43 ) 44 45 const info = ` 46 utfate [options...] [file...] 47 48 This app turns plain-text input into UTF-8. Supported input formats are 49 50 - ASCII 51 - UTF-8 52 - UTF-8 with a leading BOM 53 - UTF-16 BE 54 - UTF-16 LE 55 - UTF-32 BE 56 - UTF-32 LE 57 58 All (optional) leading options start with either single or double-dash: 59 60 -h, -help show this help message 61 ` 62 63 func main() { 64 if len(os.Args) > 1 { 65 switch os.Args[1] { 66 case `-h`, `--h`, `-help`, `--help`: 67 os.Stdout.WriteString(info[1:]) 68 return 69 } 70 } 71 72 if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF { 73 os.Stderr.WriteString(err.Error()) 74 os.Stderr.WriteString("\n") 75 os.Exit(1) 76 return 77 } 78 } 79 80 func run(w io.Writer, args []string) error { 81 bw := bufio.NewWriter(w) 82 defer bw.Flush() 83 84 for _, path := range args { 85 if err := handleFile(bw, path); err != nil { 86 return err 87 } 88 } 89 90 if len(args) == 0 { 91 return utfate(bw, os.Stdin) 92 } 93 return nil 94 } 95 96 func handleFile(w *bufio.Writer, name string) error { 97 if name == `-` { 98 return utfate(w, os.Stdin) 99 } 100 101 f, err := os.Open(name) 102 if err != nil { 103 return errors.New(`can't read from file named "` + name + `"`) 104 } 105 defer f.Close() 106 107 return utfate(w, f) 108 } 109 110 func utfate(w io.Writer, r io.Reader) error { 111 br := bufio.NewReader(r) 112 bw := bufio.NewWriter(w) 113 defer bw.Flush() 114 115 lead, err := br.Peek(4) 116 if err != nil && err != io.EOF { 117 return err 118 } 119 120 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 121 br.Discard(4) 122 return utf32toUTF8(bw, br, binary.BigEndian) 123 } 124 125 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 126 br.Discard(4) 127 return utf32toUTF8(bw, br, binary.LittleEndian) 128 } 129 130 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 131 br.Discard(2) 132 return utf16toUTF8(bw, br, readBytePairBE) 133 } 134 135 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 136 br.Discard(2) 137 return utf16toUTF8(bw, br, readBytePairLE) 138 } 139 140 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 141 br.Discard(3) 142 return handleUTF8(bw, br) 143 } 144 145 return handleUTF8(bw, br) 146 } 147 148 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 149 for { 150 c, _, err := r.ReadRune() 151 if c == unicode.ReplacementChar { 152 return errors.New(`invalid UTF-8 stream`) 153 } 154 if err == io.EOF { 155 return nil 156 } 157 if err != nil { 158 return err 159 } 160 161 if _, err := w.WriteRune(c); err != nil { 162 return io.EOF 163 } 164 } 165 } 166 167 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever 168 // don't seem to speed things up much when given ASCII input 169 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 170 lookahead := 1 171 maxAhead := r.Size() / 2 172 173 for { 174 // look ahead to check for ASCII runs 175 ahead, err := r.Peek(lookahead) 176 if err == io.EOF { 177 return nil 178 } 179 if err != nil { 180 return err 181 } 182 183 // copy leading ASCII runs 184 n := leadASCII(ahead) 185 if n > 0 { 186 w.Write(ahead[:n]) 187 r.Discard(n) 188 } 189 190 // adapt lookahead size 191 if n == len(ahead) && lookahead < maxAhead { 192 lookahead *= 2 193 } else if lookahead > 1 { 194 lookahead /= 2 195 } 196 197 if n == len(ahead) { 198 continue 199 } 200 201 c, _, err := r.ReadRune() 202 if c == unicode.ReplacementChar { 203 return errors.New(`invalid UTF-8 stream`) 204 } 205 if err == io.EOF { 206 return nil 207 } 208 if err != nil { 209 return err 210 } 211 212 if _, err := w.WriteRune(c); err != nil { 213 return io.EOF 214 } 215 } 216 } 217 218 // leadASCII is used by func fancyHandleUTF8 219 func leadASCII(buf []byte) int { 220 for i, b := range buf { 221 if b >= 128 { 222 return i 223 } 224 } 225 return len(buf) 226 } 227 228 // readPairFunc narrows source-code lines below 229 type readPairFunc func(*bufio.Reader) (byte, byte, error) 230 231 // utf16toUTF8 handles UTF-16 inputs for func utfate 232 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 233 for { 234 a, b, err := read2(r) 235 if err == io.EOF { 236 return nil 237 } 238 if err != nil { 239 return err 240 } 241 242 c := rune(256*int(a) + int(b)) 243 if utf16.IsSurrogate(c) { 244 a, b, err := read2(r) 245 if err == io.EOF { 246 return nil 247 } 248 if err != nil { 249 return err 250 } 251 252 next := rune(256*int(a) + int(b)) 253 c = utf16.DecodeRune(c, next) 254 } 255 256 if _, err := w.WriteRune(c); err != nil { 257 return io.EOF 258 } 259 } 260 } 261 262 // readBytePairBE gets you a pair of bytes in big-endian (original) order 263 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 264 a, err := br.ReadByte() 265 if err != nil { 266 return a, 0, err 267 } 268 269 b, err := br.ReadByte() 270 return a, b, err 271 } 272 273 // readBytePairLE gets you a pair of bytes in little-endian order 274 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 275 a, b, err := readBytePairBE(br) 276 return b, a, err 277 } 278 279 // utf32toUTF8 handles UTF-32 inputs for func utfate 280 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 281 var n uint32 282 for { 283 err := binary.Read(r, o, &n) 284 if err == io.EOF { 285 return nil 286 } 287 if err != nil { 288 return err 289 } 290 291 if _, err := w.WriteRune(rune(n)); err != nil { 292 return io.EOF 293 } 294 } 295 }