File: utfate.go 1 /* 2 The MIT License (MIT) 3 4 Copyright (c) 2026 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the "Software"), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 To compile a smaller-sized command-line app, you can use the `go` command as 27 follows: 28 29 go build -ldflags "-s -w" -trimpath utfate.go 30 */ 31 32 package main 33 34 import ( 35 "bufio" 36 "bytes" 37 "encoding/binary" 38 "errors" 39 "io" 40 "os" 41 "unicode" 42 "unicode/utf16" 43 ) 44 45 const info = ` 46 utfate [options...] [file...] 47 48 This app turns plain-text input into UTF-8. Supported input formats are 49 50 - ASCII 51 - UTF-8 52 - UTF-8 with a leading BOM 53 - UTF-16 BE 54 - UTF-16 LE 55 - UTF-32 BE 56 - UTF-32 LE 57 58 All (optional) leading options start with either single or double-dash: 59 60 -h, -help show this help message 61 ` 62 63 func main() { 64 if len(os.Args) > 1 { 65 switch os.Args[1] { 66 case `-h`, `--h`, `-help`, `--help`: 67 os.Stdout.WriteString(info[1:]) 68 return 69 } 70 } 71 72 if err := run(os.Stdout, os.Args[1:]); err != nil && err != io.EOF { 73 os.Stderr.WriteString(err.Error()) 74 os.Stderr.WriteString("\n") 75 os.Exit(1) 76 } 77 } 78 79 func run(w io.Writer, args []string) error { 80 bw := bufio.NewWriter(w) 81 defer bw.Flush() 82 83 for _, path := range args { 84 if err := handleFile(bw, path); err != nil { 85 return err 86 } 87 } 88 89 if len(args) == 0 { 90 return utfate(bw, os.Stdin) 91 } 92 return nil 93 } 94 95 func handleFile(w *bufio.Writer, name string) error { 96 if name == `-` { 97 return utfate(w, os.Stdin) 98 } 99 100 f, err := os.Open(name) 101 if err != nil { 102 return errors.New(`can't read from file named "` + name + `"`) 103 } 104 defer f.Close() 105 106 return utfate(w, f) 107 } 108 109 func utfate(w io.Writer, r io.Reader) error { 110 br := bufio.NewReader(r) 111 bw := bufio.NewWriter(w) 112 defer bw.Flush() 113 114 lead, err := br.Peek(4) 115 if err != nil && err != io.EOF { 116 return err 117 } 118 119 if bytes.HasPrefix(lead, []byte{'\x00', '\x00', '\xfe', '\xff'}) { 120 br.Discard(4) 121 return utf32toUTF8(bw, br, binary.BigEndian) 122 } 123 124 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe', '\x00', '\x00'}) { 125 br.Discard(4) 126 return utf32toUTF8(bw, br, binary.LittleEndian) 127 } 128 129 if bytes.HasPrefix(lead, []byte{'\xfe', '\xff'}) { 130 br.Discard(2) 131 return utf16toUTF8(bw, br, readBytePairBE) 132 } 133 134 if bytes.HasPrefix(lead, []byte{'\xff', '\xfe'}) { 135 br.Discard(2) 136 return utf16toUTF8(bw, br, readBytePairLE) 137 } 138 139 if bytes.HasPrefix(lead, []byte{'\xef', '\xbb', '\xbf'}) { 140 br.Discard(3) 141 return handleUTF8(bw, br) 142 } 143 144 return handleUTF8(bw, br) 145 } 146 147 func handleUTF8(w *bufio.Writer, r *bufio.Reader) error { 148 for { 149 c, _, err := r.ReadRune() 150 if c == unicode.ReplacementChar { 151 return errors.New(`invalid UTF-8 stream`) 152 } 153 if err == io.EOF { 154 return nil 155 } 156 if err != nil { 157 return err 158 } 159 160 if _, err := w.WriteRune(c); err != nil { 161 return io.EOF 162 } 163 } 164 } 165 166 // fancyHandleUTF8 is kept only for reference, as its attempts at being clever 167 // don't seem to speed things up much when given ASCII input 168 func fancyHandleUTF8(w *bufio.Writer, r *bufio.Reader) error { 169 lookahead := 1 170 maxAhead := r.Size() / 2 171 172 for { 173 // look ahead to check for ASCII runs 174 ahead, err := r.Peek(lookahead) 175 if err == io.EOF { 176 return nil 177 } 178 if err != nil { 179 return err 180 } 181 182 // copy leading ASCII runs 183 n := leadASCII(ahead) 184 if n > 0 { 185 w.Write(ahead[:n]) 186 r.Discard(n) 187 } 188 189 // adapt lookahead size 190 if n == len(ahead) && lookahead < maxAhead { 191 lookahead *= 2 192 } else if lookahead > 1 { 193 lookahead /= 2 194 } 195 196 if n == len(ahead) { 197 continue 198 } 199 200 c, _, err := r.ReadRune() 201 if c == unicode.ReplacementChar { 202 return errors.New(`invalid UTF-8 stream`) 203 } 204 if err == io.EOF { 205 return nil 206 } 207 if err != nil { 208 return err 209 } 210 211 if _, err := w.WriteRune(c); err != nil { 212 return io.EOF 213 } 214 } 215 } 216 217 // leadASCII is used by func fancyHandleUTF8 218 func leadASCII(buf []byte) int { 219 for i, b := range buf { 220 if b >= 128 { 221 return i 222 } 223 } 224 return len(buf) 225 } 226 227 // readPairFunc narrows source-code lines below 228 type readPairFunc func(*bufio.Reader) (byte, byte, error) 229 230 // utf16toUTF8 handles UTF-16 inputs for func utfate 231 func utf16toUTF8(w *bufio.Writer, r *bufio.Reader, read2 readPairFunc) error { 232 for { 233 a, b, err := read2(r) 234 if err == io.EOF { 235 return nil 236 } 237 if err != nil { 238 return err 239 } 240 241 c := rune(256*int(a) + int(b)) 242 if utf16.IsSurrogate(c) { 243 a, b, err := read2(r) 244 if err == io.EOF { 245 return nil 246 } 247 if err != nil { 248 return err 249 } 250 251 next := rune(256*int(a) + int(b)) 252 c = utf16.DecodeRune(c, next) 253 } 254 255 if _, err := w.WriteRune(c); err != nil { 256 return io.EOF 257 } 258 } 259 } 260 261 // readBytePairBE gets you a pair of bytes in big-endian (original) order 262 func readBytePairBE(br *bufio.Reader) (byte, byte, error) { 263 a, err := br.ReadByte() 264 if err != nil { 265 return a, 0, err 266 } 267 268 b, err := br.ReadByte() 269 return a, b, err 270 } 271 272 // readBytePairLE gets you a pair of bytes in little-endian order 273 func readBytePairLE(br *bufio.Reader) (byte, byte, error) { 274 a, b, err := readBytePairBE(br) 275 return b, a, err 276 } 277 278 // utf32toUTF8 handles UTF-32 inputs for func utfate 279 func utf32toUTF8(w *bufio.Writer, r *bufio.Reader, o binary.ByteOrder) error { 280 var n uint32 281 for { 282 err := binary.Read(r, o, &n) 283 if err == io.EOF { 284 return nil 285 } 286 if err != nil { 287 return err 288 } 289 290 if _, err := w.WriteRune(rune(n)); err != nil { 291 return io.EOF 292 } 293 } 294 }