File: plain.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -o ./plain ./plain.c
  29 */
  30 
  31 #include <ctype.h>
  32 #include <fcntl.h>
  33 #include <stdbool.h>
  34 #include <stddef.h>
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 
  39 #ifdef _WIN32
  40 #include <windows.h>
  41 #endif
  42 
  43 const char* info = ""
  44 "plain [options...] [filepaths...]\n"
  45 "\n"
  46 "\n"
  47 "Ignore all ANSI codes, leaving just the plain-text.\n"
  48 "\n"
  49 "All input is assumed to be UTF-8. When not given any filepaths, input is read\n"
  50 "from the standard input.\n"
  51 "\n"
  52 "\n"
  53 "Options, all of which can start with either 1 or 2 dashes:\n"
  54 "\n"
  55 "\n"
  56 "  -h          show this help message\n"
  57 "  -help       show this help message\n"
  58 "";
  59 
  60 const char* no_line_memory_msg = "can't get enough memory to read lines";
  61 
  62 // slice is a growable region of bytes in memory
  63 typedef struct slice {
  64     // ptr is the starting place of the region
  65     unsigned char* ptr;
  66 
  67     // len is how many bytes are currently being used
  68     size_t len;
  69 
  70     // cap is how many bytes the memory region has available
  71     size_t cap;
  72 } slice;
  73 
  74 // advance updates a slice so it starts after the number of bytes given
  75 inline void advance(slice* src, size_t n) {
  76     src->ptr += n;
  77     src->len -= n;
  78 }
  79 
  80 // find_esc_pair tries to find the starting index of 2-byte substrings
  81 // "\x1b[" or "\x1b]", whichever comes first, if at all
  82 int64_t find_esc_pair(slice line, size_t start) {
  83     bool esc = false;
  84 
  85     for (size_t i = start; i < line.len; i++) {
  86         unsigned char cur = line.ptr[i];
  87 
  88         if (cur == '\x1b') {
  89             esc = true;
  90             continue;
  91         }
  92 
  93         if (esc && (cur == '[' || cur == ']')) {
  94             return i - 1;
  95         }
  96 
  97         esc = false;
  98     }
  99 
 100     return -1;
 101 }
 102 
 103 // find_alpha tries to find the position of the first letter in a string
 104 int64_t find_alpha(slice line) {
 105     for (size_t i = 0; i < line.len; i++) {
 106         if (isalpha(line.ptr[i])) {
 107             return i;
 108         }
 109     }
 110     return -1;
 111 }
 112 
 113 // find_byte tries to find the first position of the value given in a string
 114 int64_t find_byte(slice line, unsigned char what) {
 115     for (size_t i = 0; i < line.len; i++) {
 116         if (line.ptr[i] == what) {
 117             return i;
 118         }
 119     }
 120     return -1;
 121 }
 122 
 123 // find_osc_end tries to find the first position after the end of OSC bytes
 124 int64_t find_osc_end(slice line) {
 125     size_t prev = 0;
 126     for (size_t i = 0; i < line.len; i++) {
 127         if (line.ptr[i] == '\a') {
 128             return i;
 129         }
 130         if (prev == '\x1b' && line.ptr[i] == '\\') {
 131             return i;
 132         }
 133         prev = line.ptr[i];
 134     }
 135     return -1;
 136 }
 137 
 138 inline void write_bytes(FILE* w, const unsigned char* src, size_t len) {
 139     fwrite(src, len, 1, w);
 140 }
 141 
 142 typedef struct skip_state {
 143     bool skip_alpha;
 144     bool skip_osc;
 145 } skip_state;
 146 
 147 // destyle_line renders the line given, omitting ANSI-styles
 148 void destyle_line(FILE* w, slice line, skip_state* state) {
 149     if (state->skip_alpha) {
 150         int64_t j = find_alpha(line);
 151         if (j < 0) {
 152             return;
 153         }
 154         state->skip_alpha = false;
 155         advance(&line, j + 1);
 156     }
 157 
 158     if (state->skip_osc) {
 159         // int64_t j = find_byte(line, '\a');
 160         int64_t j = find_osc_end(line);
 161         if (j < 0) {
 162             return;
 163         }
 164         state->skip_osc = false;
 165         advance(&line, j + 1);
 166     }
 167 
 168     while (line.len > 0) {
 169         int64_t j = find_esc_pair(line, 0);
 170         if (j < 0) {
 171             write_bytes(w, line.ptr, line.len);
 172             return;
 173         }
 174 
 175         write_bytes(w, line.ptr, j);
 176         advance(&line, j);
 177 
 178         switch (line.ptr[1]) {
 179             case '[':
 180                 j = find_alpha(line);
 181                 if (j < 0) {
 182                     state->skip_alpha = true;
 183                     return;
 184                 }
 185                 advance(&line, j + 1);
 186                 continue;
 187 
 188             case ']':
 189                 // j = find_byte(line, '\a');
 190                 j = find_osc_end(line);
 191                 if (j < 0) {
 192                     state->skip_osc = true;
 193                     return;
 194                 }
 195                 advance(&line, j + 1);
 196                 continue;
 197         }
 198     }
 199 }
 200 
 201 bool starts_with_bom(slice s) {
 202     const unsigned char* p = s.ptr;
 203     return s.len >= 3 && p[0] == 0xef && p[1] == 0xbb && p[2] == 0xbf;
 204 }
 205 
 206 // handle_lines loops over input lines, restyling all digit-runs as more
 207 // readable `nice numbers`, fulfilling the app's purpose
 208 bool handle_lines(FILE* w, slice* line, FILE* src) {
 209     slice trimmed;
 210     skip_state state;
 211 
 212     trimmed.cap = 0;
 213     state.skip_alpha = false;
 214     state.skip_osc = false;
 215 
 216     for (size_t i = 0; !feof(stdout); i++) {
 217         int len = getline((char**)&line->ptr, &line->cap, src);
 218         if (len < 0) {
 219             break;
 220         }
 221 
 222         if (line->ptr == NULL) {
 223             putc('\n', w);
 224             fflush(w);
 225 
 226             fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg);
 227             exit(1);
 228         }
 229 
 230         line->len = len;
 231         trimmed.ptr = line->ptr;
 232         trimmed.len = line->len;
 233 
 234         // get rid of leading UTF-8 BOM (byte-order mark) if 1st line has it
 235         if (i == 0 && starts_with_bom(trimmed)) {
 236             trimmed.ptr += 3;
 237             trimmed.len -= 3;
 238             len = trimmed.len;
 239         }
 240 
 241         const unsigned char* p = trimmed.ptr;
 242         // get rid of trailing line-feeds and CRLF end-of-line byte-pairs
 243         if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
 244             trimmed.len -= 2;
 245         } else if (len >= 1 && p[len - 1] == '\n') {
 246             trimmed.len--;
 247         }
 248 
 249         destyle_line(w, trimmed, &state);
 250         if (!state.skip_alpha && !state.skip_osc) {
 251             putc('\n', w);
 252             fflush(w);
 253         }
 254     }
 255 
 256     if (state.skip_alpha || state.skip_osc) {
 257         putc('\n', w);
 258     }
 259     fflush(w);
 260     return true;
 261 }
 262 
 263 // handle_file handles data from the filename given; returns false only when
 264 // the file can't be opened
 265 bool handle_file(FILE* w, slice* line, char* path) {
 266     FILE* f = fopen(path, "rb");
 267     if (f == NULL) {
 268         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", path);
 269         return false;
 270     }
 271 
 272     const bool ok = handle_lines(w, line, f);
 273     fclose(f);
 274     return ok;
 275 }
 276 
 277 // run returns the number of errors
 278 int run(int argc, char** argv, FILE* w) {
 279     size_t errors = 0;
 280 
 281     slice line;
 282     line.len = 0;
 283     line.cap = 32 * 1024;
 284     line.ptr = malloc(line.cap);
 285 
 286     if (line.ptr == NULL) {
 287         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg);
 288         return 1;
 289     }
 290 
 291     // use stdin when not given any filepaths
 292     if (argc < 2) {
 293         if (!handle_lines(w, &line, stdin)) {
 294             errors++;
 295         }
 296         return errors;
 297     }
 298 
 299     for (size_t i = 1; i < (size_t)argc && !feof(w); i++) {
 300         if (argv[i][0] == '-' && argv[i][1] == 0) {
 301             // `-` means standard input
 302             if (!handle_lines(w, &line, stdin)) {
 303                 errors++;
 304             }
 305             continue;
 306         }
 307 
 308         if (!handle_file(w, &line, argv[i])) {
 309             errors++;
 310         }
 311     }
 312 
 313     free(line.ptr);
 314     return errors;
 315 }
 316 
 317 // is_help_option simplifies control-flow for func main
 318 bool is_help_option(char* s) {
 319     return (s[0] == '-') && (
 320         strcmp(s, "-h") == 0 || strcmp(s, "-help") == 0 ||
 321         strcmp(s, "--h") == 0 || strcmp(s, "--help") == 0
 322     );
 323 }
 324 
 325 int main(int argc, char** argv) {
 326 #ifdef _WIN32
 327     setmode(fileno(stdin), O_BINARY);
 328     // ensure output lines end in LF instead of CRLF on windows
 329     setmode(fileno(stdout), O_BINARY);
 330     setmode(fileno(stderr), O_BINARY);
 331 #endif
 332 
 333     // handle any of the help options, if given
 334     if (argc > 1 && is_help_option(argv[1])) {
 335         puts(info);
 336         return 0;
 337     }
 338 
 339     return run(argc, argv, stdout) == 0 ? 0 : 1;
 340 }