File: dedup.cpp
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 c++ -Wall -s -O2 -march=native -mtune=native -flto -o ./dedup ./dedup.cpp
  29 */
  30 
  31 #include <stdio.h>
  32 #include <string.h>
  33 #include <unistd.h>
  34 
  35 #include <string>
  36 #include <unordered_set>
  37 
  38 #ifdef RED_ERRORS
  39 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  40 #ifdef __APPLE__
  41 #define ERROR_STYLE "\x1b[31m"
  42 #endif
  43 #define RESET_STYLE "\x1b[0m"
  44 #else
  45 #define ERROR_STYLE ""
  46 #define RESET_STYLE ""
  47 #endif
  48 
  49 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  50 
  51 #define BAD_ALLOC 2
  52 
  53 using namespace std;
  54 
  55 const char* info = ""
  56 "dedup [files...]\n"
  57 "\n"
  58 "DEDUPlicate lines, emitting each distinct line once, when it first appears.\n"
  59 "Unlike `uniq`, this app doesn't require pre-sorted lines to work correctly,\n"
  60 "and keeps lines in their original order.\n"
  61 "";
  62 
  63 typedef unordered_set<string> string_set;
  64 // typedef unordered_set<char*> string_set;
  65 
  66 typedef struct slice {
  67     void* ptr;
  68     size_t cap;
  69 } slice;
  70 
  71 void dedup(FILE* r, slice* line, string_set& seen) {
  72     while (!feof(stdout)) {
  73         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
  74         if (line->ptr == NULL) {
  75             fprintf(stderr, "\n");
  76             fprintf(stderr, ERROR_LINE("out of memory"));
  77             exit(BAD_ALLOC);
  78         }
  79 
  80         if (len < 0) {
  81             break;
  82         }
  83 
  84         char* p = (char*)line->ptr;
  85 
  86         if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') {
  87             p[len - 2] = 0;
  88             len -= 2;
  89         } else if (len >= 1 && (p[len - 1] == '\n')) {
  90             p[len - 1] = 0;
  91             len--;
  92         }
  93 
  94         string l(string_view(p, len));
  95         if (seen.find(l) == seen.end()) {
  96             fwrite(p, 1, len, stdout);
  97             fputc('\n', stdout);
  98             seen.insert(p);
  99         }
 100     }
 101 }
 102 
 103 bool handleFile(const char* path, slice* line, string_set& seen) {
 104     FILE* f = fopen(path, "rb");
 105     if (f == NULL) {
 106         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 107         return false;
 108     }
 109 
 110     dedup(f, line, seen);
 111     fclose(f);
 112     return true;
 113 }
 114 
 115 bool run(size_t argc, char** argv) {
 116     size_t start_args = 1;
 117 
 118     if (argc > start_args) {
 119         if (
 120             strcmp(argv[start_args], "-h") == 0 ||
 121             strcmp(argv[start_args], "-help") == 0 ||
 122             strcmp(argv[start_args], "--h") == 0 ||
 123             strcmp(argv[start_args], "--help") == 0
 124         ) {
 125             fprintf(stdout, "%s", info);
 126             return 0;
 127         }
 128 
 129         if (strcmp(argv[start_args], "--") == 0) {
 130             start_args++;
 131         }
 132     }
 133 
 134     size_t errors = 0;
 135     const bool live_lines = lseek(STDOUT_FILENO, 0, SEEK_CUR) != 0;
 136     if (live_lines) {
 137         setvbuf(stdout, NULL, _IOLBF, 0);
 138     } else {
 139         setvbuf(stdout, NULL, _IOFBF, 0);
 140     }
 141 
 142     slice line;
 143     line.cap = 32 * 1024;
 144     line.ptr = malloc(line.cap);
 145     if (line.ptr == NULL) {
 146         fprintf(stderr, ERROR_LINE("out of memory"));
 147         exit(BAD_ALLOC);
 148     }
 149 
 150     string_set seen;
 151     unordered_set<string> files;
 152 
 153     for (size_t i = start_args; i < argc && !feof(stdout); i++) {
 154         string arg(argv[i]);
 155 
 156         // avoid handling any file more than once
 157         if (files.find(arg) != files.end()) {
 158             continue;
 159         }
 160         files.insert(arg);
 161 
 162         if (arg == "-") {
 163             dedup(stdin, &line, seen);
 164             continue;
 165         }
 166 
 167         if (!handleFile(argv[i], &line, seen)) {
 168             errors++;
 169         }
 170     }
 171 
 172     if (argc - start_args == 0) {
 173         dedup(stdin, &line, seen);
 174     }
 175 
 176     if (!live_lines) {
 177         fflush(stdout);
 178     }
 179 
 180     if (line.ptr != NULL) {
 181         free(line.ptr);
 182     }
 183     return errors == 0;
 184 }
 185 
 186 int main(int argc, char** argv) {
 187     try {
 188         return run(argc, argv) ? 0 : 1;
 189     } catch (const bad_alloc& e) {
 190         fprintf(stderr, ERROR_LINE("out of memory"));
 191         return BAD_ALLOC;
 192     }
 193 }