File: dedup.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -flto -o ./dedup ./dedup.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #ifdef RED_ERRORS
  43 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  44 #ifdef __APPLE__
  45 #define ERROR_STYLE "\x1b[31m"
  46 #endif
  47 #define RESET_STYLE "\x1b[0m"
  48 #else
  49 #define ERROR_STYLE
  50 #define RESET_STYLE
  51 #endif
  52 
  53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  54 
  55 #define SIMPLE_HASHING
  56 
  57 const char* info = ""
  58 "dedup [filepaths...]\n"
  59 "\n"
  60 "Deduplicate lines, ensuring each unique input line appears only once on the\n"
  61 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n"
  62 "when given sorted lines, this tool works correctly without any scrambling.\n"
  63 "";
  64 
  65 const uint64_t line_starting_capacity = 32 * 1024;
  66 
  67 // prime numbers right under various binary-kb values:
  68 // 16381, 32749, 65521, 131071
  69 
  70 // bucket_count is how many buckets the set/hash-table allocates top-level:
  71 // it's a prime number to minimize hashing collisions
  72 const uint64_t bucket_count = 32749;
  73 
  74 // slice is a growable region of bytes in memory
  75 typedef struct slice {
  76     // ptr is the starting place of the region
  77     unsigned char* ptr;
  78 
  79     // len is how many bytes are currently being used
  80     size_t len;
  81 
  82     // cap is how many bytes the memory region has available
  83     size_t cap;
  84 } slice;
  85 
  86 size_t capacity_for(size_t len) {
  87     size_t cap = 16;
  88     while (cap < len) {
  89         cap *= 2;
  90     }
  91     return cap;
  92 }
  93 
  94 // string_bucket is any of the buckets used in struct string_set
  95 typedef struct string_bucket {
  96     // ptr is where the string starts
  97     unsigned char* ptr;
  98 
  99     // len is how many bytes are in the string
 100     size_t len;
 101 
 102     // next is the next in the bucket-chain
 103     struct string_bucket* next;
 104 } string_bucket;
 105 
 106 // string_set is a hash-table with no values, thus acting as a set
 107 typedef struct string_set {
 108     string_bucket** chains;
 109 
 110     // cap is how many bucket-chains there are
 111     size_t cap;
 112 } string_set;
 113 
 114 void init_string_set(string_set* set, size_t cap) {
 115     set->chains = malloc(sizeof(string_bucket*) * cap);
 116     if (set->chains == NULL) {
 117         set->cap = 0;
 118         return;
 119     }
 120     set->cap = bucket_count;
 121 
 122     for (size_t i = 0; i < cap; i++) {
 123         set->chains[i] = NULL;
 124     }
 125 }
 126 
 127 void deallocate(string_set* set) {
 128     for (size_t i = 0; i < set->cap; i++) {
 129         string_bucket* next = NULL;
 130         for (string_bucket* b = set->chains[i]; b != NULL; b = next) {
 131             next = b->next;
 132             free(b->ptr);
 133             free(b);
 134         }
 135     }
 136 
 137     free(set->chains);
 138 }
 139 
 140 #ifdef SIMPLE_HASHING
 141 size_t index_chain(const string_set* set, const slice* s) {
 142     size_t hash = 0;
 143     const unsigned char* p = s->ptr;
 144     for (size_t i = 0; i < s->len; i++) {
 145         hash *= 31;
 146         hash += p[i];
 147     }
 148     return hash % set->cap;
 149 }
 150 #else
 151 size_t index_chain(const string_set* set, const slice* s) {
 152     uint64_t k = 1;
 153     size_t hash = 0;
 154     const unsigned char* p = s->ptr;
 155     // const uint64_t r = 15485863;
 156     const uint64_t r = 86028121;
 157     for (size_t i = 0; i < s->len; i++) {
 158         hash += k * p[i];
 159         hash %= r;
 160         k *= 31;
 161         k %= r;
 162     }
 163     return hash % set->cap;
 164 }
 165 #endif
 166 
 167 bool check(const string_bucket* b, const slice* s) {
 168     if (s->len != b->len) {
 169         return false;
 170     }
 171 
 172     size_t len = s->len;
 173     for (size_t i = 0; i < len; i++) {
 174         if (s->ptr[i] != b->ptr[i]) {
 175             return false;
 176         }
 177     }
 178     return true;
 179 }
 180 
 181 // add returns 0 only when an allocation failed
 182 string_bucket* add(string_set* set, const slice* s) {
 183     size_t i = index_chain(set, s);
 184     string_bucket* last = NULL;
 185 
 186     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 187         if (check(b, s)) {
 188             return b;
 189         }
 190         last = b;
 191     }
 192 
 193     unsigned char* copy = malloc(s->len);
 194     if (copy == NULL) {
 195         return NULL;
 196     }
 197     memcpy(copy, s->ptr, s->len);
 198 
 199     string_bucket* next = malloc(sizeof(string_bucket));
 200     if (next == NULL) {
 201         free(copy);
 202         return NULL;
 203     }
 204     next->ptr = copy;
 205     next->len = s->len;
 206     next->next = NULL;
 207 
 208     if (last == NULL) {
 209         set->chains[i] = next;
 210     } else {
 211         last->next = next;
 212     }
 213     return next;
 214 }
 215 
 216 bool has(const string_set* set, const slice* s) {
 217     size_t i = index_chain(set, s);
 218 
 219     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 220         if (check(b, s)) {
 221             return b;
 222         }
 223     }
 224     return false;
 225 }
 226 
 227 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) {
 228     if (line->ptr == NULL || seen->chains == NULL) {
 229         return false;
 230     }
 231 
 232     for (size_t i = 0; !feof(w); i++) {
 233         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 234         if (line->ptr == NULL) {
 235             fprintf(stderr, "\n");
 236             fprintf(stderr, ERROR_LINE("out of memory"));
 237             exit(BAD_ALLOC);
 238         }
 239 
 240         if (len < 0) {
 241             break;
 242         }
 243 
 244         // ignore trailing line-feeds: all output lines end with one anyway
 245         const unsigned char* p = line->ptr;
 246         if (len > 0 && p[len - 1] == '\n') {
 247             len--;
 248         }
 249         line->len = len;
 250 
 251         // ignore already-seen lines
 252         if (has(seen, line)) {
 253             continue;
 254         }
 255         if (add(seen, line) == 0) {
 256             fprintf(stderr, ERROR_LINE("out of memory"));
 257             exit(1);
 258             return false;
 259         }
 260 
 261         // emit each distinct line only once, the first time it's seen
 262         fwrite(line->ptr, 1, line->len, w);
 263         fputc('\n', w);
 264     }
 265 
 266     return true;
 267 }
 268 
 269 // handle_file handles data from the filename given; returns false only when
 270 // the file can't be opened
 271 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) {
 272     FILE* f = fopen(path, "rb");
 273     if (f == NULL) {
 274         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 275         return false;
 276     }
 277 
 278     const bool ok = handle_reader(w, f, line, seen);
 279     fclose(f);
 280     return ok;
 281 }
 282 
 283 // run returns the number of errors
 284 int run(int argc, char** argv, FILE* w) {
 285     slice line;
 286     line.len = 0;
 287     line.cap = line_starting_capacity;
 288     line.ptr = malloc(line.cap);
 289     if (line.ptr == NULL) {
 290         fprintf(stderr, ERROR_LINE("out of memory"));
 291         exit(BAD_ALLOC);
 292     }
 293 
 294     string_set seen;
 295     init_string_set(&seen, bucket_count);
 296     if (seen.chains == NULL) {
 297         fprintf(stderr, ERROR_LINE("out of memory"));
 298         free(line.ptr);
 299         return 1;
 300     }
 301 
 302     size_t dashes = 0;
 303     size_t errors = 0;
 304     for (int i = 1; i < argc && !feof(w); i++) {
 305         if (strcmp(argv[i], "-") == 0) {
 306             dashes++;
 307             if (dashes > 1) {
 308                 continue;
 309             }
 310 
 311             if (!handle_reader(w, stdin, &line, &seen)) {
 312                 errors++;
 313             }
 314             continue;
 315         }
 316 
 317         if (!handle_file(w, argv[i], &line, &seen)) {
 318             errors++;
 319         }
 320     }
 321 
 322     // use stdin when not given any filepaths
 323     if (argc <= 1) {
 324         handle_reader(w, stdin, &line, &seen);
 325     }
 326 
 327     deallocate(&seen);
 328     free(line.ptr);
 329     return errors;
 330 }
 331 
 332 int main(int argc, char** argv) {
 333 #ifdef _WIN32
 334     setmode(fileno(stdin), O_BINARY);
 335     // ensure output lines end in LF instead of CRLF on windows
 336     setmode(fileno(stdout), O_BINARY);
 337     setmode(fileno(stderr), O_BINARY);
 338 #endif
 339 
 340     if (argc > 1) {
 341         if (
 342             strcmp(argv[1], "-h") == 0 ||
 343             strcmp(argv[1], "-help") == 0 ||
 344             strcmp(argv[1], "--h") == 0 ||
 345             strcmp(argv[1], "--help") == 0
 346         ) {
 347             fprintf(stdout, "%s", info);
 348             return 0;
 349         }
 350     }
 351 
 352     return run(argc, argv, stdout) == 0 ? 0 : 1;
 353 }