File: dedup.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./dedup ./dedup.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 
  38 #ifdef _WIN32
  39 #include <fcntl.h>
  40 #include <windows.h>
  41 #endif
  42 
  43 #ifdef RED_ERRORS
  44 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  45 #ifdef __APPLE__
  46 #define ERROR_STYLE "\x1b[31m"
  47 #endif
  48 #define RESET_STYLE "\x1b[0m"
  49 #else
  50 #define ERROR_STYLE
  51 #define RESET_STYLE
  52 #endif
  53 
  54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  55 
  56 #define SIMPLE_HASHING
  57 
  58 #define BAD_ALLOC 2
  59 
  60 const char* info = ""
  61 "dedup [filepaths...]\n"
  62 "\n"
  63 "Deduplicate lines, ensuring each unique input line appears only once on the\n"
  64 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n"
  65 "when given sorted lines, this tool works correctly without any scrambling.\n"
  66 "";
  67 
  68 // set_growth is the growth factor to use when rehashing a set
  69 const size_t set_growth = 2;
  70 
  71 // prime numbers right under various binary-kb values:
  72 // 16381, 32749, 65521, 131071
  73 
  74 // slice is a growable region of bytes in memory
  75 typedef struct slice {
  76     // ptr is the starting place of the region
  77     unsigned char* ptr;
  78 
  79     // len is how many bytes are currently being used
  80     size_t len;
  81 
  82     // cap is how many bytes the memory region has available
  83     size_t cap;
  84 } slice;
  85 
  86 // span is a slice without a capacity
  87 typedef struct span {
  88     unsigned char* ptr;
  89     size_t len;
  90 } span;
  91 
  92 // string_bucket is any of the buckets used in struct string_set; these
  93 // are supposed to be overallocated to fit the null-terminated string
  94 // right after the official fields
  95 typedef struct string_bucket {
  96     // next is the next in the bucket-chain
  97     struct string_bucket* next;
  98 
  99     // len is how many bytes are in the string
 100     size_t len;
 101 
 102     // the null-terminated string follows the fields like a ghost,
 103     // as c89 doesn't support trailing fake-field arrays in a struct
 104 } string_bucket;
 105 
 106 // string_set is a hash-table with no associated values, thus acting as a set
 107 typedef struct string_set {
 108     // chains are the starting points of linked-lists where bucket values
 109     // all hash the same, the hash value being the index used here
 110     string_bucket** chains;
 111 
 112     // count is how many items there are
 113     size_t count;
 114 
 115     // cap is how many bucket-chains there are
 116     size_t cap;
 117 } string_set;
 118 
 119 void init_string_set(string_set* set, size_t cap) {
 120     set->count = 0;
 121     set->chains = malloc(sizeof(string_bucket*) * cap);
 122     if (set->chains == NULL) {
 123         set->cap = 0;
 124         return;
 125     }
 126     set->cap = cap;
 127 
 128     for (size_t i = 0; i < cap; i++) {
 129         set->chains[i] = NULL;
 130     }
 131 }
 132 
 133 void deallocate(string_set* set) {
 134     for (size_t i = 0; i < set->cap; i++) {
 135         string_bucket* next = NULL;
 136         for (string_bucket* b = set->chains[i]; b != NULL; b = next) {
 137             next = b->next;
 138             free(b);
 139         }
 140     }
 141 
 142     free(set->chains);
 143 }
 144 
 145 // djb2(33) http://www.cse.yorku.ca/~oz/hash.html
 146 // size_t hash(const unsigned char* s) {
 147 //     size_t hash = 5381;
 148 //     for (size_t i = 0; s[i] != 0; i++) {
 149 //         hash = (33 * hash) ^ s[i];
 150 //     }
 151 //     return hash;
 152 // }
 153 
 154 // sdbm(65599) http://www.cse.yorku.ca/~oz/hash.html
 155 size_t hash(const unsigned char* s) {
 156     size_t hash = 0;
 157     for (size_t i = 0; s[i] != 0; i++) {
 158         hash = (65599 * hash) + s[i];
 159     }
 160     return hash;
 161 }
 162 
 163 size_t index_chain(const string_set* set, const unsigned char* s) {
 164     return hash(s) % set->cap;
 165 }
 166 
 167 unsigned char* get_string(const string_bucket* b) {
 168     return (unsigned char*)((size_t)b + sizeof(*b));
 169 }
 170 
 171 bool check(const string_bucket* b, const span s) {
 172     if (s.len != b->len) {
 173         return false;
 174     }
 175 
 176     size_t len = s.len;
 177     const unsigned char* x = s.ptr;
 178     const unsigned char* y = get_string(b);
 179     for (size_t i = 0; i < len; i++) {
 180         if (x[i] != y[i]) {
 181             return false;
 182         }
 183     }
 184     return true;
 185 }
 186 
 187 bool rehash(string_set* set) {
 188     string_set larger;
 189     init_string_set(&larger, set_growth * set->cap);
 190     if (larger.chains == NULL) {
 191         return false;
 192     }
 193 
 194     for (size_t i = 0; i < set->cap; i++) {
 195         string_bucket* next = NULL;
 196 
 197         for (string_bucket* b = set->chains[i]; b != NULL; b = next) {
 198             size_t j = index_chain(&larger, get_string(b));
 199 
 200             next = b->next;
 201             b->next = NULL;
 202 
 203             string_bucket* dest = larger.chains[j];
 204             if (dest == NULL) {
 205                 larger.chains[j] = b;
 206                 continue;
 207             }
 208 
 209             while (true) {
 210                 if (dest->next == NULL) {
 211                     dest->next = b;
 212                     break;
 213                 }
 214                 dest = dest->next;
 215             }
 216         }
 217     }
 218 
 219     free(set->chains);
 220     set->chains = larger.chains;
 221     set->cap = larger.cap;
 222     return true;
 223 }
 224 
 225 // add returns null only when an allocation failed
 226 string_bucket* add(string_set* set, const span s, bool* already_there) {
 227     size_t i = index_chain(set, s.ptr);
 228     string_bucket* last = NULL;
 229 
 230     *already_there = false;
 231     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 232         if (check(b, s)) {
 233             *already_there = true;
 234             return b;
 235         }
 236         last = b;
 237     }
 238 
 239     string_bucket* next = malloc(sizeof(string_bucket) + s.len + 1);
 240     if (next == NULL) {
 241         return NULL;
 242     }
 243 
 244     memcpy(get_string(next), s.ptr, s.len);
 245     next->len = s.len;
 246     next->next = NULL;
 247     set->count++;
 248 
 249     if (last == NULL) {
 250         set->chains[i] = next;
 251     } else {
 252         last->next = next;
 253     }
 254 
 255     if (set->count > set_growth * set->cap) {
 256         if (!rehash(set)) {
 257             free(next);
 258             return NULL;
 259         }
 260     }
 261     return next;
 262 }
 263 
 264 bool has(const string_set* set, const span s) {
 265     size_t i = index_chain(set, s.ptr);
 266     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 267         if (check(b, s)) {
 268             return true;
 269         }
 270     }
 271     return false;
 272 }
 273 
 274 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) {
 275     if (line->ptr == NULL || seen->chains == NULL) {
 276         return false;
 277     }
 278 
 279     for (size_t i = 0; !feof(w); i++) {
 280         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 281         if (line->ptr == NULL) {
 282             fprintf(stderr, "\n");
 283             fprintf(stderr, ERROR_LINE("out of memory"));
 284             exit(BAD_ALLOC);
 285             return false;
 286         }
 287 
 288         if (len < 0) {
 289             break;
 290         }
 291 
 292         // ignore trailing line-feeds: all output lines end with one anyway
 293         const unsigned char* p = line->ptr;
 294         if (len > 0 && p[len - 1] == '\n') {
 295             len--;
 296         }
 297         // line->len = len;
 298 
 299         span s;
 300         s.ptr = line->ptr;
 301         s.len = len;
 302         bool already_there = false;
 303 
 304         if (add(seen, s, &already_there) == NULL) {
 305             fprintf(stderr, "\n");
 306             fprintf(stderr, ERROR_LINE("out of memory"));
 307             exit(BAD_ALLOC);
 308             return false;
 309         }
 310 
 311         // ignore already-seen lines
 312         if (already_there) {
 313             continue;
 314         }
 315 
 316         // emit each distinct line only once, the first time it's seen
 317         line->ptr[len++] = '\n';
 318         fwrite(line->ptr, 1, len, w);
 319     }
 320 
 321     return true;
 322 }
 323 
 324 // handle_file handles data from the filename given; returns false only when
 325 // the file can't be opened
 326 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) {
 327     FILE* f = fopen(path, "rb");
 328     if (f == NULL) {
 329         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 330         return false;
 331     }
 332 
 333     const bool ok = handle_reader(w, f, line, seen);
 334     fclose(f);
 335     return ok;
 336 }
 337 
 338 // run returns the number of errors
 339 int run(int argc, char** argv, FILE* w) {
 340     size_t start = 1;
 341     if (start < argc && strcmp(argv[start], "--") == 0) {
 342         start++;
 343     }
 344 
 345     slice line;
 346     line.len = 0;
 347     line.cap = 32 * 1024;
 348     line.ptr = malloc(line.cap);
 349     if (line.ptr == NULL) {
 350         fprintf(stderr, ERROR_LINE("out of memory"));
 351         exit(BAD_ALLOC);
 352     }
 353 
 354     string_set files;
 355     init_string_set(&files, argc - start);
 356     if (files.chains == NULL) {
 357         fprintf(stderr, ERROR_LINE("out of memory"));
 358         exit(BAD_ALLOC);
 359     }
 360 
 361     string_set seen;
 362     init_string_set(&seen, 64 * 1024);
 363     if (seen.chains == NULL) {
 364         fprintf(stderr, ERROR_LINE("out of memory"));
 365         exit(BAD_ALLOC);
 366     }
 367 
 368     size_t errors = 0;
 369     bool already_opened = false;
 370 
 371     for (int i = start; i < argc && !feof(w); i++) {
 372         span s;
 373         s.ptr = (unsigned char*)argv[i];
 374         s.len = strlen(argv[i]);
 375 
 376         if (add(&files, s, &already_opened) == NULL) {
 377             fprintf(stderr, ERROR_LINE("out of memory"));
 378             exit(BAD_ALLOC);
 379         }
 380 
 381         // avoid reopening the same files, as all lines will be ignored anyway
 382         if (already_opened) {
 383             continue;
 384         }
 385 
 386         if (strcmp(argv[i], "-") == 0) {
 387             if (!handle_reader(w, stdin, &line, &seen)) {
 388                 errors++;
 389             }
 390             continue;
 391         }
 392 
 393         if (!handle_file(w, argv[i], &line, &seen)) {
 394             errors++;
 395         }
 396     }
 397 
 398     // use stdin when not given any filepaths
 399     if (argc <= 1) {
 400         handle_reader(w, stdin, &line, &seen);
 401     }
 402 
 403     // don't bother deallocating things, since the app is done
 404     exit(errors == 0 ? 0 : 1);
 405     return errors;
 406 }
 407 
 408 int main(int argc, char** argv) {
 409 #ifdef _WIN32
 410     setmode(fileno(stdin), O_BINARY);
 411     // ensure output lines end in LF instead of CRLF on windows
 412     setmode(fileno(stdout), O_BINARY);
 413     setmode(fileno(stderr), O_BINARY);
 414 #endif
 415 
 416     if (argc > 1) {
 417         if (
 418             strcmp(argv[1], "-h") == 0 ||
 419             strcmp(argv[1], "-help") == 0 ||
 420             strcmp(argv[1], "--h") == 0 ||
 421             strcmp(argv[1], "--help") == 0
 422         ) {
 423             fprintf(stdout, "%s", info);
 424             return 0;
 425         }
 426     }
 427 
 428     const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0;
 429     if (live_lines) {
 430         setvbuf(stdout, NULL, _IOLBF, 0);
 431     } else {
 432         setvbuf(stdout, NULL, _IOFBF, 0);
 433     }
 434     return run(argc, argv, stdout) == 0 ? 0 : 1;
 435 }