File: dedup.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O3 -flto -o ./dedup ./dedup.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #ifdef RED_ERRORS
  43 #define ERROR_STYLE "\x1b[38;2;204;0;0m"
  44 #ifdef __APPLE__
  45 #define ERROR_STYLE "\x1b[31m"
  46 #endif
  47 #define RESET_STYLE "\x1b[0m"
  48 #else
  49 #define ERROR_STYLE
  50 #define RESET_STYLE
  51 #endif
  52 
  53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n")
  54 
  55 #define SIMPLE_HASHING
  56 
  57 const char* info = ""
  58 "dedup [filepaths...]\n"
  59 "\n"
  60 "Deduplicate lines, ensuring each unique input line appears only once on the\n"
  61 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n"
  62 "when given sorted lines, this tool works correctly without any scrambling.\n"
  63 "";
  64 
  65 const uint64_t line_starting_capacity = 32 * 1024;
  66 
  67 // prime numbers right under various binary-kb values:
  68 // 16381, 32749, 65521, 131071
  69 
  70 // bucket_count is how many buckets the set/hash-table allocates top-level:
  71 // it's a prime number to minimize hashing collisions
  72 const uint64_t bucket_count = 32749;
  73 
  74 // slice is a growable region of bytes in memory
  75 typedef struct slice {
  76     // ptr is the starting place of the region
  77     unsigned char* ptr;
  78 
  79     // len is how many bytes are currently being used
  80     size_t len;
  81 
  82     // cap is how many bytes the memory region has available
  83     size_t cap;
  84 } slice;
  85 
  86 size_t capacity_for(size_t len) {
  87     size_t cap = 16;
  88     while (cap < len) {
  89         cap *= 2;
  90     }
  91     return cap;
  92 }
  93 
  94 // string_bucket is any of the buckets used in struct string_set
  95 typedef struct string_bucket {
  96     // ptr is where the string starts
  97     unsigned char* ptr;
  98 
  99     // len is how many bytes are in the string
 100     size_t len;
 101 
 102     // next is the next in the bucket-chain
 103     struct string_bucket* next;
 104 } string_bucket;
 105 
 106 // string_set is a hash-table with no values, thus acting as a set
 107 typedef struct string_set {
 108     string_bucket** chains;
 109 
 110     // cap is how many bucket-chains there are
 111     size_t cap;
 112 } string_set;
 113 
 114 void init_string_set(string_set* set, size_t cap) {
 115     set->chains = malloc(sizeof(string_bucket*) * cap);
 116     if (set->chains == NULL) {
 117         set->cap = 0;
 118         return;
 119     }
 120     set->cap = bucket_count;
 121 
 122     for (size_t i = 0; i < cap; i++) {
 123         set->chains[i] = NULL;
 124     }
 125 }
 126 
 127 void deallocate(string_set* set) {
 128     for (size_t i = 0; i < set->cap; i++) {
 129         string_bucket* next = NULL;
 130         for (string_bucket* b = set->chains[i]; b != NULL; b = next) {
 131             next = b->next;
 132             free(b->ptr);
 133             free(b);
 134         }
 135     }
 136 
 137     free(set->chains);
 138 }
 139 
 140 #ifdef SIMPLE_HASHING
 141 size_t index_chain(const string_set* set, const slice* s) {
 142     size_t hash = 0;
 143     const unsigned char* p = s->ptr;
 144     for (size_t i = 0; i < s->len; i++) {
 145         hash *= 31;
 146         hash += p[i];
 147     }
 148     return hash % set->cap;
 149 }
 150 #else
 151 size_t index_chain(const string_set* set, const slice* s) {
 152     uint64_t k = 1;
 153     size_t hash = 0;
 154     const unsigned char* p = s->ptr;
 155     // const uint64_t r = 15485863;
 156     const uint64_t r = 86028121;
 157     for (size_t i = 0; i < s->len; i++) {
 158         hash += k * p[i];
 159         hash %= r;
 160         k *= 31;
 161         k %= r;
 162     }
 163     return hash % set->cap;
 164 }
 165 #endif
 166 
 167 bool check(const string_bucket* b, const slice* s) {
 168     if (s->len != b->len) {
 169         return false;
 170     }
 171 
 172     size_t len = s->len;
 173     for (size_t i = 0; i < len; i++) {
 174         if (s->ptr[i] != b->ptr[i]) {
 175             return false;
 176         }
 177     }
 178     return true;
 179 }
 180 
 181 // add returns 0 only when an allocation failed
 182 string_bucket* add(string_set* set, const slice* s) {
 183     size_t i = index_chain(set, s);
 184     string_bucket* last = NULL;
 185 
 186     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 187         if (check(b, s)) {
 188             return b;
 189         }
 190         last = b;
 191     }
 192 
 193     unsigned char* copy = malloc(s->len);
 194     if (copy == NULL) {
 195         return NULL;
 196     }
 197     memcpy(copy, s->ptr, s->len);
 198 
 199     string_bucket* next = malloc(sizeof(string_bucket));
 200     if (next == NULL) {
 201         free(copy);
 202         return NULL;
 203     }
 204     next->ptr = copy;
 205     next->len = s->len;
 206     next->next = NULL;
 207 
 208     if (last == NULL) {
 209         set->chains[i] = next;
 210     } else {
 211         last->next = next;
 212     }
 213     return next;
 214 }
 215 
 216 bool has(const string_set* set, const slice* s) {
 217     size_t i = index_chain(set, s);
 218 
 219     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 220         if (check(b, s)) {
 221             return b;
 222         }
 223     }
 224     return false;
 225 }
 226 
 227 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) {
 228     if (line->ptr == NULL || seen->chains == NULL) {
 229         return false;
 230     }
 231 
 232     for (size_t i = 0; !feof(w); i++) {
 233         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 234         if (line->ptr == NULL) {
 235             fprintf(stderr, ERROR_LINE("out of memory"));
 236             return false;
 237         }
 238 
 239         if (len < 0) {
 240             break;
 241         }
 242 
 243         // ignore trailing line-feeds: all output lines end with one anyway
 244         const unsigned char* p = line->ptr;
 245         if (len > 0 && p[len - 1] == '\n') {
 246             len--;
 247         }
 248         line->len = len;
 249 
 250         // ignore already-seen lines
 251         if (has(seen, line)) {
 252             continue;
 253         }
 254         if (add(seen, line) == 0) {
 255             fprintf(stderr, ERROR_LINE("out of memory"));
 256             exit(1);
 257             return false;
 258         }
 259 
 260         // emit each distinct line only once, the first time it's seen
 261         fwrite(line->ptr, line->len, 1, w);
 262         fputc('\n', w);
 263     }
 264 
 265     return true;
 266 }
 267 
 268 // handle_file handles data from the filename given; returns false only when
 269 // the file can't be opened
 270 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) {
 271     FILE* f = fopen(path, "rb");
 272     if (f == NULL) {
 273         fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path);
 274         return false;
 275     }
 276 
 277     const bool ok = handle_reader(w, f, line, seen);
 278     fclose(f);
 279     return ok;
 280 }
 281 
 282 // run returns the number of errors
 283 int run(int argc, char** argv, FILE* w) {
 284     slice line;
 285     line.len = 0;
 286     line.cap = line_starting_capacity;
 287     line.ptr = malloc(line.cap);
 288     if (line.ptr == NULL) {
 289         fprintf(stderr, ERROR_LINE("out of memory"));
 290         return 1;
 291     }
 292 
 293     string_set seen;
 294     init_string_set(&seen, bucket_count);
 295     if (seen.chains == NULL) {
 296         fprintf(stderr, ERROR_LINE("out of memory"));
 297         free(line.ptr);
 298         return 1;
 299     }
 300 
 301     size_t dashes = 0;
 302     size_t errors = 0;
 303     for (int i = 1; i < argc && !feof(w); i++) {
 304         if (argv[i][0] == '-' && argv[i][1] == 0) {
 305             dashes++;
 306             if (dashes > 1) {
 307                 continue;
 308             }
 309 
 310             if (!handle_reader(w, stdin, &line, &seen)) {
 311                 errors++;
 312             }
 313             continue;
 314         }
 315 
 316         if (!handle_file(w, argv[i], &line, &seen)) {
 317             errors++;
 318         }
 319     }
 320 
 321     // use stdin when not given any filepaths
 322     if (argc <= 1) {
 323         handle_reader(w, stdin, &line, &seen);
 324     }
 325 
 326     deallocate(&seen);
 327     free(line.ptr);
 328     return errors;
 329 }
 330 
 331 int main(int argc, char** argv) {
 332 #ifdef _WIN32
 333     setmode(fileno(stdin), O_BINARY);
 334     // ensure output lines end in LF instead of CRLF on windows
 335     setmode(fileno(stdout), O_BINARY);
 336     setmode(fileno(stderr), O_BINARY);
 337 #endif
 338 
 339     if (argc > 1) {
 340         if (
 341             strcmp(argv[1], "-h") == 0 ||
 342             strcmp(argv[1], "-help") == 0 ||
 343             strcmp(argv[1], "--h") == 0 ||
 344             strcmp(argv[1], "--help") == 0
 345         ) {
 346             fprintf(stdout, "%s", info);
 347             return 0;
 348         }
 349     }
 350 
 351     return run(argc, argv, stdout) == 0 ? 0 : 1;
 352 }