File: dedup.c
   1 /*
   2 The MIT License (MIT)
   3 
   4 Copyright © 2020-2025 pacman64
   5 
   6 Permission is hereby granted, free of charge, to any person obtaining a copy of
   7 this software and associated documentation files (the “Software”), to deal
   8 in the Software without restriction, including without limitation the rights to
   9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  10 of the Software, and to permit persons to whom the Software is furnished to do
  11 so, subject to the following conditions:
  12 
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15 
  16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22 SOFTWARE.
  23 */
  24 
  25 /*
  26 You can build this command-line app by running
  27 
  28 cc -Wall -s -O2 -o ./dedup ./dedup.c
  29 */
  30 
  31 #include <stdbool.h>
  32 #include <stdint.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 
  37 #ifdef _WIN32
  38 #include <fcntl.h>
  39 #include <windows.h>
  40 #endif
  41 
  42 #define SIMPLE_HASHING
  43 
  44 const char* info = ""
  45 "dedup [filepaths...]\n"
  46 "\n"
  47 "Deduplicate lines, ensuring each unique input line appears only once on the\n"
  48 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n"
  49 "when given sorted lines, this tool works correctly without any scrambling.\n"
  50 "";
  51 
  52 const char* no_line_memory_msg = "can't get enough memory to read lines";
  53 
  54 const char* no_set_memory_msg = "can't get enough memory to tell lines apart";
  55 
  56 const uint64_t line_starting_capacity = 32 * 1024;
  57 
  58 // prime numbers right under various binary-kb values:
  59 // 16381, 32749, 65521, 131071
  60 
  61 // bucket_count is how many buckets the set/hash-table allocates top-level:
  62 // it's a prime number to minimize hashing collisions
  63 const uint64_t bucket_count = 32749;
  64 
  65 // slice is a growable region of bytes in memory
  66 typedef struct slice {
  67     // ptr is the starting place of the region
  68     unsigned char* ptr;
  69 
  70     // len is how many bytes are currently being used
  71     size_t len;
  72 
  73     // cap is how many bytes the memory region has available
  74     size_t cap;
  75 } slice;
  76 
  77 size_t capacity_for(size_t len) {
  78     size_t cap = 16;
  79     while (cap < len) {
  80         cap *= 2;
  81     }
  82     return cap;
  83 }
  84 
  85 // string_bucket is any of the buckets used in struct string_set
  86 typedef struct string_bucket {
  87     // ptr is where the string starts
  88     unsigned char* ptr;
  89 
  90     // len is how many bytes are in the string
  91     size_t len;
  92 
  93     // next is the next in the bucket-chain
  94     struct string_bucket* next;
  95 } string_bucket;
  96 
  97 // string_set is a hash-table with no values, thus acting as a set
  98 typedef struct string_set {
  99     string_bucket** chains;
 100 
 101     // cap is how many bucket-chains there are
 102     size_t cap;
 103 } string_set;
 104 
 105 void init_string_set(string_set* set, size_t cap) {
 106     set->chains = malloc(sizeof(string_bucket*) * cap);
 107     if (set->chains == NULL) {
 108         set->cap = 0;
 109         return;
 110     }
 111     set->cap = bucket_count;
 112 
 113     for (size_t i = 0; i < cap; i++) {
 114         set->chains[i] = NULL;
 115     }
 116 }
 117 
 118 void deallocate(string_set* set) {
 119     for (size_t i = 0; i < set->cap; i++) {
 120         string_bucket* next = NULL;
 121         for (string_bucket* b = set->chains[i]; b != NULL; b = next) {
 122             next = b->next;
 123             free(b->ptr);
 124             free(b);
 125         }
 126     }
 127 
 128     free(set->chains);
 129 }
 130 
 131 #ifdef SIMPLE_HASHING
 132 size_t index_chain(const string_set* set, const slice* s) {
 133     size_t hash = 0;
 134     const unsigned char* p = s->ptr;
 135     for (size_t i = 0; i < s->len; i++) {
 136         hash *= 31;
 137         hash += p[i];
 138     }
 139     return hash % set->cap;
 140 }
 141 #else
 142 size_t index_chain(const string_set* set, const slice* s) {
 143     uint64_t k = 1;
 144     size_t hash = 0;
 145     const unsigned char* p = s->ptr;
 146     // const uint64_t r = 15485863;
 147     const uint64_t r = 86028121;
 148     for (size_t i = 0; i < s->len; i++) {
 149         hash += k * p[i];
 150         hash %= r;
 151         k *= 31;
 152         k %= r;
 153     }
 154     return hash % set->cap;
 155 }
 156 #endif
 157 
 158 bool check(const string_bucket* b, const slice* s) {
 159     if (s->len != b->len) {
 160         return false;
 161     }
 162 
 163     size_t len = s->len;
 164     for (size_t i = 0; i < len; i++) {
 165         if (s->ptr[i] != b->ptr[i]) {
 166             return false;
 167         }
 168     }
 169     return true;
 170 }
 171 
 172 // add returns 0 only when an allocation failed
 173 string_bucket* add(string_set* set, const slice* s) {
 174     size_t i = index_chain(set, s);
 175     string_bucket* last = NULL;
 176 
 177     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 178         if (check(b, s)) {
 179             return b;
 180         }
 181         last = b;
 182     }
 183 
 184     unsigned char* copy = malloc(s->len);
 185     if (copy == NULL) {
 186         return NULL;
 187     }
 188     memcpy(copy, s->ptr, s->len);
 189 
 190     string_bucket* next = malloc(sizeof(string_bucket));
 191     if (next == NULL) {
 192         free(copy);
 193         return NULL;
 194     }
 195     next->ptr = copy;
 196     next->len = s->len;
 197     next->next = NULL;
 198 
 199     if (last == NULL) {
 200         set->chains[i] = next;
 201     } else {
 202         last->next = next;
 203     }
 204     return next;
 205 }
 206 
 207 bool has(const string_set* set, const slice* s) {
 208     size_t i = index_chain(set, s);
 209 
 210     for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) {
 211         if (check(b, s)) {
 212             return b;
 213         }
 214     }
 215     return false;
 216 }
 217 
 218 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) {
 219     if (line->ptr == NULL || seen->chains == NULL) {
 220         return false;
 221     }
 222 
 223     for (size_t i = 0; !feof(w); i++) {
 224         ssize_t len = getline((char**)&line->ptr, &line->cap, r);
 225         if (len < 0) {
 226             break;
 227         }
 228 
 229         if (line->ptr == NULL) {
 230             putc('\n', w);
 231             fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg);
 232             return false;
 233         }
 234 
 235         // ignore trailing line-feeds: all output lines end with one anyway
 236         const unsigned char* p = line->ptr;
 237         if (len > 0 && p[len - 1] == '\n') {
 238             len--;
 239         }
 240         line->len = len;
 241 
 242         // ignore already-seen lines
 243         if (has(seen, line)) {
 244             continue;
 245         }
 246         if (add(seen, line) == 0) {
 247             fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_set_memory_msg);
 248             exit(1);
 249             return false;
 250         }
 251 
 252         // emit each distinct line only once, the first time it's seen
 253         fwrite(line->ptr, line->len, 1, w);
 254         putc('\n', w);
 255         fflush(w);
 256     }
 257 
 258     return true;
 259 }
 260 
 261 // handle_file handles data from the filename given; returns false only when
 262 // the file can't be opened
 263 bool handle_file(FILE* w, const char* fname, slice* line, string_set* seen) {
 264     FILE* f = fopen(fname, "rb");
 265     if (f == NULL) {
 266         fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname);
 267         return false;
 268     }
 269 
 270     const bool ok = handle_reader(w, f, line, seen);
 271     fclose(f);
 272     return ok;
 273 }
 274 
 275 // run returns the number of errors
 276 int run(int argc, char** argv, FILE* w) {
 277     slice line;
 278     line.len = 0;
 279     line.cap = line_starting_capacity;
 280     line.ptr = malloc(line.cap);
 281     if (line.ptr == NULL) {
 282         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg);
 283         return 1;
 284     }
 285 
 286     string_set seen;
 287     init_string_set(&seen, bucket_count);
 288     if (seen.chains == NULL) {
 289         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_set_memory_msg);
 290         free(line.ptr);
 291         return 1;
 292     }
 293 
 294     size_t errors = 0;
 295     for (int i = 1; i < argc && !feof(stdout); i++) {
 296         if (argv[i][0] == '-' && argv[i][1] == 0) {
 297             if (!handle_reader(w, stdin, &line, &seen)) {
 298                 errors++;
 299             }
 300             continue;
 301         }
 302 
 303         if (!handle_file(w, argv[i], &line, &seen)) {
 304             errors++;
 305         }
 306     }
 307 
 308     // use stdin when not given any filepaths
 309     if (argc <= 1) {
 310         handle_reader(w, stdin, &line, &seen);
 311     }
 312 
 313     deallocate(&seen);
 314     free(line.ptr);
 315     return errors;
 316 }
 317 
 318 int main(int argc, char** argv) {
 319 #ifdef _WIN32
 320     setmode(fileno(stdin), O_BINARY);
 321     // ensure output lines end in LF instead of CRLF on windows
 322     setmode(fileno(stdout), O_BINARY);
 323     setmode(fileno(stderr), O_BINARY);
 324 #endif
 325 
 326     if (argc > 1) {
 327         if (
 328             strcmp(argv[1], "-h") == 0 ||
 329             strcmp(argv[1], "-help") == 0 ||
 330             strcmp(argv[1], "--h") == 0 ||
 331             strcmp(argv[1], "--help") == 0
 332         ) {
 333             fprintf(stdout, "%s", info);
 334             return 0;
 335         }
 336     }
 337 
 338     size_t dashes = 0;
 339     for (int i = 1; i < argc; i++) {
 340         if (argv[i][0] == '-' && argv[i][1] == 0) {
 341             dashes++;
 342         }
 343     }
 344 
 345     if (dashes > 1) {
 346         const char* msg = "can't use the standard input (dash) more than once";
 347         fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg);
 348         return 1;
 349     }
 350 
 351     return run(argc, argv, stdout) == 0 ? 0 : 1;
 352 }