File: dedup.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -march=native -mtune=native -flto -o ./dedup ./dedup.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 #include <unistd.h> 37 38 #ifdef _WIN32 39 #include <fcntl.h> 40 #include <windows.h> 41 #endif 42 43 #ifdef RED_ERRORS 44 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 45 #ifdef __APPLE__ 46 #define ERROR_STYLE "\x1b[31m" 47 #endif 48 #define RESET_STYLE "\x1b[0m" 49 #else 50 #define ERROR_STYLE 51 #define RESET_STYLE 52 #endif 53 54 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 55 56 #define SIMPLE_HASHING 57 58 #define BAD_ALLOC 2 59 60 const char* info = "" 61 "dedup [filepaths...]\n" 62 "\n" 63 "Deduplicate lines, ensuring each unique input line appears only once on the\n" 64 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n" 65 "when given sorted lines, this tool works correctly without any scrambling.\n" 66 ""; 67 68 // set_growth is the growth factor to use when rehashing a set 69 const size_t set_growth = 2; 70 71 // prime numbers right under various binary-kb values: 72 // 16381, 32749, 65521, 131071 73 74 // slice is a growable region of bytes in memory 75 typedef struct slice { 76 // ptr is the starting place of the region 77 unsigned char* ptr; 78 79 // len is how many bytes are currently being used 80 size_t len; 81 82 // cap is how many bytes the memory region has available 83 size_t cap; 84 } slice; 85 86 // span is a slice without a capacity 87 typedef struct span { 88 unsigned char* ptr; 89 size_t len; 90 } span; 91 92 // string_bucket is any of the buckets used in struct string_set; these 93 // are supposed to be overallocated to fit the null-terminated string 94 // right after the official fields 95 typedef struct string_bucket { 96 // next is the next in the bucket-chain 97 struct string_bucket* next; 98 99 // len is how many bytes are in the string 100 size_t len; 101 102 // the null-terminated string follows the fields like a ghost, 103 // as c89 doesn't support trailing fake-field arrays in a struct 104 } string_bucket; 105 106 // string_set is a hash-table with no associated values, thus acting as a set 107 typedef struct string_set { 108 // chains are the starting points of linked-lists where bucket values 109 // all hash the same, the hash value being the index used here 110 string_bucket** chains; 111 112 // count is how many items there are 113 size_t count; 114 115 // cap is how many bucket-chains there are 116 size_t cap; 117 } string_set; 118 119 void init_string_set(string_set* set, size_t cap) { 120 set->count = 0; 121 set->chains = malloc(sizeof(string_bucket*) * cap); 122 if (set->chains == NULL) { 123 set->cap = 0; 124 return; 125 } 126 set->cap = cap; 127 128 for (size_t i = 0; i < cap; i++) { 129 set->chains[i] = NULL; 130 } 131 } 132 133 void deallocate(string_set* set) { 134 for (size_t i = 0; i < set->cap; i++) { 135 string_bucket* next = NULL; 136 for (string_bucket* b = set->chains[i]; b != NULL; b = next) { 137 next = b->next; 138 free(b); 139 } 140 } 141 142 free(set->chains); 143 } 144 145 // djb2(33) http://www.cse.yorku.ca/~oz/hash.html 146 // size_t hash(const unsigned char* s) { 147 // size_t hash = 5381; 148 // for (size_t i = 0; s[i] != 0; i++) { 149 // hash = (33 * hash) ^ s[i]; 150 // } 151 // return hash; 152 // } 153 154 // sdbm(65599) http://www.cse.yorku.ca/~oz/hash.html 155 size_t hash(const unsigned char* s) { 156 size_t hash = 0; 157 for (size_t i = 0; s[i] != 0; i++) { 158 hash = (65599 * hash) + s[i]; 159 } 160 return hash; 161 } 162 163 size_t index_chain(const string_set* set, const unsigned char* s) { 164 return hash(s) % set->cap; 165 } 166 167 unsigned char* get_string(const string_bucket* b) { 168 return (unsigned char*)((size_t)b + sizeof(*b)); 169 } 170 171 bool check(const string_bucket* b, const span s) { 172 if (s.len != b->len) { 173 return false; 174 } 175 176 size_t len = s.len; 177 const unsigned char* x = s.ptr; 178 const unsigned char* y = get_string(b); 179 for (size_t i = 0; i < len; i++) { 180 if (x[i] != y[i]) { 181 return false; 182 } 183 } 184 return true; 185 } 186 187 bool rehash(string_set* set) { 188 string_set larger; 189 init_string_set(&larger, set_growth * set->cap); 190 if (larger.chains == NULL) { 191 return false; 192 } 193 194 for (size_t i = 0; i < set->cap; i++) { 195 string_bucket* next = NULL; 196 197 for (string_bucket* b = set->chains[i]; b != NULL; b = next) { 198 size_t j = index_chain(&larger, get_string(b)); 199 200 next = b->next; 201 b->next = NULL; 202 203 string_bucket* dest = larger.chains[j]; 204 if (dest == NULL) { 205 larger.chains[j] = b; 206 continue; 207 } 208 209 while (true) { 210 if (dest->next == NULL) { 211 dest->next = b; 212 break; 213 } 214 dest = dest->next; 215 } 216 } 217 } 218 219 free(set->chains); 220 set->chains = larger.chains; 221 set->cap = larger.cap; 222 return true; 223 } 224 225 // add returns null only when an allocation failed 226 string_bucket* add(string_set* set, const span s, bool* already_there) { 227 size_t i = index_chain(set, s.ptr); 228 string_bucket* last = NULL; 229 230 *already_there = false; 231 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 232 if (check(b, s)) { 233 *already_there = true; 234 return b; 235 } 236 last = b; 237 } 238 239 string_bucket* next = malloc(sizeof(string_bucket) + s.len + 1); 240 if (next == NULL) { 241 return NULL; 242 } 243 244 memcpy(get_string(next), s.ptr, s.len); 245 next->len = s.len; 246 next->next = NULL; 247 set->count++; 248 249 if (last == NULL) { 250 set->chains[i] = next; 251 } else { 252 last->next = next; 253 } 254 255 if (set->count > set_growth * set->cap) { 256 if (!rehash(set)) { 257 free(next); 258 return NULL; 259 } 260 } 261 return next; 262 } 263 264 bool has(const string_set* set, const span s) { 265 size_t i = index_chain(set, s.ptr); 266 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 267 if (check(b, s)) { 268 return true; 269 } 270 } 271 return false; 272 } 273 274 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) { 275 if (line->ptr == NULL || seen->chains == NULL) { 276 return false; 277 } 278 279 for (size_t i = 0; !feof(w); i++) { 280 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 281 if (line->ptr == NULL) { 282 fprintf(stderr, "\n"); 283 fprintf(stderr, ERROR_LINE("out of memory")); 284 exit(BAD_ALLOC); 285 return false; 286 } 287 288 if (len < 0) { 289 break; 290 } 291 292 // ignore trailing line-feeds: all output lines end with one anyway 293 const unsigned char* p = line->ptr; 294 if (len > 0 && p[len - 1] == '\n') { 295 len--; 296 } 297 // line->len = len; 298 299 span s; 300 s.ptr = line->ptr; 301 s.len = len; 302 bool already_there = false; 303 304 if (add(seen, s, &already_there) == NULL) { 305 fprintf(stderr, "\n"); 306 fprintf(stderr, ERROR_LINE("out of memory")); 307 exit(BAD_ALLOC); 308 return false; 309 } 310 311 // ignore already-seen lines 312 if (already_there) { 313 continue; 314 } 315 316 // emit each distinct line only once, the first time it's seen 317 line->ptr[len++] = '\n'; 318 fwrite(line->ptr, 1, len, w); 319 } 320 321 return true; 322 } 323 324 // handle_file handles data from the filename given; returns false only when 325 // the file can't be opened 326 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) { 327 FILE* f = fopen(path, "rb"); 328 if (f == NULL) { 329 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 330 return false; 331 } 332 333 const bool ok = handle_reader(w, f, line, seen); 334 fclose(f); 335 return ok; 336 } 337 338 // run returns the number of errors 339 int run(int argc, char** argv, FILE* w) { 340 size_t start = 1; 341 if (start < argc && strcmp(argv[start], "--") == 0) { 342 start++; 343 } 344 345 slice line; 346 line.len = 0; 347 line.cap = 32 * 1024; 348 line.ptr = malloc(line.cap); 349 if (line.ptr == NULL) { 350 fprintf(stderr, ERROR_LINE("out of memory")); 351 exit(BAD_ALLOC); 352 } 353 354 string_set files; 355 init_string_set(&files, argc - start); 356 if (files.chains == NULL) { 357 fprintf(stderr, ERROR_LINE("out of memory")); 358 exit(BAD_ALLOC); 359 } 360 361 string_set seen; 362 init_string_set(&seen, 64 * 1024); 363 if (seen.chains == NULL) { 364 fprintf(stderr, ERROR_LINE("out of memory")); 365 exit(BAD_ALLOC); 366 } 367 368 size_t errors = 0; 369 bool already_opened = false; 370 371 for (int i = start; i < argc && !feof(w); i++) { 372 span s; 373 s.ptr = (unsigned char*)argv[i]; 374 s.len = strlen(argv[i]); 375 376 if (add(&files, s, &already_opened) == NULL) { 377 fprintf(stderr, ERROR_LINE("out of memory")); 378 exit(BAD_ALLOC); 379 } 380 381 // avoid reopening the same files, as all lines will be ignored anyway 382 if (already_opened) { 383 continue; 384 } 385 386 if (strcmp(argv[i], "-") == 0) { 387 if (!handle_reader(w, stdin, &line, &seen)) { 388 errors++; 389 } 390 continue; 391 } 392 393 if (!handle_file(w, argv[i], &line, &seen)) { 394 errors++; 395 } 396 } 397 398 // use stdin when not given any filepaths 399 if (argc <= 1) { 400 handle_reader(w, stdin, &line, &seen); 401 } 402 403 // don't bother deallocating things, since the app is done 404 exit(errors == 0 ? 0 : 1); 405 return errors; 406 } 407 408 int main(int argc, char** argv) { 409 #ifdef _WIN32 410 setmode(fileno(stdin), O_BINARY); 411 // ensure output lines end in LF instead of CRLF on windows 412 setmode(fileno(stdout), O_BINARY); 413 setmode(fileno(stderr), O_BINARY); 414 #endif 415 416 if (argc > 1) { 417 if ( 418 strcmp(argv[1], "-h") == 0 || 419 strcmp(argv[1], "-help") == 0 || 420 strcmp(argv[1], "--h") == 0 || 421 strcmp(argv[1], "--help") == 0 422 ) { 423 fprintf(stdout, "%s", info); 424 return 0; 425 } 426 } 427 428 const bool live_lines = lseek(fileno(stdout), 0, SEEK_CUR) != 0; 429 if (live_lines) { 430 setvbuf(stdout, NULL, _IOLBF, 0); 431 } else { 432 setvbuf(stdout, NULL, _IOFBF, 0); 433 } 434 return run(argc, argv, stdout) == 0 ? 0 : 1; 435 }