File: dedup.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -flto -o ./dedup ./dedup.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #ifdef RED_ERRORS 43 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 44 #ifdef __APPLE__ 45 #define ERROR_STYLE "\x1b[31m" 46 #endif 47 #define RESET_STYLE "\x1b[0m" 48 #else 49 #define ERROR_STYLE 50 #define RESET_STYLE 51 #endif 52 53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 54 55 #define SIMPLE_HASHING 56 57 const char* info = "" 58 "dedup [filepaths...]\n" 59 "\n" 60 "Deduplicate lines, ensuring each unique input line appears only once on the\n" 61 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n" 62 "when given sorted lines, this tool works correctly without any scrambling.\n" 63 ""; 64 65 const uint64_t line_starting_capacity = 32 * 1024; 66 67 // prime numbers right under various binary-kb values: 68 // 16381, 32749, 65521, 131071 69 70 // bucket_count is how many buckets the set/hash-table allocates top-level: 71 // it's a prime number to minimize hashing collisions 72 const uint64_t bucket_count = 32749; 73 74 // slice is a growable region of bytes in memory 75 typedef struct slice { 76 // ptr is the starting place of the region 77 unsigned char* ptr; 78 79 // len is how many bytes are currently being used 80 size_t len; 81 82 // cap is how many bytes the memory region has available 83 size_t cap; 84 } slice; 85 86 size_t capacity_for(size_t len) { 87 size_t cap = 16; 88 while (cap < len) { 89 cap *= 2; 90 } 91 return cap; 92 } 93 94 // string_bucket is any of the buckets used in struct string_set 95 typedef struct string_bucket { 96 // ptr is where the string starts 97 unsigned char* ptr; 98 99 // len is how many bytes are in the string 100 size_t len; 101 102 // next is the next in the bucket-chain 103 struct string_bucket* next; 104 } string_bucket; 105 106 // string_set is a hash-table with no values, thus acting as a set 107 typedef struct string_set { 108 string_bucket** chains; 109 110 // cap is how many bucket-chains there are 111 size_t cap; 112 } string_set; 113 114 void init_string_set(string_set* set, size_t cap) { 115 set->chains = malloc(sizeof(string_bucket*) * cap); 116 if (set->chains == NULL) { 117 set->cap = 0; 118 return; 119 } 120 set->cap = bucket_count; 121 122 for (size_t i = 0; i < cap; i++) { 123 set->chains[i] = NULL; 124 } 125 } 126 127 void deallocate(string_set* set) { 128 for (size_t i = 0; i < set->cap; i++) { 129 string_bucket* next = NULL; 130 for (string_bucket* b = set->chains[i]; b != NULL; b = next) { 131 next = b->next; 132 free(b->ptr); 133 free(b); 134 } 135 } 136 137 free(set->chains); 138 } 139 140 #ifdef SIMPLE_HASHING 141 size_t index_chain(const string_set* set, const slice* s) { 142 size_t hash = 0; 143 const unsigned char* p = s->ptr; 144 for (size_t i = 0; i < s->len; i++) { 145 hash *= 31; 146 hash += p[i]; 147 } 148 return hash % set->cap; 149 } 150 #else 151 size_t index_chain(const string_set* set, const slice* s) { 152 uint64_t k = 1; 153 size_t hash = 0; 154 const unsigned char* p = s->ptr; 155 // const uint64_t r = 15485863; 156 const uint64_t r = 86028121; 157 for (size_t i = 0; i < s->len; i++) { 158 hash += k * p[i]; 159 hash %= r; 160 k *= 31; 161 k %= r; 162 } 163 return hash % set->cap; 164 } 165 #endif 166 167 bool check(const string_bucket* b, const slice* s) { 168 if (s->len != b->len) { 169 return false; 170 } 171 172 size_t len = s->len; 173 for (size_t i = 0; i < len; i++) { 174 if (s->ptr[i] != b->ptr[i]) { 175 return false; 176 } 177 } 178 return true; 179 } 180 181 // add returns 0 only when an allocation failed 182 string_bucket* add(string_set* set, const slice* s) { 183 size_t i = index_chain(set, s); 184 string_bucket* last = NULL; 185 186 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 187 if (check(b, s)) { 188 return b; 189 } 190 last = b; 191 } 192 193 unsigned char* copy = malloc(s->len); 194 if (copy == NULL) { 195 return NULL; 196 } 197 memcpy(copy, s->ptr, s->len); 198 199 string_bucket* next = malloc(sizeof(string_bucket)); 200 if (next == NULL) { 201 free(copy); 202 return NULL; 203 } 204 next->ptr = copy; 205 next->len = s->len; 206 next->next = NULL; 207 208 if (last == NULL) { 209 set->chains[i] = next; 210 } else { 211 last->next = next; 212 } 213 return next; 214 } 215 216 bool has(const string_set* set, const slice* s) { 217 size_t i = index_chain(set, s); 218 219 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 220 if (check(b, s)) { 221 return b; 222 } 223 } 224 return false; 225 } 226 227 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) { 228 if (line->ptr == NULL || seen->chains == NULL) { 229 return false; 230 } 231 232 for (size_t i = 0; !feof(w); i++) { 233 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 234 if (line->ptr == NULL) { 235 fprintf(stderr, "\n"); 236 fprintf(stderr, ERROR_LINE("out of memory")); 237 exit(BAD_ALLOC); 238 } 239 240 if (len < 0) { 241 break; 242 } 243 244 // ignore trailing line-feeds: all output lines end with one anyway 245 const unsigned char* p = line->ptr; 246 if (len > 0 && p[len - 1] == '\n') { 247 len--; 248 } 249 line->len = len; 250 251 // ignore already-seen lines 252 if (has(seen, line)) { 253 continue; 254 } 255 if (add(seen, line) == 0) { 256 fprintf(stderr, ERROR_LINE("out of memory")); 257 exit(1); 258 return false; 259 } 260 261 // emit each distinct line only once, the first time it's seen 262 fwrite(line->ptr, 1, line->len, w); 263 fputc('\n', w); 264 } 265 266 return true; 267 } 268 269 // handle_file handles data from the filename given; returns false only when 270 // the file can't be opened 271 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) { 272 FILE* f = fopen(path, "rb"); 273 if (f == NULL) { 274 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 275 return false; 276 } 277 278 const bool ok = handle_reader(w, f, line, seen); 279 fclose(f); 280 return ok; 281 } 282 283 // run returns the number of errors 284 int run(int argc, char** argv, FILE* w) { 285 slice line; 286 line.len = 0; 287 line.cap = line_starting_capacity; 288 line.ptr = malloc(line.cap); 289 if (line.ptr == NULL) { 290 fprintf(stderr, ERROR_LINE("out of memory")); 291 exit(BAD_ALLOC); 292 } 293 294 string_set seen; 295 init_string_set(&seen, bucket_count); 296 if (seen.chains == NULL) { 297 fprintf(stderr, ERROR_LINE("out of memory")); 298 free(line.ptr); 299 return 1; 300 } 301 302 size_t dashes = 0; 303 size_t errors = 0; 304 for (int i = 1; i < argc && !feof(w); i++) { 305 if (strcmp(argv[i], "-") == 0) { 306 dashes++; 307 if (dashes > 1) { 308 continue; 309 } 310 311 if (!handle_reader(w, stdin, &line, &seen)) { 312 errors++; 313 } 314 continue; 315 } 316 317 if (!handle_file(w, argv[i], &line, &seen)) { 318 errors++; 319 } 320 } 321 322 // use stdin when not given any filepaths 323 if (argc <= 1) { 324 handle_reader(w, stdin, &line, &seen); 325 } 326 327 deallocate(&seen); 328 free(line.ptr); 329 return errors; 330 } 331 332 int main(int argc, char** argv) { 333 #ifdef _WIN32 334 setmode(fileno(stdin), O_BINARY); 335 // ensure output lines end in LF instead of CRLF on windows 336 setmode(fileno(stdout), O_BINARY); 337 setmode(fileno(stderr), O_BINARY); 338 #endif 339 340 if (argc > 1) { 341 if ( 342 strcmp(argv[1], "-h") == 0 || 343 strcmp(argv[1], "-help") == 0 || 344 strcmp(argv[1], "--h") == 0 || 345 strcmp(argv[1], "--help") == 0 346 ) { 347 fprintf(stdout, "%s", info); 348 return 0; 349 } 350 } 351 352 return run(argc, argv, stdout) == 0 ? 0 : 1; 353 }