File: dedup.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O3 -flto -o ./dedup ./dedup.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #ifdef RED_ERRORS 43 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 44 #ifdef __APPLE__ 45 #define ERROR_STYLE "\x1b[31m" 46 #endif 47 #define RESET_STYLE "\x1b[0m" 48 #else 49 #define ERROR_STYLE 50 #define RESET_STYLE 51 #endif 52 53 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 54 55 #define SIMPLE_HASHING 56 57 const char* info = "" 58 "dedup [filepaths...]\n" 59 "\n" 60 "Deduplicate lines, ensuring each unique input line appears only once on the\n" 61 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n" 62 "when given sorted lines, this tool works correctly without any scrambling.\n" 63 ""; 64 65 const uint64_t line_starting_capacity = 32 * 1024; 66 67 // prime numbers right under various binary-kb values: 68 // 16381, 32749, 65521, 131071 69 70 // bucket_count is how many buckets the set/hash-table allocates top-level: 71 // it's a prime number to minimize hashing collisions 72 const uint64_t bucket_count = 32749; 73 74 // slice is a growable region of bytes in memory 75 typedef struct slice { 76 // ptr is the starting place of the region 77 unsigned char* ptr; 78 79 // len is how many bytes are currently being used 80 size_t len; 81 82 // cap is how many bytes the memory region has available 83 size_t cap; 84 } slice; 85 86 size_t capacity_for(size_t len) { 87 size_t cap = 16; 88 while (cap < len) { 89 cap *= 2; 90 } 91 return cap; 92 } 93 94 // string_bucket is any of the buckets used in struct string_set 95 typedef struct string_bucket { 96 // ptr is where the string starts 97 unsigned char* ptr; 98 99 // len is how many bytes are in the string 100 size_t len; 101 102 // next is the next in the bucket-chain 103 struct string_bucket* next; 104 } string_bucket; 105 106 // string_set is a hash-table with no values, thus acting as a set 107 typedef struct string_set { 108 string_bucket** chains; 109 110 // cap is how many bucket-chains there are 111 size_t cap; 112 } string_set; 113 114 void init_string_set(string_set* set, size_t cap) { 115 set->chains = malloc(sizeof(string_bucket*) * cap); 116 if (set->chains == NULL) { 117 set->cap = 0; 118 return; 119 } 120 set->cap = bucket_count; 121 122 for (size_t i = 0; i < cap; i++) { 123 set->chains[i] = NULL; 124 } 125 } 126 127 void deallocate(string_set* set) { 128 for (size_t i = 0; i < set->cap; i++) { 129 string_bucket* next = NULL; 130 for (string_bucket* b = set->chains[i]; b != NULL; b = next) { 131 next = b->next; 132 free(b->ptr); 133 free(b); 134 } 135 } 136 137 free(set->chains); 138 } 139 140 #ifdef SIMPLE_HASHING 141 size_t index_chain(const string_set* set, const slice* s) { 142 size_t hash = 0; 143 const unsigned char* p = s->ptr; 144 for (size_t i = 0; i < s->len; i++) { 145 hash *= 31; 146 hash += p[i]; 147 } 148 return hash % set->cap; 149 } 150 #else 151 size_t index_chain(const string_set* set, const slice* s) { 152 uint64_t k = 1; 153 size_t hash = 0; 154 const unsigned char* p = s->ptr; 155 // const uint64_t r = 15485863; 156 const uint64_t r = 86028121; 157 for (size_t i = 0; i < s->len; i++) { 158 hash += k * p[i]; 159 hash %= r; 160 k *= 31; 161 k %= r; 162 } 163 return hash % set->cap; 164 } 165 #endif 166 167 bool check(const string_bucket* b, const slice* s) { 168 if (s->len != b->len) { 169 return false; 170 } 171 172 size_t len = s->len; 173 for (size_t i = 0; i < len; i++) { 174 if (s->ptr[i] != b->ptr[i]) { 175 return false; 176 } 177 } 178 return true; 179 } 180 181 // add returns 0 only when an allocation failed 182 string_bucket* add(string_set* set, const slice* s) { 183 size_t i = index_chain(set, s); 184 string_bucket* last = NULL; 185 186 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 187 if (check(b, s)) { 188 return b; 189 } 190 last = b; 191 } 192 193 unsigned char* copy = malloc(s->len); 194 if (copy == NULL) { 195 return NULL; 196 } 197 memcpy(copy, s->ptr, s->len); 198 199 string_bucket* next = malloc(sizeof(string_bucket)); 200 if (next == NULL) { 201 free(copy); 202 return NULL; 203 } 204 next->ptr = copy; 205 next->len = s->len; 206 next->next = NULL; 207 208 if (last == NULL) { 209 set->chains[i] = next; 210 } else { 211 last->next = next; 212 } 213 return next; 214 } 215 216 bool has(const string_set* set, const slice* s) { 217 size_t i = index_chain(set, s); 218 219 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 220 if (check(b, s)) { 221 return b; 222 } 223 } 224 return false; 225 } 226 227 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) { 228 if (line->ptr == NULL || seen->chains == NULL) { 229 return false; 230 } 231 232 for (size_t i = 0; !feof(w); i++) { 233 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 234 if (line->ptr == NULL) { 235 fprintf(stderr, ERROR_LINE("out of memory")); 236 return false; 237 } 238 239 if (len < 0) { 240 break; 241 } 242 243 // ignore trailing line-feeds: all output lines end with one anyway 244 const unsigned char* p = line->ptr; 245 if (len > 0 && p[len - 1] == '\n') { 246 len--; 247 } 248 line->len = len; 249 250 // ignore already-seen lines 251 if (has(seen, line)) { 252 continue; 253 } 254 if (add(seen, line) == 0) { 255 fprintf(stderr, ERROR_LINE("out of memory")); 256 exit(1); 257 return false; 258 } 259 260 // emit each distinct line only once, the first time it's seen 261 fwrite(line->ptr, line->len, 1, w); 262 fputc('\n', w); 263 } 264 265 return true; 266 } 267 268 // handle_file handles data from the filename given; returns false only when 269 // the file can't be opened 270 bool handle_file(FILE* w, const char* path, slice* line, string_set* seen) { 271 FILE* f = fopen(path, "rb"); 272 if (f == NULL) { 273 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 274 return false; 275 } 276 277 const bool ok = handle_reader(w, f, line, seen); 278 fclose(f); 279 return ok; 280 } 281 282 // run returns the number of errors 283 int run(int argc, char** argv, FILE* w) { 284 slice line; 285 line.len = 0; 286 line.cap = line_starting_capacity; 287 line.ptr = malloc(line.cap); 288 if (line.ptr == NULL) { 289 fprintf(stderr, ERROR_LINE("out of memory")); 290 return 1; 291 } 292 293 string_set seen; 294 init_string_set(&seen, bucket_count); 295 if (seen.chains == NULL) { 296 fprintf(stderr, ERROR_LINE("out of memory")); 297 free(line.ptr); 298 return 1; 299 } 300 301 size_t dashes = 0; 302 size_t errors = 0; 303 for (int i = 1; i < argc && !feof(w); i++) { 304 if (argv[i][0] == '-' && argv[i][1] == 0) { 305 dashes++; 306 if (dashes > 1) { 307 continue; 308 } 309 310 if (!handle_reader(w, stdin, &line, &seen)) { 311 errors++; 312 } 313 continue; 314 } 315 316 if (!handle_file(w, argv[i], &line, &seen)) { 317 errors++; 318 } 319 } 320 321 // use stdin when not given any filepaths 322 if (argc <= 1) { 323 handle_reader(w, stdin, &line, &seen); 324 } 325 326 deallocate(&seen); 327 free(line.ptr); 328 return errors; 329 } 330 331 int main(int argc, char** argv) { 332 #ifdef _WIN32 333 setmode(fileno(stdin), O_BINARY); 334 // ensure output lines end in LF instead of CRLF on windows 335 setmode(fileno(stdout), O_BINARY); 336 setmode(fileno(stderr), O_BINARY); 337 #endif 338 339 if (argc > 1) { 340 if ( 341 strcmp(argv[1], "-h") == 0 || 342 strcmp(argv[1], "-help") == 0 || 343 strcmp(argv[1], "--h") == 0 || 344 strcmp(argv[1], "--help") == 0 345 ) { 346 fprintf(stdout, "%s", info); 347 return 0; 348 } 349 } 350 351 return run(argc, argv, stdout) == 0 ? 0 : 1; 352 }