File: dedup.c 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2020-2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 cc -Wall -s -O2 -o ./dedup ./dedup.c 29 */ 30 31 #include <stdbool.h> 32 #include <stdint.h> 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <string.h> 36 37 #ifdef _WIN32 38 #include <fcntl.h> 39 #include <windows.h> 40 #endif 41 42 #define SIMPLE_HASHING 43 44 const char* info = "" 45 "dedup [filepaths...]\n" 46 "\n" 47 "Deduplicate lines, ensuring each unique input line appears only once on the\n" 48 "output. Unlike the standard cmd-line app `uniq`, which only works correctly\n" 49 "when given sorted lines, this tool works correctly without any scrambling.\n" 50 ""; 51 52 const char* no_line_memory_msg = "can't get enough memory to read lines"; 53 54 const char* no_set_memory_msg = "can't get enough memory to tell lines apart"; 55 56 const uint64_t line_starting_capacity = 32 * 1024; 57 58 // prime numbers right under various binary-kb values: 59 // 16381, 32749, 65521, 131071 60 61 // bucket_count is how many buckets the set/hash-table allocates top-level: 62 // it's a prime number to minimize hashing collisions 63 const uint64_t bucket_count = 32749; 64 65 // slice is a growable region of bytes in memory 66 typedef struct slice { 67 // ptr is the starting place of the region 68 unsigned char* ptr; 69 70 // len is how many bytes are currently being used 71 size_t len; 72 73 // cap is how many bytes the memory region has available 74 size_t cap; 75 } slice; 76 77 size_t capacity_for(size_t len) { 78 size_t cap = 16; 79 while (cap < len) { 80 cap *= 2; 81 } 82 return cap; 83 } 84 85 // string_bucket is any of the buckets used in struct string_set 86 typedef struct string_bucket { 87 // ptr is where the string starts 88 unsigned char* ptr; 89 90 // len is how many bytes are in the string 91 size_t len; 92 93 // next is the next in the bucket-chain 94 struct string_bucket* next; 95 } string_bucket; 96 97 // string_set is a hash-table with no values, thus acting as a set 98 typedef struct string_set { 99 string_bucket** chains; 100 101 // cap is how many bucket-chains there are 102 size_t cap; 103 } string_set; 104 105 void init_string_set(string_set* set, size_t cap) { 106 set->chains = malloc(sizeof(string_bucket*) * cap); 107 if (set->chains == NULL) { 108 set->cap = 0; 109 return; 110 } 111 set->cap = bucket_count; 112 113 for (size_t i = 0; i < cap; i++) { 114 set->chains[i] = NULL; 115 } 116 } 117 118 void deallocate(string_set* set) { 119 for (size_t i = 0; i < set->cap; i++) { 120 string_bucket* next = NULL; 121 for (string_bucket* b = set->chains[i]; b != NULL; b = next) { 122 next = b->next; 123 free(b->ptr); 124 free(b); 125 } 126 } 127 128 free(set->chains); 129 } 130 131 #ifdef SIMPLE_HASHING 132 size_t index_chain(const string_set* set, const slice* s) { 133 size_t hash = 0; 134 const unsigned char* p = s->ptr; 135 for (size_t i = 0; i < s->len; i++) { 136 hash *= 31; 137 hash += p[i]; 138 } 139 return hash % set->cap; 140 } 141 #else 142 size_t index_chain(const string_set* set, const slice* s) { 143 uint64_t k = 1; 144 size_t hash = 0; 145 const unsigned char* p = s->ptr; 146 // const uint64_t r = 15485863; 147 const uint64_t r = 86028121; 148 for (size_t i = 0; i < s->len; i++) { 149 hash += k * p[i]; 150 hash %= r; 151 k *= 31; 152 k %= r; 153 } 154 return hash % set->cap; 155 } 156 #endif 157 158 bool check(const string_bucket* b, const slice* s) { 159 if (s->len != b->len) { 160 return false; 161 } 162 163 size_t len = s->len; 164 for (size_t i = 0; i < len; i++) { 165 if (s->ptr[i] != b->ptr[i]) { 166 return false; 167 } 168 } 169 return true; 170 } 171 172 // add returns 0 only when an allocation failed 173 string_bucket* add(string_set* set, const slice* s) { 174 size_t i = index_chain(set, s); 175 string_bucket* last = NULL; 176 177 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 178 if (check(b, s)) { 179 return b; 180 } 181 last = b; 182 } 183 184 unsigned char* copy = malloc(s->len); 185 if (copy == NULL) { 186 return NULL; 187 } 188 memcpy(copy, s->ptr, s->len); 189 190 string_bucket* next = malloc(sizeof(string_bucket)); 191 if (next == NULL) { 192 free(copy); 193 return NULL; 194 } 195 next->ptr = copy; 196 next->len = s->len; 197 next->next = NULL; 198 199 if (last == NULL) { 200 set->chains[i] = next; 201 } else { 202 last->next = next; 203 } 204 return next; 205 } 206 207 bool has(const string_set* set, const slice* s) { 208 size_t i = index_chain(set, s); 209 210 for (string_bucket* b = set->chains[i]; b != NULL; b = b->next) { 211 if (check(b, s)) { 212 return b; 213 } 214 } 215 return false; 216 } 217 218 bool handle_reader(FILE* w, FILE* r, slice* line, string_set* seen) { 219 if (line->ptr == NULL || seen->chains == NULL) { 220 return false; 221 } 222 223 for (size_t i = 0; !feof(w); i++) { 224 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 225 if (len < 0) { 226 break; 227 } 228 229 if (line->ptr == NULL) { 230 putc('\n', w); 231 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg); 232 return false; 233 } 234 235 // ignore trailing line-feeds: all output lines end with one anyway 236 const unsigned char* p = line->ptr; 237 if (len > 0 && p[len - 1] == '\n') { 238 len--; 239 } 240 line->len = len; 241 242 // ignore already-seen lines 243 if (has(seen, line)) { 244 continue; 245 } 246 if (add(seen, line) == 0) { 247 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_set_memory_msg); 248 exit(1); 249 return false; 250 } 251 252 // emit each distinct line only once, the first time it's seen 253 fwrite(line->ptr, line->len, 1, w); 254 putc('\n', w); 255 fflush(w); 256 } 257 258 return true; 259 } 260 261 // handle_file handles data from the filename given; returns false only when 262 // the file can't be opened 263 bool handle_file(FILE* w, const char* fname, slice* line, string_set* seen) { 264 FILE* f = fopen(fname, "rb"); 265 if (f == NULL) { 266 fprintf(stderr, "\x1b[31mcan't open file named %s\x1b[0m\n", fname); 267 return false; 268 } 269 270 const bool ok = handle_reader(w, f, line, seen); 271 fclose(f); 272 return ok; 273 } 274 275 // run returns the number of errors 276 int run(int argc, char** argv, FILE* w) { 277 slice line; 278 line.len = 0; 279 line.cap = line_starting_capacity; 280 line.ptr = malloc(line.cap); 281 if (line.ptr == NULL) { 282 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_line_memory_msg); 283 return 1; 284 } 285 286 string_set seen; 287 init_string_set(&seen, bucket_count); 288 if (seen.chains == NULL) { 289 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", no_set_memory_msg); 290 free(line.ptr); 291 return 1; 292 } 293 294 size_t errors = 0; 295 for (int i = 1; i < argc && !feof(stdout); i++) { 296 if (argv[i][0] == '-' && argv[i][1] == 0) { 297 if (!handle_reader(w, stdin, &line, &seen)) { 298 errors++; 299 } 300 continue; 301 } 302 303 if (!handle_file(w, argv[i], &line, &seen)) { 304 errors++; 305 } 306 } 307 308 // use stdin when not given any filepaths 309 if (argc <= 1) { 310 handle_reader(w, stdin, &line, &seen); 311 } 312 313 deallocate(&seen); 314 free(line.ptr); 315 return errors; 316 } 317 318 int main(int argc, char** argv) { 319 #ifdef _WIN32 320 setmode(fileno(stdin), O_BINARY); 321 // ensure output lines end in LF instead of CRLF on windows 322 setmode(fileno(stdout), O_BINARY); 323 setmode(fileno(stderr), O_BINARY); 324 #endif 325 326 if (argc > 1) { 327 if ( 328 strcmp(argv[1], "-h") == 0 || 329 strcmp(argv[1], "-help") == 0 || 330 strcmp(argv[1], "--h") == 0 || 331 strcmp(argv[1], "--help") == 0 332 ) { 333 fprintf(stdout, "%s", info); 334 return 0; 335 } 336 } 337 338 size_t dashes = 0; 339 for (int i = 1; i < argc; i++) { 340 if (argv[i][0] == '-' && argv[i][1] == 0) { 341 dashes++; 342 } 343 } 344 345 if (dashes > 1) { 346 const char* msg = "can't use the standard input (dash) more than once"; 347 fprintf(stderr, "\x1b[31m%s\x1b[0m\n", msg); 348 return 1; 349 } 350 351 return run(argc, argv, stdout) == 0 ? 0 : 1; 352 }