File: dedup.cpp 1 /* 2 The MIT License (MIT) 3 4 Copyright © 2025 pacman64 5 6 Permission is hereby granted, free of charge, to any person obtaining a copy of 7 this software and associated documentation files (the “Software”), to deal 8 in the Software without restriction, including without limitation the rights to 9 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 10 of the Software, and to permit persons to whom the Software is furnished to do 11 so, subject to the following conditions: 12 13 The above copyright notice and this permission notice shall be included in all 14 copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 SOFTWARE. 23 */ 24 25 /* 26 You can build this command-line app by running 27 28 c++ -Wall -s -O2 -march=native -mtune=native -flto -o ./dedup ./dedup.cpp 29 */ 30 31 #include <stdio.h> 32 #include <string.h> 33 #include <unistd.h> 34 35 #include <string> 36 #include <unordered_set> 37 38 #ifdef RED_ERRORS 39 #define ERROR_STYLE "\x1b[38;2;204;0;0m" 40 #ifdef __APPLE__ 41 #define ERROR_STYLE "\x1b[31m" 42 #endif 43 #define RESET_STYLE "\x1b[0m" 44 #else 45 #define ERROR_STYLE "" 46 #define RESET_STYLE "" 47 #endif 48 49 #define ERROR_LINE(MSG) (ERROR_STYLE MSG RESET_STYLE "\n") 50 51 #define BAD_ALLOC 2 52 53 using namespace std; 54 55 const char* info = "" 56 "dedup [files...]\n" 57 "\n" 58 "DEDUPlicate lines, emitting each distinct line once, when it first appears.\n" 59 "Unlike `uniq`, this app doesn't require pre-sorted lines to work correctly,\n" 60 "and keeps lines in their original order.\n" 61 ""; 62 63 typedef unordered_set<string> string_set; 64 // typedef unordered_set<char*> string_set; 65 66 typedef struct slice { 67 void* ptr; 68 size_t cap; 69 } slice; 70 71 void dedup(FILE* r, slice* line, string_set& seen) { 72 while (!feof(stdout)) { 73 ssize_t len = getline((char**)&line->ptr, &line->cap, r); 74 if (line->ptr == NULL) { 75 fprintf(stderr, "\n"); 76 fprintf(stderr, ERROR_LINE("out of memory")); 77 exit(BAD_ALLOC); 78 } 79 80 if (len < 0) { 81 break; 82 } 83 84 char* p = (char*)line->ptr; 85 86 if (len >= 2 && p[len - 2] == '\r' && p[len - 1] == '\n') { 87 p[len - 2] = 0; 88 len -= 2; 89 } else if (len >= 1 && (p[len - 1] == '\n')) { 90 p[len - 1] = 0; 91 len--; 92 } 93 94 string l(string_view(p, len)); 95 if (seen.find(l) == seen.end()) { 96 fwrite(p, 1, len, stdout); 97 fputc('\n', stdout); 98 seen.insert(p); 99 } 100 } 101 } 102 103 bool handleFile(const char* path, slice* line, string_set& seen) { 104 FILE* f = fopen(path, "rb"); 105 if (f == NULL) { 106 fprintf(stderr, ERROR_LINE("can't open file named '%s'"), path); 107 return false; 108 } 109 110 dedup(f, line, seen); 111 fclose(f); 112 return true; 113 } 114 115 bool run(size_t argc, char** argv) { 116 size_t start_args = 1; 117 118 if (argc > start_args) { 119 if ( 120 strcmp(argv[start_args], "-h") == 0 || 121 strcmp(argv[start_args], "-help") == 0 || 122 strcmp(argv[start_args], "--h") == 0 || 123 strcmp(argv[start_args], "--help") == 0 124 ) { 125 fprintf(stdout, "%s", info); 126 return 0; 127 } 128 129 if (strcmp(argv[start_args], "--") == 0) { 130 start_args++; 131 } 132 } 133 134 size_t errors = 0; 135 const bool live_lines = lseek(STDOUT_FILENO, 0, SEEK_CUR) != 0; 136 if (live_lines) { 137 setvbuf(stdout, NULL, _IOLBF, 0); 138 } else { 139 setvbuf(stdout, NULL, _IOFBF, 0); 140 } 141 142 slice line; 143 line.cap = 32 * 1024; 144 line.ptr = malloc(line.cap); 145 if (line.ptr == NULL) { 146 fprintf(stderr, ERROR_LINE("out of memory")); 147 exit(BAD_ALLOC); 148 } 149 150 string_set seen; 151 unordered_set<string> files; 152 153 for (size_t i = start_args; i < argc && !feof(stdout); i++) { 154 string arg(argv[i]); 155 156 // avoid handling any file more than once 157 if (files.find(arg) != files.end()) { 158 continue; 159 } 160 files.insert(arg); 161 162 if (arg == "-") { 163 dedup(stdin, &line, seen); 164 continue; 165 } 166 167 if (!handleFile(argv[i], &line, seen)) { 168 errors++; 169 } 170 } 171 172 if (argc - start_args == 0) { 173 dedup(stdin, &line, seen); 174 } 175 176 if (!live_lines) { 177 fflush(stdout); 178 } 179 180 if (line.ptr != NULL) { 181 free(line.ptr); 182 } 183 return errors == 0; 184 } 185 186 int main(int argc, char** argv) { 187 try { 188 return run(argc, argv) ? 0 : 1; 189 } catch (const bad_alloc& e) { 190 fprintf(stderr, ERROR_LINE("out of memory")); 191 return BAD_ALLOC; 192 } 193 }