File: gsawk.sh 1 #!/bin/sh 2 3 # The MIT License (MIT) 4 # 5 # Copyright (c) 2026 pacman64 6 # 7 # Permission is hereby granted, free of charge, to any person obtaining a copy 8 # of this software and associated documentation files (the "Software"), to deal 9 # in the Software without restriction, including without limitation the rights 10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 # copies of the Software, and to permit persons to whom the Software is 12 # furnished to do so, subject to the following conditions: 13 # 14 # The above copyright notice and this permission notice shall be included in 15 # all copies or substantial portions of the Software. 16 # 17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 # SOFTWARE. 24 25 26 # gsawk [options...] [awk expression...] [files...] 27 # 28 # 29 # Grouped Summaries via AWK expression calculates/aggregates some numeric 30 # statistics for each group determined by the AWK expression given. The 31 # output is a JSON object whose top-level keys are the expression results, 32 # whose values are objects of objects, numerically summarizing all columns 33 # of the rows in its top-level-key group. 34 # 35 # The handy case-insensitive shortcut options may cause this tool to fail, 36 # if the main AWK tool installed doesn't support the special IGNORECASE 37 # variable. 38 # 39 # The AWK options available only in single-dash versions are 40 # 41 # -f fs, -F fs, -Ffs, -F=fs make `fs` the field separator 42 # 43 # The other options are, available both in single and double-dash versions 44 # 45 # -h, -help show this help message 46 # -i, -ins match regexes case-insensitively; may fail the default `awk` 47 # -sort sort calculated top-level keys 48 # -tsv split fields using tabs, same as using -F "\t" 49 50 51 case "$1" in 52 -h|--h|-help|--help) 53 awk '/^# +gsawk /, /^$/ { gsub(/^# ?/, ""); print }' "$0" 54 exit 0 55 ;; 56 esac 57 58 tsv=0 59 sort_keys='' 60 case_insensitive=0 61 command='awk' 62 63 while [ $# -gt 0 ]; do 64 if [ "$1" = "--" ]; then 65 shift 66 break 67 fi 68 69 case "$1" in 70 -f|-F) 71 shift 72 if [ $# -eq 0 ]; then 73 printf "expected value after -F option\n" >&2 74 exit 1 75 fi 76 command="${command} -F $1" 77 shift 78 continue 79 ;; 80 81 -F*) 82 command="${command} $1" 83 shift 84 continue 85 ;; 86 87 -i|--i|-ins|--ins|-insensitive|--insensitive) 88 case_insensitive=1 89 shift 90 continue 91 ;; 92 93 -sort|--sort|-sorted|--sorted) 94 sort_keys='asort(ordkeys)' 95 shift 96 continue 97 ;; 98 99 -tsv|--tsv) 100 tsv=1 101 shift 102 continue 103 ;; 104 esac 105 106 break 107 done 108 109 code="${1:-\$0}" 110 [ $# -gt 0 ] && shift 111 112 # show all non-existing files given 113 failed=0 114 for arg in "$@"; do 115 if [ "${arg}" = "-" ]; then 116 continue 117 fi 118 if [ ! -e "${arg}" ]; then 119 printf "no file named \"%s\"\n" "${arg}" >&2 120 failed=1 121 fi 122 done 123 124 if [ "${failed}" -gt 0 ]; then 125 exit 2 126 fi 127 128 ci=' 129 BEGIN { 130 if (IGNORECASE == "") { 131 m = "your `awk` command lacks case-insensitive regex-matching" 132 print(m) > "/dev/stderr" 133 exit 125 134 } 135 IGNORECASE = 1 136 } 137 ' 138 if [ "${case_insensitive}" -eq 0 ]; then 139 ci='' 140 fi 141 142 src="${ci}"' 143 BEGIN { 144 if (SUBSEP == "") SUBSEP = "\034" 145 inf = "+inf" + 0 146 } 147 148 function stringify(s) { 149 gsub(/\\/, "\\\\", s) 150 gsub(/"/, "\\\"", s) 151 return sprintf("\"%s\"", s) 152 } 153 154 function init_group(key) { 155 summaries[key SUBSEP "numeric"] = 0 156 summaries[key SUBSEP "integer"] = 0 157 summaries[key SUBSEP "positive"] = 0 158 summaries[key SUBSEP "zero"] = 0 159 summaries[key SUBSEP "negative"] = 0 160 summaries[key SUBSEP "min"] = inf 161 summaries[key SUBSEP "max"] = -inf 162 summaries[key SUBSEP "sum"] = 0 163 summaries[key SUBSEP "mean"] = 0 164 summaries[key SUBSEP "product"] = 1 165 166 summaries[key SUBSEP "_ln_sum"] = 0 167 summaries[key SUBSEP "_d1"] = 0 168 summaries[key SUBSEP "_d2"] = 0 169 summaries[key SUBSEP "_mean_square"] = 0 170 } 171 172 function update_group(key, v, n) { 173 summaries[key SUBSEP "numeric"]++ 174 summaries[key SUBSEP "integer"] += v % 1 == 0 175 if (v > 0) summaries[key SUBSEP "positive"]++ 176 else if (v < 0) summaries[key SUBSEP "negative"]++ 177 else if (v == 0) summaries[key SUBSEP "zero"]++ 178 179 n = summaries[key SUBSEP "min"] 180 summaries[key SUBSEP "min"] = n < v ? n : v 181 n = summaries[key SUBSEP "max"] 182 summaries[key SUBSEP "max"] = n > v ? n : v 183 summaries[key SUBSEP "sum"] += v 184 summaries[key SUBSEP "product"] *= v 185 summaries[key SUBSEP "_ln_sum"] += v <= 0 ? -inf : log(v) 186 187 # advance welford`s algorithm 188 n = summaries[key SUBSEP "numeric"] 189 summaries[key SUBSEP "_d1"] = v - summaries[key SUBSEP "mean"] 190 summaries[key SUBSEP "mean"] += summaries[key SUBSEP "_d1"] / n 191 summaries[key SUBSEP "_d2"] = v - summaries[key SUBSEP "mean"] 192 n = summaries[key SUBSEP "_mean_square"] 193 n += summaries[key SUBSEP "_d1"] * summaries[key SUBSEP "_d2"] 194 summaries[key SUBSEP "_mean_square"] = n 195 } 196 197 # ignore leading UTF-8 BOMs on the first line of each input, when present 198 FNR == 1 { gsub(/^\xef\xbb\xbf/, "") } 199 200 # ignore trailing carriage-returns 201 { gsub(/\r$/, "") } 202 203 FNR == 1 { 204 for (i = 1; i <= NF; i++) props[++numprops] = $i 205 next 206 } 207 208 { 209 k = ('"${code}"') 210 211 if (tally[k]++ == 0) { 212 for (i = 1; i <= NF; i++) init_group(k SUBSEP props[i]) 213 ordkeys[++numkeys] = k 214 } 215 216 for (i = 1; i <= NF; i++) { 217 if ($i !~ /^ *(0|[0-9]+|[0-9]*\.[0-9]+) *$/) continue 218 update_group(k SUBSEP props[i], $i + 0) 219 } 220 } 221 222 function emit(key) { 223 nums = summaries[key SUBSEP "numeric"] 224 ints = summaries[key SUBSEP "integer"] 225 pos = summaries[key SUBSEP "positive"] 226 zero = summaries[key SUBSEP "zero"] 227 neg = summaries[key SUBSEP "negative"] 228 min = summaries[key SUBSEP "min"] 229 max = summaries[key SUBSEP "max"] 230 sum = summaries[key SUBSEP "sum"] 231 mean = summaries[key SUBSEP "mean"] 232 product = summaries[key SUBSEP "product"] 233 lsum = summaries[key SUBSEP "_ln_sum"] 234 d1 = summaries[key SUBSEP "_d1"] 235 d2 = summaries[key SUBSEP "_d2"] 236 msq = summaries[key SUBSEP "_mean_square"] 237 sum = mean * nums 238 if (nums == 0) lsum = -inf 239 240 printf "{\"numeric\": %d, ", nums 241 if (nums > 0) { 242 emit_pair("min", min) 243 emit_pair("max", max) 244 emit_pair("sum", sum) 245 emit_pair("mean", mean) 246 gm = "null" 247 if (zero == 0 && neg == 0) gm = sprintf("%f", exp(lsum / nums)) 248 printf "\"geomean\": %s, ", gm 249 emit_pair("sd", sqrt(msq / nums)) 250 # emit_pair("product", product) 251 } else { 252 printf "\"min\": null, " 253 printf "\"max\": null, " 254 printf "\"sum\": null, " 255 printf "\"mean\": null, " 256 printf "\"geomean\": null, " 257 printf "\"sd\": null, " 258 # printf "\"product\": null, " 259 } 260 printf "\"integer\": %d, ", ints 261 printf "\"positive\": %d, ", pos 262 printf "\"zero\": %d, ", zero 263 printf "\"negative\": %d}", neg 264 } 265 266 function emit_pair(key, num) { 267 printf ((num % 1 == 0) ? "\"%s\": %d, " : "\"%s\": %f, "), key, num 268 } 269 270 END { 271 '"${sort_keys}"' 272 printf "{\n" 273 274 for (i = 1; i <= numkeys; i++) { 275 k1 = ordkeys[i] 276 if (i > 1) printf ",\n" 277 printf " %s: {\n", stringify(k1) 278 279 for (j = 1; j <= numprops; j++) { 280 k2 = props[j] 281 if (j > 1) printf ",\n" 282 printf " %s: ", stringify(k2) 283 emit(k1 SUBSEP k2) 284 } 285 286 if (j > 1) print "" 287 printf " }" 288 } 289 290 if (i > 1) print "" 291 printf "}\n" 292 } 293 ' 294 295 if [ "${tsv}" -eq 1 ]; then 296 ${command} -F "\t" "${src}" "$@" 297 else 298 ${command} "${src}" "$@" 299 fi