File: gsawk.sh
   1 #!/bin/sh
   2 
   3 # The MIT License (MIT)
   4 #
   5 # Copyright (c) 2026 pacman64
   6 #
   7 # Permission is hereby granted, free of charge, to any person obtaining a copy
   8 # of this software and associated documentation files (the "Software"), to deal
   9 # in the Software without restriction, including without limitation the rights
  10 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 # copies of the Software, and to permit persons to whom the Software is
  12 # furnished to do so, subject to the following conditions:
  13 #
  14 # The above copyright notice and this permission notice shall be included in
  15 # all copies or substantial portions of the Software.
  16 #
  17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 # SOFTWARE.
  24 
  25 
  26 # gsawk [options...] [awk expression...] [files...]
  27 #
  28 #
  29 # Grouped Summaries via AWK expression calculates/aggregates some numeric
  30 # statistics for each group determined by the AWK expression given. The
  31 # output is a JSON object whose top-level keys are the expression results,
  32 # whose values are objects of objects, numerically summarizing all columns
  33 # of the rows in its top-level-key group.
  34 #
  35 # The handy case-insensitive shortcut options may cause this tool to fail,
  36 # if the main AWK tool installed doesn't support the special IGNORECASE
  37 # variable.
  38 #
  39 # The AWK options available only in single-dash versions are
  40 #
  41 #   -f fs, -F fs, -Ffs, -F=fs    make `fs` the field separator
  42 #
  43 # The other options are, available both in single and double-dash versions
  44 #
  45 #   -h, -help    show this help message
  46 #   -i, -ins     match regexes case-insensitively; may fail the default `awk`
  47 #   -sort        sort calculated top-level keys
  48 #   -tsv         split fields using tabs, same as using -F "\t"
  49 
  50 
  51 case "$1" in
  52     -h|--h|-help|--help)
  53         awk '/^# +gsawk /, /^$/ { gsub(/^# ?/, ""); print }' "$0"
  54         exit 0
  55     ;;
  56 esac
  57 
  58 tsv=0
  59 sort_keys=''
  60 case_insensitive=0
  61 command='awk'
  62 
  63 while [ $# -gt 0 ]; do
  64     if [ "$1" = "--" ]; then
  65         shift
  66         break
  67     fi
  68 
  69     case "$1" in
  70         -f|-F)
  71             shift
  72             if [ $# -eq 0 ]; then
  73                 printf "expected value after -F option\n" >&2
  74                 exit 1
  75             fi
  76             command="${command} -F $1"
  77             shift
  78             continue
  79         ;;
  80 
  81         -F*)
  82             command="${command} $1"
  83             shift
  84             continue
  85         ;;
  86 
  87         -i|--i|-ins|--ins|-insensitive|--insensitive)
  88             case_insensitive=1
  89             shift
  90             continue
  91         ;;
  92 
  93         -sort|--sort|-sorted|--sorted)
  94             sort_keys='asort(ordkeys)'
  95             shift
  96             continue
  97         ;;
  98 
  99         -tsv|--tsv)
 100             tsv=1
 101             shift
 102             continue
 103         ;;
 104     esac
 105 
 106     break
 107 done
 108 
 109 code="${1:-\$0}"
 110 [ $# -gt 0 ] && shift
 111 
 112 # show all non-existing files given
 113 failed=0
 114 for arg in "$@"; do
 115     if [ "${arg}" = "-" ]; then
 116         continue
 117     fi
 118     if [ ! -e "${arg}" ]; then
 119         printf "no file named \"%s\"\n" "${arg}" >&2
 120         failed=1
 121     fi
 122 done
 123 
 124 if [ "${failed}" -gt 0 ]; then
 125     exit 2
 126 fi
 127 
 128 ci='
 129     BEGIN {
 130         if (IGNORECASE == "") {
 131             m = "your `awk` command lacks case-insensitive regex-matching"
 132             print(m) > "/dev/stderr"
 133             exit 125
 134         }
 135         IGNORECASE = 1
 136     }
 137 '
 138 if [ "${case_insensitive}" -eq 0 ]; then
 139     ci=''
 140 fi
 141 
 142 src="${ci}"'
 143     BEGIN {
 144         if (SUBSEP == "") SUBSEP = "\034"
 145         inf = "+inf" + 0
 146     }
 147 
 148     function stringify(s) {
 149         gsub(/\\/, "\\\\", s)
 150         gsub(/"/, "\\\"", s)
 151         return sprintf("\"%s\"", s)
 152     }
 153 
 154     function init_group(key) {
 155         summaries[key SUBSEP "numeric"] = 0
 156         summaries[key SUBSEP "integer"] = 0
 157         summaries[key SUBSEP "positive"] = 0
 158         summaries[key SUBSEP "zero"] = 0
 159         summaries[key SUBSEP "negative"] = 0
 160         summaries[key SUBSEP "min"] = inf
 161         summaries[key SUBSEP "max"] = -inf
 162         summaries[key SUBSEP "sum"] = 0
 163         summaries[key SUBSEP "mean"] = 0
 164         summaries[key SUBSEP "product"] = 1
 165 
 166         summaries[key SUBSEP "_ln_sum"] = 0
 167         summaries[key SUBSEP "_d1"] = 0
 168         summaries[key SUBSEP "_d2"] = 0
 169         summaries[key SUBSEP "_mean_square"] = 0
 170     }
 171 
 172     function update_group(key, v, n) {
 173         summaries[key SUBSEP "numeric"]++
 174         summaries[key SUBSEP "integer"] += v % 1 == 0
 175         if (v > 0) summaries[key SUBSEP "positive"]++
 176         else if (v < 0) summaries[key SUBSEP "negative"]++
 177         else if (v == 0) summaries[key SUBSEP "zero"]++
 178 
 179         n = summaries[key SUBSEP "min"]
 180         summaries[key SUBSEP "min"] = n < v ? n : v
 181         n = summaries[key SUBSEP "max"]
 182         summaries[key SUBSEP "max"] = n > v ? n : v
 183         summaries[key SUBSEP "sum"] += v
 184         summaries[key SUBSEP "product"] *= v
 185         summaries[key SUBSEP "_ln_sum"] += v <= 0 ? -inf : log(v)
 186 
 187         # advance welford`s algorithm
 188         n = summaries[key SUBSEP "numeric"]
 189         summaries[key SUBSEP "_d1"] = v - summaries[key SUBSEP "mean"]
 190         summaries[key SUBSEP "mean"] += summaries[key SUBSEP "_d1"] / n
 191         summaries[key SUBSEP "_d2"] = v - summaries[key SUBSEP "mean"]
 192         n = summaries[key SUBSEP "_mean_square"]
 193         n += summaries[key SUBSEP "_d1"] * summaries[key SUBSEP "_d2"]
 194         summaries[key SUBSEP "_mean_square"] = n
 195     }
 196 
 197     # ignore leading UTF-8 BOMs on the first line of each input, when present
 198     FNR == 1 { gsub(/^\xef\xbb\xbf/, "") }
 199 
 200     # ignore trailing carriage-returns
 201     { gsub(/\r$/, "") }
 202 
 203     FNR == 1 {
 204         for (i = 1; i <= NF; i++) props[++numprops] = $i
 205         next
 206     }
 207 
 208     {
 209         k = ('"${code}"')
 210 
 211         if (tally[k]++ == 0) {
 212             for (i = 1; i <= NF; i++) init_group(k SUBSEP props[i])
 213             ordkeys[++numkeys] = k
 214         }
 215 
 216         for (i = 1; i <= NF; i++) {
 217             if ($i !~ /^ *(0|[0-9]+|[0-9]*\.[0-9]+) *$/) continue
 218             update_group(k SUBSEP props[i], $i + 0)
 219         }
 220     }
 221 
 222     function emit(key) {
 223         nums = summaries[key SUBSEP "numeric"]
 224         ints = summaries[key SUBSEP "integer"]
 225         pos = summaries[key SUBSEP "positive"]
 226         zero = summaries[key SUBSEP "zero"]
 227         neg = summaries[key SUBSEP "negative"]
 228         min = summaries[key SUBSEP "min"]
 229         max = summaries[key SUBSEP "max"]
 230         sum = summaries[key SUBSEP "sum"]
 231         mean = summaries[key SUBSEP "mean"]
 232         product = summaries[key SUBSEP "product"]
 233         lsum = summaries[key SUBSEP "_ln_sum"]
 234         d1 = summaries[key SUBSEP "_d1"]
 235         d2 = summaries[key SUBSEP "_d2"]
 236         msq = summaries[key SUBSEP "_mean_square"]
 237         sum = mean * nums
 238         if (nums == 0) lsum = -inf
 239 
 240         printf "{\"numeric\": %d, ", nums
 241         if (nums > 0) {
 242             emit_pair("min", min)
 243             emit_pair("max", max)
 244             emit_pair("sum", sum)
 245             emit_pair("mean", mean)
 246             gm = "null"
 247             if (zero == 0 && neg == 0) gm = sprintf("%f", exp(lsum / nums))
 248             printf "\"geomean\": %s, ", gm
 249             emit_pair("sd", sqrt(msq / nums))
 250             # emit_pair("product", product)
 251         } else {
 252             printf "\"min\": null, "
 253             printf "\"max\": null, "
 254             printf "\"sum\": null, "
 255             printf "\"mean\": null, "
 256             printf "\"geomean\": null, "
 257             printf "\"sd\": null, "
 258             # printf "\"product\": null, "
 259         }
 260         printf "\"integer\": %d, ", ints
 261         printf "\"positive\": %d, ", pos
 262         printf "\"zero\": %d, ", zero
 263         printf "\"negative\": %d}", neg
 264     }
 265 
 266     function emit_pair(key, num) {
 267         printf ((num % 1 == 0) ? "\"%s\": %d, " : "\"%s\": %f, "), key, num
 268     }
 269 
 270     END {
 271         '"${sort_keys}"'
 272         printf "{\n"
 273 
 274         for (i = 1; i <= numkeys; i++) {
 275             k1 = ordkeys[i]
 276             if (i > 1) printf ",\n"
 277             printf "  %s: {\n", stringify(k1)
 278 
 279             for (j = 1; j <= numprops; j++) {
 280                 k2 = props[j]
 281                 if (j > 1) printf ",\n"
 282                 printf "    %s: ", stringify(k2)
 283                 emit(k1 SUBSEP k2)
 284             }
 285 
 286             if (j > 1) print ""
 287             printf "  }"
 288         }
 289 
 290         if (i > 1) print ""
 291         printf "}\n"
 292     }
 293 '
 294 
 295 if [ "${tsv}" -eq 1 ]; then
 296     ${command} -F "\t" "${src}" "$@"
 297 else
 298     ${command} "${src}" "$@"
 299 fi