#!/bin/sh # The MIT License (MIT) # # Copyright (c) 2026 pacman64 # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # gsawk [options...] [awk expression...] [files...] # # # Grouped Summaries via AWK expression calculates/aggregates some numeric # statistics for each group determined by the AWK expression given. The # output is a JSON object whose top-level keys are the expression results, # whose values are objects of objects, numerically summarizing all columns # of the rows in its top-level-key group. # # The handy case-insensitive shortcut options may cause this tool to fail, # if the main AWK tool installed doesn't support the special IGNORECASE # variable. # # The AWK options available only in single-dash versions are # # -f fs, -F fs, -Ffs, -F=fs make `fs` the field separator # # The other options are, available both in single and double-dash versions # # -h, -help show this help message # -i, -ins match regexes case-insensitively; may fail the default `awk` # -sort sort calculated top-level keys # -tsv split fields using tabs, same as using -F "\t" case "$1" in -h|--h|-help|--help) awk '/^# +gsawk /, /^$/ { gsub(/^# ?/, ""); print }' "$0" exit 0 ;; esac tsv=0 sort_keys='' case_insensitive=0 command='awk' while [ $# -gt 0 ]; do if [ "$1" = "--" ]; then shift break fi case "$1" in -f|-F) shift if [ $# -eq 0 ]; then printf "expected value after -F option\n" >&2 exit 1 fi command="${command} -F $1" shift continue ;; -F*) command="${command} $1" shift continue ;; -i|--i|-ins|--ins|-insensitive|--insensitive) case_insensitive=1 shift continue ;; -sort|--sort|-sorted|--sorted) sort_keys='asort(ordkeys)' shift continue ;; -tsv|--tsv) tsv=1 shift continue ;; esac break done code="${1:-\$0}" [ $# -gt 0 ] && shift # show all non-existing files given failed=0 for arg in "$@"; do if [ "${arg}" = "-" ]; then continue fi if [ ! -e "${arg}" ]; then printf "no file named \"%s\"\n" "${arg}" >&2 failed=1 fi done if [ "${failed}" -gt 0 ]; then exit 2 fi ci=' BEGIN { if (IGNORECASE == "") { m = "your `awk` command lacks case-insensitive regex-matching" print(m) > "/dev/stderr" exit 125 } IGNORECASE = 1 } ' if [ "${case_insensitive}" -eq 0 ]; then ci='' fi src="${ci}"' BEGIN { if (SUBSEP == "") SUBSEP = "\034" inf = "+inf" + 0 } function stringify(s) { gsub(/\\/, "\\\\", s) gsub(/"/, "\\\"", s) return sprintf("\"%s\"", s) } function init_group(key) { summaries[key SUBSEP "numeric"] = 0 summaries[key SUBSEP "integer"] = 0 summaries[key SUBSEP "positive"] = 0 summaries[key SUBSEP "zero"] = 0 summaries[key SUBSEP "negative"] = 0 summaries[key SUBSEP "min"] = inf summaries[key SUBSEP "max"] = -inf summaries[key SUBSEP "sum"] = 0 summaries[key SUBSEP "mean"] = 0 summaries[key SUBSEP "product"] = 1 summaries[key SUBSEP "_ln_sum"] = 0 summaries[key SUBSEP "_d1"] = 0 summaries[key SUBSEP "_d2"] = 0 summaries[key SUBSEP "_mean_square"] = 0 } function update_group(key, v, n) { summaries[key SUBSEP "numeric"]++ summaries[key SUBSEP "integer"] += v % 1 == 0 if (v > 0) summaries[key SUBSEP "positive"]++ else if (v < 0) summaries[key SUBSEP "negative"]++ else if (v == 0) summaries[key SUBSEP "zero"]++ n = summaries[key SUBSEP "min"] summaries[key SUBSEP "min"] = n < v ? n : v n = summaries[key SUBSEP "max"] summaries[key SUBSEP "max"] = n > v ? n : v summaries[key SUBSEP "sum"] += v summaries[key SUBSEP "product"] *= v summaries[key SUBSEP "_ln_sum"] += v <= 0 ? -inf : log(v) # advance welford`s algorithm n = summaries[key SUBSEP "numeric"] summaries[key SUBSEP "_d1"] = v - summaries[key SUBSEP "mean"] summaries[key SUBSEP "mean"] += summaries[key SUBSEP "_d1"] / n summaries[key SUBSEP "_d2"] = v - summaries[key SUBSEP "mean"] n = summaries[key SUBSEP "_mean_square"] n += summaries[key SUBSEP "_d1"] * summaries[key SUBSEP "_d2"] summaries[key SUBSEP "_mean_square"] = n } # ignore leading UTF-8 BOMs on the first line of each input, when present FNR == 1 { gsub(/^\xef\xbb\xbf/, "") } # ignore trailing carriage-returns { gsub(/\r$/, "") } FNR == 1 { for (i = 1; i <= NF; i++) props[++numprops] = $i next } { k = ('"${code}"') if (tally[k]++ == 0) { for (i = 1; i <= NF; i++) init_group(k SUBSEP props[i]) ordkeys[++numkeys] = k } for (i = 1; i <= NF; i++) { if ($i !~ /^ *(0|[0-9]+|[0-9]*\.[0-9]+) *$/) continue update_group(k SUBSEP props[i], $i + 0) } } function emit(key) { nums = summaries[key SUBSEP "numeric"] ints = summaries[key SUBSEP "integer"] pos = summaries[key SUBSEP "positive"] zero = summaries[key SUBSEP "zero"] neg = summaries[key SUBSEP "negative"] min = summaries[key SUBSEP "min"] max = summaries[key SUBSEP "max"] sum = summaries[key SUBSEP "sum"] mean = summaries[key SUBSEP "mean"] product = summaries[key SUBSEP "product"] lsum = summaries[key SUBSEP "_ln_sum"] d1 = summaries[key SUBSEP "_d1"] d2 = summaries[key SUBSEP "_d2"] msq = summaries[key SUBSEP "_mean_square"] sum = mean * nums if (nums == 0) lsum = -inf printf "{\"numeric\": %d, ", nums if (nums > 0) { emit_pair("min", min) emit_pair("max", max) emit_pair("sum", sum) emit_pair("mean", mean) gm = "null" if (zero == 0 && neg == 0) gm = sprintf("%f", exp(lsum / nums)) printf "\"geomean\": %s, ", gm emit_pair("sd", sqrt(msq / nums)) # emit_pair("product", product) } else { printf "\"min\": null, " printf "\"max\": null, " printf "\"sum\": null, " printf "\"mean\": null, " printf "\"geomean\": null, " printf "\"sd\": null, " # printf "\"product\": null, " } printf "\"integer\": %d, ", ints printf "\"positive\": %d, ", pos printf "\"zero\": %d, ", zero printf "\"negative\": %d}", neg } function emit_pair(key, num) { printf ((num % 1 == 0) ? "\"%s\": %d, " : "\"%s\": %f, "), key, num } END { '"${sort_keys}"' printf "{\n" for (i = 1; i <= numkeys; i++) { k1 = ordkeys[i] if (i > 1) printf ",\n" printf " %s: {\n", stringify(k1) for (j = 1; j <= numprops; j++) { k2 = props[j] if (j > 1) printf ",\n" printf " %s: ", stringify(k2) emit(k1 SUBSEP k2) } if (j > 1) print "" printf " }" } if (i > 1) print "" printf "}\n" } ' if [ "${tsv}" -eq 1 ]; then ${command} -F "\t" "${src}" "$@" else ${command} "${src}" "$@" fi