#!/bin/sh
#
# shell pattern matching benchmark, by avih. License: MIT.


echo()  { printf %s\\n "$*"; }
echon() { printf %s    "$*"; }

SELF=${0##*[\\/]}

HELP="\
Usage: $SELF [OPTIONS]
Measure duration of shell pattern matches with 5 text lines.
Lines are ~ 80 chars each. There are ~ 50 builtin patterns.

Matched-lines are printed as e.g. '(M:1-34-)' (lines 1,3,4).
The builtin patterns know which of the default lines should match,
and print 'OK', or e.g. 'E:--3-5' (expected: 3,5), or 'unknown'
for other combos. Exit code is 0 if there are no 'E:.....' prints.

The builtin patterns have increasing number of '*'. This will
blow with exponential implementations. Just abort it manuallly.

  -s      Use short input lines (3-10 chars each) instead of ~ 80 chars.
  -i      Use the first 5 lines from stdin instead of builtin lines.
  -p PAT  Use [newline-separated] PAT instead of the builtin patterns.
  -n N    Iterate the 5-lines N x 1000 times. Default: 10 (50K matches).
  -d      Don't benchmark. Report matched lines, and errors if possible.
  -o      Print matched lines after each pettern result.
  -m MODE Measurement mode:
          'c': Use print-ms cmd. Tries \$NOWCMD, date +%s%3N, few more.
          't': Use shell (user) 'times' - typically 10 ms resolution.
          'n': Don't measure internally and reduce overhead (print only
               progress). Overall duration can be measured externally.
          Default: try 'c', and if no usable command is found, use 't'.
"


# input lines

DEFLINES="\
Lorem	ipsum dolor sit amet, consectetur adipiscing elit, [] do eiusmod tempor
incididunt ut labore_et dolore magna aliqua. Ut enim ad minim veniam, xyzquis
nostrud exercitation ullamco laboris nisi [!] aliquip ex commodoxyz] consequat.
Duis aute irure dolor - reprehenderit in voluptate velit esse cillum dolore
eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt
"

SHORTLINES="\
Lorem
ut.
cupidatat.
aute
esse 
"

deflines()   { printf %s "$DEFLINES"; }
shortlines() { printf %s "$SHORTLINES"; }
stdinlines() { head -5; }


# builtin patterns

# a/b, [c/]d/e, and f/g are equivalent, made same-length to reduce length bias

a='[0-9A-Za-z]'
b='[[:alnum:]]'  # 1st in char-classes (lexicographic) list - best-case

c='[a-z]'        # same as next patterns, but shorter to parse
d='[a-bc-de-z]'
e='[[:lower:]]'  # 6th in char-classes list

f='[0-9A-Fa-ef]'
g='[[:xdigit:]]'  # 12th in char-classes list - worst-case

# the test patterns starts at the 7th char of the line. char 6 is space.
# chars 1-5 are either space if this pattern is not tested for correctness,
# else the expected matched line numbers with the long builtin lines,
# e.g. all lines: 12345, lines 2 and 5: -2--5, no line: -----, etc.
PATTERNS="\
----- 
12345 *
----- @
1---- Lorem*
-2--- *quis
--3-- *nisi*
--3-- *[fobar]xyz*
--3-- *[foobarfoobarfoobarfoobar]xyz*
-2--- *[!foobarfoobarfoobarfoobar]xyz*
12345 *[hello world, this is a test]*
12345 *[!hello world, this is a test]*
1-3-- *[*
1---- *[]*
--3-- *[!]*
-23-5 *[!]].*
1---- *[![:print:]]*
-23-- *z*
---4- *-*
---4- *[-]*
-234- *[-z]*
-234- *[z-]*
-234- *[-z-]*
1-3-- *]*
1-3-- *[]]*
1-34- *[]-]*
123-- *[]-_]*
1234- *[]-_-]*
12-45 *$a$a
12-45 *$b$b
12-45 *$c$c
12-45 *$d$d
12-45 *$e$e
----- *$f$f
----- *$g$g
12-45 *$a*$a
12-45 *$b*$b
12-45 *$c*$c
12-45 *$d*$d
12-45 *$e*$e
---4- *$f*$f
---4- *$g*$g
12-45 *$a*$a*$a
12-45 *$b*$b*$b
12-45 *$c*$c*$c
12-45 *$d*$d*$d
12-45 *$e*$e*$e
---4- *$f*$f*$f
---4- *$g*$g*$g
12-45 *$a*$a*$a*$a
12-45 *$b*$b*$b*$b
12-45 *$c*$c*$c*$c
12-45 *$d*$d*$d*$d
12-45 *$e*$e*$e*$e
---4- *$f*$f*$f*$f
---4- *$g*$g*$g*$g
"


# measurement. we have xtime_mscmd and xtime_times. both run "$@" and
# set $dt to the duration, as "N ms" or shell-user-time str from "times".
# unfortunately none is ideal. the former is non-posix, and the latter
# is typically 10ms resolution. 'time sh -c ...' is also usually 10ms res,
# and would have required even more setup work (generate bench script per
# pattern, figure out what is the current shell so that we can invoke it).

# no standard way to print timestamp in ms, so try few options.
# xtime below tries to compensate latency. also serves as warmup for xtime.
init_measure() {
    while IFS= read -r cmd_ms; do
        { x=$(eval "$cmd_ms"); } 2>/dev/null
        case $x in [1-9]*[0-9])
            [ "${#x}" -gt 11 ] && break  # 10 digits is seconds-timestamp
        esac
        cmd_ms=
    done <<- CMDS
	${NOWCMD-}
	date  +%s%3N
	gdate +%s%3N
	perl -MPOSIX=strftime -MTime::HiRes=time -le "print int(time*1000)"
	python  -c "from time import time; print (int(time()*1000))"
	python3 -c "from time import time; print (int(time()*1000))"
	nodejs -e "console.log(Date.now())"
	node   -e "console.log(Date.now())"
	CMDS

    if ! [ "$cmd_ms" ]; then
        >&2 echo "$SELF: error - cannot find timestamp-ms command. (try -h)"
        >&2 echo "(tried gnu date, and perl/python/nodejs)"
        exit 1
    fi
}
now_ms() {
    eval "$cmd_ms"
}
xtime_mscmd() {
    t0=$(now_ms)
    "$@"
    t1=$(now_ms)
    t2=$(now_ms)
    dt=$((t1 - t0 - (t2 - t1)))  # latency-compensated-ish
    [ $dt -ge 0 ] || dt=0
    dt="$dt ms"
}


# theoretically ideal, but sometimes only 10ms res, and sometimes doesn't
# reset despite executing in a subshell (ksh93). also, runs "$@" in subshell.
xtime_times() {
    if [ "${KSH_VERSION+x}" ]; then
        # ksh93 has no-fork optimization which breaks times. force fork.
        # ksh clones have this var too - openbsd/mksh, but it still works.
        dt=$(ulimit -c 0; "$@"; times)
    else
        dt=$("$@"; times)
    fi
    dt=${dt%% *}  # 1st element - "shell user time"
}


# print matched lines for pattern $1, and benchmark pattern $1

print_matches() {
    while IFS= read -r line; do
        case $line in $1) echo "$line"; esac
    done
}

# positional parameters are usually quicker to access than named vars.
# the action on match is intentionally empty to remove matches duration
# bias due to additional command. no known shell optimizes out such lines.
benchmark() {
    set -- "$1" "$L1" "$L2" "$L3" "$L4" "$L5"
    n=$((1+N*1000))
    until [ $((n=n-1)) = 0 ]; do
        case $2 in $1) ;; esac
        case $3 in $1) ;; esac
        case $4 in $1) ;; esac
        case $5 in $1) ;; esac
        case $6 in $1) ;; esac
    done
}


# main

N=10 D= O= measure= input_lines=deflines cpat=
opts=hidm:n:sp:o
while getopts $opts o; do
    case $o in
    s)  input_lines=shortlines ;;
    i)  input_lines=stdinlines ;;
    n)  N=$OPTARG ;;
    o)  O=yes ;;
    d)  D=yes ;;
    p)  PATTERNS=$OPTARG cpat=yes ;;  # custom patterns
    m)  case $OPTARG in
        c) measure=xtime_mscmd ;;
        t) measure=xtime_times ;;
        n) measure=xtime_none  ;;
        *) >&2 echo "$SELF: invalid -m MODE value    (try -h)"
           exit 1
        esac
        ;;
    h)  echon "$HELP"
        exit
        ;;
    *)  >&2 echo "Usage: $SELF [-$opts]    (try -h)"
        exit 1
    esac
done

if [ "$measure" ]; then
    [ "$measure" = xtime_mscmd ] && init_measure
elif (init_measure 2>/dev/null); then
    measure=xtime_mscmd
    init_measure
else
    measure=xtime_times
fi


# read the input lines to vars L1-L5 to benchmark them without IO
{
    IFS= read -r L1
    IFS= read -r L2
    IFS= read -r L3
    IFS= read -r L4
    IFS= read -r L5
} << EOF
$($input_lines)
EOF

PIN() { printf %s\\n "$L1" "$L2" "$L3" "$L4" "$L5"; }

# prints matched lines as e.g. "1--45"
pmatches() {
    o=
    case $L1 in $1) o=${o}1;; *) o=$o-; esac
    case $L2 in $1) o=${o}2;; *) o=$o-; esac
    case $L3 in $1) o=${o}3;; *) o=$o-; esac
    case $L4 in $1) o=${o}4;; *) o=$o-; esac
    case $L5 in $1) o=${o}5;; *) o=$o-; esac
    echo "$o"
}

# let's go

npat=0 err=
printf %s "$PATTERNS" |
while IFS= read -r x || [ "$x" ]; do
    npat=$((npat+1))
    [ "$cpat" ] && p=$x || p=${x#??????}

    if [ "$measure" = xtime_none ]; then
        # remove overheads. the user will measure overall time on their own.
        benchmark "$p"
        echo $npat
        continue
    fi

    res=$(pmatches "$p")
    if [ "$input_lines" = deflines ] && ! [ "$cpat" ]; then
        expect=${x%" $p"}
        case x$expect in
            x"     ") r="unknown" ;;
             x"$res") r="OK     " ;;
                  x*) r=E:$expect err=yes ;;
        esac
    else
        r=unknown
    fi


    if [ "$D" ]; then
        printf "[%2s] %s (M:%s)  %s\n" $npat "$r" "$res" "'$p'"
    else
        $measure benchmark "$p"
        printf "[%2s] %9s  %s (M:%s)  %s\n" $npat "$dt" "$r" "$res" "'$p'"
    fi


    if [ "$O" ]; then
        PIN | print_matches "$p"
        echo
    fi

    [ -z "$err" ]
done