From 5d35de5e46fcc8c17552e1eb1a689c851e1e30fb Mon Sep 17 00:00:00 2001
From: Jonathan Rockway <2367+jrockway@users.noreply.github.com>
Date: Sat, 18 Jun 2022 18:12:03 -0400
Subject: [PATCH] WIP: regexp match/no match

---
 cmd/jlog/main.go                  |  36 +++++--
 integration-tests/loggers_test.go |   3 +-
 pkg/parse/filter.go               | 159 ++++++++++++++++++++++++++++++
 pkg/parse/parse.go                |  92 ++---------------
 pkg/parse/parse_fuzz_test.go      |   8 +-
 pkg/parse/parse_test.go           |  27 ++---
 6 files changed, 221 insertions(+), 104 deletions(-)
 create mode 100644 pkg/parse/filter.go

diff --git a/cmd/jlog/main.go b/cmd/jlog/main.go
index 734cbef..1b17075 100644
--- a/cmd/jlog/main.go
+++ b/cmd/jlog/main.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"os"
 	"os/signal"
+	"regexp"
 	"runtime/debug"
 	"runtime/pprof"
 	"strings"
@@ -43,7 +44,9 @@ type output struct {
 }
 
 type general struct {
-	JQ           string `short:"e" description:"A jq program to run on each record in the processed input; use this to ignore certain lines, add fields, etc.  Hint: 'select(condition)' will remove lines that don't match 'condition'."`
+	MatchRegex   string `short:"g" long:"regex" description:"A regular expression that removes lines from the output that don't match, like grep."`
+	NoMatchRegex string `short:"G" long:"no-regex" description:"A regular expression that removes lines from the output that DO match, like 'grep -v'."`
+	JQ           string `short:"e" long:"jq" description:"A jq program to run on each record in the processed input; use this to ignore certain lines, add fields, etc.  Hint: 'select(condition)' will remove lines that don't match 'condition'."`
 	NoColor      bool   `short:"M" long:"no-color" description:"Disable the use of color." env:"JLOG_FORCE_MONOCHROME"`
 	NoMonochrome bool   `short:"c" long:"no-monochrome" description:"Force the use of color." ENV:"JLOG_FORCE_COLOR"`
 	Profile      string `long:"profile" description:"If set, collect a CPU profile and write it to this file."`
@@ -59,8 +62,8 @@ type input struct {
 	NoTimestampKey bool     `long:"notimekey" description:"If set, don't look for a time, and don't display times." env:"JLOG_NO_TIMESTAMP_KEY"`
 	MessageKey     string   `long:"messagekey" description:"JSON key that holds the log message." env:"JLOG_MESSAGE_KEY"`
 	NoMessageKey   bool     `long:"nomessagekey" description:"If set, don't look for a message, and don't display messages (time/level + fields only)." env:"JLOG_NO_MESSAGE_KEY"`
-	DeleteKeys     []string `long:"delete" description:"JSON keys to be deleted before JQ processing and output; repeatable." env:"JLOG_DELETE_KEYS"`
-	UpgradeKeys    []string `long:"upgrade" description:"JSON key (of type object) whose fields should be merged with any other fields; good for loggers that always put structed data in a separate key; repeatable.\n--upgrade b would transform as follows: {a:'a', b:{'c':'c'}} -> {a:'a', c:'c'}" env:"JLOG_UPGRADE_KEYS"`
+	DeleteKeys     []string `long:"delete" description:"JSON keys to be deleted before JQ processing and output; repeatable." env:"JLOG_DELETE_KEYS" env-delim:","`
+	UpgradeKeys    []string `long:"upgrade" description:"JSON key (of type object) whose fields should be merged with any other fields; good for loggers that always put structed data in a separate key; repeatable.\n--upgrade b would transform as follows: {a:'a', b:{'c':'c'}} -> {a:'a', c:'c'}" env:"JLOG_UPGRADE_KEYS" env-delim:","`
 }
 
 func printVersion(w io.Writer) {
@@ -164,8 +167,29 @@ func main() {
 		subsecondFormt = ""
 	}
 
-	jq, err := parse.CompileJQ(gen.JQ)
-	if err != nil {
+	fsch := new(parse.FilterScheme)
+	if gen.MatchRegex != "" && gen.NoMatchRegex != "" {
+		fmt.Fprintf(os.Stderr, "cannot have both a non-empty MatchRegex and a non-empty NoMatchRegex\n")
+		os.Exit(1)
+	}
+	if rx := gen.MatchRegex; rx != "" {
+		regex, err := regexp.Compile(rx)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "problem compiling MatchRegex: %v\n", err)
+			os.Exit(1)
+		}
+		fsch.MatchRegex = regex
+	}
+	if rx := gen.NoMatchRegex; rx != "" {
+		regex, err := regexp.Compile(rx)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "problem compiling NoMatchRegex: %v\n", err)
+			os.Exit(1)
+		}
+		fsch.NoMatchRegex = regex
+	}
+
+	if err := fsch.AddJQ(gen.JQ); err != nil {
 		fmt.Fprintf(os.Stderr, "problem %v\n", err)
 		os.Exit(1)
 	}
@@ -247,7 +271,7 @@ func main() {
 		signal.Stop(sigCh)
 	}()
 
-	summary, err := parse.ReadLog(os.Stdin, colorable.NewColorableStdout(), ins, outs, jq)
+	summary, err := parse.ReadLog(os.Stdin, colorable.NewColorableStdout(), ins, outs, fsch)
 	if err != nil {
 		if signals := atomic.LoadInt32(&nSignals); signals < 1 || !strings.Contains(err.Error(), "file already closed") {
 			outs.EmitError(err.Error())
diff --git a/integration-tests/loggers_test.go b/integration-tests/loggers_test.go
index 20c3634..bc6b014 100644
--- a/integration-tests/loggers_test.go
+++ b/integration-tests/loggers_test.go
@@ -217,6 +217,7 @@ func TestLoggers(t *testing.T) {
 		PriorityFields: []string{"error", "string", "int", "object", "source"},
 		Formatter:      f,
 	}
+	fs := &parse.FilterScheme{}
 	golden := `
 INFO  1 ok line 1
 INFO  2 ok line 2 string:value int:42 object:{"foo":"bar"}
@@ -235,7 +236,7 @@ ERROR 3 ok line 3 error:whoa
 				output := new(bytes.Buffer)
 				test.f(input)
 				inputCopy := *input
-				if _, err := parse.ReadLog(input, output, ins, outs, nil); err != nil {
+				if _, err := parse.ReadLog(input, output, ins, outs, fs); err != nil {
 					t.Fatalf("readlog: %v", err)
 				}
 				want := golden
diff --git a/pkg/parse/filter.go b/pkg/parse/filter.go
new file mode 100644
index 0000000..97f9e22
--- /dev/null
+++ b/pkg/parse/filter.go
@@ -0,0 +1,159 @@
+package parse
+
+import (
+	"errors"
+	"fmt"
+	"regexp"
+
+	"github.com/itchyny/gojq"
+)
+
+// FilterScheme controls how lines are filtered.
+type FilterScheme struct {
+	JQ           *gojq.Code
+	MatchRegex   *regexp.Regexp
+	NoMatchRegex *regexp.Regexp
+}
+
+// DefaultVariables are variables available to JQ programs.
+var DefaultVariables = []string{
+	"$TS",
+	"$RAW", "$MSG",
+	"$LVL", "$UNKNOWN", "$TRACE", "$DEBUG", "$INFO", "$WARN", "$ERROR", "$PANIC", "$DPANIC", "$FATAL",
+}
+
+// prepareVariable extracts the variables above from a line.
+func prepareVariables(l *line) []interface{} {
+	return []interface{}{
+		float64(l.time.UnixNano()) / 1e9, // $TS
+		string(l.raw), l.msg,
+		uint8(l.lvl), uint8(LevelUnknown), uint8(LevelTrace), uint8(LevelDebug), uint8(LevelInfo), uint8(LevelWarn), uint8(LevelError), uint8(LevelPanic), uint8(LevelDPanic), uint8(LevelFatal),
+	}
+}
+
+// highlightKey is a special key that controls highlighting.
+const highlightKey = "__highlight"
+
+func compileJQ(p string) (*gojq.Code, error) {
+	if p == "" {
+		return nil, nil
+	}
+	p = "def highlight($cond): . + {__highlight: $cond};\n" + p
+	q, err := gojq.Parse(p)
+	if err != nil {
+		return nil, fmt.Errorf("parsing jq program %q: %v", p, err)
+	}
+	jq, err := gojq.Compile(q, gojq.WithVariables(DefaultVariables))
+	if err != nil {
+		return nil, fmt.Errorf("compiling jq program %q: %v", p, err)
+	}
+	return jq, nil
+}
+
+// AddJQ compiles the provided jq program and adds it to the filter.
+func (f *FilterScheme) AddJQ(p string) error {
+	if f.JQ != nil {
+		return errors.New("jq program already added")
+	}
+	jq, err := compileJQ(p)
+	if err != nil {
+		return err // already has decent annotation
+	}
+	f.JQ = jq
+	return nil
+}
+
+// runJQ runs the provided jq program on the provided line.  It returns true if the result is empty
+// (i.e., the line should be filtered out), and an error if the output type is invalid or another
+// error occurred.
+func (f *FilterScheme) runJQ(l *line) (bool, error) {
+	if f.JQ == nil {
+		return false, nil
+	}
+	var filtered bool
+	iter := f.JQ.Run(l.fields, prepareVariables(l)...)
+	if result, ok := iter.Next(); ok {
+		switch x := result.(type) {
+		case map[string]interface{}:
+			if raw, ok := x[highlightKey]; ok {
+				delete(x, highlightKey)
+				if hi, ok := raw.(bool); ok {
+					l.highlight = hi
+				}
+			}
+			l.fields = x
+		case nil:
+			return false, errors.New("unexpected nil result; yield an empty map ('{}') to delete all fields")
+		case error:
+			return false, fmt.Errorf("error: %w", x)
+		case bool:
+			return false, errors.New("unexpected boolean output; did you mean to use 'select(...)'?")
+		default:
+			return false, fmt.Errorf("unexpected result type %T(%#v)", result, result)
+		}
+		if _, ok = iter.Next(); ok {
+			// We only use the first line that is output.  This can be revisited in the
+			// future.
+			return false, errors.New("unexpectedly produced more than 1 output")
+		}
+	} else {
+		filtered = true
+		l.fields = make(map[string]interface{})
+	}
+	return filtered, nil
+}
+
+// regexpScope determines what fields a regexp should run against.  Not implemented yet.
+type regexpScope int
+
+const (
+	regexpScopeUnknown regexpScope = iota
+	regexpScopeMessage
+)
+
+// runRegexp runs the regexp, returning whether or not it matched.
+func runRegexp(rx *regexp.Regexp, l *line, scope regexpScope) bool {
+	var input string
+	switch scope {
+	case regexpScopeUnknown:
+		panic("unknown regexp scope")
+	case regexpScopeMessage:
+		input = l.msg
+	}
+	fields := rx.FindStringSubmatch(input)
+	if len(fields) == 0 {
+		return false
+	}
+	for i, name := range rx.SubexpNames() {
+		if i == 0 {
+			continue
+		}
+		if name == "" {
+			name = fmt.Sprintf("$%v", i)
+		}
+		l.fields[name] = fields[i]
+	}
+	return true
+}
+
+// Run runs all the filters defined in this FilterScheme against the provided line.  The return
+// value is true if the line should be removed from the output ("filtered").
+func (f *FilterScheme) Run(l *line) (bool, error) {
+	if rx := f.NoMatchRegex; rx != nil {
+		found := runRegexp(rx, l, regexpScopeMessage)
+		if found {
+			return true, nil
+		}
+	}
+	if rx := f.MatchRegex; rx != nil {
+		found := runRegexp(rx, l, regexpScopeMessage)
+		if !found {
+			return true, nil
+		}
+	}
+	filtered, err := f.runJQ(l)
+	if err != nil {
+		return false, fmt.Errorf("jq: %w", err)
+	}
+	return filtered, nil
+}
diff --git a/pkg/parse/parse.go b/pkg/parse/parse.go
index a9b8e77..800f4c4 100644
--- a/pkg/parse/parse.go
+++ b/pkg/parse/parse.go
@@ -11,7 +11,6 @@ import (
 	"runtime"
 	"time"
 
-	"github.com/itchyny/gojq"
 	"github.com/logrusorgru/aurora/v3"
 )
 
@@ -130,84 +129,11 @@ type Summary struct {
 	Filtered int
 }
 
-var DefaultVariables = []string{
-	"$TS",
-	"$RAW", "$MSG",
-	"$LVL", "$UNKNOWN", "$TRACE", "$DEBUG", "$INFO", "$WARN", "$ERROR", "$PANIC", "$DPANIC", "$FATAL",
-}
-
-func prepareVariables(l *line) []interface{} {
-	return []interface{}{
-		float64(l.time.UnixNano()) / 1e9, // $TS
-		string(l.raw), l.msg,
-		uint8(l.lvl), uint8(LevelUnknown), uint8(LevelTrace), uint8(LevelDebug), uint8(LevelInfo), uint8(LevelWarn), uint8(LevelError), uint8(LevelPanic), uint8(LevelDPanic), uint8(LevelFatal),
-	}
-}
-
-const highlightKey = "__highlight"
-
-// CompileJQ compiles the provided jq program.
-func CompileJQ(p string) (*gojq.Code, error) {
-	if p == "" {
-		return nil, nil
-	}
-	p = "def highlight($cond): . + {__highlight: $cond};\n" + p
-	q, err := gojq.Parse(p)
-	if err != nil {
-		return nil, fmt.Errorf("parsing jq program %q: %v", p, err)
-	}
-	jq, err := gojq.Compile(q, gojq.WithVariables(DefaultVariables))
-	if err != nil {
-		return nil, fmt.Errorf("compiling jq program %q: %v", p, err)
-	}
-	return jq, nil
-}
-
-// runJQ runs the provided jq program on the provided line.  It returns true if the result is empty
-// (i.e., the line should be filtered out), and an error if the output type is invalid or another
-// error occurred.
-func runJQ(jq *gojq.Code, l *line) (bool, error) {
-	if jq == nil {
-		return false, nil
-	}
-	var filtered bool
-	iter := jq.Run(l.fields, prepareVariables(l)...)
-	if result, ok := iter.Next(); ok {
-		switch x := result.(type) {
-		case map[string]interface{}:
-			if raw, ok := x[highlightKey]; ok {
-				delete(x, highlightKey)
-				if hi, ok := raw.(bool); ok {
-					l.highlight = hi
-				}
-			}
-			l.fields = x
-		case nil:
-			return false, errors.New("unexpected nil result; yield an empty map ('{}') to delete all fields")
-		case error:
-			return false, fmt.Errorf("error: %w", x)
-		case bool:
-			return false, errors.New("unexpected boolean output; did you mean to use 'select(...)'?")
-		default:
-			return false, fmt.Errorf("unexpected result type %T(%#v)", result, result)
-		}
-		if _, ok = iter.Next(); ok {
-			// We only use the first line that is output.  This can be revisited in the
-			// future.
-			return false, errors.New("unexpectedly produced more than 1 output")
-		}
-	} else {
-		filtered = true
-		l.fields = make(map[string]interface{})
-	}
-	return filtered, nil
-}
-
 // ReadLog reads a stream of JSON-formatted log lines from the provided reader according to the
 // input schema, reformatting it and writing to the provided writer according to the output schema.
 // Parse errors are handled according to the input schema.  Any other errors, not including io.EOF
 // on the reader, are returned.
-func ReadLog(r io.Reader, w io.Writer, ins *InputSchema, outs *OutputSchema, jq *gojq.Code) (Summary, error) {
+func ReadLog(r io.Reader, w io.Writer, ins *InputSchema, outs *OutputSchema, filter *FilterScheme) (Summary, error) {
 	s := bufio.NewScanner(r)
 	s.Buffer(make([]byte, 0, LineBufferSize), LineBufferSize)
 	var l line
@@ -309,18 +235,18 @@ func ReadLog(r io.Reader, w io.Writer, ins *InputSchema, outs *OutputSchema, jq
 
 			// Filter.
 			var err error
-			filtered, err = runJQ(jq, &l)
+			filtered, err = filter.Run(&l)
 			if err != nil {
 				addError = true
 				writeRawLine = true
 				recoverable = false
-				// It is questionable as to whether or not jq breaking means that we
-				// should stop processing the log entirely.  It's probably a bug in
-				// the jq program that affects every line, so the sooner we return
-				// the error, the sooner the user can fix their program.  But on the
-				// other hand, is it worth it to spend the time debugging a jq
-				// program that's only broken on one line out of a billion?
-				return fmt.Errorf("jq: %w", err)
+				// It is questionable as to whether or not a filter breaking means
+				// that we should stop processing the log entirely.  It's probably a
+				// bug in the filter that affects every line, so the sooner we
+				// return the error, the sooner the user can fix their filter.  But
+				// on the other hand, is it worth it to spend the time debugging a
+				// jq program that's only broken on one line out of a billion?
+				return fmt.Errorf("filter: %w", err)
 			}
 			if filtered {
 				sum.Filtered++
diff --git a/pkg/parse/parse_fuzz_test.go b/pkg/parse/parse_fuzz_test.go
index 4d66783..b693da5 100644
--- a/pkg/parse/parse_fuzz_test.go
+++ b/pkg/parse/parse_fuzz_test.go
@@ -11,15 +11,17 @@ import (
 	"testing"
 	"time"
 
-	"github.com/itchyny/gojq"
 	"github.com/jrockway/json-logs/pkg/parse/internal/fuzzsupport"
 	"github.com/logrusorgru/aurora/v3"
 )
 
 // runReadLog runs ReadLog against some input, and asserts that certain expectations are met.  It's
 // used to implement FuzzReadLogs and FuzzReadLogsWithJSON.
-func runReadLog(t *testing.T, jq *gojq.Code, in []byte, expectedLines int) {
+func runReadLog(t *testing.T, fs *FilterScheme, in []byte, expectedLines int) {
 	t.Helper()
+	if fs == nil {
+		fs = new(FilterScheme)
+	}
 	inbuf := bytes.NewReader(in)
 	ins := &InputSchema{
 		Strict: false,
@@ -37,7 +39,7 @@ func runReadLog(t *testing.T, jq *gojq.Code, in []byte, expectedLines int) {
 		},
 	}
 	outbuf := new(bytes.Buffer)
-	summary, err := ReadLog(inbuf, outbuf, ins, outs, jq)
+	summary, err := ReadLog(inbuf, outbuf, ins, outs, fs)
 	if err != nil {
 		if errors.Is(err, bufio.ErrTooLong) {
 			// This is a known limit and the fuzzer likes to produce very long
diff --git a/pkg/parse/parse_test.go b/pkg/parse/parse_test.go
index c32b7b4..b983fa5 100644
--- a/pkg/parse/parse_test.go
+++ b/pkg/parse/parse_test.go
@@ -69,13 +69,6 @@ func comperror(x, y error) bool {
 	return x.Error() == y.Error()
 }
 
-func mustJQ(prog string) *gojq.Code {
-	jq, err := CompileJQ(prog)
-	if err != nil {
-		panic(err)
-	}
-	return jq
-}
 func TestRead(t *testing.T) {
 	tests := []struct {
 		name  string
@@ -959,7 +952,10 @@ func TestReadLog(t *testing.T) {
 		}
 
 		t.Run(test.name, func(t *testing.T) {
-			summary, err := ReadLog(test.r, test.w, test.is, os, test.jq)
+			fs := &FilterScheme{
+				JQ: test.jq,
+			}
+			summary, err := ReadLog(test.r, test.w, test.is, os, fs)
 			if diff := cmp.Diff(test.w.String(), test.wantOutput); diff != "" {
 				t.Errorf("output: %v", diff)
 			}
@@ -976,13 +972,21 @@ func TestReadLog(t *testing.T) {
 	}
 }
 
+func mustJQ(prog string) *gojq.Code {
+	jq, err := compileJQ(prog)
+	if err != nil {
+		panic(err)
+	}
+	return jq
+}
+
 func TestReadLogWithNullFormatter(t *testing.T) {
 	r := strings.NewReader(`{"level":"info","ts":12345,"msg":"foo"}` + "\n")
 	w := io.Discard
 	is := &InputSchema{Strict: false}
 	os := &OutputSchema{}
-	jq := mustJQ(".")
-	if _, err := ReadLog(r, w, is, os, jq); err != nil {
+	fs := &FilterScheme{JQ: mustJQ(".")}
+	if _, err := ReadLog(r, w, is, os, fs); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -1061,7 +1065,8 @@ func TestJQ(t *testing.T) {
 		},
 	}
 	for _, test := range testData {
-		gotFiltered, gotErr := runJQ(test.jq, test.l)
+		fs := &FilterScheme{JQ: test.jq}
+		gotFiltered, gotErr := fs.runJQ(test.l)
 		if diff := cmp.Diff(test.l, test.wantLine, cmp.AllowUnexported(line{}), cmpopts.EquateEmpty()); diff != "" {
 			t.Errorf("line: %s", diff)
 		}