From d714f4687ad0177d21b3a95679715d2bd4ddab53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rok=20Su=C5=A1nik?= Date: Fri, 15 Mar 2024 17:37:15 +0100 Subject: [PATCH] implement spreadsheet joins --- env/spreadsheet.go | 2 +- evaldo/builtins_spreadsheet.go | 163 +++++++++++++++++++++++++++++++-- tests/structures.rye | 47 +++++++++- 3 files changed, 202 insertions(+), 10 deletions(-) diff --git a/env/spreadsheet.go b/env/spreadsheet.go index dbd3f3a9..b5fd2558 100644 --- a/env/spreadsheet.go +++ b/env/spreadsheet.go @@ -201,7 +201,7 @@ func (s Spreadsheet) Columns(ps *ProgramState, names []string) Object { return *nspr } -func (s Spreadsheet) GetRow(ps *ProgramState, index int) Object { +func (s Spreadsheet) GetRow(ps *ProgramState, index int) SpreadsheetRow { row := s.Rows[index] row.Uplink = &s return row diff --git a/evaldo/builtins_spreadsheet.go b/evaldo/builtins_spreadsheet.go index 143eb806..cf657037 100644 --- a/evaldo/builtins_spreadsheet.go +++ b/evaldo/builtins_spreadsheet.go @@ -567,7 +567,11 @@ var Builtins_spreadsheet = map[string]*env.Builtin{ if !ok { return MakeBuiltinError(ps, "Second element of replacement block must be a string.", "add-col!") } - return GenerateColumnRegexReplace(ps, spr, newCol, fromCols, regex, replaceStr.Value) + err := GenerateColumnRegexReplace(ps, &spr, newCol, fromCols, regex, replaceStr.Value) + if err != nil { + return err + } + return spr default: return MakeArgError(ps, 3, []env.Type{env.BlockType}, "add-col!") } @@ -582,9 +586,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{ } }, }, - "add-index!": { + "add-indexes!": { Argsn: 2, - Doc: "Indexes all values in a colun and istre it,", + Doc: "Creates an index for all values in the provided columns. Changes in-place and returns the new spreadsheet.", Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) env.Object { switch spr := arg0.(type) { case env.Spreadsheet: @@ -599,9 +603,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{ return MakeError(ps, "Block of tagwords needed") } } - res := AddIndexes(ps, &spr, colWords) - if res != nil { - return res + err := AddIndexes(ps, &spr, colWords) + if err != nil { + return err } return spr default: @@ -612,6 +616,22 @@ var Builtins_spreadsheet = map[string]*env.Builtin{ } }, }, + "indexes?": { + Argsn: 1, + Doc: "Returns the columns that are indexed in a spreadsheet.", + Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) { + switch spr := arg0.(type) { + case env.Spreadsheet: + res := make([]env.Object, 0) + for col := range spr.Indexes { + res = append(res, *env.NewString(col)) + } + return *env.NewBlock(*env.NewTSeries(res)) + default: + return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "indexes?") + } + }, + }, "autotype": { Argsn: 2, Doc: "Takes a spreadsheet and tries to determine and change the types of columns.", @@ -629,6 +649,70 @@ var Builtins_spreadsheet = map[string]*env.Builtin{ } }, }, + "left-join": { + Argsn: 4, + Doc: "Left joins two spreadsheets on the given columns.", + Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) { + switch spr1 := arg0.(type) { + case env.Spreadsheet: + switch spr2 := arg1.(type) { + case env.Spreadsheet: + switch col1 := arg2.(type) { + case env.Word: + col2, ok := arg3.(env.Word) + if !ok { + return MakeArgError(ps, 4, []env.Type{env.WordType}, "left-join") + } + return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), false) + case env.String: + col2, ok := arg3.(env.String) + if !ok { + MakeArgError(ps, 4, []env.Type{env.StringType}, "left-join") + } + return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, false) + default: + return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "left-join") + } + default: + return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "left-join") + } + default: + return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "left-join") + } + }, + }, + "inner-join": { + Argsn: 4, + Doc: "Inner joins two spreadsheets on the given columns.", + Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) { + switch spr1 := arg0.(type) { + case env.Spreadsheet: + switch spr2 := arg1.(type) { + case env.Spreadsheet: + switch col1 := arg2.(type) { + case env.Word: + col2, ok := arg3.(env.Word) + if !ok { + return MakeArgError(ps, 4, []env.Type{env.WordType}, "inner-join") + } + return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), true) + case env.String: + col2, ok := arg3.(env.String) + if !ok { + MakeArgError(ps, 4, []env.Type{env.StringType}, "inner-join") + } + return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, true) + default: + return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "inner-join") + } + default: + return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "inner-join") + } + default: + return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "inner-join") + } + }, + }, } func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extractCols env.Block, code env.Block) env.Object { @@ -668,7 +752,7 @@ func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extr return s } -func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object { +func GenerateColumnRegexReplace(ps *env.ProgramState, s *env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object { // add name to columns s.Cols = append(s.Cols, ps.Idx.GetWord(name.Index)) for ix, row := range s.Rows { @@ -690,7 +774,7 @@ func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name en row.Values = append(row.Values, newVal) s.Rows[ix] = row } - return s + return nil } func AddIndexes(ps *env.ProgramState, s *env.Spreadsheet, columns []env.Word) env.Object { @@ -938,3 +1022,66 @@ func AutoType(ps *env.ProgramState, s *env.Spreadsheet, percent float64) env.Obj return *newS } + +func LeftJoin(ps *env.ProgramState, s1 env.Spreadsheet, s2 env.Spreadsheet, col1 string, col2 string, innerJoin bool) env.Object { + if !slices.Contains(s1.Cols, col1) { + return MakeBuiltinError(ps, "Column not found in first spreadsheet.", "left-join") + } + if !slices.Contains(s2.Cols, col2) { + return MakeBuiltinError(ps, "Column not found in second spreadsheet.", "left-join") + } + + combinedCols := make([]string, len(s1.Cols)+len(s2.Cols)) + copy(combinedCols, s1.Cols) + for i, v := range s2.Cols { + if slices.Contains(combinedCols, v) { + combinedCols[i+len(s1.Cols)] = v + "_2" + } else { + combinedCols[i+len(s1.Cols)] = v + } + } + nspr := env.NewSpreadsheet(combinedCols) + for _, row1 := range s1.GetRows() { + val1, err := s1.GetRowValue(col1, row1) + if err != nil { + return MakeError(ps, "Couldn't retrieve value at row") + } + newRow := make([]any, len(combinedCols)) + + // the row id of the second spreadsheet that matches the current row + s2RowId := -1 + // use index if available + if ix, ok := s2.Indexes[col2]; ok { + if rowIds, ok := ix[val1]; ok { + // if there are multiple rows with the same value (ie. joining on non-unique column), just use the first one + s2RowId = rowIds[0] + } + } else { + for i, row2 := range s2.GetRows() { + val2, err := s2.GetRowValue(col2, row2) + if err != nil { + return MakeError(ps, "Couldn't retrieve value at row") + } + if val1.(env.Object).Equal(val2.(env.Object)) { + s2RowId = i + break + } + } + } + if innerJoin && s2RowId == -1 { + continue + } + copy(newRow, row1.Values) + if s2RowId > -1 { + for i, v := range s2.GetRow(ps, s2RowId).Values { + newRow[i+len(s1.Cols)] = v + } + } else { + for i := range s2.Cols { + newRow[i+len(s1.Cols)] = env.Void{} + } + } + nspr.AddRow(*env.NewSpreadsheetRow(newRow, nspr)) + } + return *nspr +} diff --git a/tests/structures.rye b/tests/structures.rye index 4c09fd05..fab6707a 100644 --- a/tests/structures.rye +++ b/tests/structures.rye @@ -572,7 +572,7 @@ section "Serializers and loaders" section "Spreadsheet related functions" -"Functions for handling and working with Context." +"Functions for creating and working with spreadsheets." { group "spreadsheet & related" @@ -608,6 +608,51 @@ section "Spreadsheet related functions" equal { to-spreadsheet vals { dict { "a" 1 b 2 } dict { "a" 3 "b" 4 } } } spreadsheet { "a" "b" } { 1 2 3 4 } equal { to-spreadsheet vals { dict { "a" 1 b 2 "c" 3 } dict { "a" 4 "b" 5 } } } spreadsheet { "a" "b" "c" } { 1 2 3 4 5 _ } } + + group "index" + mold\nowrap ?add-indexes! + { { block } } + { + ; returned value + equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } |add-indexes! [ 'a ] |indexes? } { "a" } + ; in-place + ; TODO this should work but doesn't, index should be added in place but for some reason it isn't + ; equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } , spr .add-indexes! [ 'a ] , spr .indexes? } { "a" } + } + + group "left join" + mold\nowrap ?left-join + { { block } } + { + equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } , + houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } , + names .left-join houses 'id 'id + } spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" } + + ; joining with an index on the second spreadsheet + equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } , + houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } , + houses .add-indexes! [ 'id ] :houses , + names .left-join houses 'id 'id + } spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" } + } + + group "inner join" + mold\nowrap ?inner-join + { { block } } + { + equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } , + houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } , + names .inner-join houses 'id 'id + } spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" } + + ; joining with an index on the second spreadsheet + equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } , + houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } , + houses .add-indexes! [ 'id ] :houses , + names .inner-join houses 'id 'id + } spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" } + } }