Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement spreadsheet joins #157

Merged
merged 1 commit into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion env/spreadsheet.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func (s Spreadsheet) Columns(ps *ProgramState, names []string) Object {
return *nspr
}

func (s Spreadsheet) GetRow(ps *ProgramState, index int) Object {
func (s Spreadsheet) GetRow(ps *ProgramState, index int) SpreadsheetRow {
row := s.Rows[index]
row.Uplink = &s
return row
Expand Down
163 changes: 155 additions & 8 deletions evaldo/builtins_spreadsheet.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,11 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
if !ok {
return MakeBuiltinError(ps, "Second element of replacement block must be a string.", "add-col!")
}
return GenerateColumnRegexReplace(ps, spr, newCol, fromCols, regex, replaceStr.Value)
err := GenerateColumnRegexReplace(ps, &spr, newCol, fromCols, regex, replaceStr.Value)
if err != nil {
return err
}
return spr
default:
return MakeArgError(ps, 3, []env.Type{env.BlockType}, "add-col!")
}
Expand All @@ -582,9 +586,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"add-index!": {
"add-indexes!": {
Argsn: 2,
Doc: "Indexes all values in a colun and istre it,",
Doc: "Creates an index for all values in the provided columns. Changes in-place and returns the new spreadsheet.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) env.Object {
switch spr := arg0.(type) {
case env.Spreadsheet:
Expand All @@ -599,9 +603,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
return MakeError(ps, "Block of tagwords needed")
}
}
res := AddIndexes(ps, &spr, colWords)
if res != nil {
return res
err := AddIndexes(ps, &spr, colWords)
if err != nil {
return err
}
return spr
default:
Expand All @@ -612,6 +616,22 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"indexes?": {
Argsn: 1,
Doc: "Returns the columns that are indexed in a spreadsheet.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr := arg0.(type) {
case env.Spreadsheet:
res := make([]env.Object, 0)
for col := range spr.Indexes {
res = append(res, *env.NewString(col))
}
return *env.NewBlock(*env.NewTSeries(res))
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "indexes?")
}
},
},
"autotype": {
Argsn: 2,
Doc: "Takes a spreadsheet and tries to determine and change the types of columns.",
Expand All @@ -629,6 +649,70 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"left-join": {
Argsn: 4,
Doc: "Left joins two spreadsheets on the given columns.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr1 := arg0.(type) {
case env.Spreadsheet:
switch spr2 := arg1.(type) {
case env.Spreadsheet:
switch col1 := arg2.(type) {
case env.Word:
col2, ok := arg3.(env.Word)
if !ok {
return MakeArgError(ps, 4, []env.Type{env.WordType}, "left-join")
}
return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), false)
case env.String:
col2, ok := arg3.(env.String)
if !ok {
MakeArgError(ps, 4, []env.Type{env.StringType}, "left-join")
}
return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, false)
default:
return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "left-join")
}
default:
return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "left-join")
}
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "left-join")
}
},
},
"inner-join": {
Argsn: 4,
Doc: "Inner joins two spreadsheets on the given columns.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr1 := arg0.(type) {
case env.Spreadsheet:
switch spr2 := arg1.(type) {
case env.Spreadsheet:
switch col1 := arg2.(type) {
case env.Word:
col2, ok := arg3.(env.Word)
if !ok {
return MakeArgError(ps, 4, []env.Type{env.WordType}, "inner-join")
}
return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), true)
case env.String:
col2, ok := arg3.(env.String)
if !ok {
MakeArgError(ps, 4, []env.Type{env.StringType}, "inner-join")
}
return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, true)
default:
return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "inner-join")
}
default:
return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "inner-join")
}
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "inner-join")
}
},
},
}

func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extractCols env.Block, code env.Block) env.Object {
Expand Down Expand Up @@ -668,7 +752,7 @@ func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extr
return s
}

func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object {
func GenerateColumnRegexReplace(ps *env.ProgramState, s *env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object {
// add name to columns
s.Cols = append(s.Cols, ps.Idx.GetWord(name.Index))
for ix, row := range s.Rows {
Expand All @@ -690,7 +774,7 @@ func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name en
row.Values = append(row.Values, newVal)
s.Rows[ix] = row
}
return s
return nil
}

func AddIndexes(ps *env.ProgramState, s *env.Spreadsheet, columns []env.Word) env.Object {
Expand Down Expand Up @@ -938,3 +1022,66 @@ func AutoType(ps *env.ProgramState, s *env.Spreadsheet, percent float64) env.Obj

return *newS
}

func LeftJoin(ps *env.ProgramState, s1 env.Spreadsheet, s2 env.Spreadsheet, col1 string, col2 string, innerJoin bool) env.Object {
if !slices.Contains(s1.Cols, col1) {
return MakeBuiltinError(ps, "Column not found in first spreadsheet.", "left-join")
}
if !slices.Contains(s2.Cols, col2) {
return MakeBuiltinError(ps, "Column not found in second spreadsheet.", "left-join")
}

combinedCols := make([]string, len(s1.Cols)+len(s2.Cols))
copy(combinedCols, s1.Cols)
for i, v := range s2.Cols {
if slices.Contains(combinedCols, v) {
combinedCols[i+len(s1.Cols)] = v + "_2"
} else {
combinedCols[i+len(s1.Cols)] = v
}
}
nspr := env.NewSpreadsheet(combinedCols)
for _, row1 := range s1.GetRows() {
val1, err := s1.GetRowValue(col1, row1)
if err != nil {
return MakeError(ps, "Couldn't retrieve value at row")
}
newRow := make([]any, len(combinedCols))

// the row id of the second spreadsheet that matches the current row
s2RowId := -1
// use index if available
if ix, ok := s2.Indexes[col2]; ok {
if rowIds, ok := ix[val1]; ok {
// if there are multiple rows with the same value (ie. joining on non-unique column), just use the first one
s2RowId = rowIds[0]
}
} else {
for i, row2 := range s2.GetRows() {
val2, err := s2.GetRowValue(col2, row2)
if err != nil {
return MakeError(ps, "Couldn't retrieve value at row")
}
if val1.(env.Object).Equal(val2.(env.Object)) {
s2RowId = i
break
}
}
}
if innerJoin && s2RowId == -1 {
continue
}
copy(newRow, row1.Values)
if s2RowId > -1 {
for i, v := range s2.GetRow(ps, s2RowId).Values {
newRow[i+len(s1.Cols)] = v
}
} else {
for i := range s2.Cols {
newRow[i+len(s1.Cols)] = env.Void{}
}
}
nspr.AddRow(*env.NewSpreadsheetRow(newRow, nspr))
}
return *nspr
}
47 changes: 46 additions & 1 deletion tests/structures.rye
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ section "Serializers and loaders"


section "Spreadsheet related functions"
"Functions for handling and working with Context."
"Functions for creating and working with spreadsheets."
{

group "spreadsheet & related"
Expand Down Expand Up @@ -608,6 +608,51 @@ section "Spreadsheet related functions"
equal { to-spreadsheet vals { dict { "a" 1 b 2 } dict { "a" 3 "b" 4 } } } spreadsheet { "a" "b" } { 1 2 3 4 }
equal { to-spreadsheet vals { dict { "a" 1 b 2 "c" 3 } dict { "a" 4 "b" 5 } } } spreadsheet { "a" "b" "c" } { 1 2 3 4 5 _ }
}

group "index"
mold\nowrap ?add-indexes!
{ { block } }
{
; returned value
equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } |add-indexes! [ 'a ] |indexes? } { "a" }
; in-place
; TODO this should work but doesn't, index should be added in place but for some reason it isn't
; equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } , spr .add-indexes! [ 'a ] , spr .indexes? } { "a" }
}

group "left join"
mold\nowrap ?left-join
{ { block } }
{
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } ,
names .left-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" }

; joining with an index on the second spreadsheet
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } ,
houses .add-indexes! [ 'id ] :houses ,
names .left-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" }
}

group "inner join"
mold\nowrap ?inner-join
{ { block } }
{
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } ,
names .inner-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" }

; joining with an index on the second spreadsheet
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } ,
houses .add-indexes! [ 'id ] :houses ,
names .inner-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" }
}
}


Expand Down