diff --git a/api/api.go b/api/api.go index e2ac706c4..d1198cb44 100644 --- a/api/api.go +++ b/api/api.go @@ -167,6 +167,7 @@ func NewServerRoutes(s Server) *http.ServeMux { m.Handle("/get/", s.middleware(dsh.GetHandler)) m.Handle("/rename", s.middleware(dsh.RenameHandler)) m.Handle("/diff", s.middleware(dsh.DiffHandler)) + m.Handle("/changes", s.middleware(dsh.ChangesHandler)) // Deprecated, use /get/username/name?component=body or /get/username/name/body.csv m.Handle("/body/", s.middleware(dsh.BodyHandler)) m.Handle("/stats/", s.middleware(dsh.StatsHandler)) diff --git a/api/datasets.go b/api/datasets.go index d5eb71d84..730a38740 100644 --- a/api/datasets.go +++ b/api/datasets.go @@ -97,6 +97,20 @@ func (h *DatasetHandlers) DiffHandler(w http.ResponseWriter, r *http.Request) { } } +// ChangesHandler is a dataset single endpoint +func (h *DatasetHandlers) ChangesHandler(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodPost, http.MethodGet: + if h.ReadOnly { + readOnlyResponse(w, "/changereport") + return + } + h.changesHandler(w, r) + default: + util.NotFoundHandler(w, r) + } +} + // PeerListHandler is a dataset list endpoint func (h *DatasetHandlers) PeerListHandler(w http.ResponseWriter, r *http.Request) { switch r.Method { @@ -340,6 +354,30 @@ func (h *DatasetHandlers) diffHandler(w http.ResponseWriter, r *http.Request) { util.WritePageResponse(w, res, r, util.Page{}) } +func (h *DatasetHandlers) changesHandler(w http.ResponseWriter, r *http.Request) { + req := &lib.ChangeReportParams{} + switch r.Header.Get("Content-Type") { + case "application/json": + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + util.WriteErrResponse(w, http.StatusBadRequest, fmt.Errorf("error decoding body into params: %s", err.Error())) + return + } + default: + req = &lib.ChangeReportParams{ + LeftRefstr: r.FormValue("left_path"), + RightRefstr: r.FormValue("right_path"), + } + } + + res := &lib.ChangeReport{} + if err := h.ChangeReport(req, res); err != nil { + util.WriteErrResponse(w, http.StatusInternalServerError, fmt.Errorf("error generating change report: %s", err.Error())) + return + } + + util.WritePageResponse(w, res, r, util.Page{}) +} + func (h *DatasetHandlers) peerListHandler(w http.ResponseWriter, r *http.Request) { log.Info(r.URL.Path) p := lib.ListParamsFromRequest(r) diff --git a/changes/changes.go b/changes/changes.go new file mode 100644 index 000000000..27c4a3441 --- /dev/null +++ b/changes/changes.go @@ -0,0 +1,568 @@ +package changes + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + golog "github.com/ipfs/go-log" + "github.com/qri-io/dataset" + "github.com/qri-io/dataset/tabular" + "github.com/qri-io/qri/dsref" + qerr "github.com/qri-io/qri/errors" + "github.com/qri-io/qri/fsi" + "github.com/qri-io/qri/stats" +) + +var ( + log = golog.Logger("changes") +) + +// ChangeReportComponent is a generic component used to populate the change report +type ChangeReportComponent struct { + Left interface{} `json:"left"` + Right interface{} `json:"right"` + About map[string]interface{} `json:"about,omitempty"` +} + +// ChangeReportDeltaComponent is a subcomponent that can hold +// delta information between left and right +type ChangeReportDeltaComponent struct { + ChangeReportComponent + Title string `json:"title,omitempty"` + Delta interface{} `json:"delta"` +} + +// StatsChangeComponent represents the stats change report +type StatsChangeComponent struct { + Summary *ChangeReportDeltaComponent `json:"summary"` + Columns []*ChangeReportDeltaComponent `json:"columns"` +} + +// ChangeReportResponse is the result of a call to changereport +type ChangeReportResponse struct { + VersionInfo *ChangeReportComponent `json:"version_info,omitempty"` + Commit *ChangeReportComponent `json:"commit,omitempty"` + Meta *ChangeReportComponent `json:"meta,omitempty"` + Readme *ChangeReportComponent `json:"readme,omitempty"` + Structure *ChangeReportComponent `json:"structure,omitempty"` + Transform *ChangeReportComponent `json:"transform,omitempty"` + Stats *StatsChangeComponent `json:"stats,omitempty"` +} + +// StatsChangeSummaryFields represents the stats summary +type StatsChangeSummaryFields struct { + Entries int `json:"entries"` + Columns int `json:"columns"` + // NullValues int `json:"nullValues"` + TotalSize int `json:"totalSize"` +} + +// EmptyObject is used mostly as a placeholder in palces where it is required +// that a key is present in the response even if empty and not be nil +type EmptyObject map[string]interface{} + +// Service can generate a change report between two datasets +type Service struct { + loader dsref.Loader + stats *stats.Service +} + +// New allocates a Change service +func New(loader dsref.Loader, stats *stats.Service) *Service { + return &Service{ + loader: loader, + stats: stats, + } +} + +func (svc *Service) parseColumns(colItems *tabular.Columns, data *dataset.Dataset) (interface{}, error) { + var sErr error + if data.Structure != nil { + *colItems, _, sErr = tabular.ColumnsFromJSONSchema(data.Structure.Schema) + if sErr != nil { + return nil, sErr + } + return StatsChangeSummaryFields{ + Entries: data.Structure.Entries, + Columns: len(*colItems), + TotalSize: data.Structure.Length, + }, nil + } + return EmptyObject{}, nil +} + +// maybeLoadStats attempts to load stats if not alredy present +// errors out if it fails as stats are required and some datasets might not yet have +// a stats component attached to it +func (svc *Service) maybeLoadStats(ctx context.Context, ds *dataset.Dataset) error { + if ds.Stats != nil { + return nil + } + var statsErr error + ds.Stats, statsErr = svc.stats.Stats(ctx, ds) + if statsErr != nil { + return qerr.New(statsErr, "missing stats components") + } + return nil +} + +// parseStats uses json serializing > deserializing to easily parse the stats +// interface as we have little type safety in the dataset.stats component right now +func (svc *Service) parseStats(ds *dataset.Dataset) ([]EmptyObject, error) { + statsStr, err := json.Marshal(ds.Stats.Stats) + if err != nil { + log.Debugf("failed to load stats: %s", err.Error()) + return nil, qerr.New(err, "failed to load stats") + } + stats := []EmptyObject{} + err = json.Unmarshal(statsStr, &stats) + if err != nil { + log.Debugf("failed to parse stats: %s", err.Error()) + return nil, qerr.New(err, "failed to parse stats") + } + + return stats, nil +} + +func (svc *Service) statsDiff(ctx context.Context, leftDs *dataset.Dataset, rightDs *dataset.Dataset) (*StatsChangeComponent, error) { + res := &StatsChangeComponent{} + + res.Summary = &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{}, + } + + var leftColItems, rightColItems tabular.Columns + var sErr error + res.Summary.Left, sErr = svc.parseColumns(&leftColItems, leftDs) + if sErr != nil { + return &StatsChangeComponent{}, sErr + } + leftColCount := len(leftColItems) + + res.Summary.Right, sErr = svc.parseColumns(&rightColItems, rightDs) + if sErr != nil { + return &StatsChangeComponent{}, sErr + } + rightColCount := len(rightColItems) + + if leftDs.Structure != nil && rightDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: res.Summary.Right.(StatsChangeSummaryFields).Entries - res.Summary.Left.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: res.Summary.Right.(StatsChangeSummaryFields).TotalSize - res.Summary.Left.(StatsChangeSummaryFields).TotalSize, + } + } else if leftDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: -res.Summary.Left.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: -res.Summary.Left.(StatsChangeSummaryFields).TotalSize, + } + } else if rightDs.Structure != nil { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: res.Summary.Right.(StatsChangeSummaryFields).Entries, + Columns: rightColCount - leftColCount, + TotalSize: res.Summary.Right.(StatsChangeSummaryFields).TotalSize, + } + } else { + res.Summary.Delta = StatsChangeSummaryFields{ + Entries: 0, + Columns: 0, + TotalSize: 0, + } + } + + err := svc.maybeLoadStats(ctx, leftDs) + if err != nil { + return nil, err + } + err = svc.maybeLoadStats(ctx, rightDs) + if err != nil { + return nil, err + } + + leftStats, err := svc.parseStats(leftDs) + if err != nil { + return nil, err + } + rightStats, err := svc.parseStats(rightDs) + if err != nil { + return nil, err + } + + res.Columns, err = svc.matchColumns(leftColCount, rightColCount, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + log.Debugf("failed to calculate stats change report: %s", err.Error()) + return nil, qerr.New(err, "failed to calculate stats change report") + } + + return res, nil +} + +// matchColumns attempts to match up columns from the left and right side based on the column name +// this is not ideal as datasets without a header have generic column names and in case of adding a column +// before the end might shift the alignment and break comparison due to type differences of columns which +// are not properly handled yet +func (svc *Service) matchColumns(leftColCount, rightColCount int, leftColItems, rightColItems tabular.Columns, leftStats, rightStats []EmptyObject) ([]*ChangeReportDeltaComponent, error) { + maxColCount := leftColCount + if rightColCount > maxColCount { + maxColCount = rightColCount + } + + columns := make([]*ChangeReportDeltaComponent, maxColCount) + + type cIndex struct { + LPos int + RPos int + } + + colIndex := map[string]*cIndex{} + for i := 0; i < maxColCount; i++ { + if i < leftColCount { + if c, ok := colIndex[leftColItems[i].Title]; ok && c != nil { + colIndex[leftColItems[i].Title].LPos = i + } else { + colIndex[leftColItems[i].Title] = &cIndex{ + LPos: i, + RPos: -1, + } + } + } + if i < rightColCount { + if c, ok := colIndex[rightColItems[i].Title]; ok && c != nil { + colIndex[rightColItems[i].Title].RPos = i + } else { + colIndex[rightColItems[i].Title] = &cIndex{ + LPos: -1, + RPos: i, + } + } + } + } + + i := 0 + for k, v := range colIndex { + columns[i] = &ChangeReportDeltaComponent{ + Title: k, + } + var lCol, rCol *tabular.Column + if v.LPos >= 0 { + columns[i].Left = leftStats[v.LPos] + lCol = &leftColItems[v.LPos] + } else { + columns[i].Left = EmptyObject{} + } + if v.RPos >= 0 { + columns[i].Right = rightStats[v.RPos] + rCol = &rightColItems[v.RPos] + } else { + columns[i].Right = EmptyObject{} + } + deltaCol, aboutCol, err := svc.columnStatsDelta(columns[i].Left, columns[i].Right, lCol, rCol, v.LPos >= 0, v.RPos >= 0) + if err != nil { + log.Debugf("error calculating stats delta: %s", err.Error()) + return nil, qerr.New(err, fmt.Sprintf("failed to calculate stats column delta for %q", columns[i].Title)) + } + columns[i].Delta = deltaCol + columns[i].About = aboutCol + i++ + } + + return columns, nil +} + +func parseStatsMap(stats interface{}) (map[string]interface{}, error) { + statsMap := map[string]interface{}{} + serialized, err := json.Marshal(stats) + if err != nil { + log.Debugf("error serializing stats") + return nil, err + } + err = json.Unmarshal(serialized, &statsMap) + if err != nil { + log.Debugf("error deserializing stats") + return nil, err + } + return statsMap, nil +} + +func (svc *Service) columnStatsDelta(left, right interface{}, lCol, rCol *tabular.Column, hasLeft, hasRight bool) (map[string]interface{}, map[string]interface{}, error) { + var deltaCol map[string]interface{} + aboutCol := map[string]interface{}{} + var leftStatsMap, rightStatsMap map[string]interface{} + var err error + if hasLeft { + leftStatsMap, err = parseStatsMap(left) + if err != nil { + log.Debugf("error parsing stats map") + return nil, nil, err + } + } + if hasRight { + rightStatsMap, err = parseStatsMap(right) + if err != nil { + log.Debugf("error parsing stats map") + return nil, nil, err + } + } + + //determine shape + if (!hasRight || (hasRight && (rCol.Type.HasType("number") || rCol.Type.HasType("integer")))) && + (!hasLeft || (hasLeft && (lCol.Type.HasType("number") || lCol.Type.HasType("integer")))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "max": float64(0), + "min": float64(0), + "median": float64(0), + "mean": float64(0), + } + } else if (!hasRight || (hasRight && rCol.Type.HasType("string"))) && (!hasLeft || (hasLeft && lCol.Type.HasType("string"))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "maxLength": float64(0), + "minLength": float64(0), + "unique": float64(0), + } + } else if (!hasRight || (hasRight && rCol.Type.HasType("boolean"))) && (!hasLeft || (hasLeft && lCol.Type.HasType("boolean"))) { + deltaCol = map[string]interface{}{ + "count": float64(0), + "trueCount": float64(0), + "falseCount": float64(0), + } + } else { + log.Debugf("incompatible column types: %+v / %+v", rCol.Type, lCol.Type) + // TODO(arqu): improve handling of columns with different types + return nil, nil, errors.New("incompatible column types") + } + + // assign values + for k := range deltaCol { + if hasLeft { + if leftStatsMap[k] == nil { + log.Debugf("%s was nil", k) + } else { + deltaCol[k] = deltaCol[k].(float64) - leftStatsMap[k].(float64) + } + } + if hasRight { + if rightStatsMap[k] == nil { + log.Debugf("%s was nil", k) + } else { + deltaCol[k] = deltaCol[k].(float64) + rightStatsMap[k].(float64) + } + } + } + + if hasLeft && !hasRight { + aboutCol["status"] = fsi.STRemoved + } else if !hasLeft && hasRight { + aboutCol["status"] = fsi.STAdd + } else if hasLeft && hasRight { + sum := float64(0) + for k := range deltaCol { + sum += deltaCol[k].(float64) + } + if sum == 0 { + aboutCol["status"] = fsi.STUnmodified + } else { + aboutCol["status"] = fsi.STChange + } + } else { + aboutCol["status"] = fsi.STMissing + } + + return deltaCol, aboutCol, nil +} + +// Report computes the change report of two sources +// This takes some assumptions - we work only with tabular data, with header rows and functional structure.json +func (svc *Service) Report(ctx context.Context, leftRef, rightRef dsref.Ref, loadSource string) (*ChangeReportResponse, error) { + leftDs, err := svc.loader.LoadDataset(ctx, leftRef, loadSource) + if err != nil { + return nil, err + } + if rightRef.Path == "" { + rightRef.Path = leftDs.PreviousPath + } + rightDs, err := svc.loader.LoadDataset(ctx, rightRef, loadSource) + if err != nil { + return nil, err + } + + res := &ChangeReportResponse{} + + leftVi := dsref.ConvertDatasetToVersionInfo(leftDs) + rightVi := dsref.ConvertDatasetToVersionInfo(rightDs) + + res.VersionInfo = &ChangeReportComponent{} + res.VersionInfo.Left = leftVi + res.VersionInfo.Right = rightVi + res.VersionInfo.About = EmptyObject{} + + if leftVi.Path == rightVi.Path { + res.VersionInfo.About["status"] = fsi.STUnmodified + } else { + res.VersionInfo.About["status"] = fsi.STChange + } + + if leftDs.Commit != nil || rightDs.Commit != nil { + res.Commit = &ChangeReportComponent{} + if leftDs.Commit != nil { + res.Commit.Left = leftDs.Commit + } else { + res.Commit.Left = EmptyObject{} + } + if rightDs.Commit != nil { + res.Commit.Right = rightDs.Commit + } else { + res.Commit.Right = EmptyObject{} + } + res.Commit.About = EmptyObject{} + + if leftDs.Commit != nil && rightDs.Commit == nil { + res.Commit.About["status"] = fsi.STRemoved + } else if leftDs.Commit == nil && rightDs.Commit != nil { + res.Commit.About["status"] = fsi.STAdd + } else if leftDs.Commit != nil && rightDs.Commit != nil { + if leftDs.Commit.Path == rightDs.Commit.Path { + res.Commit.About["status"] = fsi.STUnmodified + } else { + res.Commit.About["status"] = fsi.STChange + } + } else { + res.Commit.About["status"] = fsi.STMissing + } + } + + if leftDs.Meta != nil || rightDs.Meta != nil { + res.Meta = &ChangeReportComponent{} + hasLeftMeta := leftDs.Meta != nil && !leftDs.Meta.IsEmpty() + hasRightMeta := rightDs.Meta != nil && !rightDs.Meta.IsEmpty() + + if hasLeftMeta { + res.Meta.Left = leftDs.Meta + } else { + res.Meta.Left = EmptyObject{} + } + if hasRightMeta { + res.Meta.Right = rightDs.Meta + } else { + res.Meta.Right = EmptyObject{} + } + res.Meta.About = EmptyObject{} + + if hasLeftMeta && !hasRightMeta { + res.Meta.About["status"] = fsi.STRemoved + } else if !hasLeftMeta && hasRightMeta { + res.Meta.About["status"] = fsi.STAdd + } else if hasLeftMeta && hasRightMeta { + if leftDs.Meta.Path == rightDs.Meta.Path { + res.Meta.About["status"] = fsi.STUnmodified + } else { + res.Meta.About["status"] = fsi.STChange + } + } else { + res.Meta.About["status"] = fsi.STMissing + } + } + + if leftDs.Readme != nil || rightDs.Readme != nil { + res.Readme = &ChangeReportComponent{} + if leftDs.Readme != nil { + res.Readme.Left = string(leftDs.Readme.ScriptBytes) + } else { + res.Readme.Left = "" + } + if rightDs.Readme != nil { + res.Readme.Right = string(rightDs.Readme.ScriptBytes) + } else { + res.Readme.Right = "" + } + res.Readme.About = EmptyObject{} + + if res.Readme.Left != "" && res.Readme.Right == "" { + res.Readme.About["status"] = fsi.STRemoved + } else if res.Readme.Left == "" && res.Readme.Right != "" { + res.Readme.About["status"] = fsi.STAdd + } else if res.Readme.Left != "" && res.Readme.Right != "" { + if res.Readme.Left == res.Readme.Right { + res.Readme.About["status"] = fsi.STUnmodified + } else { + res.Readme.About["status"] = fsi.STChange + } + } else { + res.Readme.About["status"] = fsi.STMissing + } + } + + if leftDs.Structure != nil || rightDs.Structure != nil { + res.Structure = &ChangeReportComponent{} + if leftDs.Structure != nil { + if leftDs.Structure.Format != "csv" { + return nil, errors.New("changes are only supported for CSV datasets") + } + res.Structure.Left = leftDs.Structure + } else { + res.Structure.Left = EmptyObject{} + } + if rightDs.Structure != nil { + if rightDs.Structure.Format != "csv" { + return nil, errors.New("changes are only supported for CSV datasets") + } + res.Structure.Right = rightDs.Structure + } else { + res.Structure.Right = EmptyObject{} + } + res.Structure.About = EmptyObject{} + + if leftDs.Structure != nil && rightDs.Structure == nil { + res.Structure.About["status"] = fsi.STRemoved + } else if leftDs.Structure == nil && rightDs.Structure != nil { + res.Structure.About["status"] = fsi.STAdd + } else if leftDs.Structure != nil && rightDs.Structure != nil { + if leftDs.Structure.Path == rightDs.Structure.Path { + res.Structure.About["status"] = fsi.STUnmodified + } else { + res.Structure.About["status"] = fsi.STChange + } + } else { + res.Structure.About["status"] = fsi.STMissing + } + } + + if leftDs.Transform != nil || rightDs.Transform != nil { + res.Transform = &ChangeReportComponent{} + if leftDs.Transform != nil { + res.Transform.Left = string(leftDs.Transform.ScriptBytes) + } else { + res.Transform.Left = "" + } + if rightDs.Transform != nil { + res.Transform.Right = string(rightDs.Transform.ScriptBytes) + } else { + res.Transform.Right = "" + } + res.Transform.About = EmptyObject{} + + if res.Transform.Left != "" && res.Transform.Right == "" { + res.Transform.About["status"] = fsi.STRemoved + } else if res.Transform.Left == "" && res.Transform.Right != "" { + res.Transform.About["status"] = fsi.STAdd + } else if res.Transform.Left != "" && res.Transform.Right != "" { + if res.Transform.Left == res.Transform.Right { + res.Transform.About["status"] = fsi.STUnmodified + } else { + res.Transform.About["status"] = fsi.STChange + } + } else { + res.Transform.About["status"] = fsi.STMissing + } + } + + res.Stats, err = svc.statsDiff(ctx, leftDs, rightDs) + if err != nil { + return nil, err + } + return res, nil +} diff --git a/changes/changes_test.go b/changes/changes_test.go new file mode 100644 index 000000000..c43407256 --- /dev/null +++ b/changes/changes_test.go @@ -0,0 +1,672 @@ +package changes + +import ( + "context" + "io/ioutil" + "os" + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/qri-io/dataset" + "github.com/qri-io/dataset/tabular" + "github.com/qri-io/qfs" + "github.com/qri-io/qri/base" + "github.com/qri-io/qri/base/dsfs" + "github.com/qri-io/qri/dsref" + "github.com/qri-io/qri/fsi" + "github.com/qri-io/qri/repo" + repotest "github.com/qri-io/qri/repo/test" + "github.com/qri-io/qri/stats" +) + +func newTestService(t *testing.T, r repo.Repo, workDir string) *Service { + cache, err := stats.NewLocalCache(workDir, 1000<<8) + if err != nil { + t.Fatal(err) + } + statsSvc := stats.New(cache) + loader := base.NewLocalDatasetLoader(r.Filesystem()) + + return New(loader, statsSvc) +} + +func updateDataset(t *testing.T, r repo.Repo, ds *dataset.Dataset, newBody string) dsref.Ref { + ctx := context.Background() + currRef := dsref.ConvertDatasetToVersionInfo(ds).SimpleRef() + + ds.SetBodyFile(qfs.NewMemfileBytes("body.csv", []byte(newBody))) + ds.PreviousPath = currRef.Path + + // force recalculate structure as that is what we rely on for the change reports + ds.Structure = nil + if err := base.InferStructure(ds); err != nil { + t.Fatal(err.Error()) + } + + res, err := base.CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, nil, dsfs.SaveSwitches{Pin: true, ShouldRender: true}) + if err != nil { + t.Fatal(err.Error()) + } + return dsref.ConvertDatasetToVersionInfo(res).SimpleRef() +} + +func getBaseCols() []*ChangeReportDeltaComponent { + return []*ChangeReportDeltaComponent{ + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(65.25), + "min": float64(44.4), + "mean": float64(52.04), + "median": float64(50.65), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(44.4), + float64(50.65), + float64(55.5), + float64(65.25), + float64(66.25), + }, + "frequencies": []interface{}{ + float64(2), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{ + "count": float64(5), + "max": float64(5000.65), + "min": float64(44), + "mean": float64(1238.06), + "median": float64(440.4), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(44), + float64(55), + float64(440.4), + float64(650.25), + float64(5000.65), + float64(5001.65), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STChange, + }, + }, + Title: "avg_age", + Delta: map[string]interface{}{ + "count": float64(0), + "max": float64(4935.4), + "mean": float64(1186.02), + "median": float64(389.75), + "min": float64(-0.3999999999999986), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "maxLength": float64(8), + "minLength": float64(7), + "unique": float64(5), + "frequencies": map[string]interface{}{ + "chatham": float64(1), + "chicago": float64(1), + "new york": float64(1), + "raleigh": float64(1), + "toronto": float64(1), + }, + "type": "string", + }, + Right: EmptyObject{ + "count": float64(5), + "maxLength": float64(8), + "minLength": float64(7), + "unique": float64(5), + "frequencies": map[string]interface{}{ + "chatham": float64(1), + "chicago": float64(1), + "new york": float64(1), + "raleigh": float64(1), + "toronto": float64(1), + }, + "type": "string", + }, + About: map[string]interface{}{ + "status": fsi.STUnmodified, + }, + }, + Title: "city", + Delta: map[string]interface{}{ + "count": float64(0), + "maxLength": float64(0), + "minLength": float64(0), + "unique": float64(0), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "falseCount": float64(1), + "trueCount": float64(4), + "type": "boolean", + }, + Right: EmptyObject{ + "count": float64(5), + "falseCount": float64(5), + "trueCount": float64(0), + "type": "boolean", + }, + About: map[string]interface{}{ + "status": fsi.STUnmodified, + }, + }, + Title: "in_usa", + Delta: map[string]interface{}{ + "count": float64(0), + "falseCount": float64(4), + "trueCount": float64(-4), + }, + }, + &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(40000000), + "min": float64(35000), + "mean": float64(9817000), + "median": float64(300000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(35000), + float64(250000), + float64(300000), + float64(8500000), + float64(40000000), + float64(40000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{ + "count": float64(5), + "max": float64(4000000), + "min": float64(3500), + "mean": float64(981700), + "median": float64(30000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(3500), + float64(25000), + float64(30000), + float64(850000), + float64(4000000), + float64(4000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STChange, + }, + }, + Title: "pop", + Delta: map[string]interface{}{ + "count": float64(0), + "max": float64(-36000000), + "mean": float64(-8835300), + "median": float64(-270000), + "min": float64(-31500), + }, + }, + } +} + +func TestStatsDiff(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + ds.Name = "cities" + leftDs := *ds + + // alter body file + const alteredBodyData = `city,pop,avg_age,in_usa +toronto,4000000,55.0,false +new york,850000,44.0,false +chicago,30000,440.4,false +chatham,3500,650.25,false +raleigh,25000,5000.65,false` + + updateDataset(t, mr, ds, alteredBodyData) + + res, err := svc.statsDiff(ctx, &leftDs, ds) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(res.Columns, func(i, j int) bool { + return res.Columns[i].Title < res.Columns[j].Title + }) + + expect := &StatsChangeComponent{ + Summary: &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: StatsChangeSummaryFields{Entries: 5, Columns: 4, TotalSize: 155}, + Right: StatsChangeSummaryFields{Entries: 5, Columns: 4, TotalSize: 157}, + }, + Delta: StatsChangeSummaryFields{ + Entries: 0, + Columns: 0, + TotalSize: 2, + }, + }, + Columns: getBaseCols(), + } + + if diff := cmp.Diff(res, expect); diff != "" { + t.Errorf("stat component result mismatch. (-want +got):%s\n", diff) + } +} + +func TestParseColumns(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + var colItems tabular.Columns + summary, err := svc.parseColumns(&colItems, ds) + if err != nil { + t.Fatal(err) + } + + expectColItems := tabular.Columns{ + tabular.Column{ + Title: "city", + Type: &tabular.ColType{"string"}, + }, + tabular.Column{ + Title: "pop", + Type: &tabular.ColType{"integer"}, + }, + tabular.Column{ + Title: "avg_age", + Type: &tabular.ColType{"number"}, + }, + tabular.Column{ + Title: "in_usa", + Type: &tabular.ColType{"boolean"}, + }, + } + + if diff := cmp.Diff(colItems, expectColItems); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + expectSummary := StatsChangeSummaryFields{ + Entries: 5, + Columns: 4, + TotalSize: 155, + } + + if diff := cmp.Diff(summary, expectSummary); diff != "" { + t.Errorf("stats summary result mismatch. (-want +got):%s\n", diff) + } +} + +func TestMaybeLoadStats(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + if ds.Stats == nil { + t.Fatal("stats are nil") + } + + ds.Stats = nil + + svc.maybeLoadStats(ctx, ds) + if ds.Stats == nil { + t.Fatal("stats are nil") + } +} + +func TestMatchColumns(t *testing.T) { + ctx := context.Background() + + workDir, err := ioutil.TempDir("", "qri_test_changes_service") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(workDir) + + mr, err := repotest.NewTestRepo() + if err != nil { + t.Fatal(err) + } + + svc := newTestService(t, mr, workDir) + + ref := dsref.MustParse("peer/cities") + if _, err := mr.ResolveRef(ctx, &ref); err != nil { + t.Fatal(err) + } + + ds, err := dsfs.LoadDataset(ctx, mr.Filesystem(), ref.Path) + if err != nil { + t.Fatal(err) + } + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + ds.Name = "cities" + leftDs := *ds + + // alter body file + const alteredBodyData = `city,pop,avg_age,in_usa +toronto,4000000,55.0,false +new york,850000,44.0,false +chicago,30000,440.4,false +chatham,3500,650.25,false +raleigh,25000,5000.65,false` + + updateDataset(t, mr, ds, alteredBodyData) + + var leftColItems tabular.Columns + _, err = svc.parseColumns(&leftColItems, &leftDs) + if err != nil { + t.Fatal(err) + } + leftStats, err := svc.parseStats(&leftDs) + if err != nil { + t.Fatal(err) + } + + var rightColItems tabular.Columns + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err := svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err := svc.matchColumns(4, 4, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect := getBaseCols() + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + // alter body file - remove column + const alteredBodyDataColumns1 = `city,avg_age,in_usa +toronto,55.0,false +new york,44.0,false +chicago,440.4,false +chatham,650.25,false +raleigh,5000.65,false` + + ds.Name = "cities" + + updateDataset(t, mr, ds, alteredBodyDataColumns1) + if err = base.OpenDataset(ctx, mr.Filesystem(), ds); err != nil { + t.Fatal(err) + } + + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err = svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err = svc.matchColumns(4, 3, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect = getBaseCols() + expect[3] = &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{ + "count": float64(5), + "max": float64(40000000), + "min": float64(35000), + "mean": float64(9817000), + "median": float64(300000), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(35000), + float64(250000), + float64(300000), + float64(8500000), + float64(40000000), + float64(40000001), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + Right: EmptyObject{}, + About: map[string]interface{}{ + "status": fsi.STRemoved, + }, + }, + Title: "pop", + Delta: map[string]interface{}{ + "count": float64(-5), + "max": float64(-40000000), + "mean": float64(-9817000), + "median": float64(-300000), + "min": float64(-35000), + }, + } + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } + + // alter body file - add column + const alteredBodyDataColumns2 = `city,pop,avg_age,in_usa,score +toronto,4000000,55.0,false,1 +new york,850000,44.0,false,2 +chicago,30000,440.4,false,3 +chatham,3500,650.25,false,4 +raleigh,25000,5000.65,false,5` + + ds.Name = "cities" + + updateDataset(t, mr, ds, alteredBodyDataColumns2) + + _, err = svc.parseColumns(&rightColItems, ds) + if err != nil { + t.Fatal(err) + } + rightStats, err = svc.parseStats(ds) + if err != nil { + t.Fatal(err) + } + + report, err = svc.matchColumns(4, 5, leftColItems, rightColItems, leftStats, rightStats) + if err != nil { + t.Fatal(err) + } + + // output order is not strict so we need to enfore it here + sort.SliceStable(report, func(i, j int) bool { + return report[i].Title < report[j].Title + }) + + expect = getBaseCols() + expect = append(expect, &ChangeReportDeltaComponent{ + ChangeReportComponent: ChangeReportComponent{ + Left: EmptyObject{}, + Right: EmptyObject{ + "count": float64(5), + "max": float64(5), + "min": float64(1), + "mean": float64(3), + "median": float64(3), + "histogram": map[string]interface{}{ + "bins": []interface{}{ + float64(1), + float64(2), + float64(3), + float64(4), + float64(5), + float64(6), + }, + "frequencies": []interface{}{ + float64(1), + float64(1), + float64(1), + float64(1), + float64(1), + }, + }, + "type": "numeric", + }, + About: map[string]interface{}{ + "status": fsi.STAdd, + }, + }, + Title: "score", + Delta: map[string]interface{}{ + "count": float64(5), + "max": float64(5), + "mean": float64(3), + "median": float64(3), + "min": float64(1), + }, + }) + + if diff := cmp.Diff(report, expect); diff != "" { + t.Errorf("column items result mismatch. (-want +got):%s\n", diff) + } +} diff --git a/fsi/status.go b/fsi/status.go index f3b6a9fe9..bfe7bde42 100644 --- a/fsi/status.go +++ b/fsi/status.go @@ -25,6 +25,8 @@ var ( STRemoved = "removed" // STParseError is a component that didn't parse STParseError = "parse error" + // STMissing is a component that is missing + STMissing = "missing" // STConflictError is a component with a conflict STConflictError = "conflict error" // ErrWorkingDirectoryDirty is the error for when the working directory is not clean diff --git a/go.mod b/go.mod index 3644e4add..050b7c7cc 100644 --- a/go.mod +++ b/go.mod @@ -39,7 +39,7 @@ require ( github.com/olekukonko/tablewriter v0.0.4 github.com/pkg/errors v0.9.1 github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5 - github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6 + github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8 github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a github.com/qri-io/doggos v0.1.0 diff --git a/go.sum b/go.sum index c5b0aa732..ac6f4f7bd 100644 --- a/go.sum +++ b/go.sum @@ -1128,6 +1128,8 @@ github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5 h1:xeMaT6fLTvdrFOOP2N github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5/go.mod h1:1AwOy3yhcZTAXzaF4wGSdnrp87u3PBOrsWXUjOtQCXo= github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6 h1:a9CYZQ+DCzwqg8BgEN5oKboBoxueaYf0EKPnXeR/Mhk= github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y= +github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8 h1:/9pbWabRT9BbjFp1AjdAsXKT2NQp+mGmyvnTylPyEHY= +github.com/qri-io/dataset v0.2.1-0.20201201155506-9b4fc79ffde8/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y= github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b h1:T8qEIv+qLi5mVWvSS329wJ+HbN7cfMwCWjRVzh/+upo= github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b/go.mod h1:NrL/b7YvexgpGb4HEO3Rlx5RrMLDfxuKDf/XDAq5ac0= github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a h1:40BIa59lae2xZ7iieb3UU4/X57jZsWZ6QgqwdjDQhig= diff --git a/lib/changes.go b/lib/changes.go new file mode 100644 index 000000000..d056279db --- /dev/null +++ b/lib/changes.go @@ -0,0 +1,49 @@ +package lib + +import ( + "context" + + "github.com/qri-io/qri/changes" + "github.com/qri-io/qri/dsref" +) + +// ChangeReportParams defines parameters for diffing two sources +type ChangeReportParams struct { + LeftRefstr string `json:"left"` + RightRefstr string `json:"right"` +} + +// ChangeReport is a simple utility type declaration +type ChangeReport = changes.ChangeReportResponse + +// ChangeReport resolves the requested datasets and tries to generate a change report +func (m *DatasetMethods) ChangeReport(p *ChangeReportParams, res *ChangeReport) error { + ctx := context.TODO() + reportSource := "" + + if m.inst.rpc != nil { + return checkRPCError(m.inst.rpc.Call("DatasetMethods.ChangeReport", p, res)) + } + + right, _, err := m.inst.ParseAndResolveRef(ctx, p.RightRefstr, reportSource) + if err != nil { + return err + } + + var left dsref.Ref + if p.LeftRefstr != "" { + if left, _, err = m.inst.ParseAndResolveRef(ctx, p.LeftRefstr, reportSource); err != nil { + return err + } + } else { + left = dsref.Ref{Username: right.Username, Name: right.Name} + } + + report, err := changes.New(m.inst, m.inst.stats).Report(ctx, left, right, reportSource) + if err != nil { + return err + } + + *res = *report + return nil +} diff --git a/lib/lib.go b/lib/lib.go index 60aea7513..0cc5fb338 100644 --- a/lib/lib.go +++ b/lib/lib.go @@ -384,7 +384,7 @@ func NewInstance(ctx context.Context, repoPath string, opts ...Option) (qri *Ins // if logAll is enabled, turn on debug level logging for all qri packages. Packages need to // be explicitly enumerated here if o.logAll { - allPackages := []string{"qriapi", "qrip2p", "base", "cmd", "config", "dsref", "dsfs", "friendly", "fsi", "lib", "logbook", "profile", "repo"} + allPackages := []string{"qriapi", "qrip2p", "base", "changes", "cmd", "config", "dsref", "dsfs", "friendly", "fsi", "lib", "logbook", "profile", "repo"} for _, name := range allPackages { golog.SetLogLevel(name, "debug") }