From d055f00965d9465ed9472cf35f870b4b7f038894 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Fri, 13 Sep 2024 21:26:02 -0700 Subject: [PATCH] Skip other sections when reading metadata --- read.go | 74 +++++++++++++++++++++++++++++++++++----------------- read_test.go | 30 ++++++++++++++++++++- section.go | 27 +++++++++++++++++-- 3 files changed, 104 insertions(+), 27 deletions(-) diff --git a/read.go b/read.go index c3dc2b3db..6c1b4aa51 100644 --- a/read.go +++ b/read.go @@ -21,6 +21,7 @@ import ( "hash/crc64" "log" "os" + "slices" "sort" "github.com/rs/xid" @@ -94,20 +95,15 @@ func (r *reader) Str() (string, error) { } func (r *reader) readTOC(toc *indexTOC) error { - sz, err := r.r.Size() - if err != nil { - return err - } - r.off = sz - 8 - - var tocSection simpleSection - if err := tocSection.read(r); err != nil { - return err - } - - r.seek(tocSection.off) + return r.readTOCSections(toc, []string{}) +} - sectionCount, err := r.U32() +// readTOCSections reads the table of contents of the index file. +// +// If the tags parameter is non-empty, it reads only those tagged sections for efficiency +// and does not populate the other sections. +func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { + tocSection, sectionCount, err := r.readHeader() if err != nil { return err } @@ -126,11 +122,14 @@ func (r *reader) readTOC(toc *indexTOC) error { return err } + skipSection := len(tags) > 0 && !slices.Contains(tags, tag) sec := secs[tag] if sec == nil || sec.kind() != sectionKind(kind) { // If we don't recognize the section, we may be reading a newer index than the current version. Use // a "dummy section" struct to skip over it. - log.Printf("encountered unrecognized index section (%s), skipping over it", tag) + skipSection = true + log.Printf("encountered malformed index section (%s), skipping over it", tag) + switch sectionKind(kind) { case sectionKindSimple: sec = &simpleSection{} @@ -143,8 +142,14 @@ func (r *reader) readTOC(toc *indexTOC) error { } } - if err := sec.read(r); err != nil { - return err + if skipSection { + if err := sec.skip(r); err != nil { + return err + } + } else { + if err := sec.read(r); err != nil { + return err + } } } } else { @@ -169,6 +174,27 @@ func (r *reader) readTOC(toc *indexTOC) error { return nil } +func (r *reader) readHeader() (simpleSection, uint32, error) { + sz, err := r.r.Size() + if err != nil { + return simpleSection{}, 0, err + } + r.off = sz - 8 + + var tocSection simpleSection + if err := tocSection.read(r); err != nil { + return simpleSection{}, 0, err + } + + r.seek(tocSection.off) + + sectionCount, err := r.U32() + if err != nil { + return simpleSection{}, 0, err + } + return tocSection, sectionCount, nil +} + func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { return r.file.Read(sec.off, sec.sz) } @@ -205,7 +231,7 @@ func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) { return arr, nil } -func (r *reader) readJSON(data interface{}, sec *simpleSection) error { +func (r *reader) readJSON(data interface{}, sec simpleSection) error { blob, err := r.r.Read(sec.off, sec.sz) if err != nil { return err @@ -228,7 +254,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { branchNames: []map[uint]string{}, } - repos, md, err := r.readMetadata(toc) + repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData) if md != nil && !canReadVersion(md) { return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion) } else if err != nil { @@ -395,9 +421,9 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) { return &d, nil } -func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) { +func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) { var md IndexMetadata - if err := r.readJSON(&md, &toc.metaData); err != nil { + if err := r.readJSON(&md, metaData); err != nil { return nil, nil, err } @@ -410,7 +436,7 @@ func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, err } if len(blob) == 0 { - blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz) + blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz) if err != nil { return nil, &md, err } @@ -573,11 +599,11 @@ func NewSearcher(r IndexFile) (Searcher, error) { func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) { rd := &reader{r: inf} var toc indexTOC - if err := rd.readTOC(&toc); err != nil { + err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) + if err != nil { return nil, nil, err } - - return rd.readMetadata(&toc) + return rd.parseMetadata(toc.metaData, toc.repoMetaData) } // ReadMetadataPathAlive is like ReadMetadataPath except that it only returns diff --git a/read_test.go b/read_test.go index 9e7acd135..c230ed9cd 100644 --- a/read_test.go +++ b/read_test.go @@ -32,7 +32,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "github.com/sourcegraph/zoekt/query" ) @@ -467,3 +466,32 @@ func TestEncodeRanks(t *testing.T) { return true }, nil) } + +func BenchmarkReadMetadata(b *testing.B) { + file, err := os.Open("testdata/benchmark/zoekt_v16.00000.zoekt") + if err != nil { + b.Fatalf("Failed to open test file: %v", err) + } + defer file.Close() + + indexFile, err := NewIndexFile(file) + if err != nil { + b.Fatalf("could not open index: %v", err) + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + repos, metadata, err := ReadMetadata(indexFile) + if err != nil { + b.Fatalf("ReadMetadata failed: %v", err) + } + if len(repos) != 1 { + b.Fatalf("expected 1 repository") + } + if metadata == nil { + b.Fatalf("expected non-nil metadata") + } + } +} diff --git a/section.go b/section.go index c686faaaa..adb758e3c 100644 --- a/section.go +++ b/section.go @@ -127,8 +127,12 @@ func (s *simpleSection) end(w *writer) { // section is a range of bytes in the index file. type section interface { read(*reader) error + // skip advances over the data in the section without reading it. + // NOTE: the section will not contain valid data after this call, and it should not be used. + skip(*reader) error write(*writer) - kind() sectionKind // simple or complex, used in serialization + // kind encodes whether the section is simple or compound, and is used in serialization + kind() sectionKind } type sectionKind int @@ -156,10 +160,17 @@ func (s *simpleSection) read(r *reader) error { return err } s.sz, err = r.U32() + return err +} + +func (s *simpleSection) skip(r *reader) error { + var err error + _, err = r.U32() if err != nil { return err } - return nil + _, err = r.U32() + return err } func (s *simpleSection) write(w *writer) { @@ -215,6 +226,18 @@ func (s *compoundSection) read(r *reader) error { return err } +func (s *compoundSection) skip(r *reader) error { + if err := s.data.skip(r); err != nil { + return err + } + if err := s.index.read(r); err != nil { + return err + } + + _, err := r.r.Read(s.index.off, s.index.sz) + return err +} + // relativeIndex returns the relative offsets of the items (first // element is 0), plus a final marking the end of the last item. func (s *compoundSection) relativeIndex() []uint32 {