diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2719d437..fd0fec90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,6 +2,7 @@ on: push: branches: - master + - v15.x - v14.x - v13.x - v12.x @@ -12,7 +13,7 @@ jobs: test: strategy: matrix: - go-version: [1.18.x, 1.19.x, 1.20.x] + go-version: [1.19.x, 1.20.x, 1.21.x] platform: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.platform }} steps: diff --git a/go.mod b/go.mod index aebb90b5..f5946ccb 100644 --- a/go.mod +++ b/go.mod @@ -1,28 +1,23 @@ module github.com/blevesearch/zapx/v15 -go 1.19 +go 1.20 require ( github.com/RoaringBitmap/roaring v1.2.3 - github.com/blevesearch/bleve_index_api v1.0.5 - github.com/blevesearch/go-faiss v0.2.1-0.20230718193937-72c2455dad4c + github.com/blevesearch/bleve_index_api v1.1.1 + github.com/blevesearch/go-faiss v1.0.1 github.com/blevesearch/mmap-go v1.0.4 - github.com/blevesearch/scorch_segment_api/v2 v2.1.5 + github.com/blevesearch/scorch_segment_api/v2 v2.2.1 github.com/blevesearch/vellum v1.0.10 github.com/golang/snappy v0.0.1 - github.com/spf13/cobra v1.4.0 + github.com/spf13/cobra v1.7.0 + golang.org/x/exp v0.0.0-20231006140011-7918f672742d ) require ( github.com/bits-and-blooms/bitset v1.2.0 // indirect - github.com/inconshreveable/mousetrap v1.0.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/mschoch/smat v0.2.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a // indirect + golang.org/x/sys v0.13.0 // indirect ) - -replace github.com/blevesearch/bleve_index_api => ../bleve_index_api - -replace github.com/blevesearch/go-faiss => /Users/thejasbhat/fts/vector_search/go-faiss - -replace github.com/blevesearch/scorch_segment_api/v2 => ../scorch_segment_api diff --git a/go.sum b/go.sum index e8ebd286..5d360331 100644 --- a/go.sum +++ b/go.sum @@ -2,32 +2,41 @@ github.com/RoaringBitmap/roaring v1.2.3 h1:yqreLINqIrX22ErkKI0vY47/ivtJr6n+kMhVO github.com/RoaringBitmap/roaring v1.2.3/go.mod h1:plvDsJQpxOC5bw8LRteu/MLWHsHez/3y6cubLI4/1yE= github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA= github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/blevesearch/bleve_index_api v1.1.1 h1:Z81Hhga4I+WnxNdrdgoDQWD5K282/x0n2QyRKcslVEo= +github.com/blevesearch/bleve_index_api v1.1.1/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/go-faiss v1.0.1 h1:B0/FGdmcdxHIM0DRPyy4aWk0ZjMTFbCsmIzra77GAxE= +github.com/blevesearch/go-faiss v1.0.1/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8= github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.2.1 h1:ssWvVSGzGsNSwZe1QozqsBvgz8fmFDzMlM0ep1bz+HM= +github.com/blevesearch/scorch_segment_api/v2 v2.2.1/go.mod h1:EhYElKqeTdLz9g8VzclpW2RKHnPRj/R4g/N6B0q37rM= github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxyMIPI= github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k= -github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= -github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.4.0 h1:y+wJpx64xcgO1V+RcnwW0LEHxTKRi2ZDPSBjWnrg88Q= -github.com/spf13/cobra v1.4.0/go.mod h1:Wo4iy3BUC+X2Fybo0PDqwJIv3dNRiZLHQymsfxlB84g= +github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= +github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a h1:dGzPydgVsqGcTRVwiLJ1jVbufYwmzD3LfVPLKsKg+0k= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/section_vector_index.go b/section_vector_index.go index 7345e81c..b0c25967 100644 --- a/section_vector_index.go +++ b/section_vector_index.go @@ -11,6 +11,7 @@ import ( "github.com/RoaringBitmap/roaring" index "github.com/blevesearch/bleve_index_api" faiss "github.com/blevesearch/go-faiss" + "golang.org/x/exp/maps" ) func init() { @@ -46,6 +47,7 @@ func (v *vectorIndexSection) AddrForField(opaque map[int]resetable, fieldID int) type vecIndexMeta struct { startOffset int indexSize uint64 + vecIds []int64 } func remapDocIDs(oldIDs *roaring.Bitmap, newIDs []uint64) *roaring.Bitmap { @@ -68,7 +70,7 @@ LOOP: for fieldID, _ := range fieldsInv { var indexes []vecIndexMeta - vecToDocID := make(map[uint64]*roaring.Bitmap) + vecToDocID := make(map[int64]*roaring.Bitmap) // todo: would parallely fetching the following stuff from segments // be beneficial in terms of perf? @@ -101,6 +103,7 @@ LOOP: numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) pos += n + indexes[len(indexes)-1].vecIds = make([]int64, 0, numVecs) for i := 0; i < int(numVecs); i++ { vecID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) @@ -119,15 +122,17 @@ LOOP: } bitMap = remapDocIDs(bitMap, newDocNumsIn[segI]) - if vecToDocID[vecID] == nil { + if vecToDocID[int64(vecID)] == nil { if drops[segI] != nil && !drops[segI].IsEmpty() { - vecToDocID[vecID] = roaring.AndNot(bitMap, drops[segI]) + vecToDocID[int64(vecID)] = roaring.AndNot(bitMap, drops[segI]) } else { - vecToDocID[vecID] = bitMap + vecToDocID[int64(vecID)] = bitMap } } else { - vecToDocID[vecID].Or(bitMap) + vecToDocID[int64(vecID)].Or(bitMap) } + + indexes[len(indexes)-1].vecIds = append(indexes[len(indexes)-1].vecIds, int64(vecID)) } } err := vo.mergeAndWriteVectorIndexes(fieldID, segments, vecToDocID, indexes, w, closeCh) @@ -139,7 +144,7 @@ LOOP: return nil } -func (v *vectorIndexOpaque) flushVectorSection(vecToDocID map[uint64]*roaring.Bitmap, +func (v *vectorIndexOpaque) flushVectorSection(vecToDocID map[int64]*roaring.Bitmap, serializedIndex []byte, w *CountHashWriter) (int, error) { tempBuf := v.grabBuf(binary.MaxVarintLen64) fieldStart := w.Count() @@ -193,12 +198,7 @@ func (v *vectorIndexOpaque) flushVectorSection(vecToDocID map[uint64]*roaring.Bi // todo: naive implementation. need to keep in mind the perf implications and improve on this. // perhaps, parallelized merging can help speed things up over here. func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(fieldID int, sbs []*SegmentBase, - vecToDocID map[uint64]*roaring.Bitmap, indexes []vecIndexMeta, w *CountHashWriter, closeCh chan struct{}) error { - if len(vecToDocID) >= 100000 { - // merging of more complex index types (for eg ivf family) with reconstruction - // method. - return fmt.Errorf("to be implemented") - } + vecToDocID map[int64]*roaring.Bitmap, indexes []vecIndexMeta, w *CountHashWriter, closeCh chan struct{}) error { var vecIndexes []*faiss.IndexImpl for segI, seg := range sbs { @@ -211,6 +211,69 @@ func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(fieldID int, sbs []*Segme vecIndexes = append(vecIndexes, index) } + if len(vecToDocID) > 10000 { + // merging of more complex index types (for eg ivf family) with reconstruction + // method. + var indexData []float32 + for i := 0; i < len(vecIndexes); i++ { + if isClosed(closeCh) { + return fmt.Errorf("merging of vector sections aborted") + } + // todo: parallelize reconstruction + recons, err := vecIndexes[i].ReconstructBatch(int64(len(indexes[i].vecIds)), indexes[i].vecIds) + if err != nil { + return err + } + indexData = append(indexData, recons...) + } + + // safe to assume that all the indexes are of the same config values, given + // that they are extracted from the field mapping info. + dims := vecIndexes[0].D() + metric := vecIndexes[0].MetricType() + finalVecIDs := maps.Keys(vecToDocID) + + index, err := faiss.IndexFactory(dims, "IDMap2,IVF100,SQ8", metric) + if err != nil { + return err + } + + index, err = index.GetIVFSubIndex() + if err != nil { + return err + } + + err = index.SetDirectMap(2) + if err != nil { + return err + } + + err = index.Train(indexData) + if err != nil { + return err + } + + index.AddWithIDs(indexData, finalVecIDs) + if err != nil { + return err + } + + mergedIndexBytes, err := faiss.WriteIndexIntoBuffer(index) + if err != nil { + return err + } + + fieldStart, err := v.flushVectorSection(vecToDocID, mergedIndexBytes, w) + if err != nil { + return err + } + v.fieldAddrs[uint16(fieldID)] = fieldStart + + return nil + } + + // todo: ivf -> flat index when there were huge number of vector deletes for this field + for i := 1; i < len(vecIndexes); i++ { if isClosed(closeCh) { return fmt.Errorf("merging of vector sections aborted") diff --git a/segment.go b/segment.go index 32d6248e..abc51981 100644 --- a/segment.go +++ b/segment.go @@ -529,7 +529,6 @@ func (s *SegmentBase) visitStoredFields(vdc *visitDocumentCtx, num uint64, arrayPos[i] = ap } } - value := uncompressed[offset : offset+l] keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) }