From a83030c46ec7c9dc58163c51bc0e74cdcaa14acb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 2 Sep 2024 22:01:01 -0700 Subject: [PATCH] Added experimental knn option --- CHANGELOG.md | 1 + README.md | 16 ++++++- examples/Gemfile | 9 ---- examples/elasticsearch_knn.rb | 62 --------------------------- examples/opensearch_knn.rb | 74 --------------------------------- lib/searchkick.rb | 9 ++++ lib/searchkick/index_options.rb | 35 ++++++++++++++++ lib/searchkick/model.rb | 2 +- lib/searchkick/query.rb | 34 ++++++++++++++- test/knn_test.rb | 36 ++++++++++++++++ test/models/product.rb | 3 +- test/support/activerecord.rb | 7 ++++ test/support/mongoid.rb | 1 + 13 files changed, 139 insertions(+), 150 deletions(-) delete mode 100644 examples/Gemfile delete mode 100644 examples/elasticsearch_knn.rb delete mode 100644 examples/opensearch_knn.rb create mode 100644 test/knn_test.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8508de0c..8c1e6e4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 5.4.0 (unreleased) +- Added experimental `knn` option - Added experimental support for `_raw` to `where` option - Added warning for `exists` with non-`true` values - Added warning for full reindex and `:queue` mode diff --git a/README.md b/README.md index a08f38d0..c4fb5c02 100644 --- a/README.md +++ b/README.md @@ -1845,9 +1845,21 @@ To query nested data, use dot notation. Product.search("san", fields: ["store.city"], where: {"store.zip_code" => 12345}) ``` -## Nearest Neighbors +## Nearest Neighbors [experimental] -You can use custom mapping and searching to index vectors and perform k-nearest neighbor search. See the examples for [Elasticsearch](examples/elasticsearch_knn.rb) and [OpenSearch](examples/opensearch_knn.rb). +*Available for Elasticsearch 8.6+ and OpenSearch 2.4+* + +```ruby +class Product < ApplicationRecord + searchkick knn: {embedding: {dimensions: 3}} +end +``` + +Reindex and search with: + +```ruby +Product.search(knn: {field: :embedding, vector: [1, 2, 3]}) +``` ## Reference diff --git a/examples/Gemfile b/examples/Gemfile deleted file mode 100644 index d77ec765..00000000 --- a/examples/Gemfile +++ /dev/null @@ -1,9 +0,0 @@ -source "https://rubygems.org" - -gemspec path: ".." - -gem "activerecord" -gem "disco" -gem "elasticsearch" -gem "opensearch-ruby" -gem "sqlite3" diff --git a/examples/elasticsearch_knn.rb b/examples/elasticsearch_knn.rb deleted file mode 100644 index a779c3b0..00000000 --- a/examples/elasticsearch_knn.rb +++ /dev/null @@ -1,62 +0,0 @@ -require "active_record" -require "disco" -require "elasticsearch" -require "searchkick" - -ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:" -ActiveRecord::Schema.verbose = false -ActiveRecord::Schema.define do - create_table :movies do |t| - t.string :name - t.text :embedding - end -end - -class Movie < ActiveRecord::Base - # remove "coder: " for Active Record < 7.1 - serialize :embedding, coder: JSON - - searchkick \ - mappings: { - properties: { - embedding: { - type: "dense_vector", - dims: 20, - index: true, - similarity: "cosine" - } - } - }, - merge_mappings: true - - def search_data - { - name: name, - embedding: embedding - } - end -end - -data = Disco.load_movielens -recommender = Disco::Recommender.new(factors: 20) -recommender.fit(data) - -movies = [] -recommender.item_ids.each do |item_id| - movies << {name: item_id, embedding: recommender.item_factors(item_id).to_a} -end -Movie.insert_all!(movies) - -Movie.reindex - -movie = Movie.find_by!(name: "Star Wars (1977)") -body = { - knn: { - filter: {bool: {must_not: {term: {_id: movie.id}}}}, - field: "embedding", - k: 5, - num_candidates: 5, - query_vector: movie.embedding - } -} -pp Movie.search(body: body).map(&:name) diff --git a/examples/opensearch_knn.rb b/examples/opensearch_knn.rb deleted file mode 100644 index dddc6d1a..00000000 --- a/examples/opensearch_knn.rb +++ /dev/null @@ -1,74 +0,0 @@ -require "active_record" -require "disco" -require "opensearch-ruby" -require "searchkick" - -ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:" -ActiveRecord::Schema.verbose = false -ActiveRecord::Schema.define do - create_table :movies do |t| - t.string :name - t.text :embedding - end -end - -class Movie < ActiveRecord::Base - # remove "coder: " for Active Record < 7.1 - serialize :embedding, coder: JSON - - searchkick \ - settings: { - index: { - knn: true - } - }, - mappings: { - properties: { - embedding: { - type: "knn_vector", - dimension: 20, - method: { - name: "hnsw", - space_type: "cosinesimil", - engine: "lucene" - } - } - } - }, - merge_mappings: true - - def search_data - { - name: name, - embedding: embedding - } - end -end - -data = Disco.load_movielens -recommender = Disco::Recommender.new(factors: 20) -recommender.fit(data) - -movies = [] -recommender.item_ids.each do |item_id| - movies << {name: item_id, embedding: recommender.item_factors(item_id).to_a} -end -Movie.insert_all!(movies) - -Movie.reindex - -movie = Movie.find_by!(name: "Star Wars (1977)") -# uses efficient filtering available in OpenSearch 2.4+ -# https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/ -body = { - query: { - knn: { - embedding: { - filter: {bool: {must_not: {term: {_id: movie.id}}}}, - vector: movie.embedding, - k: 5 - } - } - } -} -pp Movie.search(body: body).map(&:name) diff --git a/lib/searchkick.rb b/lib/searchkick.rb index 69a021f4..28e3305c 100644 --- a/lib/searchkick.rb +++ b/lib/searchkick.rb @@ -142,6 +142,15 @@ def self.server_below?(version, true_version = false) Gem::Version.new(server_version.split("-")[0]) < Gem::Version.new(version.split("-")[0]) end + # private + def self.knn_support? + if opensearch? + !server_below?("2.4.0", true) + else + !server_below?("8.6.0") + end + end + def self.search(term = "*", model: nil, **options, &block) options = options.dup klass = model diff --git a/lib/searchkick/index_options.rb b/lib/searchkick/index_options.rb index f7cdf4ea..73576f4f 100644 --- a/lib/searchkick/index_options.rb +++ b/lib/searchkick/index_options.rb @@ -169,6 +169,20 @@ def generate_settings max_shingle_diff: 4 } + if options[:knn] + unless Searchkick.knn_support? + if Searchkick.opensearch? + raise Error, "knn requires OpenSearch 2.4+" + else + raise Error, "knn requires Elasticsearch 8.6+" + end + end + + if Searchkick.opensearch? + settings[:index][:knn] = true + end + end + if options[:case_sensitive] settings[:analysis][:analyzer].each do |_, analyzer| analyzer[:filter].delete("lowercase") @@ -406,6 +420,27 @@ def generate_mappings mapping[field] = shape_options.merge(type: "geo_shape") end + (options[:knn] || []).each do |field, knn_options| + if Searchkick.opensearch? + mapping[field.to_s] = { + type: "knn_vector", + dimension: knn_options[:dimensions], + method: { + name: "hnsw", + space_type: "cosinesimil", + engine: "lucene" + } + } + else + mapping[field.to_s] = { + type: "dense_vector", + dims: knn_options[:dimensions], + index: true, + similarity: "cosine" + } + end + end + if options[:inheritance] mapping[:type] = keyword_mapping end diff --git a/lib/searchkick/model.rb b/lib/searchkick/model.rb index 41aa4875..ed13e28b 100644 --- a/lib/searchkick/model.rb +++ b/lib/searchkick/model.rb @@ -4,7 +4,7 @@ def searchkick(**options) options = Searchkick.model_options.merge(options) unknown_keywords = options.keys - [:_all, :_type, :batch_size, :callbacks, :case_sensitive, :conversions, :deep_paging, :default_fields, - :filterable, :geo_shape, :highlight, :ignore_above, :index_name, :index_prefix, :inheritance, :language, + :filterable, :geo_shape, :highlight, :ignore_above, :index_name, :index_prefix, :inheritance, :knn, :language, :locations, :mappings, :match, :max_result_window, :merge_mappings, :routing, :searchable, :search_synonyms, :settings, :similarity, :special_characters, :stem, :stemmer, :stem_conversions, :stem_exclusion, :stemmer_override, :suggest, :synonyms, :text_end, :text_middle, :text_start, :unscope, :word, :word_end, :word_middle, :word_start] diff --git a/lib/searchkick/query.rb b/lib/searchkick/query.rb index bef8be1b..277cfec5 100644 --- a/lib/searchkick/query.rb +++ b/lib/searchkick/query.rb @@ -19,7 +19,7 @@ class Query def initialize(klass, term = "*", **options) unknown_keywords = options.keys - [:aggs, :block, :body, :body_options, :boost, :boost_by, :boost_by_distance, :boost_by_recency, :boost_where, :conversions, :conversions_term, :debug, :emoji, :exclude, :explain, - :fields, :highlight, :includes, :index_name, :indices_boost, :limit, :load, + :fields, :highlight, :includes, :index_name, :indices_boost, :knn, :limit, :load, :match, :misspellings, :models, :model_includes, :offset, :operator, :order, :padding, :page, :per_page, :profile, :request_params, :routing, :scope_results, :scroll, :select, :similar, :smart_aggs, :suggest, :total_entries, :track, :type, :where] raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any? @@ -526,6 +526,38 @@ def prepare end end + # knn + knn = options[:knn] + if knn + if term != "*" + raise ArgumentError, "Hybrid search not supported yet" + end + + field = knn[:field] + vector = knn[:vector] + k = per_page + offset + filter = payload.delete(:query) + + if Searchkick.opensearch? + payload[:query] = { + knn: { + field.to_sym => { + vector: vector, + k: k, + filter: filter + } + } + } + else + payload[:knn] = { + field: field, + query_vector: vector, + k: k, + filter: filter + } + end + end + # pagination pagination_options = options[:page] || options[:limit] || options[:per_page] || options[:offset] || options[:padding] if !options[:body] || pagination_options diff --git a/test/knn_test.rb b/test/knn_test.rb new file mode 100644 index 00000000..8d9ec99c --- /dev/null +++ b/test/knn_test.rb @@ -0,0 +1,36 @@ +require_relative "test_helper" + +class KnnTest < Minitest::Test + def setup + skip unless Searchkick.knn_support? + super + end + + def test_basic + store [{name: "A", embedding: [1, 2, 3]}, {name: "B", embedding: [-1, -2, -3]}] + assert_order "*", ["A", "B"], knn: {field: :embedding, vector: [1, 2, 3]} + + scores = Product.search(knn: {field: :embedding, vector: [1, 2, 3]}).hits.map { |v| v["_score"] } + assert_in_delta 1, scores[0] + assert_in_delta 0, scores[1] + end + + def test_where + store [ + {name: "A", store_id: 1, embedding: [1, 2, 3]}, + {name: "B", store_id: 2, embedding: [1, 2, 3]}, + {name: "C", store_id: 1, embedding: [-1, -2, -3]}, + ] + assert_order "*", ["A", "C"], knn: {field: :embedding, vector: [1, 2, 3]}, where: {store_id: 1} + end + + def test_pagination + store [ + {name: "A", embedding: [1, 2, 3]}, + {name: "B", embedding: [1, 2, 0]}, + {name: "C", embedding: [-1, -2, 0]}, + {name: "D", embedding: [-1, -2, -3]} + ] + assert_order "*", ["B", "C"], knn: {field: :embedding, vector: [1, 2, 3]}, limit: 2, offset: 1 + end +end diff --git a/test/models/product.rb b/test/models/product.rb index b6179df4..66810b83 100644 --- a/test/models/product.rb +++ b/test/models/product.rb @@ -20,7 +20,8 @@ class Product highlight: [:name], filterable: [:name, :color, :description], similarity: "BM25", - match: ENV["MATCH"] ? ENV["MATCH"].to_sym : nil + match: ENV["MATCH"] ? ENV["MATCH"].to_sym : nil, + knn: Searchkick.knn_support? ? {embedding: {dimensions: 3}} : nil attr_accessor :conversions, :user_ids, :aisle, :details diff --git a/test/support/activerecord.rb b/test/support/activerecord.rb index e8e07e4a..7d7e1c78 100644 --- a/test/support/activerecord.rb +++ b/test/support/activerecord.rb @@ -32,6 +32,7 @@ t.decimal :longitude, precision: 10, scale: 7 t.text :description t.text :alt_description + t.text :embedding t.timestamps null: true end @@ -75,6 +76,12 @@ class Product < ActiveRecord::Base belongs_to :store + + if ActiveRecord::VERSION::STRING.to_f >= 7.1 + serialize :embedding, coder: JSON + else + serialize :embedding, JSON + end end class Store < ActiveRecord::Base diff --git a/test/support/mongoid.rb b/test/support/mongoid.rb index 9bcd6bf3..137616e2 100644 --- a/test/support/mongoid.rb +++ b/test/support/mongoid.rb @@ -21,6 +21,7 @@ class Product field :longitude, type: BigDecimal field :description field :alt_description + field :embedding, type: Array end class Store