-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexample.rb
53 lines (48 loc) · 1.67 KB
/
example.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
require "informers"
require "pg"
require "pgvector"
conn = PG.connect(dbname: "pgvector_example")
conn.exec("CREATE EXTENSION IF NOT EXISTS vector")
conn.exec("DROP TABLE IF EXISTS documents")
conn.exec("CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))")
conn.exec("CREATE INDEX ON documents USING GIN (to_tsvector('english', content))")
model = Informers.pipeline("embedding", "Xenova/multi-qa-MiniLM-L6-cos-v1")
input = [
"The dog is barking",
"The cat is purring",
"The bear is growling"
]
embeddings = model.(input)
input.zip(embeddings) do |content, embedding|
conn.exec_params("INSERT INTO documents (content, embedding) VALUES ($1, $2)", [content, embedding])
end
sql = <<~SQL
WITH semantic_search AS (
SELECT id, RANK () OVER (ORDER BY embedding <=> $2) AS rank
FROM documents
ORDER BY embedding <=> $2
LIMIT 20
),
keyword_search AS (
SELECT id, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC)
FROM documents, plainto_tsquery('english', $1) query
WHERE to_tsvector('english', content) @@ query
ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC
LIMIT 20
)
SELECT
COALESCE(semantic_search.id, keyword_search.id) AS id,
COALESCE(1.0 / ($3 + semantic_search.rank), 0.0) +
COALESCE(1.0 / ($3 + keyword_search.rank), 0.0) AS score
FROM semantic_search
FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id
ORDER BY score DESC
LIMIT 5
SQL
query = "growling bear"
query_embedding = model.(query)
k = 60
result = conn.exec_params(sql, [query, query_embedding, k])
result.each do |row|
puts "document: #{row["id"]}, RRF score: #{row["score"]}"
end