feat: basic chat rag works

ShivTestOrg · Oct 6, 2024 · 0f82015 · 0f82015
1 parent 51454d4
commit 0f82015
Show file tree

Hide file tree

Showing 13 changed files with 164 additions and 16 deletions.
diff --git a/.dev.vars.example b/.dev.vars.example
@@ -1 +1,4 @@
-OPENAI_API_KEY="MY_SECRET"
+OPENAI_API_KEY=""
+SUPABASE_URL=""
+SUPABASE_KEY=""
+VOYAGEAI_API_KEY=""
diff --git a/package.json b/package.json
@@ -17,7 +17,7 @@
     "knip-ci": "knip --no-exit-code --reporter json --config .github/knip.ts",
     "prepare": "husky install",
     "test": "jest --setupFiles dotenv/config --coverage",
-    "worker": "wrangler dev --env dev --port 4000"
+    "worker": "wrangler dev --env dev --port 5000"
   },
   "keywords": [
     "typescript",

diff --git a/src/adapters/index.ts b/src/adapters/index.ts
@@ -5,7 +5,7 @@ import { SuperSupabase } from "./supabase/helpers/supabase";
 import { Embedding as VoyageEmbedding } from "./voyage/helpers/embedding";
 import { SuperVoyage } from "./voyage/helpers/voyage";
 import { VoyageAIClient } from "voyageai";
-import { Issue } from "./supabase/helpers/issue";
+import { Issue } from "./supabase/helpers/issues";
 import { SuperOpenAi } from "./openai/helpers/openai";
 import OpenAI from "openai";
 import { Completions } from "./openai/helpers/completions";

diff --git a/src/adapters/openai/helpers/completions.ts b/src/adapters/openai/helpers/completions.ts
@@ -1,7 +1,7 @@
 import OpenAI from "openai";
 import { Context } from "../../../types";
 import { SuperOpenAi } from "./openai";
-const MAX_TOKENS = 3072;
+const MAX_TOKENS = 3000;
 
 export interface CompletionsType {
   answer: string;
@@ -46,7 +46,7 @@ export class Completions extends SuperOpenAi {
           ],
         },
       ],
-      temperature: 0,
+      temperature: 0.2,
       max_tokens: MAX_TOKENS,
       top_p: 1,
       frequency_penalty: 0,

diff --git a/src/adapters/supabase/helpers/comment.ts b/src/adapters/supabase/helpers/comment.ts
@@ -12,6 +12,14 @@ export interface CommentType {
   embedding: number[];
 }
 
+export interface CommentSimilaritySearchResult {
+  comment_id: string;
+  comment_plaintext: string;
+  comment_issue_id: string;
+  similarity: number;
+  text_similarity: number;
+}
+
 export class Comment extends SuperSupabase {
   constructor(supabase: SupabaseClient, context: Context) {
     super(supabase, context);
@@ -24,12 +32,14 @@ export class Comment extends SuperSupabase {
     return data;
   }
 
-  async findSimilarComments(query: string, threshold: number, currentId: string): Promise<CommentType[] | null> {
+  async findSimilarComments(query: string, threshold: number, currentId: string): Promise<CommentSimilaritySearchResult[] | null> {
     const embedding = await this.context.adapters.voyage.embedding.createEmbedding(query);
-    const { data, error } = await this.supabase.rpc("find_similar_comments_with_vector_search_ftse", {
+    const { data, error } = await this.supabase.rpc("find_similar_comments", {
       current_id: currentId,
+      query_text: query,
       query_embedding: embedding,
       threshold: threshold,
+      max_results: 10,
     });
     if (error) {
       this.context.logger.error("Error finding similar comments", error);

diff --git a/src/adapters/supabase/helpers/issue.ts → src/adapters/supabase/helpers/issues.ts b/src/adapters/supabase/helpers/issue.ts → src/adapters/supabase/helpers/issues.ts
@@ -6,6 +6,7 @@ export interface IssueSimilaritySearchResult {
   issue_id: string;
   issue_plaintext: string;
   similarity: number;
+  text_similarity: number;
 }
 
 export interface IssueType {
@@ -33,10 +34,12 @@ export class Issue extends SuperSupabase {
   }
   async findSimilarIssues(plaintext: string, threshold: number, currentId: string): Promise<IssueSimilaritySearchResult[] | null> {
     const embedding = await this.context.adapters.voyage.embedding.createEmbedding(plaintext);
-    const { data, error } = await this.supabase.rpc("find_similar_issues_vector_search_ftse", {
+    const { data, error } = await this.supabase.rpc("find_similar_issue_ftse", {
       current_id: currentId,
+      query_text: plaintext,
       query_embedding: embedding,
       threshold: threshold,
+      max_results: 10,
     });
     if (error) {
       this.context.logger.error("Error finding similar issues", error);

diff --git a/src/adapters/voyage/helpers/embedding.ts b/src/adapters/voyage/helpers/embedding.ts
@@ -17,7 +17,7 @@ export class Embedding extends SuperVoyage {
     } else {
       const response = await this.client.embed({
         input: text,
-        model: "voyage-large-3",
+        model: "voyage-large-2-instruct",
       });
       return (response.data && response.data[0]?.embedding) || [];
     }

diff --git a/src/adapters/voyage/helpers/rerankers.ts b/src/adapters/voyage/helpers/rerankers.ts
@@ -14,8 +14,9 @@ export class Rerankers extends SuperVoyage {
     const response = await this.client.rerank({
       query,
       documents: results,
-      model: "voyage-large-3",
+      model: "rerank-2",
       returnDocuments: true,
+      topK: 5,
     });
     const rerankedResults = response.data || [];
     return rerankedResults.map((result) => result.document).filter((document): document is string => document !== undefined);

diff --git a/src/handlers/ask-gpt.ts b/src/handlers/ask-gpt.ts
@@ -1,6 +1,7 @@
 import { Context } from "../types";
 import { CompletionsType } from "../adapters/openai/helpers/completions";
-import { CommentType } from "../adapters/supabase/helpers/comment";
+import { CommentSimilaritySearchResult } from "../adapters/supabase/helpers/comment";
+import { IssueSimilaritySearchResult } from "../adapters/supabase/helpers/issues";
 
 export async function askQuestion(context: Context, question: string) {
   if (!question) {
@@ -20,12 +21,23 @@ export async function askGpt(context: Context, question: string): Promise<Comple
   const similarComments = (await context.adapters.supabase.comment.findSimilarComments(question, similarityThreshold, "")) || [];
   const similarIssues = (await context.adapters.supabase.issue.findSimilarIssues(question, similarityThreshold, "")) || [];
   //Create a new object with plain text from both the objects
-  const similarText = similarComments.map((comment: CommentType) => comment.plaintext);
-  similarText.push(...similarIssues.map((issue) => issue.issue_plaintext));
+  const similarText = similarComments.map((comment: CommentSimilaritySearchResult) => comment.comment_plaintext);
+  similarText.push(...similarIssues.map((issue: IssueSimilaritySearchResult) => issue.issue_plaintext));
   //Rerank Similar Comments and Issues
   const rerankedText = await context.adapters.voyage.reranker.reRankResults(similarText, question);
+  //Remove unwanted characters from the text
+  rerankedText.forEach((text) => removeUnwantedChars(text));
   //TODO: Temporary workaround
   //const chat = createChatHistory(formattedChat);
   //logger.info(`Sending chat to OpenAI`, { chat });
   return context.adapters.openai.completions.createCompletion(question, model, rerankedText);
 }
+
+/**
+ * Removes unwanted characters from the text like emojis, special characters etc.
+ * @param text
+ * @returns
+ */
+const removeUnwantedChars = (text: string) => {
+  return text.replace(/[^a-zA-Z0-9\s]/g, "");
+};
diff --git a/src/plugin.ts b/src/plugin.ts
@@ -16,9 +16,10 @@ export async function plugin(inputs: PluginInputs, env: Env) {
   const voyageClient = new VoyageAIClient({
     apiKey: env.VOYAGEAI_API_KEY,
   });
+
   const openaiClient = new OpenAI({
     apiKey: env.OPENAI_API_KEY,
-    baseURL: inputs.settings.openAiBaseUrl || "https://api.openai.com",
+    ...(inputs.settings.openAiBaseUrl && { baseUrl: inputs.settings.openAiBaseUrl }),
   });
   const context: Context = {
     eventName: inputs.eventName,

diff --git a/src/types/plugin-inputs.ts b/src/types/plugin-inputs.ts
@@ -22,7 +22,7 @@ export interface PluginInputs<T extends SupportedEventsU = SupportedEventsU, TU
 export const pluginSettingsSchema = T.Object({
   model: T.String({ default: "o1-mini" }),
   openAiBaseUrl: T.Optional(T.String()),
-  similarityThreshold: T.Number({ default: 0.8 }),
+  similarityThreshold: T.Number({ default: 0.1 }),
 });
 
 export const pluginSettingsValidator = new StandardValidator(pluginSettingsSchema);

diff --git a/supabase/migrations/20241005200943_comments_function.sql b/supabase/migrations/20241005200943_comments_function.sql
@@ -0,0 +1,119 @@
+CREATE OR REPLACE FUNCTION find_similar_issue_ftse(
+    current_id VARCHAR,
+    query_text TEXT,
+    query_embedding VECTOR(1024),
+    threshold DOUBLE PRECISION,
+    max_results INTEGER DEFAULT 10
+)
+RETURNS TABLE(
+    issue_id VARCHAR,
+    issue_plaintext TEXT,
+    similarity DOUBLE PRECISION,
+    text_similarity DOUBLE PRECISION
+) AS $$
+DECLARE
+    query_tokens TEXT[];
+    query_tsquery TSQUERY;
+BEGIN
+    -- Generate query tokens
+    SELECT array_agg(DISTINCT lower(word))
+    INTO query_tokens
+    FROM unnest(regexp_split_to_array(query_text, '\s+')) AS word
+    WHERE length(word) > 2;
+
+    -- Create tsquery from tokens
+    SELECT to_tsquery(string_agg(lexeme || ':*', ' | '))
+    INTO query_tsquery
+    FROM unnest(query_tokens) lexeme;
+
+    RETURN QUERY
+    WITH vector_similarity AS (
+        SELECT
+            id,
+            plaintext,
+            (1 - (embedding <-> query_embedding))::DOUBLE PRECISION AS vec_similarity
+        FROM issues
+        WHERE id <> current_id
+          AND (1 - (embedding <-> query_embedding))::DOUBLE PRECISION > threshold
+    ),
+    text_similarity AS (
+        SELECT
+            id,
+            plaintext,
+            ts_rank(to_tsvector('english', plaintext), query_tsquery)::DOUBLE PRECISION AS text_sim
+        FROM issues
+        WHERE to_tsvector('english', plaintext) @@ query_tsquery
+    )
+    SELECT
+        vs.id AS issue_id,
+        vs.plaintext AS issue_plaintext,
+        vs.vec_similarity AS similarity,
+        COALESCE(ts.text_sim, 0::DOUBLE PRECISION) AS text_similarity
+    FROM vector_similarity vs
+    LEFT JOIN text_similarity ts ON vs.id = ts.id
+    ORDER BY (vs.vec_similarity + COALESCE(ts.text_sim, 0::DOUBLE PRECISION)) DESC
+    LIMIT max_results;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION find_similar_comments(
+    current_id VARCHAR,
+    query_text TEXT,
+    query_embedding VECTOR(1024),
+    threshold DOUBLE PRECISION,
+    max_results INTEGER DEFAULT 10
+)
+RETURNS TABLE(
+    comment_id VARCHAR,
+    comment_plaintext TEXT,
+    comment_issue_id VARCHAR,
+    similarity DOUBLE PRECISION,
+    text_similarity DOUBLE PRECISION
+) AS $$
+DECLARE
+    query_tokens TEXT[];
+    query_tsquery TSQUERY;
+BEGIN
+    -- Generate query tokens
+    SELECT array_agg(DISTINCT lower(word))
+    INTO query_tokens
+    FROM unnest(regexp_split_to_array(query_text, '\s+')) AS word
+    WHERE length(word) > 2;
+
+    -- Create tsquery from tokens
+    SELECT to_tsquery(string_agg(lexeme || ':*', ' | '))
+    INTO query_tsquery
+    FROM unnest(query_tokens) lexeme;
+
+    RETURN QUERY
+    WITH vector_similarity AS (
+        SELECT
+            id,
+            plaintext,
+            issue_id,
+            1 - (l2_distance(query_embedding, embedding))::DOUBLE PRECISION AS vec_similarity
+        FROM issue_comments
+        WHERE id <> current_id
+          AND 1 - (l2_distance(query_embedding, embedding))::DOUBLE PRECISION > threshold
+    ),
+    text_similarity AS (
+        SELECT
+            id,
+            plaintext,
+            issue_id,
+            ts_rank(to_tsvector('english', plaintext), query_tsquery)::DOUBLE PRECISION AS text_sim
+        FROM issue_comments
+        WHERE to_tsvector('english', plaintext) @@ query_tsquery
+    )
+    SELECT
+        vs.id AS comment_id,
+        vs.plaintext AS comment_plaintext,
+        vs.issue_id AS comment_issue_id,
+        vs.vec_similarity AS similarity,
+        COALESCE(ts.text_sim, 0::DOUBLE PRECISION) AS text_similarity
+    FROM vector_similarity vs
+    LEFT JOIN text_similarity ts ON vs.id = ts.id
+    ORDER BY (vs.vec_similarity + COALESCE(ts.text_sim, 0::DOUBLE PRECISION)) DESC
+    LIMIT max_results;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/tests/main.test.ts b/tests/main.test.ts
@@ -61,7 +61,6 @@ describe("Ask plugin tests", () => {
 
   it("should ask GPT a question", async () => {
     const ctx = createContext(TEST_SLASH_COMMAND);
-    console.log(ctx.adapters);
     createComments([transformCommentTemplate(1, 1, TEST_QUESTION, "ubiquity", "test-repo", true)]);
     const res = await askQuestion(ctx, TEST_QUESTION);