diff --git a/.dev.vars.example b/.dev.vars.example index b9e5cff..e39f3dd 100644 --- a/.dev.vars.example +++ b/.dev.vars.example @@ -1 +1,4 @@ -OPENAI_API_KEY="MY_SECRET" +OPENAI_API_KEY="" +SUPABASE_URL="" +SUPABASE_KEY="" +VOYAGEAI_API_KEY="" \ No newline at end of file diff --git a/package.json b/package.json index ba6f618..c6bb216 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ "knip-ci": "knip --no-exit-code --reporter json --config .github/knip.ts", "prepare": "husky install", "test": "jest --setupFiles dotenv/config --coverage", - "worker": "wrangler dev --env dev --port 4000" + "worker": "wrangler dev --env dev --port 5000" }, "keywords": [ "typescript", diff --git a/src/adapters/index.ts b/src/adapters/index.ts index c72c5f8..5040ebd 100644 --- a/src/adapters/index.ts +++ b/src/adapters/index.ts @@ -5,7 +5,7 @@ import { SuperSupabase } from "./supabase/helpers/supabase"; import { Embedding as VoyageEmbedding } from "./voyage/helpers/embedding"; import { SuperVoyage } from "./voyage/helpers/voyage"; import { VoyageAIClient } from "voyageai"; -import { Issue } from "./supabase/helpers/issue"; +import { Issue } from "./supabase/helpers/issues"; import { SuperOpenAi } from "./openai/helpers/openai"; import OpenAI from "openai"; import { Completions } from "./openai/helpers/completions"; diff --git a/src/adapters/openai/helpers/completions.ts b/src/adapters/openai/helpers/completions.ts index 32669ac..83636ad 100644 --- a/src/adapters/openai/helpers/completions.ts +++ b/src/adapters/openai/helpers/completions.ts @@ -1,7 +1,7 @@ import OpenAI from "openai"; import { Context } from "../../../types"; import { SuperOpenAi } from "./openai"; -const MAX_TOKENS = 3072; +const MAX_TOKENS = 3000; export interface CompletionsType { answer: string; @@ -46,7 +46,7 @@ export class Completions extends SuperOpenAi { ], }, ], - temperature: 0, + temperature: 0.2, max_tokens: MAX_TOKENS, top_p: 1, frequency_penalty: 0, diff --git a/src/adapters/supabase/helpers/comment.ts b/src/adapters/supabase/helpers/comment.ts index 90dd8c2..e6aff31 100644 --- a/src/adapters/supabase/helpers/comment.ts +++ b/src/adapters/supabase/helpers/comment.ts @@ -12,6 +12,14 @@ export interface CommentType { embedding: number[]; } +export interface CommentSimilaritySearchResult { + comment_id: string; + comment_plaintext: string; + comment_issue_id: string; + similarity: number; + text_similarity: number; +} + export class Comment extends SuperSupabase { constructor(supabase: SupabaseClient, context: Context) { super(supabase, context); @@ -24,12 +32,14 @@ export class Comment extends SuperSupabase { return data; } - async findSimilarComments(query: string, threshold: number, currentId: string): Promise { + async findSimilarComments(query: string, threshold: number, currentId: string): Promise { const embedding = await this.context.adapters.voyage.embedding.createEmbedding(query); - const { data, error } = await this.supabase.rpc("find_similar_comments_with_vector_search_ftse", { + const { data, error } = await this.supabase.rpc("find_similar_comments", { current_id: currentId, + query_text: query, query_embedding: embedding, threshold: threshold, + max_results: 10, }); if (error) { this.context.logger.error("Error finding similar comments", error); diff --git a/src/adapters/supabase/helpers/issue.ts b/src/adapters/supabase/helpers/issues.ts similarity index 93% rename from src/adapters/supabase/helpers/issue.ts rename to src/adapters/supabase/helpers/issues.ts index bee8223..8bd083e 100644 --- a/src/adapters/supabase/helpers/issue.ts +++ b/src/adapters/supabase/helpers/issues.ts @@ -6,6 +6,7 @@ export interface IssueSimilaritySearchResult { issue_id: string; issue_plaintext: string; similarity: number; + text_similarity: number; } export interface IssueType { @@ -33,10 +34,12 @@ export class Issue extends SuperSupabase { } async findSimilarIssues(plaintext: string, threshold: number, currentId: string): Promise { const embedding = await this.context.adapters.voyage.embedding.createEmbedding(plaintext); - const { data, error } = await this.supabase.rpc("find_similar_issues_vector_search_ftse", { + const { data, error } = await this.supabase.rpc("find_similar_issue_ftse", { current_id: currentId, + query_text: plaintext, query_embedding: embedding, threshold: threshold, + max_results: 10, }); if (error) { this.context.logger.error("Error finding similar issues", error); diff --git a/src/adapters/voyage/helpers/embedding.ts b/src/adapters/voyage/helpers/embedding.ts index 9943882..575543e 100644 --- a/src/adapters/voyage/helpers/embedding.ts +++ b/src/adapters/voyage/helpers/embedding.ts @@ -17,7 +17,7 @@ export class Embedding extends SuperVoyage { } else { const response = await this.client.embed({ input: text, - model: "voyage-large-3", + model: "voyage-large-2-instruct", }); return (response.data && response.data[0]?.embedding) || []; } diff --git a/src/adapters/voyage/helpers/rerankers.ts b/src/adapters/voyage/helpers/rerankers.ts index 08fadae..734063d 100644 --- a/src/adapters/voyage/helpers/rerankers.ts +++ b/src/adapters/voyage/helpers/rerankers.ts @@ -14,8 +14,9 @@ export class Rerankers extends SuperVoyage { const response = await this.client.rerank({ query, documents: results, - model: "voyage-large-3", + model: "rerank-2", returnDocuments: true, + topK: 5, }); const rerankedResults = response.data || []; return rerankedResults.map((result) => result.document).filter((document): document is string => document !== undefined); diff --git a/src/handlers/ask-gpt.ts b/src/handlers/ask-gpt.ts index 5dc83bc..90e5222 100644 --- a/src/handlers/ask-gpt.ts +++ b/src/handlers/ask-gpt.ts @@ -1,6 +1,7 @@ import { Context } from "../types"; import { CompletionsType } from "../adapters/openai/helpers/completions"; -import { CommentType } from "../adapters/supabase/helpers/comment"; +import { CommentSimilaritySearchResult } from "../adapters/supabase/helpers/comment"; +import { IssueSimilaritySearchResult } from "../adapters/supabase/helpers/issues"; export async function askQuestion(context: Context, question: string) { if (!question) { @@ -20,12 +21,23 @@ export async function askGpt(context: Context, question: string): Promise comment.plaintext); - similarText.push(...similarIssues.map((issue) => issue.issue_plaintext)); + const similarText = similarComments.map((comment: CommentSimilaritySearchResult) => comment.comment_plaintext); + similarText.push(...similarIssues.map((issue: IssueSimilaritySearchResult) => issue.issue_plaintext)); //Rerank Similar Comments and Issues const rerankedText = await context.adapters.voyage.reranker.reRankResults(similarText, question); + //Remove unwanted characters from the text + rerankedText.forEach((text) => removeUnwantedChars(text)); //TODO: Temporary workaround //const chat = createChatHistory(formattedChat); //logger.info(`Sending chat to OpenAI`, { chat }); return context.adapters.openai.completions.createCompletion(question, model, rerankedText); } + +/** + * Removes unwanted characters from the text like emojis, special characters etc. + * @param text + * @returns + */ +const removeUnwantedChars = (text: string) => { + return text.replace(/[^a-zA-Z0-9\s]/g, ""); +}; diff --git a/src/plugin.ts b/src/plugin.ts index ab0de20..33f441d 100644 --- a/src/plugin.ts +++ b/src/plugin.ts @@ -16,9 +16,10 @@ export async function plugin(inputs: PluginInputs, env: Env) { const voyageClient = new VoyageAIClient({ apiKey: env.VOYAGEAI_API_KEY, }); + const openaiClient = new OpenAI({ apiKey: env.OPENAI_API_KEY, - baseURL: inputs.settings.openAiBaseUrl || "https://api.openai.com", + ...(inputs.settings.openAiBaseUrl && { baseUrl: inputs.settings.openAiBaseUrl }), }); const context: Context = { eventName: inputs.eventName, diff --git a/src/types/plugin-inputs.ts b/src/types/plugin-inputs.ts index 2c0546d..694bb50 100644 --- a/src/types/plugin-inputs.ts +++ b/src/types/plugin-inputs.ts @@ -22,7 +22,7 @@ export interface PluginInputs 2; + + -- Create tsquery from tokens + SELECT to_tsquery(string_agg(lexeme || ':*', ' | ')) + INTO query_tsquery + FROM unnest(query_tokens) lexeme; + + RETURN QUERY + WITH vector_similarity AS ( + SELECT + id, + plaintext, + (1 - (embedding <-> query_embedding))::DOUBLE PRECISION AS vec_similarity + FROM issues + WHERE id <> current_id + AND (1 - (embedding <-> query_embedding))::DOUBLE PRECISION > threshold + ), + text_similarity AS ( + SELECT + id, + plaintext, + ts_rank(to_tsvector('english', plaintext), query_tsquery)::DOUBLE PRECISION AS text_sim + FROM issues + WHERE to_tsvector('english', plaintext) @@ query_tsquery + ) + SELECT + vs.id AS issue_id, + vs.plaintext AS issue_plaintext, + vs.vec_similarity AS similarity, + COALESCE(ts.text_sim, 0::DOUBLE PRECISION) AS text_similarity + FROM vector_similarity vs + LEFT JOIN text_similarity ts ON vs.id = ts.id + ORDER BY (vs.vec_similarity + COALESCE(ts.text_sim, 0::DOUBLE PRECISION)) DESC + LIMIT max_results; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION find_similar_comments( + current_id VARCHAR, + query_text TEXT, + query_embedding VECTOR(1024), + threshold DOUBLE PRECISION, + max_results INTEGER DEFAULT 10 +) +RETURNS TABLE( + comment_id VARCHAR, + comment_plaintext TEXT, + comment_issue_id VARCHAR, + similarity DOUBLE PRECISION, + text_similarity DOUBLE PRECISION +) AS $$ +DECLARE + query_tokens TEXT[]; + query_tsquery TSQUERY; +BEGIN + -- Generate query tokens + SELECT array_agg(DISTINCT lower(word)) + INTO query_tokens + FROM unnest(regexp_split_to_array(query_text, '\s+')) AS word + WHERE length(word) > 2; + + -- Create tsquery from tokens + SELECT to_tsquery(string_agg(lexeme || ':*', ' | ')) + INTO query_tsquery + FROM unnest(query_tokens) lexeme; + + RETURN QUERY + WITH vector_similarity AS ( + SELECT + id, + plaintext, + issue_id, + 1 - (l2_distance(query_embedding, embedding))::DOUBLE PRECISION AS vec_similarity + FROM issue_comments + WHERE id <> current_id + AND 1 - (l2_distance(query_embedding, embedding))::DOUBLE PRECISION > threshold + ), + text_similarity AS ( + SELECT + id, + plaintext, + issue_id, + ts_rank(to_tsvector('english', plaintext), query_tsquery)::DOUBLE PRECISION AS text_sim + FROM issue_comments + WHERE to_tsvector('english', plaintext) @@ query_tsquery + ) + SELECT + vs.id AS comment_id, + vs.plaintext AS comment_plaintext, + vs.issue_id AS comment_issue_id, + vs.vec_similarity AS similarity, + COALESCE(ts.text_sim, 0::DOUBLE PRECISION) AS text_similarity + FROM vector_similarity vs + LEFT JOIN text_similarity ts ON vs.id = ts.id + ORDER BY (vs.vec_similarity + COALESCE(ts.text_sim, 0::DOUBLE PRECISION)) DESC + LIMIT max_results; +END; +$$ LANGUAGE plpgsql; \ No newline at end of file diff --git a/tests/main.test.ts b/tests/main.test.ts index 7720ab8..067158e 100644 --- a/tests/main.test.ts +++ b/tests/main.test.ts @@ -61,7 +61,6 @@ describe("Ask plugin tests", () => { it("should ask GPT a question", async () => { const ctx = createContext(TEST_SLASH_COMMAND); - console.log(ctx.adapters); createComments([transformCommentTemplate(1, 1, TEST_QUESTION, "ubiquity", "test-repo", true)]); const res = await askQuestion(ctx, TEST_QUESTION);