Skip to content

Commit

Permalink
feat: turbopuffer vectorstore
Browse files Browse the repository at this point in the history
  • Loading branch information
mattzcarey committed Dec 23, 2023
1 parent 9006b80 commit 7088a85
Show file tree
Hide file tree
Showing 17 changed files with 551 additions and 51 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ dist
!jest.config.js
*.d.ts

**/data

cdk.context.json

# nextjs and sst
Expand Down
2 changes: 2 additions & 0 deletions packages/crgpt-loader/.env_example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OPENAI_API_KEY=<paste-key-here>
TURBOPUFFER_API_KEY=<paste-key-here>
48 changes: 0 additions & 48 deletions packages/crgpt-loader/crgpt-loader.ts

This file was deleted.

6 changes: 4 additions & 2 deletions packages/crgpt-loader/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
"name": "crgpt-loader",
"version": "0.0.1",
"description": "",
"main": "index.js",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"test": "ts-node test.ts"
},
"keywords": [
"github",
Expand All @@ -16,6 +17,7 @@
"author": "Matt Carey",
"license": "MIT",
"dependencies": {
"dotenv": "^16.3.1",
"ignore": "^5.3.0",
"langchain": "^0.0.204"
},
Expand Down
8 changes: 8 additions & 0 deletions packages/crgpt-loader/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions packages/crgpt-loader/src/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
export const removeFolders = [
"node_modules",
".git",
".github",
".vscode",
".idea",
"dist",
"build",
"out",
"coverage",
"tmp",
"temp",
"log",
"logs",
];

export const lockFiles = ["package-lock.json", "pnpm-lock.yaml", "yarn.lock"];

export const removeFoldersCommand = (dir: string): string => {
return `find ${dir} -type d \\( ${removeFolders
.map((folder) => `-name '${folder}'`)
.join(" -o ")} \\) -exec rm -rf {} +`;
};

export const removeFilesCommand = (dir: string): string => {
return `find ${dir} -type f \\( ${lockFiles
.map((file) => `-name '${file}'`)
.join(" -o ")} \\) -delete`;
};
212 changes: 212 additions & 0 deletions packages/crgpt-loader/src/crgpt-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import axios, { AxiosResponse } from "axios";
import dotenv from "dotenv";
import { promises as fsPromises } from "fs";
import { Document } from "langchain/document";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import os from "os";
import path from "path";
import { removeFilesCommand, removeFoldersCommand } from "./constants";
import { executeCommand, openFile, savePage } from "./utils";

dotenv.config();

export class CRGPTLoader {
private link: string;
private embeddings: OpenAIEmbeddings;

constructor(link: string) {
this.link = link;
this.embeddings = new OpenAIEmbeddings();
}

private extractRepoName(): string {
return this.link.split("/").slice(-1)[0];
}

private async splitDocuments(documents: Document[]): Promise<Document[]> {
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1500,
});

return splitter.splitDocuments(documents);
}

private async buildDocument(
filePath: string
): Promise<Document<{ source: string }>> {
return new Document({
pageContent: await openFile(filePath),
metadata: {
source: filePath,
},
});
}

private async getEmbeddings(documents: Document[]): Promise<Number[][]> {
return this.embeddings.embedDocuments(
documents.map((doc) => doc.pageContent)
);
}

private async storeDocuments(
documents: Document[],
embeddings: Number[][],
indexName = this.extractRepoName()
): Promise<void> {
try {
const ids = documents.map((_, index) => index);
const attributes = {
source: documents.map((doc) => doc.metadata.source),
pageContent: documents.map((doc) => doc.pageContent),
};

const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`;
const headers = {
Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`,
"Content-Type": "application/json",
};

await axios.post(
apiEndpoint,
{
ids,
vectors: embeddings,
attributes,
},
{ headers }
);
} catch (error) {
console.error("Error storing documents:", error);
throw error;
}
}

public async load(): Promise<void> {
try {
const tempDir = await this.cloneRepository();
await this.removeUnwantedFilesAndFolders(tempDir);

const documents = await this.createDocuments(tempDir);

const chunks = await this.splitDocuments(documents);
const embeddings = await this.getEmbeddings(chunks);

await this.storeDocuments(chunks, embeddings);
console.log("Documents stored");

await this.cleanup(tempDir);
} catch (error) {
console.error("Error in CRGPTLoader:", error);
}
}

private async cloneRepository(): Promise<string> {
const tempDir = await fsPromises.mkdtemp(
path.join(os.tmpdir(), "CRGPTLoader-")
);
const cloneCommand = `git clone --depth 1 ${this.link} ${tempDir}`;
await executeCommand(cloneCommand);
return tempDir;
}

private async removeUnwantedFilesAndFolders(tempDir: string): Promise<void> {
try {
await executeCommand(removeFoldersCommand(tempDir));
await executeCommand(removeFilesCommand(tempDir));
} catch (error) {
console.error("Error removing files or folders:", error);
}
}

private async cleanup(tempDir: string): Promise<void> {
await executeCommand(`rm -rf ${tempDir}`);
}

private async createDocuments(
directory: string
): Promise<Document<{ source: string }>[]> {
const entries = await fsPromises.readdir(directory, {
withFileTypes: true,
});
const documents: Document<{ source: string }>[] = [];

for (const entry of entries) {
const fullPath = path.join(directory, entry.name);
if (entry.isDirectory()) {
documents.push(...(await this.createDocuments(fullPath)));
} else if (entry.isFile()) {
try {
const document = await this.buildDocument(fullPath);
documents.push(document);
} catch (error) {
console.error(`Error reading file ${entry.name}:`, error);
}
}
}

return documents;
}

public async read(): Promise<void> {
const namespace = this.extractRepoName();
let nextCursor = null;
const dataDir = "./data";
let pageIndex = 0;

do {
try {
const response = await this.fetchPage(namespace, nextCursor);

if (response.status === 202) {
// Data not ready, wait and retry
await new Promise((resolve) => setTimeout(resolve, 5000)); // wait for 5 seconds
continue;
}

const { ids, vectors, attributes, next_cursor } = response.data;
savePage(dataDir, pageIndex, ids, vectors, attributes);

nextCursor = next_cursor;
pageIndex++;
} catch (error) {
console.error("Error fetching data:", error);
throw error;
}
} while (nextCursor);
}

private async fetchPage(
namespace: string,
cursor: string | null
): Promise<AxiosResponse> {
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${namespace}`;
const params = cursor ? { cursor } : {};

return axios.get(apiEndpoint, {
headers: { Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}` },
params,
maxContentLength: Infinity,
maxBodyLength: Infinity,
});
}

public async delete(indexName = this.extractRepoName()): Promise<void> {
try {
// Set up the API endpoint and headers
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`;
const headers = {
Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`,
};

// Make the DELETE request
const response = await axios.delete(apiEndpoint, { headers });

// Log the response status
console.log("Delete response:", response.data);
} catch (error) {
console.error("Error deleting documents:", error);
throw error;
}
}
}
File renamed without changes.
2 changes: 2 additions & 0 deletions packages/crgpt-loader/src/lc_wip/githubLoader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
//WIP GitHub loader integration using UNIX commands

Loading

0 comments on commit 7088a85

Please sign in to comment.