-
Notifications
You must be signed in to change notification settings - Fork 179
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9006b80
commit 7088a85
Showing
17 changed files
with
551 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,8 @@ dist | |
!jest.config.js | ||
*.d.ts | ||
|
||
**/data | ||
|
||
cdk.context.json | ||
|
||
# nextjs and sst | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
OPENAI_API_KEY=<paste-key-here> | ||
TURBOPUFFER_API_KEY=<paste-key-here> |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
export const removeFolders = [ | ||
"node_modules", | ||
".git", | ||
".github", | ||
".vscode", | ||
".idea", | ||
"dist", | ||
"build", | ||
"out", | ||
"coverage", | ||
"tmp", | ||
"temp", | ||
"log", | ||
"logs", | ||
]; | ||
|
||
export const lockFiles = ["package-lock.json", "pnpm-lock.yaml", "yarn.lock"]; | ||
|
||
export const removeFoldersCommand = (dir: string): string => { | ||
return `find ${dir} -type d \\( ${removeFolders | ||
.map((folder) => `-name '${folder}'`) | ||
.join(" -o ")} \\) -exec rm -rf {} +`; | ||
}; | ||
|
||
export const removeFilesCommand = (dir: string): string => { | ||
return `find ${dir} -type f \\( ${lockFiles | ||
.map((file) => `-name '${file}'`) | ||
.join(" -o ")} \\) -delete`; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
import axios, { AxiosResponse } from "axios"; | ||
import dotenv from "dotenv"; | ||
import { promises as fsPromises } from "fs"; | ||
import { Document } from "langchain/document"; | ||
import { OpenAIEmbeddings } from "langchain/embeddings/openai"; | ||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; | ||
import os from "os"; | ||
import path from "path"; | ||
import { removeFilesCommand, removeFoldersCommand } from "./constants"; | ||
import { executeCommand, openFile, savePage } from "./utils"; | ||
|
||
dotenv.config(); | ||
|
||
export class CRGPTLoader { | ||
private link: string; | ||
private embeddings: OpenAIEmbeddings; | ||
|
||
constructor(link: string) { | ||
this.link = link; | ||
this.embeddings = new OpenAIEmbeddings(); | ||
} | ||
|
||
private extractRepoName(): string { | ||
return this.link.split("/").slice(-1)[0]; | ||
} | ||
|
||
private async splitDocuments(documents: Document[]): Promise<Document[]> { | ||
const splitter = new RecursiveCharacterTextSplitter({ | ||
chunkSize: 1500, | ||
}); | ||
|
||
return splitter.splitDocuments(documents); | ||
} | ||
|
||
private async buildDocument( | ||
filePath: string | ||
): Promise<Document<{ source: string }>> { | ||
return new Document({ | ||
pageContent: await openFile(filePath), | ||
metadata: { | ||
source: filePath, | ||
}, | ||
}); | ||
} | ||
|
||
private async getEmbeddings(documents: Document[]): Promise<Number[][]> { | ||
return this.embeddings.embedDocuments( | ||
documents.map((doc) => doc.pageContent) | ||
); | ||
} | ||
|
||
private async storeDocuments( | ||
documents: Document[], | ||
embeddings: Number[][], | ||
indexName = this.extractRepoName() | ||
): Promise<void> { | ||
try { | ||
const ids = documents.map((_, index) => index); | ||
const attributes = { | ||
source: documents.map((doc) => doc.metadata.source), | ||
pageContent: documents.map((doc) => doc.pageContent), | ||
}; | ||
|
||
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`; | ||
const headers = { | ||
Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`, | ||
"Content-Type": "application/json", | ||
}; | ||
|
||
await axios.post( | ||
apiEndpoint, | ||
{ | ||
ids, | ||
vectors: embeddings, | ||
attributes, | ||
}, | ||
{ headers } | ||
); | ||
} catch (error) { | ||
console.error("Error storing documents:", error); | ||
throw error; | ||
} | ||
} | ||
|
||
public async load(): Promise<void> { | ||
try { | ||
const tempDir = await this.cloneRepository(); | ||
await this.removeUnwantedFilesAndFolders(tempDir); | ||
|
||
const documents = await this.createDocuments(tempDir); | ||
|
||
const chunks = await this.splitDocuments(documents); | ||
const embeddings = await this.getEmbeddings(chunks); | ||
|
||
await this.storeDocuments(chunks, embeddings); | ||
console.log("Documents stored"); | ||
|
||
await this.cleanup(tempDir); | ||
} catch (error) { | ||
console.error("Error in CRGPTLoader:", error); | ||
} | ||
} | ||
|
||
private async cloneRepository(): Promise<string> { | ||
const tempDir = await fsPromises.mkdtemp( | ||
path.join(os.tmpdir(), "CRGPTLoader-") | ||
); | ||
const cloneCommand = `git clone --depth 1 ${this.link} ${tempDir}`; | ||
await executeCommand(cloneCommand); | ||
return tempDir; | ||
} | ||
|
||
private async removeUnwantedFilesAndFolders(tempDir: string): Promise<void> { | ||
try { | ||
await executeCommand(removeFoldersCommand(tempDir)); | ||
await executeCommand(removeFilesCommand(tempDir)); | ||
} catch (error) { | ||
console.error("Error removing files or folders:", error); | ||
} | ||
} | ||
|
||
private async cleanup(tempDir: string): Promise<void> { | ||
await executeCommand(`rm -rf ${tempDir}`); | ||
} | ||
|
||
private async createDocuments( | ||
directory: string | ||
): Promise<Document<{ source: string }>[]> { | ||
const entries = await fsPromises.readdir(directory, { | ||
withFileTypes: true, | ||
}); | ||
const documents: Document<{ source: string }>[] = []; | ||
|
||
for (const entry of entries) { | ||
const fullPath = path.join(directory, entry.name); | ||
if (entry.isDirectory()) { | ||
documents.push(...(await this.createDocuments(fullPath))); | ||
} else if (entry.isFile()) { | ||
try { | ||
const document = await this.buildDocument(fullPath); | ||
documents.push(document); | ||
} catch (error) { | ||
console.error(`Error reading file ${entry.name}:`, error); | ||
} | ||
} | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
public async read(): Promise<void> { | ||
const namespace = this.extractRepoName(); | ||
let nextCursor = null; | ||
const dataDir = "./data"; | ||
let pageIndex = 0; | ||
|
||
do { | ||
try { | ||
const response = await this.fetchPage(namespace, nextCursor); | ||
|
||
if (response.status === 202) { | ||
// Data not ready, wait and retry | ||
await new Promise((resolve) => setTimeout(resolve, 5000)); // wait for 5 seconds | ||
continue; | ||
} | ||
|
||
const { ids, vectors, attributes, next_cursor } = response.data; | ||
savePage(dataDir, pageIndex, ids, vectors, attributes); | ||
|
||
nextCursor = next_cursor; | ||
pageIndex++; | ||
} catch (error) { | ||
console.error("Error fetching data:", error); | ||
throw error; | ||
} | ||
} while (nextCursor); | ||
} | ||
|
||
private async fetchPage( | ||
namespace: string, | ||
cursor: string | null | ||
): Promise<AxiosResponse> { | ||
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${namespace}`; | ||
const params = cursor ? { cursor } : {}; | ||
|
||
return axios.get(apiEndpoint, { | ||
headers: { Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}` }, | ||
params, | ||
maxContentLength: Infinity, | ||
maxBodyLength: Infinity, | ||
}); | ||
} | ||
|
||
public async delete(indexName = this.extractRepoName()): Promise<void> { | ||
try { | ||
// Set up the API endpoint and headers | ||
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${indexName}`; | ||
const headers = { | ||
Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`, | ||
}; | ||
|
||
// Make the DELETE request | ||
const response = await axios.delete(apiEndpoint, { headers }); | ||
|
||
// Log the response status | ||
console.log("Delete response:", response.data); | ||
} catch (error) { | ||
console.error("Error deleting documents:", error); | ||
throw error; | ||
} | ||
} | ||
} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
//WIP GitHub loader integration using UNIX commands | ||
|
Oops, something went wrong.