From 0bb50f56b155bd673b78f5f2ff85e14b3cba7e34 Mon Sep 17 00:00:00 2001 From: Rob Gordon Date: Wed, 10 Jan 2024 17:56:52 -0500 Subject: [PATCH] Text-to-speech working --- .gitignore | 4 +- api/package.json | 4 + api/prompt/speech-to-text.ts | 27 +++++++ app/src/components/EditWithAI.tsx | 59 +++++++++----- app/src/components/Microphone.tsx | 70 ++++++++++++++++ app/src/lib/useMicrophoneStore.ts | 61 ++++++++++++++ pnpm-lock.yaml | 129 +++++++++++++++++++++++++++--- 7 files changed, 320 insertions(+), 34 deletions(-) create mode 100644 api/prompt/speech-to-text.ts create mode 100644 app/src/components/Microphone.tsx create mode 100644 app/src/lib/useMicrophoneStore.ts diff --git a/.gitignore b/.gitignore index 96f044b59..67374d2fe 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ TODO.md keys ERROR.png flowchart-fun.feature-reacher.json -.parcel-cache \ No newline at end of file +.parcel-cache + +speech*.mp4 \ No newline at end of file diff --git a/api/package.json b/api/package.json index 0047e7b8e..cd021e8ad 100644 --- a/api/package.json +++ b/api/package.json @@ -16,12 +16,15 @@ "@sendgrid/mail": "^7.4.6", "@supabase/supabase-js": "^2.31.0", "ajv": "^8.12.0", + "axios": "^0.27.2", "csv-parse": "^5.3.6", "date-fns": "^2.29.3", "graph-selector": "^0.10.0", "highlight.js": "^11.8.0", "marked": "^4.1.1", + "micro": "^10.0.1", "moniker": "^0.1.2", + "multer": "1.4.5-lts.1", "notion-to-md": "^2.5.5", "openai": "^4.24.2", "shared": "workspace:*", @@ -33,6 +36,7 @@ "@swc/jest": "^0.2.24", "@types/jest": "^29.0.0", "@types/marked": "^4.0.7", + "@types/multer": "^1.4.11", "@types/node": "^18.16.17", "@typescript-eslint/eslint-plugin": "^6.2.0", "@typescript-eslint/parser": "^6.2.0", diff --git a/api/prompt/speech-to-text.ts b/api/prompt/speech-to-text.ts new file mode 100644 index 000000000..d85e60892 --- /dev/null +++ b/api/prompt/speech-to-text.ts @@ -0,0 +1,27 @@ +import { VercelApiHandler } from "@vercel/node"; +import { openai } from "../_lib/_openai"; +import { toFile } from "openai"; + +const handler: VercelApiHandler = async (req, res) => { + try { + const { audioUrl } = req.body; + + if (!audioUrl) { + res.status(400).json({ ok: false, error: "No audioUrl provided" }); + return; + } + + const base64Data = audioUrl.split(";base64,").pop(); + const binaryData = Buffer.from(base64Data, "base64"); + const transcription = await openai.audio.transcriptions.create({ + file: await toFile(binaryData, "audio.mp4"), + model: "whisper-1", + }); + res.send(transcription.text); + } catch (error) { + console.error(error); + res.status(500).json({ ok: false, error: "Something went wrong" }); + } +}; + +export default handler; diff --git a/app/src/components/EditWithAI.tsx b/app/src/components/EditWithAI.tsx index 0dde05c83..f0f029d79 100644 --- a/app/src/components/EditWithAI.tsx +++ b/app/src/components/EditWithAI.tsx @@ -1,5 +1,5 @@ -import { MagicWand, Microphone, Robot } from "phosphor-react"; -import { Button2, IconButton2 } from "../ui/Shared"; +import { MagicWand, Robot } from "phosphor-react"; +import { Button2 } from "../ui/Shared"; import * as Popover from "@radix-ui/react-popover"; import { Trans, t } from "@lingui/macro"; import { useCallback, useRef, useState } from "react"; @@ -7,6 +7,7 @@ import { useDoc } from "../lib/useDoc"; import { parse, stringify, Graph as GSGraph } from "graph-selector"; import { useMutation } from "react-query"; import * as Toast from "@radix-ui/react-toast"; +import { Microphone } from "./Microphone"; // The Graph type we send to AI is slightly different from internal representation type GraphForAI = { @@ -24,7 +25,8 @@ type GraphForAI = { export function EditWithAI() { const [message, setMessage] = useState(null); const [isOpen, setIsOpen] = useState(false); - const { mutate: edit, isLoading } = useMutation({ + const [transcriptionLoading, setTranscriptionLoading] = useState(false); + const { mutate: edit, isLoading: editIsLoading } = useMutation({ mutationFn: async (body: { prompt: string; graph: GraphForAI }) => { // /api/prompt/edit const response = await fetch("/api/prompt/edit", { @@ -35,6 +37,7 @@ export function EditWithAI() { }, }); const data = await response.json(); + return data as { message: string; toolCalls: { @@ -43,7 +46,6 @@ export function EditWithAI() { }[]; }; }, - onMutate: () => setIsOpen(false), onSuccess(data) { if (data.message) { setMessage(data.message); @@ -59,14 +61,14 @@ export function EditWithAI() { } } }, + onSettled() { + setTranscriptionLoading(false); + }, }); - const handleSubmit = useCallback( - (e: React.FormEvent) => { - e.preventDefault(); - const formData = new FormData(e.currentTarget); - const prompt = formData.get("prompt") as string; - if (!prompt) return; + const submitPrompt = useCallback( + (prompt: string) => { + setIsOpen(false); const text = useDoc.getState().text; const _graph = parse(text); @@ -117,8 +119,28 @@ export function EditWithAI() { [edit] ); + const handleSubmit = useCallback( + (e: React.FormEvent) => { + e.preventDefault(); + + const formData = new FormData(e.currentTarget); + const prompt = formData.get("prompt") as string; + if (!prompt) return; + + submitPrompt(prompt); + }, + [submitPrompt] + ); + + const handleSend = useCallback(() => { + setTranscriptionLoading(true); + setIsOpen(false); + }, []); + const formRef = useRef(null); + const isLoading = editIsLoading || transcriptionLoading; + return ( <> @@ -142,13 +164,13 @@ export function EditWithAI() {