Text-to-speech working

tone-row · Jan 10, 2024 · 0bb50f5 · 0bb50f5
1 parent c9bccf4
commit 0bb50f5
Show file tree

Hide file tree

Showing 7 changed files with 320 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,6 @@ TODO.md
 keys
 ERROR.png
 flowchart-fun.feature-reacher.json
-.parcel-cache
+.parcel-cache
+
+speech*.mp4
diff --git a/api/package.json b/api/package.json
@@ -16,12 +16,15 @@
     "@sendgrid/mail": "^7.4.6",
     "@supabase/supabase-js": "^2.31.0",
     "ajv": "^8.12.0",
+    "axios": "^0.27.2",
     "csv-parse": "^5.3.6",
     "date-fns": "^2.29.3",
     "graph-selector": "^0.10.0",
     "highlight.js": "^11.8.0",
     "marked": "^4.1.1",
+    "micro": "^10.0.1",
     "moniker": "^0.1.2",
+    "multer": "1.4.5-lts.1",
     "notion-to-md": "^2.5.5",
     "openai": "^4.24.2",
     "shared": "workspace:*",
@@ -33,6 +36,7 @@
     "@swc/jest": "^0.2.24",
     "@types/jest": "^29.0.0",
     "@types/marked": "^4.0.7",
+    "@types/multer": "^1.4.11",
     "@types/node": "^18.16.17",
     "@typescript-eslint/eslint-plugin": "^6.2.0",
     "@typescript-eslint/parser": "^6.2.0",

diff --git a/api/prompt/speech-to-text.ts b/api/prompt/speech-to-text.ts
@@ -0,0 +1,27 @@
+import { VercelApiHandler } from "@vercel/node";
+import { openai } from "../_lib/_openai";
+import { toFile } from "openai";
+
+const handler: VercelApiHandler = async (req, res) => {
+  try {
+    const { audioUrl } = req.body;
+
+    if (!audioUrl) {
+      res.status(400).json({ ok: false, error: "No audioUrl provided" });
+      return;
+    }
+
+    const base64Data = audioUrl.split(";base64,").pop();
+    const binaryData = Buffer.from(base64Data, "base64");
+    const transcription = await openai.audio.transcriptions.create({
+      file: await toFile(binaryData, "audio.mp4"),
+      model: "whisper-1",
+    });
+    res.send(transcription.text);
+  } catch (error) {
+    console.error(error);
+    res.status(500).json({ ok: false, error: "Something went wrong" });
+  }
+};
+
+export default handler;
diff --git a/app/src/components/EditWithAI.tsx b/app/src/components/EditWithAI.tsx
@@ -1,12 +1,13 @@
-import { MagicWand, Microphone, Robot } from "phosphor-react";
-import { Button2, IconButton2 } from "../ui/Shared";
+import { MagicWand, Robot } from "phosphor-react";
+import { Button2 } from "../ui/Shared";
 import * as Popover from "@radix-ui/react-popover";
 import { Trans, t } from "@lingui/macro";
 import { useCallback, useRef, useState } from "react";
 import { useDoc } from "../lib/useDoc";
 import { parse, stringify, Graph as GSGraph } from "graph-selector";
 import { useMutation } from "react-query";
 import * as Toast from "@radix-ui/react-toast";
+import { Microphone } from "./Microphone";
 
 // The Graph type we send to AI is slightly different from internal representation
 type GraphForAI = {
@@ -24,7 +25,8 @@ type GraphForAI = {
 export function EditWithAI() {
   const [message, setMessage] = useState<string | null>(null);
   const [isOpen, setIsOpen] = useState(false);
-  const { mutate: edit, isLoading } = useMutation({
+  const [transcriptionLoading, setTranscriptionLoading] = useState(false);
+  const { mutate: edit, isLoading: editIsLoading } = useMutation({
     mutationFn: async (body: { prompt: string; graph: GraphForAI }) => {
       // /api/prompt/edit
       const response = await fetch("/api/prompt/edit", {
@@ -35,6 +37,7 @@ export function EditWithAI() {
         },
       });
       const data = await response.json();
+
       return data as {
         message: string;
         toolCalls: {
@@ -43,7 +46,6 @@ export function EditWithAI() {
         }[];
       };
     },
-    onMutate: () => setIsOpen(false),
     onSuccess(data) {
       if (data.message) {
         setMessage(data.message);
@@ -59,14 +61,14 @@ export function EditWithAI() {
         }
       }
     },
+    onSettled() {
+      setTranscriptionLoading(false);
+    },
   });
-  const handleSubmit = useCallback(
-    (e: React.FormEvent<HTMLFormElement>) => {
-      e.preventDefault();
 
-      const formData = new FormData(e.currentTarget);
-      const prompt = formData.get("prompt") as string;
-      if (!prompt) return;
+  const submitPrompt = useCallback(
+    (prompt: string) => {
+      setIsOpen(false);
 
       const text = useDoc.getState().text;
       const _graph = parse(text);
@@ -117,8 +119,28 @@ export function EditWithAI() {
     [edit]
   );
 
+  const handleSubmit = useCallback(
+    (e: React.FormEvent<HTMLFormElement>) => {
+      e.preventDefault();
+
+      const formData = new FormData(e.currentTarget);
+      const prompt = formData.get("prompt") as string;
+      if (!prompt) return;
+
+      submitPrompt(prompt);
+    },
+    [submitPrompt]
+  );
+
+  const handleSend = useCallback(() => {
+    setTranscriptionLoading(true);
+    setIsOpen(false);
+  }, []);
+
   const formRef = useRef<HTMLFormElement>(null);
 
+  const isLoading = editIsLoading || transcriptionLoading;
+
   return (
     <>
       <Popover.Root open={isOpen} onOpenChange={setIsOpen}>
@@ -142,13 +164,13 @@ export function EditWithAI() {
           <Popover.Content
             side="bottom"
             sideOffset={10}
-            align="center"
-            className="w-[300px] bg-white rounded shadow border p-2 !z-[100] animate-slideDownAndFade"
+            align="end"
+            className="w-[300px] bg-white rounded shadow border border-purple-300 p-2 !z-[100] animate-slideDownAndFade"
           >
             <form className="grid gap-2" onSubmit={handleSubmit} ref={formRef}>
               <div className="relative">
                 <textarea
-                  placeholder={t`Write your prompt here or press and hold the button to speak...`}
+                  placeholder={t`Write your prompt here or click to enable the microphone, then press and hold to record.`}
                   className="text-xs w-full resize-none h-24 p-2 leading-normal"
                   name="prompt"
                   required
@@ -162,13 +184,10 @@ export function EditWithAI() {
                     }
                   }}
                 />
-                <IconButton2
-                  size="xs"
-                  className="!absolute bottom-0 right-0"
-                  type="button"
-                >
-                  <Microphone size={16} />
-                </IconButton2>
+                <Microphone
+                  onTranscription={submitPrompt}
+                  onSend={handleSend}
+                />
               </div>
               <Button2 size="sm" color="purple">
                 <Trans>Submit</Trans>

diff --git a/app/src/components/Microphone.tsx b/app/src/components/Microphone.tsx
@@ -0,0 +1,70 @@
+import { Microphone as MicrophoneIcon } from "phosphor-react";
+import { IconButton2 } from "../ui/Shared";
+import { useCallback } from "react";
+import cx from "classnames";
+import {
+  startRecording,
+  stopRecording,
+  turnOnMicrophone,
+  useMicrophoneStore,
+} from "../lib/useMicrophoneStore";
+
+/**
+ * Records the user and then does something with the recording
+ */
+export function Microphone({
+  onSend,
+  onTranscription,
+}: {
+  onSend: () => void;
+  onTranscription: (text: string) => void | Promise<void>;
+}) {
+  const isMicOn = useMicrophoneStore((s) => s.isMicOn);
+  const isRecording = useMicrophoneStore((s) => s.isRecording);
+
+  const handleFinishRecording = useCallback(() => {
+    onSend();
+    const audioBlob = new Blob(useMicrophoneStore.getState().data, {
+      type: "audio/mp4",
+    });
+
+    // Base64 encode the blob
+    let audioUrl = "";
+    const reader = new FileReader();
+    reader.readAsDataURL(audioBlob);
+    reader.onloadend = () => {
+      audioUrl = reader.result as string;
+
+      // Send the base64 data to the server
+      fetch("/api/prompt/speech-to-text", {
+        method: "POST",
+        body: JSON.stringify({ audioUrl }),
+        headers: {
+          "Content-Type": "application/json",
+        },
+      })
+        .then((res) => res.text())
+        .then(onTranscription);
+    };
+  }, [onSend, onTranscription]);
+
+  const turnOn = useCallback(() => {
+    turnOnMicrophone(handleFinishRecording);
+  }, [handleFinishRecording]);
+
+  return (
+    <IconButton2
+      size="xs"
+      className={cx("!absolute bottom-0 right-0", {
+        "!bg-black !text-white": isMicOn && !isRecording,
+        "!bg-red-500 !text-white": isMicOn && isRecording,
+      })}
+      type="button"
+      onClick={turnOn}
+      onMouseDown={startRecording}
+      onMouseUp={stopRecording}
+    >
+      <MicrophoneIcon size={16} />
+    </IconButton2>
+  );
+}
diff --git a/app/src/lib/useMicrophoneStore.ts b/app/src/lib/useMicrophoneStore.ts
@@ -0,0 +1,61 @@
+import { create } from "zustand";
+type MicrophoneStore = {
+  isMicOn: boolean;
+  isRecording: boolean;
+  mediaRecorder: MediaRecorder | null;
+  data: Blob[];
+};
+
+export const useMicrophoneStore = create<MicrophoneStore>((set) => ({
+  isMicOn: false,
+  isRecording: false,
+  mediaRecorder: null,
+  data: [],
+}));
+
+export function turnOnMicrophone(onFinish: () => void | Promise<void>) {
+  const isMicOn = useMicrophoneStore.getState().isMicOn;
+  if (isMicOn) return;
+
+  // start the microphone listening to audio
+  navigator.mediaDevices
+    .getUserMedia({ audio: true })
+    .then((stream) => {
+      const mediaRecorder = new MediaRecorder(stream);
+
+      mediaRecorder.addEventListener("dataavailable", (e) => {
+        useMicrophoneStore.setState((state) => ({
+          data: [...state.data, e.data],
+        }));
+      });
+
+      mediaRecorder.addEventListener("start", () => {
+        useMicrophoneStore.setState({ isRecording: true });
+      });
+
+      mediaRecorder.addEventListener("stop", () => {
+        useMicrophoneStore.setState({ isRecording: false });
+        onFinish();
+      });
+
+      useMicrophoneStore.setState({ isMicOn: true, mediaRecorder });
+    })
+    .catch(console.error);
+}
+
+export function startRecording() {
+  const mediaRecorder = useMicrophoneStore.getState().mediaRecorder;
+  if (!mediaRecorder) return;
+
+  // Clear the data array
+  useMicrophoneStore.setState({ data: [] });
+
+  mediaRecorder.start();
+}
+
+export function stopRecording() {
+  const mediaRecorder = useMicrophoneStore.getState().mediaRecorder;
+  if (!mediaRecorder) return;
+
+  mediaRecorder.stop();
+}