Skip to content

Commit

Permalink
Text-to-speech working
Browse files Browse the repository at this point in the history
  • Loading branch information
rob-gordon committed Jan 10, 2024
1 parent c9bccf4 commit 0bb50f5
Show file tree
Hide file tree
Showing 7 changed files with 320 additions and 34 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ TODO.md
keys
ERROR.png
flowchart-fun.feature-reacher.json
.parcel-cache
.parcel-cache

speech*.mp4
4 changes: 4 additions & 0 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
"@sendgrid/mail": "^7.4.6",
"@supabase/supabase-js": "^2.31.0",
"ajv": "^8.12.0",
"axios": "^0.27.2",
"csv-parse": "^5.3.6",
"date-fns": "^2.29.3",
"graph-selector": "^0.10.0",
"highlight.js": "^11.8.0",
"marked": "^4.1.1",
"micro": "^10.0.1",
"moniker": "^0.1.2",
"multer": "1.4.5-lts.1",
"notion-to-md": "^2.5.5",
"openai": "^4.24.2",
"shared": "workspace:*",
Expand All @@ -33,6 +36,7 @@
"@swc/jest": "^0.2.24",
"@types/jest": "^29.0.0",
"@types/marked": "^4.0.7",
"@types/multer": "^1.4.11",
"@types/node": "^18.16.17",
"@typescript-eslint/eslint-plugin": "^6.2.0",
"@typescript-eslint/parser": "^6.2.0",
Expand Down
27 changes: 27 additions & 0 deletions api/prompt/speech-to-text.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { VercelApiHandler } from "@vercel/node";
import { openai } from "../_lib/_openai";
import { toFile } from "openai";

const handler: VercelApiHandler = async (req, res) => {
try {
const { audioUrl } = req.body;

if (!audioUrl) {
res.status(400).json({ ok: false, error: "No audioUrl provided" });
return;
}

const base64Data = audioUrl.split(";base64,").pop();
const binaryData = Buffer.from(base64Data, "base64");
const transcription = await openai.audio.transcriptions.create({
file: await toFile(binaryData, "audio.mp4"),
model: "whisper-1",
});
res.send(transcription.text);
} catch (error) {
console.error(error);
res.status(500).json({ ok: false, error: "Something went wrong" });
}
};

export default handler;
59 changes: 39 additions & 20 deletions app/src/components/EditWithAI.tsx
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import { MagicWand, Microphone, Robot } from "phosphor-react";
import { Button2, IconButton2 } from "../ui/Shared";
import { MagicWand, Robot } from "phosphor-react";
import { Button2 } from "../ui/Shared";
import * as Popover from "@radix-ui/react-popover";
import { Trans, t } from "@lingui/macro";
import { useCallback, useRef, useState } from "react";
import { useDoc } from "../lib/useDoc";
import { parse, stringify, Graph as GSGraph } from "graph-selector";
import { useMutation } from "react-query";
import * as Toast from "@radix-ui/react-toast";
import { Microphone } from "./Microphone";

// The Graph type we send to AI is slightly different from internal representation
type GraphForAI = {
Expand All @@ -24,7 +25,8 @@ type GraphForAI = {
export function EditWithAI() {
const [message, setMessage] = useState<string | null>(null);
const [isOpen, setIsOpen] = useState(false);
const { mutate: edit, isLoading } = useMutation({
const [transcriptionLoading, setTranscriptionLoading] = useState(false);
const { mutate: edit, isLoading: editIsLoading } = useMutation({
mutationFn: async (body: { prompt: string; graph: GraphForAI }) => {
// /api/prompt/edit
const response = await fetch("/api/prompt/edit", {
Expand All @@ -35,6 +37,7 @@ export function EditWithAI() {
},
});
const data = await response.json();

return data as {
message: string;
toolCalls: {
Expand All @@ -43,7 +46,6 @@ export function EditWithAI() {
}[];
};
},
onMutate: () => setIsOpen(false),
onSuccess(data) {
if (data.message) {
setMessage(data.message);
Expand All @@ -59,14 +61,14 @@ export function EditWithAI() {
}
}
},
onSettled() {
setTranscriptionLoading(false);
},
});
const handleSubmit = useCallback(
(e: React.FormEvent<HTMLFormElement>) => {
e.preventDefault();

const formData = new FormData(e.currentTarget);
const prompt = formData.get("prompt") as string;
if (!prompt) return;
const submitPrompt = useCallback(
(prompt: string) => {
setIsOpen(false);

const text = useDoc.getState().text;
const _graph = parse(text);
Expand Down Expand Up @@ -117,8 +119,28 @@ export function EditWithAI() {
[edit]
);

const handleSubmit = useCallback(
(e: React.FormEvent<HTMLFormElement>) => {
e.preventDefault();

const formData = new FormData(e.currentTarget);
const prompt = formData.get("prompt") as string;
if (!prompt) return;

submitPrompt(prompt);
},
[submitPrompt]
);

const handleSend = useCallback(() => {
setTranscriptionLoading(true);
setIsOpen(false);
}, []);

const formRef = useRef<HTMLFormElement>(null);

const isLoading = editIsLoading || transcriptionLoading;

return (
<>
<Popover.Root open={isOpen} onOpenChange={setIsOpen}>
Expand All @@ -142,13 +164,13 @@ export function EditWithAI() {
<Popover.Content
side="bottom"
sideOffset={10}
align="center"
className="w-[300px] bg-white rounded shadow border p-2 !z-[100] animate-slideDownAndFade"
align="end"
className="w-[300px] bg-white rounded shadow border border-purple-300 p-2 !z-[100] animate-slideDownAndFade"
>
<form className="grid gap-2" onSubmit={handleSubmit} ref={formRef}>
<div className="relative">
<textarea
placeholder={t`Write your prompt here or press and hold the button to speak...`}
placeholder={t`Write your prompt here or click to enable the microphone, then press and hold to record.`}
className="text-xs w-full resize-none h-24 p-2 leading-normal"
name="prompt"
required
Expand All @@ -162,13 +184,10 @@ export function EditWithAI() {
}
}}
/>
<IconButton2
size="xs"
className="!absolute bottom-0 right-0"
type="button"
>
<Microphone size={16} />
</IconButton2>
<Microphone
onTranscription={submitPrompt}
onSend={handleSend}
/>
</div>
<Button2 size="sm" color="purple">
<Trans>Submit</Trans>
Expand Down
70 changes: 70 additions & 0 deletions app/src/components/Microphone.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import { Microphone as MicrophoneIcon } from "phosphor-react";
import { IconButton2 } from "../ui/Shared";
import { useCallback } from "react";
import cx from "classnames";
import {
startRecording,
stopRecording,
turnOnMicrophone,
useMicrophoneStore,
} from "../lib/useMicrophoneStore";

/**
* Records the user and then does something with the recording
*/
export function Microphone({
onSend,
onTranscription,
}: {
onSend: () => void;
onTranscription: (text: string) => void | Promise<void>;
}) {
const isMicOn = useMicrophoneStore((s) => s.isMicOn);
const isRecording = useMicrophoneStore((s) => s.isRecording);

const handleFinishRecording = useCallback(() => {
onSend();
const audioBlob = new Blob(useMicrophoneStore.getState().data, {
type: "audio/mp4",
});

// Base64 encode the blob
let audioUrl = "";
const reader = new FileReader();
reader.readAsDataURL(audioBlob);
reader.onloadend = () => {
audioUrl = reader.result as string;

// Send the base64 data to the server
fetch("/api/prompt/speech-to-text", {
method: "POST",
body: JSON.stringify({ audioUrl }),
headers: {
"Content-Type": "application/json",
},
})
.then((res) => res.text())
.then(onTranscription);
};
}, [onSend, onTranscription]);

const turnOn = useCallback(() => {
turnOnMicrophone(handleFinishRecording);
}, [handleFinishRecording]);

return (
<IconButton2
size="xs"
className={cx("!absolute bottom-0 right-0", {
"!bg-black !text-white": isMicOn && !isRecording,
"!bg-red-500 !text-white": isMicOn && isRecording,
})}
type="button"
onClick={turnOn}
onMouseDown={startRecording}
onMouseUp={stopRecording}
>
<MicrophoneIcon size={16} />
</IconButton2>
);
}
61 changes: 61 additions & 0 deletions app/src/lib/useMicrophoneStore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { create } from "zustand";
type MicrophoneStore = {
isMicOn: boolean;
isRecording: boolean;
mediaRecorder: MediaRecorder | null;
data: Blob[];
};

export const useMicrophoneStore = create<MicrophoneStore>((set) => ({
isMicOn: false,
isRecording: false,
mediaRecorder: null,
data: [],
}));

export function turnOnMicrophone(onFinish: () => void | Promise<void>) {
const isMicOn = useMicrophoneStore.getState().isMicOn;
if (isMicOn) return;

// start the microphone listening to audio
navigator.mediaDevices
.getUserMedia({ audio: true })
.then((stream) => {
const mediaRecorder = new MediaRecorder(stream);

mediaRecorder.addEventListener("dataavailable", (e) => {
useMicrophoneStore.setState((state) => ({
data: [...state.data, e.data],
}));
});

mediaRecorder.addEventListener("start", () => {
useMicrophoneStore.setState({ isRecording: true });
});

mediaRecorder.addEventListener("stop", () => {
useMicrophoneStore.setState({ isRecording: false });
onFinish();
});

useMicrophoneStore.setState({ isMicOn: true, mediaRecorder });
})
.catch(console.error);
}

export function startRecording() {
const mediaRecorder = useMicrophoneStore.getState().mediaRecorder;
if (!mediaRecorder) return;

// Clear the data array
useMicrophoneStore.setState({ data: [] });

mediaRecorder.start();
}

export function stopRecording() {
const mediaRecorder = useMicrophoneStore.getState().mediaRecorder;
if (!mediaRecorder) return;

mediaRecorder.stop();
}
Loading

0 comments on commit 0bb50f5

Please sign in to comment.