Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MetaVoice 1B TTS: New and Improved Artificial Intelligence Capabilities as well as Improved User Interface. #194

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions meta-ai-voice-enhancement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import os
import sys
import gradio as gr
import tyro
from fam.llm.fast_inference import TTS
from fam.llm.utils import check_audio_file

# Add the project root to the system path for module import
project_root = os.path.dirname(os.path.abspath(__file__))
if project_root not in sys.path:
sys.path.insert(0, project_root)

# Setup the TTS model
TTS_MODEL = tyro.cli(TTS, args=["--telemetry_origin", "webapp"])

# Setup interface parameters
RADIO_CHOICES = ["Preset voices", "Upload target voice (at least 30s)"]
MAX_CHARS = 220
PRESET_VOICES = {
# female
"Bria": "https://cdn.themetavoice.xyz/speakers/bria.mp3",
# male
"Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
"Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
}

def denormalise_top_p(top_p):
# Converts normalized top_p to a value in the range [0.9, 1.0]
return round(0.9 + top_p / 100, 2)

def denormalise_guidance(guidance):
# Converts normalized guidance to a value in the range [1.0, 3.0]
return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)

def _check_file_size(path):
if not path:
return
filesize = os.path.getsize(path)
filesize_mb = filesize / 1024 / 1024
if filesize_mb >= 50:
raise gr.Error(f"Please upload a sample less than 50MB for voice cloning. Provided: {round(filesize_mb)} MB")

def _handle_edge_cases(to_say, upload_target):
if not to_say:
raise gr.Error("Please provide text to synthesise")

if len(to_say) > MAX_CHARS:
gr.Warning(
f"Max {MAX_CHARS} characters allowed. Provided: {len(to_say)} characters. Truncating and generating speech...Result at the end can be unstable as a result."
)

if upload_target:
check_audio_file(upload_target) # Ensure file duration is at least 30s
_check_file_size(upload_target)

def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target):
try:
d_top_p = denormalise_top_p(top_p)
d_guidance = denormalise_guidance(guidance)

_handle_edge_cases(to_say, upload_target)

to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]

# Synthesize speech using the selected voice or uploaded sample
return TTS_MODEL.synthesise(
text=to_say,
spk_ref_path=PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else upload_target,
top_p=d_top_p,
guidance_scale=d_guidance,
)
except Exception as e:
raise gr.Error(f"Something went wrong. Reason: {str(e)}")

def change_voice_selection_layout(choice):
if choice == RADIO_CHOICES[0]:
return [gr.update(visible=True), gr.update(visible=False)]
return [gr.update(visible=False), gr.update(visible=True)]

title = """
<picture>
<source srcset="https://cdn.themetavoice.xyz/banner_light_transparent.png" media="(prefers-color-scheme: dark)" />
<img alt="MetaVoice logo" src="https://cdn.themetavoice.xyz/banner_light_transparent.png" style="width: 20%; margin: 0 auto;" />
</picture>

\n# TTS by MetaVoice-1B
"""

description = """
<strong>MetaVoice-1B</strong> is a 1.2B parameter base model for TTS (text-to-speech). It has been built with the following priorities:
\n
* <strong>Emotional speech rhythm and tone</strong> in English.
* <strong>Zero-shot cloning for American & British voices</strong>, with 30s reference audio.
* Support for <strong>voice cloning with finetuning</strong>.
* We have had success with as little as 1 minute training data for Indian speakers.
* Support for <strong>long-form synthesis</strong>.

We are releasing the model under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0). See [Github](https://github.com/metavoiceio/metavoice-src) for details and to contribute.
"""

with gr.Blocks(title="TTS by MetaVoice") as demo:
gr.Markdown(title)

with gr.Row():
gr.Markdown(description)

with gr.Row():
with gr.Column():
to_say = gr.TextArea(
label=f"What should I say!? (max {MAX_CHARS} characters).",
lines=4,
value="This is a demo of text to speech by MetaVoice-1B, an open-source foundational audio model by MetaVoice.",
)
with gr.Row(), gr.Column():
# Voice settings
top_p = gr.Slider(
value=5.0,
minimum=0.0,
maximum=10.0,
step=1.0,
label="Speech Stability - improves text following for a challenging speaker",
)
guidance = gr.Slider(
value=5.0,
minimum=1.0,
maximum=5.0,
step=1.0,
label="Speaker similarity - How closely to match speaker identity and speech style.",
)

# Voice selection
toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])

with gr.Row(visible=True) as row_1:
preset_dropdown = gr.Dropdown(
PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
)
with gr.Accordion("Preview: Preset voices", open=False):
for label, path in PRESET_VOICES.items():
gr.Audio(value=path, label=label)

with gr.Row(visible=False) as row_2:
upload_target = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.",
)

toggle.change(
change_voice_selection_layout,
inputs=toggle,
outputs=[row_1, row_2],
)

with gr.Column():
speech = gr.Audio(
type="filepath",
label="MetaVoice-1B says...",
)

submit = gr.Button("Generate Speech")
submit.click(
fn=tts,
inputs=[to_say, top_p, guidance, toggle, preset_dropdown, upload_target],
outputs=speech,
)

demo.queue()
demo.launch(
favicon_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets/favicon.ico"),
server_name="0.0.0.0",
server_port=7861,
)