From 63d71456470651d122ace6b67135b3728bfde66b Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 12:10:46 +0100 Subject: [PATCH 1/4] Update versions --- requirements.txt | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/requirements.txt b/requirements.txt index 836de40ab6..864215117e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,31 @@ # core deps numpy==1.22.0;python_version<="3.10" -numpy==1.24.3;python_version>"3.10" -cython==0.29.30 +numpy>=1.24.3;python_version>"3.10" +cython>=0.29.30 scipy>=1.11.2 torch>=2.1 torchaudio -soundfile==0.12.* -librosa==0.10.* -scikit-learn==1.3.0 +soundfile>=0.12.0 +librosa>=0.10.0 +scikit-learn>=1.3.0 numba==0.55.1;python_version<"3.9" -numba==0.57.0;python_version>="3.9" -inflect==5.6.* -tqdm==4.64.* -anyascii==0.3.* -pyyaml==6.* -fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp==3.8.* -packaging==23.1 +numba>=0.57.0;python_version>="3.9" +inflect>=5.6.0 +tqdm>=4.64.1 +anyascii>=0.3.0 +pyyaml>=6.0 +fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail +aiohttp>=3.8.1 +packaging>=23.1 # deps for examples -flask==2.* +flask>=2.0.1 # deps for inference -pysbd==0.3.4 +pysbd>=0.3.4 # deps for notebooks -umap-learn==0.5.* +umap-learn>=0.5.1 pandas>=1.4,<2.0 # deps for training -matplotlib==3.7.* +matplotlib>=3.7.0 # coqui stack trainer # config management @@ -47,11 +47,11 @@ bnnumerizer bnunicodenormalizer #deps for tortoise k_diffusion -einops==0.6.* -transformers==4.33.* +einops>=0.6.0 +transformers>=4.33.0 #deps for bark -encodec==0.1.* +encodec>=0.1.1 # deps for XTTS -unidecode==1.3.* +unidecode>=1.3.2 num2words spacy[ja]>=3 \ No newline at end of file From 08d11e91987e3af3c8d1e423715e1decfa99f756 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:01:32 +0100 Subject: [PATCH 2/4] Update CI version --- .github/workflows/pypi-release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 49a5b3004e..2bbcf3cd70 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -10,7 +10,7 @@ jobs: build-sdist: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Verify tag matches version run: | set -ex @@ -38,7 +38,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} From 26efdf6ee7feaed7a6b926d3237a393e97814754 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:42:33 +0100 Subject: [PATCH 3/4] Make k_diffusion optional --- TTS/tts/layers/tortoise/diffusion.py | 13 +++++++++++-- requirements.txt | 1 - 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index cb350af779..fcdaa9d76e 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -13,12 +13,19 @@ import numpy as np import torch import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral from tqdm import tqdm from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} + +try: + from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral + + K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +except ImportError: + K_DIFFUSION_SAMPLERS = None + + SAMPLERS = ["dpm++2m", "p", "ddim"] @@ -531,6 +538,8 @@ def sample_loop(self, *args, **kwargs): if self.conditioning_free is not True: raise RuntimeError("cond_free must be true") with tqdm(total=self.num_timesteps) as pbar: + if K_DIFFUSION_SAMPLERS is None: + raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers") return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) else: raise RuntimeError("sampler not impl") diff --git a/requirements.txt b/requirements.txt index 864215117e..ce0e5d9207 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,7 +46,6 @@ bangla bnnumerizer bnunicodenormalizer #deps for tortoise -k_diffusion einops>=0.6.0 transformers>=4.33.0 #deps for bark From 44880f09ed6e4accfb9794a44cc5cf1c383ccc34 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Fri, 17 Nov 2023 13:43:34 +0100 Subject: [PATCH 4/4] Make style --- TTS/tts/layers/tortoise/diffusion.py | 1 - TTS/tts/layers/xtts/gpt.py | 4 +++- TTS/tts/layers/xtts/tokenizer.py | 23 ++++++++++++----------- TTS/tts/models/xtts.py | 14 +++++--------- tests/zoo_tests/test_models.py | 4 ++-- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index fcdaa9d76e..7bea02ca08 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -17,7 +17,6 @@ from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper - try: from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index d914ebf90f..e7b186b858 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -441,7 +441,9 @@ def forward( audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) # Pad mel codes with stop_audio_token - audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet + audio_codes = self.set_mel_padding( + audio_codes, code_lengths - 3 + ) # -3 to get the real code lengths without consider start and stop tokens that was not added yet # Build input and target tensors # Prepend start token to inputs and append stop token to targets diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1ef655a3cc..5284874397 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -1,23 +1,22 @@ import os import re -import torch -import pypinyin import textwrap - from functools import cached_property + +import pypinyin +import torch from hangul_romanize import Transliter from hangul_romanize.rule import academic from num2words import num2words +from spacy.lang.ar import Arabic +from spacy.lang.en import English +from spacy.lang.es import Spanish +from spacy.lang.ja import Japanese +from spacy.lang.zh import Chinese from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words -from spacy.lang.en import English -from spacy.lang.zh import Chinese -from spacy.lang.ja import Japanese -from spacy.lang.ar import Arabic -from spacy.lang.es import Spanish - def get_spacy_lang(lang): if lang == "zh": @@ -32,6 +31,7 @@ def get_spacy_lang(lang): # For most languages, Enlish does the job return English() + def split_sentence(text, lang, text_split_length=250): """Preprocess the input text""" text_splits = [] @@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250): return text_splits + _whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: @@ -619,7 +620,7 @@ def katsu(self): return cutlet.Cutlet() def check_input_length(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region limit = self.char_limits.get(lang, 250) if len(txt) > limit: print( @@ -640,7 +641,7 @@ def preprocess_text(self, txt, lang): return txt def encode(self, txt, lang): - lang = lang.split("-")[0] # remove the region + lang = lang.split("-")[0] # remove the region self.check_input_length(txt, lang) txt = self.preprocess_text(txt, lang) lang = "zh-cn" if lang == "zh" else lang diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 3583591f8b..208ec4d561 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -513,13 +513,13 @@ def inference( enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) else: text = [text] - + wavs = [] gpt_latents_list = [] for sent in text: @@ -563,9 +563,7 @@ def inference( if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) gpt_latents_list.append(gpt_latents.cpu()) @@ -623,7 +621,7 @@ def inference_stream( enable_text_splitting=False, **hf_generate_kwargs, ): - language = language.split("-")[0] # remove the country code + language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) if enable_text_splitting: text = split_sentence(text, language, self.tokenizer.char_limits[language]) @@ -675,9 +673,7 @@ def inference_stream( gpt_latents = torch.cat(all_latents, dim=0)[None, :] if length_scale != 1.0: gpt_latents = F.interpolate( - gpt_latents.transpose(1, 2), - scale_factor=length_scale, - mode="linear" + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" ).transpose(1, 2) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index a5aad5c1ea..8fa56e287a 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -186,7 +186,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=1.5 + speed=1.5, ) wav_chuncks = [] for i, chunk in enumerate(chunks): @@ -198,7 +198,7 @@ def test_xtts_v2_streaming(): "en", gpt_cond_latent, speaker_embedding, - speed=0.66 + speed=0.66, ) wav_chuncks = [] for i, chunk in enumerate(chunks):