diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 49a5b3004e..2bbcf3cd70 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -10,7 +10,7 @@ jobs: build-sdist: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Verify tag matches version run: | set -ex @@ -38,7 +38,7 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index c167f7ca44..b7c6393baa 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -42,6 +42,5 @@ jobs: run: | python3 -m pip install .[all] python3 setup.py egg_info - # - name: Lint check - # run: | - # make lint \ No newline at end of file + - name: Style check + run: make style diff --git a/.github/workflows/xtts_tests.yml b/.github/workflows/xtts_tests.yml new file mode 100644 index 0000000000..be367f3547 --- /dev/null +++ b/.github/workflows/xtts_tests.yml @@ -0,0 +1,53 @@ +name: xtts-tests + +on: + push: + branches: + - main + pull_request: + types: [opened, synchronize, reopened] +jobs: + check_skip: + runs-on: ubuntu-latest + if: "! contains(github.event.head_commit.message, '[ci skip]')" + steps: + - run: echo "${{ github.event.head_commit.message }}" + + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.9, "3.10", "3.11"] + experimental: [false] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: 'requirements*' + - name: check OS + run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git make gcc + sudo apt-get install espeak + sudo apt-get install espeak-ng + make system-deps + - name: Install/upgrade Python setup deps + run: python3 -m pip install --upgrade pip setuptools wheel + - name: Replace scarf urls + run: | + sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json + - name: Install TTS + run: | + python3 -m pip install .[all] + python3 setup.py egg_info + - name: Unit tests + run: make test_xtts diff --git a/.gitignore b/.gitignore index 563040e8da..22ec6e410a 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,4 @@ wandb depot/* coqui_recipes/* local_scripts/* +coqui_demos/* \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ade35507d2..cae35993dc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system. 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version. +## Development in Docker container + +If you prefer working within a Docker container as your development environment, you can do the following: + +1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. + +2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. + + ```bash + $ git clone git@github.com:/TTS.git + $ cd TTS + $ git remote add upstream https://github.com/coqui-ai/TTS.git + ``` + +3. Build the Docker Image as your development environment (it installs all of the dependencies for you): + + ``` + docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev . + ``` + +4. Run the container with GPU support: + + ``` + docker run -it --gpus all tts-dev:latest /bin/bash + ``` + Feel free to ping us at any step you need help using our communication channels. If you are new to Github or open-source contribution, These are good resources. diff --git a/Dockerfile b/Dockerfile index 30dfb23d0d..9fb3005ef4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,19 @@ ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 FROM ${BASE} + RUN apt-get update && apt-get upgrade -y RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* RUN pip3 install llvmlite --ignore-installed -WORKDIR /root -COPY . /root +# Install Dependencies: RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 RUN rm -rf /root/.cache/pip + +# Copy TTS repository contents: +WORKDIR /root +COPY . /root + RUN make install + ENTRYPOINT ["tts"] CMD ["--help"] diff --git a/Makefile b/Makefile index ab992ec52e..54aa6eeb18 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,9 @@ test_tts: ## run tts tests. test_tts2: ## run tts tests. nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2 +test_xtts: + nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests + test_aux: ## run aux tests. nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests ./run_bash_tests.sh diff --git a/README.md b/README.md index 594777c116..ef16c9b6a1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ ## 🐸Coqui.ai News +- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. +- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). +- 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. @@ -25,7 +28,7 @@ 📚 Utilities for dataset analysis and curation. ______________________________________________________________________ -[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) +[![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) @@ -69,7 +72,7 @@ Please use our dedicated channels for questions and discussion. Help is much mor | Type | Links | | ------------------------------- | --------------------------------------- | | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) -| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)| +| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)| | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)| | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) | 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)| @@ -202,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" print(TTS().list_models()) # Init TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Run TTS # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language @@ -264,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models() # Init TTS with the target studio speaker tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) # Run TTS -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) +tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH) # V1 model models = TTS(cs_api_model="V1").list_models() # Run TTS with emotion and speed control # Emotion control only works with V1 model tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) - -# XTTS-multilingual -models = TTS(cs_api_model="XTTS-multilingual").list_models() -# Run TTS with emotion and speed control -# Emotion control only works with V1 model -tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0) ``` #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. diff --git a/TTS/.models.json b/TTS/.models.json index 8e35893bef..1957d78adb 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -2,15 +2,17 @@ "tts_models": { "multilingual": { "multi-dataset": { - "xtts_v1": { - "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.", + "xtts_v2": { + "description": "XTTS-v2.0.2 by Coqui with 16 languages.", "hf_url": [ - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5" ], + "model_hash": "10f92b55c512af7a8d39d650547a15a7", "default_vocoder": null, - "commit": "e5140314", + "commit": "480a6cdf7", "license": "CPML", "contact": "info@coqui.ai", "tos_required": true @@ -18,12 +20,12 @@ "xtts_v1.1": { "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.", "hf_url": [ - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/model.pth", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/config.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/vocab.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/hash.md5" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5" ], - "model_hash": "10163afc541dc86801b33d1f3217b456", + "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59", "default_vocoder": null, "commit": "82910a63", "license": "CPML", diff --git a/TTS/VERSION b/TTS/VERSION index 66333910a4..16eb94e711 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.18.0 +0.21.3 diff --git a/TTS/api.py b/TTS/api.py index dd5820f8a4..b3aa531b7f 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -10,6 +10,7 @@ from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer +from TTS.config import load_config class TTS(nn.Module): @@ -60,27 +61,28 @@ def __init__( vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are - "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. + "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control. Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ super().__init__() self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) - + self.config = load_config(config_path) if config_path else None self.synthesizer = None self.voice_converter = None self.csapi = None self.cs_api_model = cs_api_model - self.model_name = None - + self.model_name = "" if gpu: warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") - if model_name is not None: + if model_name is not None and len(model_name) > 0: if "tts_models" in model_name or "coqui_studio" in model_name: self.load_tts_model_by_name(model_name, gpu) elif "voice_conversion_models" in model_name: self.load_vc_model_by_name(model_name, gpu) + else: + self.load_model_by_name(model_name, gpu) if model_path: self.load_tts_model_by_path( @@ -106,7 +108,12 @@ def is_coqui_studio(self): @property def is_multi_lingual(self): # Not sure what sets this to None, but applied a fix to prevent crashing. - if isinstance(self.model_name, str) and "xtts" in self.model_name: + if ( + isinstance(self.model_name, str) + and "xtts" in self.model_name + or self.config + and ("xtts" in self.config.model or len(self.config.languages) > 1) + ): return True if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: return self.synthesizer.tts_model.language_manager.num_languages > 1 @@ -149,6 +156,15 @@ def download_model_by_name(self, model_name: str): vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"]) return model_path, config_path, vocoder_path, vocoder_config_path, None + def load_model_by_name(self, model_name: str, gpu: bool = False): + """Load one of the 🐸TTS models by name. + + Args: + model_name (str): Model name to load. You can list models by ```tts.models```. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + self.load_tts_model_by_name(model_name, gpu) + def load_vc_model_by_name(self, model_name: str, gpu: bool = False): """Load one of the voice conversion models by name. @@ -264,7 +280,7 @@ def tts_coqui_studio( language: str = None, emotion: str = None, speed: float = 1.0, - pipe_out = None, + pipe_out=None, file_path: str = None, ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. @@ -275,7 +291,7 @@ def tts_coqui_studio( speaker_name (str, optional): Speaker name from Coqui Studio. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. emotion (str, optional): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available with "V1" model. Defaults to None. @@ -310,6 +326,7 @@ def tts( speaker_wav: str = None, emotion: str = None, speed: float = None, + split_sentences: bool = True, **kwargs, ): """Convert text to speech. @@ -321,7 +338,7 @@ def tts( Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. @@ -330,6 +347,12 @@ def tts( speed (float, optional): Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0. Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. + kwargs (dict, optional): + Additional arguments for the model. """ self._check_arguments( speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs @@ -347,6 +370,7 @@ def tts( style_wav=None, style_text=None, reference_speaker_name=None, + split_sentences=split_sentences, **kwargs, ) return wav @@ -359,8 +383,9 @@ def tts_to_file( speaker_wav: str = None, emotion: str = None, speed: float = 1.0, - pipe_out = None, + pipe_out=None, file_path: str = "output.wav", + split_sentences: bool = True, **kwargs, ): """Convert text to speech. @@ -385,6 +410,10 @@ def tts_to_file( Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): Output file path. Defaults to "output.wav". + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. kwargs (dict, optional): Additional arguments for the model. """ @@ -400,7 +429,14 @@ def tts_to_file( file_path=file_path, pipe_out=pipe_out, ) - wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) + wav = self.tts( + text=text, + speaker=speaker, + language=language, + speaker_wav=speaker_wav, + split_sentences=split_sentences, + **kwargs, + ) self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) return file_path @@ -440,7 +476,14 @@ def voice_conversion_to_file( save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) return file_path - def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): + def tts_with_vc( + self, + text: str, + language: str = None, + speaker_wav: str = None, + speaker: str = None, + split_sentences: bool = True, + ): """Convert text to speech with voice conversion. It combines tts with voice conversion to fake voice cloning. @@ -457,17 +500,32 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. """ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: # Lazy code... save it to a temp file to resample it while reading it for VC - self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name) + self.tts_to_file( + text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences + ) if self.voice_converter is None: self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) return wav def tts_with_vc_to_file( - self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" + self, + text: str, + language: str = None, + speaker_wav: str = None, + file_path: str = "output.wav", + speaker: str = None, + split_sentences: bool = True, ): """Convert text to speech with voice conversion and save to file. @@ -484,6 +542,15 @@ def tts_with_vc_to_file( Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + split_sentences (bool, optional): + Split text into sentences, synthesize them separately and concatenate the file audio. + Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only + applicable to the 🐸TTS models. Defaults to True. """ - wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) + wav = self.tts_with_vc( + text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences + ) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 9eadee070e..c6048626b3 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -15,6 +15,7 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import quantize from TTS.utils.generic_utils import count_parameters use_cuda = torch.cuda.is_available() @@ -159,7 +160,7 @@ def inference( def extract_spectrograms( - data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt" + data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt" ): model.eval() export_metadata = [] @@ -196,8 +197,8 @@ def extract_spectrograms( _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) # quantize and save wav - if quantized_wav: - wavq = ap.quantize(wav) + if quantize_bits > 0: + wavq = quantize(wav, quantize_bits) np.save(wavq_path, wavq) # save TTS mel @@ -263,7 +264,7 @@ def main(args): # pylint: disable=redefined-outer-name model, ap, args.output_path, - quantized_wav=args.quantized, + quantize_bits=args.quantize_bits, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", @@ -277,7 +278,7 @@ def main(args): # pylint: disable=redefined-outer-name parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") - parser.add_argument("--quantized", action="store_true", help="Save quantized audio files") + parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") parser.add_argument("--eval", type=bool, help="compute eval.", default=True) args = parser.parse_args() diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 78a20c2566..d9ec3063e6 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -227,7 +227,7 @@ def main(): parser.add_argument( "--cs_model", type=str, - help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", + help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.", ) parser.add_argument( "--emotion", @@ -238,7 +238,7 @@ def main(): parser.add_argument( "--language", type=str, - help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", + help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.", default=None, ) parser.add_argument( @@ -419,6 +419,13 @@ def main(): print(" > Saving output to ", args.out_path) return + if args.language_idx is None and args.language is not None: + msg = ( + "--language is only supported for Coqui Studio models. " + "Use --language_idx to specify the target language for multilingual models." + ) + raise ValueError(msg) + # CASE4: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) @@ -427,7 +434,9 @@ def main(): tts_path = model_path tts_config_path = config_path if "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + args.vocoder_name = ( + model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + ) # voice conversion model if model_item["model_type"] == "voice_conversion_models": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index f2e7779c0c..448fefc712 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -8,17 +8,17 @@ import torch from torch.utils.data import DataLoader +from trainer.io import copy_model_files, save_best_model, save_checkpoint from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer from TTS.encoder.dataset import EncoderDataset -from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model +from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.encoder.utils.training import init_training from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder -from TTS.utils.io import copy_model_files from TTS.utils.samplers import PerfectBatchSampler from TTS.utils.training import check_update @@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, if global_step % c.save_step == 0: # save model - save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch) + save_checkpoint( + c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict() + ) end_time = time.time() @@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, flush=True, ) # save the best checkpoint - best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) + best_loss = save_best_model( + eval_loss, + best_loss, + c, + model, + optimizer, + None, + global_step, + epoch, + OUT_PATH, + criterion=criterion.state_dict(), + ) model.train() return best_loss, global_step @@ -276,7 +289,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.loss == "softmaxproto" and c.model != "speaker_encoder": c.map_classid_to_classname = map_classid_to_classname - copy_model_files(c, OUT_PATH) + copy_model_files(c, OUT_PATH, new_fields={}) if args.restore_path: criterion, args.restore_step = model.load_checkpoint( diff --git a/TTS/cs_api.py b/TTS/cs_api.py index 4a44b535fd..9dc6c30dd4 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -43,7 +43,7 @@ class CS_API: Args: api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable `COQUI_STUDIO_TOKEN`. - model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. + model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. Example listing all available speakers: @@ -65,7 +65,7 @@ class CS_API: Example with multi-language model: >>> from TTS.api import CS_API - >>> tts = CS_API(model="XTTS-multilang") + >>> tts = CS_API(model="XTTS") >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") """ @@ -78,16 +78,11 @@ class CS_API: "XTTS": { "list_speakers": "https://app.coqui.ai/api/v2/speakers", "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", - }, - "XTTS-multilang": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", + "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", }, } - SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"] + SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] def __init__(self, api_token=None, model="XTTS"): self.api_token = api_token @@ -139,7 +134,7 @@ def list_speakers(self): self._check_token() conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] - conn.request("GET", f"{url}?per_page=100", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s) for s in json.loads(data)["result"]] @@ -148,7 +143,7 @@ def list_voices(self): """List custom voices created by the user.""" conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_voices"] - conn.request("GET", f"{url}", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s, True) for s in json.loads(data)["result"]] @@ -197,14 +192,6 @@ def _create_payload(model, text, speaker, speed, emotion, language): } ) elif model == "XTTS": - payload.update( - { - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - elif model == "XTTS-multilang": payload.update( { "name": speaker.name, @@ -226,13 +213,10 @@ def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, langua assert language is None, "❗ language is not supported for V1 model." elif self.model == "XTTS": assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." - assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." - elif self.model == "XTTS-multilang": - assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model." - assert language is not None, "❗ Language is required for XTTS-multilang model." + assert language is not None, "❗ Language is required for XTTS model." assert ( language in self.SUPPORTED_LANGUAGES - ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" + ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." return text, speaker_name, speaker_id, emotion, speed, language def tts( @@ -255,7 +239,7 @@ def tts( supported by `V1` model. Defaults to None. speed (float): Speed of the speech. 1.0 is normal speed. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. """ self._check_token() self.ping_api() @@ -305,7 +289,7 @@ def tts_to_file( speed (float): Speed of the speech. 1.0 is normal speed. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. """ if file_path is None: @@ -322,21 +306,12 @@ def tts_to_file( print(api.speakers) print(api.list_speakers_as_tts_models()) - ts = time.time() - wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) - print(f" [i] XTTS took {time.time() - ts:.2f}s") - - filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") - - api = CS_API(model="XTTS-multilang") - print(api.speakers) - ts = time.time() wav, sr = api.tts( - "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en" + "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name ) print(f" [i] XTTS took {time.time() - ts:.2f}s") filepath = api.tts_to_file( - text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en" + text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav" ) diff --git a/TTS/demos/xtts_ft_demo/requirements.txt b/TTS/demos/xtts_ft_demo/requirements.txt new file mode 100644 index 0000000000..cb5b16f66e --- /dev/null +++ b/TTS/demos/xtts_ft_demo/requirements.txt @@ -0,0 +1,2 @@ +faster_whisper==0.9.0 +gradio==4.7.1 \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py new file mode 100644 index 0000000000..536faa0108 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -0,0 +1,160 @@ +import os +import gc +import torchaudio +import pandas +from faster_whisper import WhisperModel +from glob import glob + +from tqdm import tqdm + +import torch +import torchaudio +# torch.set_num_threads(1) + +from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners + +torch.set_num_threads(16) + + +import os + +audio_types = (".wav", ".mp3", ".flac") + + +def list_audios(basePath, contains=None): + # return the set of files that are valid + return list_files(basePath, validExts=audio_types, contains=contains) + +def list_files(basePath, validExts=None, contains=None): + # loop over the directory structure + for (rootDir, dirNames, filenames) in os.walk(basePath): + # loop over the filenames in the current directory + for filename in filenames: + # if the contains string is not none and the filename does not contain + # the supplied string, then ignore the file + if contains is not None and filename.find(contains) == -1: + continue + + # determine the file extension of the current file + ext = filename[filename.rfind("."):].lower() + + # check to see if the file is an audio and should be processed + if validExts is None or ext.endswith(validExts): + # construct the path to the audio and yield it + audioPath = os.path.join(rootDir, filename) + yield audioPath + +def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): + audio_total_size = 0 + # make sure that ooutput file exists + os.makedirs(out_path, exist_ok=True) + + # Loading Whisper + device = "cuda" if torch.cuda.is_available() else "cpu" + + print("Loading Whisper Model!") + asr_model = WhisperModel("large-v2", device=device, compute_type="float16") + + metadata = {"audio_file": [], "text": [], "speaker_name": []} + + if gradio_progress is not None: + tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...") + else: + tqdm_object = tqdm(audio_files) + + for audio_path in tqdm_object: + wav, sr = torchaudio.load(audio_path) + # stereo to mono if needed + if wav.size(0) != 1: + wav = torch.mean(wav, dim=0, keepdim=True) + + wav = wav.squeeze() + audio_total_size += (wav.size(-1) / sr) + + segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) + segments = list(segments) + i = 0 + sentence = "" + sentence_start = None + first_word = True + # added all segments words in a unique list + words_list = [] + for _, segment in enumerate(segments): + words = list(segment.words) + words_list.extend(words) + + # process each word + for word_idx, word in enumerate(words_list): + if first_word: + sentence_start = word.start + # If it is the first sentence, add buffer or get the begining of the file + if word_idx == 0: + sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start + else: + # get previous sentence end + previous_word_end = words_list[word_idx - 1].end + # add buffer or get the silence midle between the previous sentence and the current one + sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2) + + sentence = word.word + first_word = False + else: + sentence += word.word + + if word.word[-1] in ["!", ".", "?"]: + sentence = sentence[1:] + # Expand number and abbreviations plus normalization + sentence = multilingual_cleaners(sentence, target_language) + audio_file_name, _ = os.path.splitext(os.path.basename(audio_path)) + + audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav" + + # Check for the next word's existence + if word_idx + 1 < len(words_list): + next_word_start = words_list[word_idx + 1].start + else: + # If don't have more words it means that it is the last sentence then use the audio len as next word start + next_word_start = (wav.shape[0] - 1) / sr + + # Average the current word end and next word start + word_end = min((word.end + next_word_start) / 2, word.end + buffer) + + absoulte_path = os.path.join(out_path, audio_file) + os.makedirs(os.path.dirname(absoulte_path), exist_ok=True) + i += 1 + first_word = True + + audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0) + # if the audio is too short ignore it (i.e < 0.33 seconds) + if audio.size(-1) >= sr/3: + torchaudio.save(absoulte_path, + audio, + sr + ) + else: + continue + + metadata["audio_file"].append(audio_file) + metadata["text"].append(sentence) + metadata["speaker_name"].append(speaker_name) + + df = pandas.DataFrame(metadata) + df = df.sample(frac=1) + num_val_samples = int(len(df)*eval_percentage) + + df_eval = df[:num_val_samples] + df_train = df[num_val_samples:] + + df_train = df_train.sort_values('audio_file') + train_metadata_path = os.path.join(out_path, "metadata_train.csv") + df_train.to_csv(train_metadata_path, sep="|", index=False) + + eval_metadata_path = os.path.join(out_path, "metadata_eval.csv") + df_eval = df_eval.sort_values('audio_file') + df_eval.to_csv(eval_metadata_path, sep="|", index=False) + + # deallocate VRAM and RAM + del asr_model, df_train, df_eval, df, metadata + gc.collect() + + return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py new file mode 100644 index 0000000000..a98765c3e7 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -0,0 +1,172 @@ +import os +import gc + +from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.utils.manage import ModelManager + + +def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995): + # Logging parameters + RUN_NAME = "GPT_XTTS_FT" + PROJECT_NAME = "XTTS_trainer" + DASHBOARD_LOGGER = "tensorboard" + LOGGER_URI = None + + # Set here the path that the checkpoints will be saved. Default: ./run/training/ + OUT_PATH = os.path.join(output_path, "run", "training") + + # Training Parameters + OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False + START_WITH_EVAL = False # if True it will star with evaluation + BATCH_SIZE = batch_size # set here the batch size + GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps + + + # Define here the dataset that you want to use for the fine-tuning on. + config_dataset = BaseDatasetConfig( + formatter="coqui", + dataset_name="ft_dataset", + path=os.path.dirname(train_csv), + meta_file_train=train_csv, + meta_file_val=eval_csv, + language=language, + ) + + # Add here the configs of the datasets + DATASETS_CONFIG_LIST = [config_dataset] + + # Define the path where XTTS v2.0.1 files will be downloaded + CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/") + os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) + + + # DVAE files + DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" + MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" + + # Set the path to the downloaded files + DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) + MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK)) + + # download DVAE files if needed + if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): + print(" > Downloading DVAE files!") + ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) + + + # Download XTTS v2.0 checkpoint if needed + TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" + XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" + XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json" + + # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. + TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file + XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file + XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file + + # download XTTS v2.0 files if needed + if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): + print(" > Downloading XTTS v2.0 files!") + ModelManager._download_model_files( + [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) + + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=max_audio_length, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=1026, + gpt_start_audio_token=1024, + gpt_stop_audio_token=1025, + gpt_use_masking_gt_prompt_approach=True, + gpt_use_perceiver_resampler=True, + ) + # define audio config + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + # training parameters config + config = GPTTrainerConfig( + epochs=num_epochs, + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description=""" + GPT XTTS training + """, + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=100, + save_step=1000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=START_WITH_EVAL, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() + + # get the longest text audio file to use as speaker reference + samples_len = [len(item["text"].split(" ")) for item in train_samples] + longest_text_idx = samples_len.index(max(samples_len)) + speaker_ref = train_samples[longest_text_idx]["audio_file"] + + trainer_out_path = trainer.output_path + + # deallocate VRAM and RAM + del model, trainer, train_samples, eval_samples + gc.collect() + + return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py new file mode 100644 index 0000000000..ebb11f29d1 --- /dev/null +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -0,0 +1,415 @@ +import argparse +import os +import sys +import tempfile + +import gradio as gr +import librosa.display +import numpy as np + +import os +import torch +import torchaudio +import traceback +from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list +from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt + +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts + + +def clear_gpu_cache(): + # clear the GPU cache + if torch.cuda.is_available(): + torch.cuda.empty_cache() + +XTTS_MODEL = None +def load_model(xtts_checkpoint, xtts_config, xtts_vocab): + global XTTS_MODEL + clear_gpu_cache() + if not xtts_checkpoint or not xtts_config or not xtts_vocab: + return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!" + config = XttsConfig() + config.load_json(xtts_config) + XTTS_MODEL = Xtts.init_from_config(config) + print("Loading XTTS model! ") + XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) + if torch.cuda.is_available(): + XTTS_MODEL.cuda() + + print("Model Loaded!") + return "Model Loaded!" + +def run_tts(lang, tts_text, speaker_audio_file): + if XTTS_MODEL is None or not speaker_audio_file: + return "You need to run the previous step to load the model !!", None, None + + gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) + out = XTTS_MODEL.inference( + text=tts_text, + language=lang, + gpt_cond_latent=gpt_cond_latent, + speaker_embedding=speaker_embedding, + temperature=XTTS_MODEL.config.temperature, # Add custom parameters here + length_penalty=XTTS_MODEL.config.length_penalty, + repetition_penalty=XTTS_MODEL.config.repetition_penalty, + top_k=XTTS_MODEL.config.top_k, + top_p=XTTS_MODEL.config.top_p, + ) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) + out_path = fp.name + torchaudio.save(out_path, out["wav"], 24000) + + return "Speech generated !", out_path, speaker_audio_file + + + + +# define a logger to redirect +class Logger: + def __init__(self, filename="log.out"): + self.log_file = filename + self.terminal = sys.stdout + self.log = open(self.log_file, "w") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + self.log.flush() + + def isatty(self): + return False + +# redirect stdout and stderr to a file +sys.stdout = Logger() +sys.stderr = sys.stdout + + +# logging.basicConfig(stream=sys.stdout, level=logging.INFO) +import logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +def read_logs(): + sys.stdout.flush() + with open(sys.stdout.log_file, "r") as f: + return f.read() + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="""XTTS fine-tuning demo\n\n""" + """ + Example runs: + python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port + """, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--port", + type=int, + help="Port to run the gradio demo. Default: 5003", + default=5003, + ) + parser.add_argument( + "--out_path", + type=str, + help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/", + default="/tmp/xtts_ft/", + ) + + parser.add_argument( + "--num_epochs", + type=int, + help="Number of epochs to train. Default: 10", + default=10, + ) + parser.add_argument( + "--batch_size", + type=int, + help="Batch size. Default: 4", + default=4, + ) + parser.add_argument( + "--grad_acumm", + type=int, + help="Grad accumulation steps. Default: 1", + default=1, + ) + parser.add_argument( + "--max_audio_length", + type=int, + help="Max permitted audio size in seconds. Default: 11", + default=11, + ) + + args = parser.parse_args() + + with gr.Blocks() as demo: + with gr.Tab("1 - Data processing"): + out_path = gr.Textbox( + label="Output path (where data and checkpoints will be saved):", + value=args.out_path, + ) + # upload_file = gr.Audio( + # sources="upload", + # label="Select here the audio files that you want to use for XTTS trainining !", + # type="filepath", + # ) + upload_file = gr.File( + file_count="multiple", + label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)", + ) + lang = gr.Dropdown( + label="Dataset Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja" + ], + ) + progress_data = gr.Label( + label="Progress:" + ) + logs = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs, every=1) + + prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") + + def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): + clear_gpu_cache() + out_path = os.path.join(out_path, "dataset") + os.makedirs(out_path, exist_ok=True) + if audio_path is None: + return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", "" + else: + try: + train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + except: + traceback.print_exc() + error = traceback.format_exc() + return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", "" + + clear_gpu_cache() + + # if audio total len is less than 2 minutes raise an error + if audio_total_size < 120: + message = "The sum of the duration of the audios that you provided should be at least 2 minutes!" + print(message) + return message, "", "" + + print("Dataset Processed!") + return "Dataset Processed!", train_meta, eval_meta + + with gr.Tab("2 - Fine-tuning XTTS Encoder"): + train_csv = gr.Textbox( + label="Train CSV:", + ) + eval_csv = gr.Textbox( + label="Eval CSV:", + ) + num_epochs = gr.Slider( + label="Number of epochs:", + minimum=1, + maximum=100, + step=1, + value=args.num_epochs, + ) + batch_size = gr.Slider( + label="Batch size:", + minimum=2, + maximum=512, + step=1, + value=args.batch_size, + ) + grad_acumm = gr.Slider( + label="Grad accumulation steps:", + minimum=2, + maximum=128, + step=1, + value=args.grad_acumm, + ) + max_audio_length = gr.Slider( + label="Max permitted audio size in seconds:", + minimum=2, + maximum=20, + step=1, + value=args.max_audio_length, + ) + progress_train = gr.Label( + label="Progress:" + ) + logs_tts_train = gr.Textbox( + label="Logs:", + interactive=False, + ) + demo.load(read_logs, None, logs_tts_train, every=1) + train_btn = gr.Button(value="Step 2 - Run the training") + + def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length): + clear_gpu_cache() + if not train_csv or not eval_csv: + return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" + try: + # convert seconds to waveform frames + max_audio_length = int(max_audio_length * 22050) + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length) + except: + traceback.print_exc() + error = traceback.format_exc() + return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", "" + + # copy original files to avoid parameters changes issues + os.system(f"cp {config_path} {exp_path}") + os.system(f"cp {vocab_file} {exp_path}") + + ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth") + print("Model training done!") + clear_gpu_cache() + return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav + + with gr.Tab("3 - Inference"): + with gr.Row(): + with gr.Column() as col1: + xtts_checkpoint = gr.Textbox( + label="XTTS checkpoint path:", + value="", + ) + xtts_config = gr.Textbox( + label="XTTS config path:", + value="", + ) + + xtts_vocab = gr.Textbox( + label="XTTS vocab path:", + value="", + ) + progress_load = gr.Label( + label="Progress:" + ) + load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model") + + with gr.Column() as col2: + speaker_reference_audio = gr.Textbox( + label="Speaker reference audio:", + value="", + ) + tts_language = gr.Dropdown( + label="Language", + value="en", + choices=[ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh", + "hu", + "ko", + "ja", + ] + ) + tts_text = gr.Textbox( + label="Input Text.", + value="This model sounds really good and above all, it's reasonably fast.", + ) + tts_btn = gr.Button(value="Step 4 - Inference") + + with gr.Column() as col3: + progress_gen = gr.Label( + label="Progress:" + ) + tts_output_audio = gr.Audio(label="Generated Audio.") + reference_audio = gr.Audio(label="Reference audio used.") + + prompt_compute_btn.click( + fn=preprocess_dataset, + inputs=[ + upload_file, + lang, + out_path, + ], + outputs=[ + progress_data, + train_csv, + eval_csv, + ], + ) + + + train_btn.click( + fn=train_model, + inputs=[ + lang, + train_csv, + eval_csv, + num_epochs, + batch_size, + grad_acumm, + out_path, + max_audio_length, + ], + outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], + ) + + load_btn.click( + fn=load_model, + inputs=[ + xtts_checkpoint, + xtts_config, + xtts_vocab + ], + outputs=[progress_load], + ) + + tts_btn.click( + fn=run_tts, + inputs=[ + tts_language, + tts_text, + speaker_reference_audio, + ], + outputs=[progress_gen, tts_output_audio, reference_audio], + ) + + demo.launch( + share=True, + debug=False, + server_port=args.port, + server_name="0.0.0.0" + ) diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 1da029611b..236d6fe937 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -1,15 +1,12 @@ -import datetime import glob import os import random -import re import numpy as np from scipy import signal from TTS.encoder.models.lstm import LSTMSpeakerEncoder from TTS.encoder.models.resnet import ResNetSpeakerEncoder -from TTS.utils.io import save_fsspec class AugmentWAV(object): @@ -118,11 +115,6 @@ def apply_one(self, audio): return self.additive_noise(noise_type, audio) -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - def setup_encoder_model(config: "Coqpit"): if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( @@ -142,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"): audio_config=config.audio, ) return model - - -def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): - checkpoint_path = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "criterion": criterion.state_dict(), - "step": current_step, - "epoch": epoch, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - save_fsspec(state, checkpoint_path) - - -def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict(), - "criterion": criterion.state_dict(), - "step": current_step, - "epoch": epoch, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - best_loss = model_loss - bestmodel_path = "best_model.pth" - bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) - save_fsspec(state, bestmodel_path) - return best_loss diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py deleted file mode 100644 index d1dad3e24d..0000000000 --- a/TTS/encoder/utils/io.py +++ /dev/null @@ -1,38 +0,0 @@ -import datetime -import os - -from TTS.utils.io import save_fsspec - - -def save_checkpoint(model, optimizer, model_loss, out_path, current_step): - checkpoint_path = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "step": current_step, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - save_fsspec(state, checkpoint_path) - - -def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - "model": new_state_dict, - "optimizer": optimizer.state_dict(), - "step": current_step, - "loss": model_loss, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - best_loss = model_loss - bestmodel_path = "best_model.pth" - bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) - save_fsspec(state, bestmodel_path) - return best_loss diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py index 7c58a232e7..ff8f271d80 100644 --- a/TTS/encoder/utils/training.py +++ b/TTS/encoder/utils/training.py @@ -3,13 +3,13 @@ from coqpit import Coqpit from trainer import TrainerArgs, get_last_checkpoint +from trainer.io import copy_model_files from trainer.logging import logger_factory from trainer.logging.console_logger import ConsoleLogger from TTS.config import load_config, register_config from TTS.tts.utils.text.characters import parse_symbols from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch -from TTS.utils.io import copy_model_files @dataclass diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index b968559047..bbf048e1ab 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -30,35 +30,32 @@ class XttsConfig(BaseTTSConfig): which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. - reperation_penalty (float): + repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`. top_p (float): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to `0.8`. - cond_free_k (float): - Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf]. - As cond_free_k increases, the output becomes dominated by the conditioning-free signal. - Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`. - - diffusion_temperature (float): - Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 - are the "mean" prediction of the diffusion network and will sound bland and smeared. - Defaults to `1.0`. - num_gpt_outputs (int): Number of samples taken from the autoregressive model, all of which are filtered using CLVP. As XTTS is a probabilistic model, more samples means a higher probability of creating something "great". Defaults to `16`. - decoder_iterations (int): - Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine - the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better, - however. Defaults to `30`. + gpt_cond_len (int): + Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`. + + gpt_cond_chunk_len (int): + Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the + latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len. + If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`. + + max_ref_len (int): + Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`. + + sound_norm_refs (bool): + Whether to normalize the conditioning audio. Defaults to `False`. - decoder_sampler (str): - Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`. Note: Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. @@ -74,17 +71,37 @@ class XttsConfig(BaseTTSConfig): audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) model_dir: str = None languages: List[str] = field( - default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"] + default_factory=lambda: [ + "en", + "es", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "zh-cn", + "hu", + "ko", + "ja", + "hi", + ] ) # inference params - temperature: float = 0.2 + temperature: float = 0.85 length_penalty: float = 1.0 repetition_penalty: float = 2.0 top_k: int = 50 - top_p: float = 0.8 - cond_free_k: float = 2.0 - diffusion_temperature: float = 1.0 - num_gpt_outputs: int = 16 - decoder_iterations: int = 30 - decoder_sampler: str = "ddim" + top_p: float = 0.85 + num_gpt_outputs: int = 1 + + # cloning + gpt_cond_len: int = 12 + gpt_cond_chunk_len: int = 4 + max_ref_len: int = 10 + sound_norm_refs: bool = False diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index fbf6881f04..053444b0c1 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -280,7 +280,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) text = cols[1] - items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -294,7 +294,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] wav_file = os.path.join(root_path, "wavn", utt_id + ".wav") - items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index 354a0336a1..fb9aa4495f 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F +from torch.nn.utils import parametrize from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor @@ -73,7 +74,7 @@ def __init__( ) nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain)) if self.use_weight_norm: - self.conv = nn.utils.weight_norm(self.conv) + self.conv = nn.utils.parametrizations.weight_norm(self.conv) def forward(self, signal, mask=None): conv_signal = self.conv(signal) @@ -113,7 +114,7 @@ def __init__( dilation=1, w_init_gain="relu", ) - conv_layer = nn.utils.weight_norm(conv_layer.conv, name="weight") + conv_layer = nn.utils.parametrizations.weight_norm(conv_layer.conv, name="weight") convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) @@ -567,7 +568,7 @@ def __init__( # pylint: disable=dangerous-default-value self.convt_pre = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.ConvTranspose1d( in_channels, in_channels, @@ -584,7 +585,7 @@ def __init__( # pylint: disable=dangerous-default-value self.conv_blocks.append( nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( in_channels, in_channels, @@ -665,6 +666,6 @@ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=25 def remove_weight_norm(self): self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) + parametrize.remove_parametrizations(self.convt_pre[1], "weight") for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) + parametrize.remove_parametrizations(block[1], "weight") diff --git a/TTS/tts/layers/delightful_tts/kernel_predictor.py b/TTS/tts/layers/delightful_tts/kernel_predictor.py index 19dfd57e7b..96c550b6c2 100644 --- a/TTS/tts/layers/delightful_tts/kernel_predictor.py +++ b/TTS/tts/layers/delightful_tts/kernel_predictor.py @@ -1,4 +1,5 @@ import torch.nn as nn # pylint: disable=consider-using-from-import +from torch.nn.utils import parametrize class KernelPredictor(nn.Module): @@ -36,7 +37,9 @@ def __init__( # pylint: disable=dangerous-default-value kpnet_bias_channels = conv_out_channels * conv_layers # l_b self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) + ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) @@ -46,7 +49,7 @@ def __init__( # pylint: disable=dangerous-default-value self.residual_convs.append( nn.Sequential( nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -56,7 +59,7 @@ def __init__( # pylint: disable=dangerous-default-value ) ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -68,7 +71,7 @@ def __init__( # pylint: disable=dangerous-default-value getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) ) - self.kernel_conv = nn.utils.weight_norm( + self.kernel_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_kernel_channels, @@ -77,7 +80,7 @@ def __init__( # pylint: disable=dangerous-default-value bias=True, ) ) - self.bias_conv = nn.utils.weight_norm( + self.bias_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_bias_channels, @@ -117,9 +120,9 @@ def forward(self, c): return kernels, bias def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) + parametrize.remove_parametrizations(self.input_conv[0], "weight") + parametrize.remove_parametrizations(self.kernel_conv, "weight") + parametrize.remove_parametrizations(self.bias_conv, "weight") for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) + parametrize.remove_parametrizations(block[1], "weight") + parametrize.remove_parametrizations(block[3], "weight") diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py index bc89da4fbe..f8de63b49f 100644 --- a/TTS/tts/layers/generic/wavenet.py +++ b/TTS/tts/layers/generic/wavenet.py @@ -1,5 +1,6 @@ import torch from torch import nn +from torch.nn.utils import parametrize @torch.jit.script @@ -62,7 +63,7 @@ def __init__( # init conditioning layer if c_in_channels > 0: cond_layer = torch.nn.Conv1d(c_in_channels, 2 * hidden_channels * num_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight") # intermediate layers for i in range(num_layers): dilation = dilation_rate**i @@ -75,7 +76,7 @@ def __init__( in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) if i < num_layers - 1: @@ -84,7 +85,7 @@ def __init__( res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) # setup weight norm if not weight_norm: @@ -115,11 +116,11 @@ def forward(self, x, x_mask=None, g=None, **kwargs): # pylint: disable=unused-a def remove_weight_norm(self): if self.c_in_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) + parametrize.remove_parametrizations(self.cond_layer, "weight") for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) + parametrize.remove_parametrizations(l, "weight") for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + parametrize.remove_parametrizations(l, "weight") class WNBlocks(nn.Module): diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 273c62a5c0..b02c311808 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -186,7 +186,7 @@ def __init__( self.sigmoid_scale = sigmoid_scale # input layer start = torch.nn.Conv1d(in_channels // 2, hidden_channels, 1) - start = torch.nn.utils.weight_norm(start) + start = torch.nn.utils.parametrizations.weight_norm(start) self.start = start # output layer # Initializing last layer to 0 makes the affine coupling layers diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index cb350af779..7bea02ca08 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -13,12 +13,18 @@ import numpy as np import torch import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral from tqdm import tqdm from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +try: + from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral + + K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} +except ImportError: + K_DIFFUSION_SAMPLERS = None + + SAMPLERS = ["dpm++2m", "p", "ddim"] @@ -531,6 +537,8 @@ def sample_loop(self, *args, **kwargs): if self.conditioning_free is not True: raise RuntimeError("cond_free must be true") with tqdm(total=self.num_timesteps) as pbar: + if K_DIFFUSION_SAMPLERS is None: + raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers") return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) else: raise RuntimeError("sampler not impl") diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index 47365eb58d..a5200c2673 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -1,4 +1,3 @@ -import json from dataclasses import dataclass from enum import Enum from typing import Callable, Optional @@ -6,6 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +import torch.nn.utils.parametrize as parametrize MAX_WAV_VALUE = 32768.0 @@ -44,7 +44,9 @@ def __init__( kpnet_bias_channels = conv_out_channels * conv_layers # l_b self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) + ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) @@ -54,7 +56,7 @@ def __init__( self.residual_convs.append( nn.Sequential( nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -64,7 +66,7 @@ def __init__( ) ), getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_hidden_channels, @@ -76,7 +78,7 @@ def __init__( getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), ) ) - self.kernel_conv = nn.utils.weight_norm( + self.kernel_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_kernel_channels, @@ -85,7 +87,7 @@ def __init__( bias=True, ) ) - self.bias_conv = nn.utils.weight_norm( + self.bias_conv = nn.utils.parametrizations.weight_norm( nn.Conv1d( kpnet_hidden_channels, kpnet_bias_channels, @@ -125,12 +127,12 @@ def forward(self, c): return kernels, bias def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) + parametrize.remove_parametrizations(self.input_conv[0], "weight") + parametrize.remove_parametrizations(self.kernel_conv, "weight") + parametrize.remove_parametrizations(self.bias_conv) for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) + parametrize.remove_parametrizations(block[1], "weight") + parametrize.remove_parametrizations(block[3], "weight") class LVCBlock(torch.nn.Module): @@ -169,7 +171,7 @@ def __init__( self.convt_pre = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.ConvTranspose1d( in_channels, in_channels, @@ -186,7 +188,7 @@ def __init__( self.conv_blocks.append( nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( + nn.utils.parametrizations.weight_norm( nn.Conv1d( in_channels, in_channels, @@ -267,9 +269,9 @@ def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=25 def remove_weight_norm(self): self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) + parametrize.remove_parametrizations(self.convt_pre[1], "weight") for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) + parametrize.remove_parametrizations(block[1], "weight") class UnivNetGenerator(nn.Module): @@ -314,11 +316,13 @@ def __init__( ) ) - self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")) + self.conv_pre = nn.utils.parametrizations.weight_norm( + nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect") + ) self.conv_post = nn.Sequential( nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), + nn.utils.parametrizations.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), nn.Tanh(), ) @@ -346,11 +350,11 @@ def eval(self, inference=False): self.remove_weight_norm() def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.conv_pre) + parametrize.remove_parametrizations(self.conv_pre, "weight") for layer in self.conv_post: if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + parametrize.remove_parametrizations(layer, "weight") for res_block in self.res_stack: res_block.remove_weight_norm() diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index 148f283c90..c27d11bef6 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -14,7 +14,7 @@ class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(Conv1d(1, 16, 15, 1, padding=7)), diff --git a/TTS/tts/layers/xtts/diffusion.py b/TTS/tts/layers/xtts/diffusion.py deleted file mode 100644 index 37665bc676..0000000000 --- a/TTS/tts/layers/xtts/diffusion.py +++ /dev/null @@ -1,1319 +0,0 @@ -import enum -import math - -import numpy as np -import torch -import torch as th -from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral -from tqdm import tqdm - -from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper - -K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m} -SAMPLERS = ["dpm++2m", "p", "ddim"] - - -def normal_kl(mean1, logvar1, mean2, logvar2): - """ - Compute the KL divergence between two gaussians. - - Shapes are automatically broadcasted, so batches can be compared to - scalars, among other use cases. - """ - tensor = None - for obj in (mean1, logvar1, mean2, logvar2): - if isinstance(obj, th.Tensor): - tensor = obj - break - assert tensor is not None, "at least one argument must be a Tensor" - - # Force variances to be Tensors. Broadcasting helps convert scalars to - # Tensors, but it does not work for th.exp(). - logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)] - - return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2)) - - -def approx_standard_normal_cdf(x): - """ - A fast approximation of the cumulative distribution function of the - standard normal. - """ - return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) - - -def discretized_gaussian_log_likelihood(x, *, means, log_scales): - """ - Compute the log-likelihood of a Gaussian distribution discretizing to a - given image. - - :param x: the target images. It is assumed that this was uint8 values, - rescaled to the range [-1, 1]. - :param means: the Gaussian mean Tensor. - :param log_scales: the Gaussian log stddev Tensor. - :return: a tensor like x of log probabilities (in nats). - """ - assert x.shape == means.shape == log_scales.shape - centered_x = x - means - inv_stdv = th.exp(-log_scales) - plus_in = inv_stdv * (centered_x + 1.0 / 255.0) - cdf_plus = approx_standard_normal_cdf(plus_in) - min_in = inv_stdv * (centered_x - 1.0 / 255.0) - cdf_min = approx_standard_normal_cdf(min_in) - log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) - log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) - cdf_delta = cdf_plus - cdf_min - log_probs = th.where( - x < -0.999, - log_cdf_plus, - th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), - ) - assert log_probs.shape == x.shape - return log_probs - - -def mean_flat(tensor): - """ - Take the mean over all non-batch dimensions. - """ - return tensor.mean(dim=list(range(1, len(tensor.shape)))) - - -def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): - """ - Get a pre-defined beta schedule for the given name. - - The beta schedule library consists of beta schedules which remain similar - in the limit of num_diffusion_timesteps. - Beta schedules may be added, but should not be removed or changed once - they are committed to maintain backwards compatibility. - """ - if schedule_name == "linear": - # Linear schedule from Ho et al, extended to work for any number of - # diffusion steps. - scale = 1000 / num_diffusion_timesteps - beta_start = scale * 0.0001 - beta_end = scale * 0.02 - return np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64) - elif schedule_name == "cosine": - return betas_for_alpha_bar( - num_diffusion_timesteps, - lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, - ) - else: - raise NotImplementedError(f"unknown beta schedule: {schedule_name}") - - -def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): - """ - Create a beta schedule that discretizes the given alpha_t_bar function, - which defines the cumulative product of (1-beta) over time from t = [0,1]. - - :param num_diffusion_timesteps: the number of betas to produce. - :param alpha_bar: a lambda that takes an argument t from 0 to 1 and - produces the cumulative product of (1-beta) up to that - part of the diffusion process. - :param max_beta: the maximum beta to use; use values lower than 1 to - prevent singularities. - """ - betas = [] - for i in range(num_diffusion_timesteps): - t1 = i / num_diffusion_timesteps - t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) - return np.array(betas) - - -class ModelMeanType(enum.Enum): - """ - Which type of output the model predicts. - """ - - PREVIOUS_X = "previous_x" # the model predicts x_{t-1} - START_X = "start_x" # the model predicts x_0 - EPSILON = "epsilon" # the model predicts epsilon - - -class ModelVarType(enum.Enum): - """ - What is used as the model's output variance. - - The LEARNED_RANGE option has been added to allow the model to predict - values between FIXED_SMALL and FIXED_LARGE, making its job easier. - """ - - LEARNED = "learned" - FIXED_SMALL = "fixed_small" - FIXED_LARGE = "fixed_large" - LEARNED_RANGE = "learned_range" - - -class LossType(enum.Enum): - MSE = "mse" # use raw MSE loss (and KL when learning variances) - RESCALED_MSE = "rescaled_mse" # use raw MSE loss (with RESCALED_KL when learning variances) - KL = "kl" # use the variational lower-bound - RESCALED_KL = "rescaled_kl" # like KL, but rescale to estimate the full VLB - - def is_vb(self): - return self == LossType.KL or self == LossType.RESCALED_KL - - -class GaussianDiffusion: - """ - Utilities for training and sampling diffusion models. - - Ported directly from here, and then adapted over time to further experimentation. - https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 - - :param betas: a 1-D numpy array of betas for each diffusion timestep, - starting at T and going to 1. - :param model_mean_type: a ModelMeanType determining what the model outputs. - :param model_var_type: a ModelVarType determining how variance is output. - :param loss_type: a LossType determining the loss function to use. - :param rescale_timesteps: if True, pass floating point timesteps into the - model so that they are always scaled like in the - original paper (0 to 1000). - """ - - def __init__( - self, - *, - betas, - model_mean_type, - model_var_type, - loss_type, - rescale_timesteps=False, # this is generally False - conditioning_free=False, - conditioning_free_k=1, - ramp_conditioning_free=True, - sampler="ddim", - ): - self.sampler = sampler - self.model_mean_type = ModelMeanType(model_mean_type) - self.model_var_type = ModelVarType(model_var_type) - self.loss_type = LossType(loss_type) - self.rescale_timesteps = rescale_timesteps - self.conditioning_free = conditioning_free - self.conditioning_free_k = conditioning_free_k - self.ramp_conditioning_free = ramp_conditioning_free - - # Use float64 for accuracy. - betas = np.array(betas, dtype=np.float64) - self.betas = betas - assert len(betas.shape) == 1, "betas must be 1-D" - assert (betas > 0).all() and (betas <= 1).all() - - self.num_timesteps = int(betas.shape[0]) - - alphas = 1.0 - betas - self.alphas_cumprod = np.cumprod(alphas, axis=0) - self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) - self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) - assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) - - # calculations for diffusion q(x_t | x_{t-1}) and others - self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) - self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) - self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) - self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) - self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) - - # calculations for posterior q(x_{t-1} | x_t, x_0) - self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) - # log calculation clipped because the posterior variance is 0 at the - # beginning of the diffusion chain. - self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:])) - self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) - self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod) - - def q_mean_variance(self, x_start, t): - """ - Get the distribution q(x_t | x_0). - - :param x_start: the [N x C x ...] tensor of noiseless inputs. - :param t: the number of diffusion steps (minus 1). Here, 0 means one step. - :return: A tuple (mean, variance, log_variance), all of x_start's shape. - """ - mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start - variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) - log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape) - return mean, variance, log_variance - - def q_sample(self, x_start, t, noise=None): - """ - Diffuse the data for a given number of diffusion steps. - - In other words, sample from q(x_t | x_0). - - :param x_start: the initial data batch. - :param t: the number of diffusion steps (minus 1). Here, 0 means one step. - :param noise: if specified, the split-out normal noise. - :return: A noisy version of x_start. - """ - if noise is None: - noise = th.randn_like(x_start) - assert noise.shape == x_start.shape - return ( - _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start - + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise - ) - - def q_posterior_mean_variance(self, x_start, x_t, t): - """ - Compute the mean and variance of the diffusion posterior: - - q(x_{t-1} | x_t, x_0) - - """ - assert x_start.shape == x_t.shape - posterior_mean = ( - _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start - + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t - ) - posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) - posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) - assert ( - posterior_mean.shape[0] - == posterior_variance.shape[0] - == posterior_log_variance_clipped.shape[0] - == x_start.shape[0] - ) - return posterior_mean, posterior_variance, posterior_log_variance_clipped - - def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None): - """ - Apply the model to get p(x_{t-1} | x_t), as well as a prediction of - the initial x, x_0. - - :param model: the model, which takes a signal and a batch of timesteps - as input. - :param x: the [N x C x ...] tensor at time t. - :param t: a 1-D Tensor of timesteps. - :param clip_denoised: if True, clip the denoised signal into [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. Applies before - clip_denoised. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :return: a dict with the following keys: - - 'mean': the model mean output. - - 'variance': the model variance output. - - 'log_variance': the log of 'variance'. - - 'pred_xstart': the prediction for x_0. - """ - if model_kwargs is None: - model_kwargs = {} - - assert self.model_var_type == ModelVarType.LEARNED_RANGE - assert self.model_mean_type == ModelMeanType.EPSILON - assert denoised_fn is None - assert clip_denoised is True - B, C = x.shape[:2] - assert t.shape == (B,) - model_output = model(x, self._scale_timesteps(t), **model_kwargs) - if self.conditioning_free: - model_output_no_conditioning = model(x, self._scale_timesteps(t), conditioning_free=True, **model_kwargs) - - if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: - assert model_output.shape == (B, C * 2, *x.shape[2:]) - model_output, model_var_values = th.split(model_output, C, dim=1) - if self.conditioning_free: - model_output_no_conditioning, _ = th.split(model_output_no_conditioning, C, dim=1) - if self.model_var_type == ModelVarType.LEARNED: - assert False - model_log_variance = model_var_values - model_variance = th.exp(model_log_variance) - else: - min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape) - max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) - # The model_var_values is [-1, 1] for [min_var, max_var]. - frac = (model_var_values + 1) / 2 - model_log_variance = frac * max_log + (1 - frac) * min_log - model_variance = th.exp(model_log_variance) - else: - assert False - model_variance, model_log_variance = { - # for fixedlarge, we set the initial (log-)variance like so - # to get a better decoder log likelihood. - ModelVarType.FIXED_LARGE: ( - np.append(self.posterior_variance[1], self.betas[1:]), - np.log(np.append(self.posterior_variance[1], self.betas[1:])), - ), - ModelVarType.FIXED_SMALL: ( - self.posterior_variance, - self.posterior_log_variance_clipped, - ), - }[self.model_var_type] - model_variance = _extract_into_tensor(model_variance, t, x.shape) - model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) - - if self.conditioning_free: - if self.ramp_conditioning_free: - assert t.shape[0] == 1 # This should only be used in inference. - cfk = self.conditioning_free_k * (1 - self._scale_timesteps(t)[0].item() / self.num_timesteps) - else: - cfk = self.conditioning_free_k - model_output = (1 + cfk) * model_output - cfk * model_output_no_conditioning - - def process_xstart(x): - if denoised_fn is not None: - assert False - x = denoised_fn(x) - if clip_denoised: - return x.clamp(-1, 1) - assert False - return x - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - assert False - pred_xstart = process_xstart(self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)) - model_mean = model_output - elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: - if self.model_mean_type == ModelMeanType.START_X: - assert False - pred_xstart = process_xstart(model_output) - else: - pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)) - model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t) - else: - raise NotImplementedError(self.model_mean_type) - - assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape - return { - "mean": model_mean, - "variance": model_variance, - "log_variance": model_log_variance, - "pred_xstart": pred_xstart, - } - - def _predict_xstart_from_eps(self, x_t, t, eps): - assert x_t.shape == eps.shape - return ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps - ) - - def _predict_xstart_from_xprev(self, x_t, t, xprev): - assert x_t.shape == xprev.shape - return ( # (xprev - coef2*x_t) / coef1 - _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t - ) - - def _predict_eps_from_xstart(self, x_t, t, pred_xstart): - return ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart - ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) - - def _scale_timesteps(self, t): - if self.rescale_timesteps: - return t.float() * (1000.0 / self.num_timesteps) - return t - - def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): - """ - Compute the mean for the previous step, given a function cond_fn that - computes the gradient of a conditional log probability with respect to - x. In particular, cond_fn computes grad(log(p(y|x))), and we want to - condition on y. - - This uses the conditioning strategy from Sohl-Dickstein et al. (2015). - """ - gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) - new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() - return new_mean - - def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): - """ - Compute what the p_mean_variance output would have been, should the - model's score function be conditioned by cond_fn. - - See condition_mean() for details on cond_fn. - - Unlike condition_mean(), this instead uses the conditioning strategy - from Song et al (2020). - """ - alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) - - eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) - eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, self._scale_timesteps(t), **model_kwargs) - - out = p_mean_var.copy() - out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) - out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t) - return out - - def p_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - ): - """ - Sample x_{t-1} from the model at the given timestep. - - :param model: the model to sample from. - :param x: the current tensor at x_{t-1}. - :param t: the value of t, starting at 0 for the first diffusion step. - :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. - :param cond_fn: if not None, this is a gradient function that acts - similarly to the model. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :return: a dict containing the following keys: - - 'sample': a random sample from the model. - - 'pred_xstart': a prediction of x_0. - """ - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - noise = th.randn_like(x) - nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 - if cond_fn is not None: - out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs) - sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise - return {"sample": sample, "pred_xstart": out["pred_xstart"]} - - def k_diffusion_sample_loop( - self, - k_sampler, - pbar, - model, - shape, - noise=None, # all given - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - device=None, # ALL UNUSED - model_kwargs=None, # {'precomputed_aligned_embeddings': precomputed_embeddings}, - progress=False, # unused as well - ): - assert isinstance(model_kwargs, dict) - if device is None: - device = next(model.parameters()).device - s_in = noise.new_ones([noise.shape[0]]) - - def model_split(*args, **kwargs): - model_output = model(*args, **kwargs) - model_epsilon, model_var = th.split(model_output, model_output.shape[1] // 2, dim=1) - return model_epsilon, model_var - - # - """ - print(self.betas) - print(th.tensor(self.betas)) - noise_schedule = NoiseScheduleVP(schedule='discrete', betas=th.tensor(self.betas)) - """ - noise_schedule = NoiseScheduleVP(schedule="linear", continuous_beta_0=0.1 / 4, continuous_beta_1=20.0 / 4) - - def model_fn_prewrap(x, t, *args, **kwargs): - """ - x_in = torch.cat([x] * 2) - t_in = torch.cat([t_continuous] * 2) - c_in = torch.cat([unconditional_condition, condition]) - noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) - print(t) - print(self.timestep_map) - exit() - """ - """ - model_output = model(x, self._scale_timesteps(t*4000), **model_kwargs) - out = self.p_mean_variance(model, x, t*4000, model_kwargs=model_kwargs) - return out['pred_xstart'] - """ - x, _ = x.chunk(2) - t, _ = (t * 1000).chunk(2) - res = torch.cat( - [ - model_split(x, t, conditioning_free=True, **model_kwargs)[0], - model_split(x, t, **model_kwargs)[0], - ] - ) - pbar.update(1) - return res - - model_fn = model_wrapper( - model_fn_prewrap, - noise_schedule, - model_type="noise", # "noise" or "x_start" or "v" or "score" - model_kwargs=model_kwargs, - guidance_type="classifier-free", - condition=th.Tensor(1), - unconditional_condition=th.Tensor(1), - guidance_scale=self.conditioning_free_k, - ) - """ - model_fn = model_wrapper( - model_fn_prewrap, - noise_schedule, - model_type='x_start', - model_kwargs={} - ) - # - dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver") - x_sample = dpm_solver.sample( - noise, - steps=20, - order=3, - skip_type="time_uniform", - method="singlestep", - ) - """ - dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++") - x_sample = dpm_solver.sample( - noise, - steps=self.num_timesteps, - order=2, - skip_type="time_uniform", - method="multistep", - ) - #''' - return x_sample - - # HF DIFFUSION ATTEMPT - """ - from .hf_diffusion import EulerAncestralDiscreteScheduler - Scheduler = EulerAncestralDiscreteScheduler() - Scheduler.set_timesteps(100) - for timestep in Scheduler.timesteps: - noise_input = Scheduler.scale_model_input(noise, timestep) - ts = s_in * timestep - model_output = model(noise_input, ts, **model_kwargs) - model_epsilon, _model_var = th.split(model_output, model_output.shape[1]//2, dim=1) - noise, _x0 = Scheduler.step(model_epsilon, timestep, noise) - return noise - """ - - # KARRAS DIFFUSION ATTEMPT - """ - TRAINED_DIFFUSION_STEPS = 4000 # HARDCODED - ratio = TRAINED_DIFFUSION_STEPS/14.5 - def call_model(*args, **kwargs): - model_output = model(*args, **kwargs) - model_output, model_var_values = th.split(model_output, model_output.shape[1]//2, dim=1) - return model_output - print(get_sigmas_karras(self.num_timesteps, sigma_min=0.0, sigma_max=4000, device=device)) - exit() - sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device) - return k_sampler(call_model, noise, sigmas, extra_args=model_kwargs, disable=not progress) - ''' - sigmas = get_sigmas_karras(self.num_timesteps, sigma_min=0.03, sigma_max=14.5, device=device) - step = 0 # LMAO - global_sigmas = None - # - def fakemodel(x, t, **model_kwargs): - print(t,global_sigmas*ratio) - return model(x, t, **model_kwargs) - def denoised(x, sigmas, **extra_args): - t = th.tensor([self.num_timesteps-step-1] * shape[0], device=device) - nonlocal global_sigmas - global_sigmas = sigmas - with th.no_grad(): - out = self.p_sample( - fakemodel, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - ) - return out["sample"] - def callback(d): - nonlocal step - step += 1 - - return k_sampler(denoised, noise, sigmas, extra_args=model_kwargs, callback=callback, disable=not progress) - ''' - """ - - def sample_loop(self, *args, **kwargs): - s = self.sampler - if s == "p": - return self.p_sample_loop(*args, **kwargs) - elif s == "ddim": - return self.ddim_sample_loop(*args, **kwargs) - elif s == "dpm++2m": - if self.conditioning_free is not True: - raise RuntimeError("cond_free must be true") - with tqdm(total=self.num_timesteps) as pbar: - return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs) - else: - raise RuntimeError("sampler not impl") - - def p_sample_loop( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - ): - """ - Generate samples from the model. - - :param model: the model module. - :param shape: the shape of the samples, (N, C, H, W). - :param noise: if specified, the noise from the encoder to sample. - Should be of the same shape as `shape`. - :param clip_denoised: if True, clip x_start predictions to [-1, 1]. - :param denoised_fn: if not None, a function which applies to the - x_start prediction before it is used to sample. - :param cond_fn: if not None, this is a gradient function that acts - similarly to the model. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param device: if specified, the device to create the samples on. - If not specified, use a model parameter's device. - :param progress: if True, show a tqdm progress bar. - :return: a non-differentiable batch of samples. - """ - final = None - for sample in self.p_sample_loop_progressive( - model, - shape, - noise=noise, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - device=device, - progress=progress, - ): - final = sample - return final["sample"] - - def p_sample_loop_progressive( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - ): - """ - Generate samples from the model and yield intermediate samples from - each timestep of diffusion. - - Arguments are the same as p_sample_loop(). - Returns a generator over dicts, where each dict is the return value of - p_sample(). - """ - if device is None: - device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) - if noise is not None: - img = noise - else: - img = th.randn(*shape, device=device) - indices = list(range(self.num_timesteps))[::-1] - - for i in tqdm(indices, disable=not progress): - t = th.tensor([i] * shape[0], device=device) - with th.no_grad(): - out = self.p_sample( - model, - img, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - ) - yield out - img = out["sample"] - - def ddim_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - eta=0.0, - ): - """ - Sample x_{t-1} from the model using DDIM. - - Same usage as p_sample(). - """ - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - if cond_fn is not None: - out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs) - - # Usually our model outputs epsilon, but we re-derive it - # in case we used x_start or x_prev prediction. - eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) - - alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) - alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) - sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) - # Equation 12. - noise = th.randn_like(x) - mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps - nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 - sample = mean_pred + nonzero_mask * sigma * noise - return {"sample": sample, "pred_xstart": out["pred_xstart"]} - - def ddim_reverse_sample( - self, - model, - x, - t, - clip_denoised=True, - denoised_fn=None, - model_kwargs=None, - eta=0.0, - ): - """ - Sample x_{t+1} from the model using DDIM reverse ODE. - """ - assert eta == 0.0, "Reverse ODE only for deterministic path" - out = self.p_mean_variance( - model, - x, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - model_kwargs=model_kwargs, - ) - # Usually our model outputs epsilon, but we re-derive it - # in case we used x_start or x_prev prediction. - eps = ( - _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] - ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) - alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) - - # Equation 12. reversed - mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps - - return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} - - def ddim_sample_loop( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - eta=0.0, - ): - """ - Generate samples from the model using DDIM. - - Same usage as p_sample_loop(). - """ - final = None - for sample in self.ddim_sample_loop_progressive( - model, - shape, - noise=noise, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - device=device, - progress=progress, - eta=eta, - ): - final = sample - return final["sample"] - - def ddim_sample_loop_progressive( - self, - model, - shape, - noise=None, - clip_denoised=True, - denoised_fn=None, - cond_fn=None, - model_kwargs=None, - device=None, - progress=False, - eta=0.0, - ): - """ - Use DDIM to sample from the model and yield intermediate samples from - each timestep of DDIM. - - Same usage as p_sample_loop_progressive(). - """ - if device is None: - device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) - if noise is not None: - img = noise - else: - img = th.randn(*shape, device=device) - indices = list(range(self.num_timesteps))[::-1] - - if progress: - # Lazy import so that we don't depend on tqdm. - from tqdm.auto import tqdm - - indices = tqdm(indices, disable=not progress) - - for i in indices: - t = th.tensor([i] * shape[0], device=device) - with th.no_grad(): - out = self.ddim_sample( - model, - img, - t, - clip_denoised=clip_denoised, - denoised_fn=denoised_fn, - cond_fn=cond_fn, - model_kwargs=model_kwargs, - eta=eta, - ) - yield out - img = out["sample"] - - def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None): - """ - Get a term for the variational lower-bound. - - The resulting units are bits (rather than nats, as one might expect). - This allows for comparison to other papers. - - :return: a dict with the following keys: - - 'output': a shape [N] tensor of NLLs or KLs. - - 'pred_xstart': the x_0 predictions. - """ - true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t) - out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs) - kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]) - kl = mean_flat(kl) / np.log(2.0) - - decoder_nll = -discretized_gaussian_log_likelihood( - x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] - ) - assert decoder_nll.shape == x_start.shape - decoder_nll = mean_flat(decoder_nll) / np.log(2.0) - - # At the first timestep return the decoder NLL, - # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) - output = th.where((t == 0), decoder_nll, kl) - return {"output": output, "pred_xstart": out["pred_xstart"]} - - def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): - """ - Compute training losses for a single timestep. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param t: a batch of timestep indices. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param noise: if specified, the specific Gaussian noise to try to remove. - :return: a dict with the key "loss" containing a tensor of shape [N]. - Some mean or variance settings may also have other keys. - """ - if model_kwargs is None: - model_kwargs = {} - if noise is None: - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start, t, noise=noise) - - terms = {} - - if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: - # TODO: support multiple model outputs for this mode. - terms["loss"] = self._vb_terms_bpd( - model=model, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - model_kwargs=model_kwargs, - )["output"] - if self.loss_type == LossType.RESCALED_KL: - terms["loss"] *= self.num_timesteps - elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: - model_outputs = model(x_t, self._scale_timesteps(t), **model_kwargs) - if isinstance(model_outputs, tuple): - model_output = model_outputs[0] - terms["extra_outputs"] = model_outputs[1:] - else: - model_output = model_outputs - - if self.model_var_type in [ - ModelVarType.LEARNED, - ModelVarType.LEARNED_RANGE, - ]: - B, C = x_t.shape[:2] - assert model_output.shape == (B, C * 2, *x_t.shape[2:]) - model_output, model_var_values = th.split(model_output, C, dim=1) - # Learn the variance using the variational bound, but don't let - # it affect our mean prediction. - frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) - terms["vb"] = self._vb_terms_bpd( - model=lambda *args, r=frozen_out: r, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - )["output"] - if self.loss_type == LossType.RESCALED_MSE: - # Divide by 1000 for equivalence with initial implementation. - # Without a factor of 1/1000, the VB term hurts the MSE term. - terms["vb"] *= self.num_timesteps / 1000.0 - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0] - x_start_pred = torch.zeros(x_start) # Not supported. - elif self.model_mean_type == ModelMeanType.START_X: - target = x_start - x_start_pred = model_output - elif self.model_mean_type == ModelMeanType.EPSILON: - target = noise - x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) - else: - raise NotImplementedError(self.model_mean_type) - assert model_output.shape == target.shape == x_start.shape - terms["mse"] = mean_flat((target - model_output) ** 2) - terms["x_start_predicted"] = x_start_pred - if "vb" in terms: - terms["loss"] = terms["mse"] + terms["vb"] - else: - terms["loss"] = terms["mse"] - else: - raise NotImplementedError(self.loss_type) - - return terms - - def autoregressive_training_losses( - self, - model, - x_start, - t, - model_output_keys, - gd_out_key, - model_kwargs=None, - noise=None, - ): - """ - Compute training losses for a single timestep. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param t: a batch of timestep indices. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - :param noise: if specified, the specific Gaussian noise to try to remove. - :return: a dict with the key "loss" containing a tensor of shape [N]. - Some mean or variance settings may also have other keys. - """ - if model_kwargs is None: - model_kwargs = {} - if noise is None: - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start, t, noise=noise) - terms = {} - if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: - assert False # not currently supported for this type of diffusion. - elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: - model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) - terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) - model_output = terms[gd_out_key] - if self.model_var_type in [ - ModelVarType.LEARNED, - ModelVarType.LEARNED_RANGE, - ]: - B, C = x_t.shape[:2] - assert model_output.shape == (B, C, 2, *x_t.shape[2:]) - model_output, model_var_values = ( - model_output[:, :, 0], - model_output[:, :, 1], - ) - # Learn the variance using the variational bound, but don't let - # it affect our mean prediction. - frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) - terms["vb"] = self._vb_terms_bpd( - model=lambda *args, r=frozen_out: r, - x_start=x_start, - x_t=x_t, - t=t, - clip_denoised=False, - )["output"] - if self.loss_type == LossType.RESCALED_MSE: - # Divide by 1000 for equivalence with initial implementation. - # Without a factor of 1/1000, the VB term hurts the MSE term. - terms["vb"] *= self.num_timesteps / 1000.0 - - if self.model_mean_type == ModelMeanType.PREVIOUS_X: - target = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0] - x_start_pred = torch.zeros(x_start) # Not supported. - elif self.model_mean_type == ModelMeanType.START_X: - target = x_start - x_start_pred = model_output - elif self.model_mean_type == ModelMeanType.EPSILON: - target = noise - x_start_pred = self._predict_xstart_from_eps(x_t, t, model_output) - else: - raise NotImplementedError(self.model_mean_type) - assert model_output.shape == target.shape == x_start.shape - terms["mse"] = mean_flat((target - model_output) ** 2) - terms["x_start_predicted"] = x_start_pred - if "vb" in terms: - terms["loss"] = terms["mse"] + terms["vb"] - else: - terms["loss"] = terms["mse"] - else: - raise NotImplementedError(self.loss_type) - - return terms - - def _prior_bpd(self, x_start): - """ - Get the prior KL term for the variational lower-bound, measured in - bits-per-dim. - - This term can't be optimized, as it only depends on the encoder. - - :param x_start: the [N x C x ...] tensor of inputs. - :return: a batch of [N] KL values (in bits), one per batch element. - """ - batch_size = x_start.shape[0] - t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) - qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) - kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0) - return mean_flat(kl_prior) / np.log(2.0) - - def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): - """ - Compute the entire variational lower-bound, measured in bits-per-dim, - as well as other related quantities. - - :param model: the model to evaluate loss on. - :param x_start: the [N x C x ...] tensor of inputs. - :param clip_denoised: if True, clip denoised samples. - :param model_kwargs: if not None, a dict of extra keyword arguments to - pass to the model. This can be used for conditioning. - - :return: a dict containing the following keys: - - total_bpd: the total variational lower-bound, per batch element. - - prior_bpd: the prior term in the lower-bound. - - vb: an [N x T] tensor of terms in the lower-bound. - - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. - - mse: an [N x T] tensor of epsilon MSEs for each timestep. - """ - device = x_start.device - batch_size = x_start.shape[0] - - vb = [] - xstart_mse = [] - mse = [] - for t in list(range(self.num_timesteps))[::-1]: - t_batch = th.tensor([t] * batch_size, device=device) - noise = th.randn_like(x_start) - x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) - # Calculate VLB term at the current timestep - with th.no_grad(): - out = self._vb_terms_bpd( - model, - x_start=x_start, - x_t=x_t, - t=t_batch, - clip_denoised=clip_denoised, - model_kwargs=model_kwargs, - ) - vb.append(out["output"]) - xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) - eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) - mse.append(mean_flat((eps - noise) ** 2)) - - vb = th.stack(vb, dim=1) - xstart_mse = th.stack(xstart_mse, dim=1) - mse = th.stack(mse, dim=1) - - prior_bpd = self._prior_bpd(x_start) - total_bpd = vb.sum(dim=1) + prior_bpd - return { - "total_bpd": total_bpd, - "prior_bpd": prior_bpd, - "vb": vb, - "xstart_mse": xstart_mse, - "mse": mse, - } - - -class SpacedDiffusion(GaussianDiffusion): - """ - A diffusion process which can skip steps in a base diffusion process. - - :param use_timesteps: a collection (sequence or set) of timesteps from the - original diffusion process to retain. - :param kwargs: the kwargs to create the base diffusion process. - """ - - def __init__(self, use_timesteps, **kwargs): - self.use_timesteps = set(use_timesteps) - self.timestep_map = [] - self.original_num_steps = len(kwargs["betas"]) - - base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa - last_alpha_cumprod = 1.0 - new_betas = [] - for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): - if i in self.use_timesteps: - new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) - last_alpha_cumprod = alpha_cumprod - self.timestep_map.append(i) - kwargs["betas"] = np.array(new_betas) - super().__init__(**kwargs) - - def p_mean_variance(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) - - def training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().training_losses(self._wrap_model(model), *args, **kwargs) - - def autoregressive_training_losses(self, model, *args, **kwargs): # pylint: disable=signature-differs - return super().autoregressive_training_losses(self._wrap_model(model, True), *args, **kwargs) - - def condition_mean(self, cond_fn, *args, **kwargs): - return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) - - def condition_score(self, cond_fn, *args, **kwargs): - return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) - - def _wrap_model(self, model, autoregressive=False): - if isinstance(model, _WrappedModel) or isinstance(model, _WrappedAutoregressiveModel): - return model - mod = _WrappedAutoregressiveModel if autoregressive else _WrappedModel - return mod(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps) - - def _scale_timesteps(self, t): - # Scaling is done by the wrapped model. - return t - - -def space_timesteps(num_timesteps, section_counts): - """ - Create a list of timesteps to use from an original diffusion process, - given the number of timesteps we want to take from equally-sized portions - of the original process. - - For example, if there's 300 timesteps and the section counts are [10,15,20] - then the first 100 timesteps are strided to be 10 timesteps, the second 100 - are strided to be 15 timesteps, and the final 100 are strided to be 20. - - If the stride is a string starting with "ddim", then the fixed striding - from the DDIM paper is used, and only one section is allowed. - - :param num_timesteps: the number of diffusion steps in the original - process to divide up. - :param section_counts: either a list of numbers, or a string containing - comma-separated numbers, indicating the step count - per section. As a special case, use "ddimN" where N - is a number of steps to use the striding from the - DDIM paper. - :return: a set of diffusion steps from the original process to use. - """ - if isinstance(section_counts, str): - if section_counts.startswith("ddim"): - desired_count = int(section_counts[len("ddim") :]) - for i in range(1, num_timesteps): - if len(range(0, num_timesteps, i)) == desired_count: - return set(range(0, num_timesteps, i)) - raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride") - section_counts = [int(x) for x in section_counts.split(",")] - size_per = num_timesteps // len(section_counts) - extra = num_timesteps % len(section_counts) - start_idx = 0 - all_steps = [] - for i, section_count in enumerate(section_counts): - size = size_per + (1 if i < extra else 0) - if size < section_count: - raise ValueError(f"cannot divide section of {size} steps into {section_count}") - if section_count <= 1: - frac_stride = 1 - else: - frac_stride = (size - 1) / (section_count - 1) - cur_idx = 0.0 - taken_steps = [] - for _ in range(section_count): - taken_steps.append(start_idx + round(cur_idx)) - cur_idx += frac_stride - all_steps += taken_steps - start_idx += size - return set(all_steps) - - -class _WrappedModel: - def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): - self.model = model - self.timestep_map = timestep_map - self.rescale_timesteps = rescale_timesteps - self.original_num_steps = original_num_steps - - def __call__(self, x, ts, **kwargs): - map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) - new_ts = map_tensor[ts] - if self.rescale_timesteps: - new_ts = new_ts.float() * (1000.0 / self.original_num_steps) - return self.model(x, new_ts, **kwargs) - - -class _WrappedAutoregressiveModel: - def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): - self.model = model - self.timestep_map = timestep_map - self.rescale_timesteps = rescale_timesteps - self.original_num_steps = original_num_steps - - def __call__(self, x, x0, ts, **kwargs): - map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) - new_ts = map_tensor[ts] - if self.rescale_timesteps: - new_ts = new_ts.float() * (1000.0 / self.original_num_steps) - return self.model(x, x0, new_ts, **kwargs) - - -def _extract_into_tensor(arr, timesteps, broadcast_shape): - """ - Extract values from a 1-D numpy array for a batch of indices. - - :param arr: the 1-D numpy array. - :param timesteps: a tensor of indices into the array to extract. - :param broadcast_shape: a larger shape of K dimensions with the batch - dimension equal to the length of timesteps. - :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. - """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() - while len(res.shape) < len(broadcast_shape): - res = res[..., None] - return res.expand(broadcast_shape) diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index 88ce100c72..e7b186b858 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -11,6 +11,7 @@ from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder +from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler def null_position_embeddings(range, dim): @@ -105,6 +106,8 @@ def __init__( checkpointing=False, average_conditioning_embeddings=False, label_smoothing=0.0, + use_perceiver_resampler=False, + perceiver_cond_length_compression=256, ): """ Args: @@ -125,6 +128,7 @@ def __init__( self.heads = heads self.model_dim = model_dim self.max_conditioning_inputs = max_conditioning_inputs + self.max_gen_mel_tokens = max_mel_tokens - self.max_conditioning_inputs - 2 self.max_mel_tokens = -1 if max_mel_tokens == -1 else max_mel_tokens + 2 + self.max_conditioning_inputs self.max_text_tokens = -1 if max_text_tokens == -1 else max_text_tokens + 2 self.max_prompt_tokens = max_prompt_tokens @@ -132,13 +136,12 @@ def __init__( self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads) self.conditioning_dropout = nn.Dropout1d(0.1) self.average_conditioning_embeddings = average_conditioning_embeddings + self.use_perceiver_resampler = use_perceiver_resampler + self.perceiver_cond_length_compression = perceiver_cond_length_compression self.text_embedding = nn.Embedding(self.number_text_tokens, model_dim) self.mel_embedding = nn.Embedding(self.num_audio_tokens, model_dim) - self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim) - self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim) - ( self.gpt, self.mel_pos_embedding, @@ -165,9 +168,29 @@ def __init__( self.text_head = nn.Linear(model_dim, self.number_text_tokens) self.mel_head = nn.Linear(model_dim, self.num_audio_tokens) + if self.use_perceiver_resampler: + # XTTS v2 + self.conditioning_perceiver = PerceiverResampler( + dim=model_dim, + depth=2, + dim_context=model_dim, + num_latents=32, + dim_head=64, + heads=8, + ff_mult=4, + use_flash_attn=False, + ) + else: + # XTTS v1 + self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim) + self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim) + def get_grad_norm_parameter_groups(self): return { "conditioning_encoder": list(self.conditioning_encoder.parameters()), + "conditioning_perceiver": list(self.conditioning_perceiver.parameters()) + if self.use_perceiver_resampler + else None, "gpt": list(self.gpt.parameters()), "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()), } @@ -197,6 +220,7 @@ def init_gpt_for_inference(self, kv_cache=True, use_deepspeed=False): if use_deepspeed: import deepspeed + self.ds_engine = deepspeed.init_inference( model=self.gpt_inference.half(), # Transformers models mp_size=1, # Number of GPU @@ -233,6 +257,7 @@ def get_logits( prompt=None, get_attns=False, return_latent=False, + attn_mask_cond=None, attn_mask_text=None, attn_mask_mel=None, ): @@ -248,8 +273,8 @@ def get_logits( if attn_mask_text is not None: attn_mask = torch.cat([attn_mask_text, attn_mask_mel], dim=1) if prompt is not None: - attn_mask_prompt = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device) - attn_mask = torch.cat([attn_mask_prompt, attn_mask], dim=1) + attn_mask_cond = torch.ones(prompt.shape[0], offset, dtype=torch.bool, device=emb.device) + attn_mask = torch.cat([attn_mask_cond, attn_mask], dim=1) gpt_out = self.gpt( inputs_embeds=emb, @@ -313,7 +338,6 @@ def get_prompts(self, prompt_codes): prompt_len = 3 prompt_len = prompt_len * 24 # in frames if prompt_codes.shape[-1] >= prompt_len: - new_prompt = [] for i in range(prompt_codes.shape[0]): if lengths[i] < prompt_len: start = 0 @@ -326,7 +350,7 @@ def get_prompts(self, prompt_codes): prompt = F.pad(prompt, (0, 1), value=self.stop_prompt_token) return prompt - def get_style_emb(self, cond_input, cond_lens=None, cond_seg_len=None, return_latent=False, sample=True): + def get_style_emb(self, cond_input, return_latent=False): """ cond_input: (b, 80, s) or (b, 1, 80, s) conds: (b, 1024, s) @@ -335,26 +359,9 @@ def get_style_emb(self, cond_input, cond_lens=None, cond_seg_len=None, return_la if not return_latent: if cond_input.ndim == 4: cond_input = cond_input.squeeze(1) - if sample: - _len_secs = random.randint(2, 6) # in secs - cond_seg_len = int((22050 / 1024) * _len_secs) # in frames - if cond_input.shape[-1] >= cond_seg_len: - new_conds = [] - for i in range(cond_input.shape[0]): - cond_len = int(cond_lens[i] / 1024) - if cond_len < cond_seg_len: - start = 0 - else: - start = random.randint(0, cond_len - cond_seg_len) - cond_vec = cond_input[i, :, start : start + cond_seg_len] - new_conds.append(cond_vec) - conds = torch.stack(new_conds, dim=0) - else: - cond_seg_len = 5 if cond_seg_len is None else cond_seg_len # secs - cond_frame_len = int((22050 / 1024) * cond_seg_len) - conds = cond_input[:, :, -cond_frame_len:] - - conds = self.conditioning_encoder(conds) + conds = self.conditioning_encoder(cond_input) # (b, d, s) + if self.use_perceiver_resampler: + conds = self.conditioning_perceiver(conds.permute(0, 2, 1)).transpose(1, 2) # (b, d, 32) else: # already computed conds = cond_input.unsqueeze(1) @@ -366,10 +373,10 @@ def forward( text_lengths, audio_codes, wav_lengths, - cond_lens=None, cond_mels=None, + cond_idxs=None, + cond_lens=None, cond_latents=None, - loss_weights=None, return_attentions=False, return_latent=False, ): @@ -377,11 +384,12 @@ def forward( Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode (actuated by `text_first`). - cond_mels: MEL float tensor, (b, 1, 80,s) text_inputs: long tensor, (b,t) text_lengths: long tensor, (b,) mel_inputs: long tensor, (b,m) wav_lengths: long tensor, (b,) + cond_mels: MEL float tensor, (b, 1, 80,s) + cond_idxs: cond start and end indexs, (b, 2) If return_attentions is specified, only logits are returned. If return_latent is specified, loss & logits are not computed or returned. Only the predicted latents are returned. @@ -393,21 +401,31 @@ def forward( max_text_len = text_lengths.max() code_lengths = torch.ceil(wav_lengths / self.code_stride_len).long() + 3 + if cond_lens is not None: + if self.use_perceiver_resampler: + cond_lens = cond_lens // self.perceiver_cond_length_compression + else: + cond_lens = cond_lens // self.code_stride_len + + if cond_idxs is not None: + # recompute cond idxs for mel lengths + for idx in range(cond_idxs.size(0)): + if self.use_perceiver_resampler: + cond_idxs[idx] = cond_idxs[idx] // self.perceiver_cond_length_compression + else: + cond_idxs[idx] = cond_idxs[idx] // self.code_stride_len + + # ensure that the cond_mel does not have padding + # if cond_lens is not None and cond_idxs is None: + # min_cond_len = torch.min(cond_lens) + # cond_mels = cond_mels[:, :, :, :min_cond_len] + # If len(codes) + 3 is larger than maxiumum allowed length, we truncate the codes. max_mel_len = code_lengths.max() if max_mel_len > audio_codes.shape[-1]: audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) - silence = True - for idx, l in enumerate(code_lengths): - length = l.item() - while silence: - if audio_codes[idx, length - 1] != 83: - break - length -= 1 - code_lengths[idx] = length - # 💖 Lovely assertions assert ( max_mel_len <= audio_codes.shape[-1] @@ -423,7 +441,9 @@ def forward( audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token) # Pad mel codes with stop_audio_token - audio_codes = self.set_mel_padding(audio_codes, code_lengths) + audio_codes = self.set_mel_padding( + audio_codes, code_lengths - 3 + ) # -3 to get the real code lengths without consider start and stop tokens that was not added yet # Build input and target tensors # Prepend start token to inputs and append stop token to targets @@ -435,9 +455,16 @@ def forward( ) # Set attn_mask + attn_mask_cond = None attn_mask_text = None attn_mask_mel = None if not return_latent: + attn_mask_cond = torch.ones( + cond_mels.shape[0], + cond_mels.shape[-1], + dtype=torch.bool, + device=text_inputs.device, + ) attn_mask_text = torch.ones( text_inputs.shape[0], text_inputs.shape[1], @@ -451,6 +478,15 @@ def forward( device=audio_codes.device, ) + if cond_idxs is not None: + # use masking approach + for idx, r in enumerate(cond_idxs): + l = r[1] - r[0] + attn_mask_cond[idx, l:] = 0.0 + elif cond_lens is not None: + for idx, l in enumerate(cond_lens): + attn_mask_cond[idx, l:] = 0.0 + for idx, l in enumerate(text_lengths): attn_mask_text[idx, l + 1 :] = 0.0 @@ -465,7 +501,7 @@ def forward( # Compute speech conditioning input if cond_latents is None: - cond_latents = self.get_style_emb(cond_mels, cond_lens).transpose(1, 2) + cond_latents = self.get_style_emb(cond_mels).transpose(1, 2) # Get logits sub = -5 # don't ask me why 😄 @@ -480,6 +516,7 @@ def forward( prompt=cond_latents, get_attns=return_attentions, return_latent=return_latent, + attn_mask_cond=attn_mask_cond, attn_mask_text=attn_mask_text, attn_mask_mel=attn_mask_mel, ) @@ -501,6 +538,13 @@ def forward( 0 ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + # ignore the loss for the segment used for conditioning + # coin flip for the segment to be ignored + if cond_idxs is not None: + cond_start = cond_idxs[idx, 0] + cond_end = cond_idxs[idx, 1] + mel_targets[idx, cond_start:cond_end] = -1 + # Compute losses loss_text = F.cross_entropy( text_logits, text_targets.long(), ignore_index=-1, label_smoothing=self.label_smoothing @@ -512,7 +556,7 @@ def forward( def inference(self, cond_latents, text_inputs, **hf_generate_kwargs): self.compute_embeddings(cond_latents, text_inputs) - return self.generate(cond_latents, text_inputs, input_tokens=None, **hf_generate_kwargs) + return self.generate(cond_latents, text_inputs, **hf_generate_kwargs) def compute_embeddings( self, @@ -548,7 +592,7 @@ def generate( bos_token_id=self.start_audio_token, pad_token_id=self.stop_audio_token, eos_token_id=self.stop_audio_token, - max_length=self.max_mel_tokens * 2 + self.max_prompt_tokens + self.max_text_tokens, + max_length=self.max_gen_mel_tokens + gpt_inputs.shape[-1], **hf_generate_kwargs, ) if "return_dict_in_generate" in hf_generate_kwargs: @@ -561,7 +605,7 @@ def get_generator(self, fake_inputs, **hf_generate_kwargs): bos_token_id=self.start_audio_token, pad_token_id=self.stop_audio_token, eos_token_id=self.stop_audio_token, - max_length=self.max_mel_tokens * 2 + self.max_prompt_tokens + self.max_text_tokens, + max_length=self.max_gen_mel_tokens + fake_inputs.shape[-1], do_stream=True, **hf_generate_kwargs, ) diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 6439b455a0..9add7826e6 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -1,13 +1,13 @@ import torch +import torchaudio from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm -import torchaudio +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec - LRELU_SLOPE = 0.1 @@ -121,9 +121,9 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -177,7 +177,7 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class HifiganGenerator(torch.nn.Module): @@ -224,9 +224,7 @@ def __init__( self.cond_in_each_up_layer = cond_in_each_up_layer # initial upsampling layers - self.conv_pre = weight_norm( - Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3) - ) + self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) resblock = ResBlock1 if resblock_type == "1" else ResBlock2 # upsampling layers self.ups = nn.ModuleList() @@ -246,22 +244,18 @@ def __init__( self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) - for _, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): + for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) # post convolution layer - self.conv_post = weight_norm( - Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias) - ) + self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias)) if cond_channels > 0: self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) if not conv_pre_weight_norm: - remove_weight_norm(self.conv_pre) + remove_parametrizations(self.conv_pre, "weight") if not conv_post_weight_norm: - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_post, "weight") if self.cond_in_each_up_layer: self.conds = nn.ModuleList() @@ -318,19 +312,17 @@ def inference(self, c): Tensor: [B, 1, T] """ c = c.to(self.conv_pre.weight.device) - c = torch.nn.functional.pad( - c, (self.inference_padding, self.inference_padding), "replicate" - ) + c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") return self.forward(c) def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_pre, "weight") + remove_parametrizations(self.conv_post, "weight") def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False @@ -342,6 +334,7 @@ def load_checkpoint( assert not self.training self.remove_weight_norm() + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() @@ -425,10 +418,8 @@ def forward(self, x): return torch.nn.functional.conv1d(x, self.filter).squeeze(1) - class ResNetSpeakerEncoder(nn.Module): - """This is copied from 🐸TTS to remove it from the dependencies. - """ + """This is copied from 🐸TTS to remove it from the dependencies.""" # pylint: disable=W0102 def __init__( @@ -620,6 +611,7 @@ def load_checkpoint( return criterion, state["step"] return criterion + class HifiDecoder(torch.nn.Module): def __init__( self, @@ -724,9 +716,7 @@ def inference(self, c, g): """ return self.forward(c, g=g) - def load_checkpoint( - self, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) # remove unused keys state = state["model"] diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py new file mode 100644 index 0000000000..7b7ee79b50 --- /dev/null +++ b/TTS/tts/layers/xtts/perceiver_encoder.py @@ -0,0 +1,319 @@ +# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532 + +from collections import namedtuple +from functools import wraps + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from einops.layers.torch import Rearrange +from packaging import version +from torch import einsum, nn + + +def exists(val): + return val is not None + + +def once(fn): + called = False + + @wraps(fn) + def inner(x): + nonlocal called + if called: + return + called = True + return fn(x) + + return inner + + +print_once = once(print) + +# main class + + +class Attend(nn.Module): + def __init__(self, dropout=0.0, causal=False, use_flash=False): + super().__init__() + self.dropout = dropout + self.attn_dropout = nn.Dropout(dropout) + + self.causal = causal + self.register_buffer("mask", None, persistent=False) + + self.use_flash = use_flash + assert not ( + use_flash and version.parse(torch.__version__) < version.parse("2.0.0") + ), "in order to use flash attention, you must be using pytorch 2.0 or above" + + # determine efficient attention configs for cuda and cpu + self.config = namedtuple("EfficientAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]) + self.cpu_config = self.config(True, True, True) + self.cuda_config = None + + if not torch.cuda.is_available() or not use_flash: + return + + device_properties = torch.cuda.get_device_properties(torch.device("cuda")) + + if device_properties.major == 8 and device_properties.minor == 0: + print_once("A100 GPU detected, using flash attention if input tensor is on cuda") + self.cuda_config = self.config(True, False, False) + else: + print_once("Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda") + self.cuda_config = self.config(False, True, True) + + def get_mask(self, n, device): + if exists(self.mask) and self.mask.shape[-1] >= n: + return self.mask[:n, :n] + + mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1) + self.register_buffer("mask", mask, persistent=False) + return mask + + def flash_attn(self, q, k, v, mask=None): + _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda + + # Recommended for multi-query single-key-value attention by Tri Dao + # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64]) + + if k.ndim == 3: + k = rearrange(k, "b ... -> b 1 ...").expand_as(q) + + if v.ndim == 3: + v = rearrange(v, "b ... -> b 1 ...").expand_as(q) + + # Check if mask exists and expand to compatible shape + # The mask is B L, so it would have to be expanded to B H N L + + if exists(mask): + mask = rearrange(mask, "b j -> b 1 1 j") + mask = mask.expand(-1, heads, q_len, -1) + + # Check if there is a compatible device for flash attention + + config = self.cuda_config if is_cuda else self.cpu_config + + # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale + + with torch.backends.cuda.sdp_kernel(**config._asdict()): + out = F.scaled_dot_product_attention( + q, k, v, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal + ) + + return out + + def forward(self, q, k, v, mask=None): + """ + einstein notation + b - batch + h - heads + n, i, j - sequence length (base sequence length, source, target) + d - feature dimension + """ + + n, device = q.shape[-2], q.device + + scale = q.shape[-1] ** -0.5 + + if self.use_flash: + return self.flash_attn(q, k, v, mask=mask) + + kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d" + + # similarity + + sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale + + # key padding mask + + if exists(mask): + mask = rearrange(mask, "b j -> b 1 1 j") + sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max) + + # causal mask + + if self.causal: + causal_mask = self.get_mask(n, device) + sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max) + + # attention + + attn = sim.softmax(dim=-1) + attn = self.attn_dropout(attn) + + # aggregate values + + out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v) + + return out + + +def Sequential(*mods): + return nn.Sequential(*filter(exists, mods)) + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if callable(d) else d + + +class RMSNorm(nn.Module): + def __init__(self, dim, scale=True, dim_cond=None): + super().__init__() + self.cond = exists(dim_cond) + self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None + + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) if scale else None + + def forward(self, x, cond=None): + gamma = default(self.gamma, 1) + out = F.normalize(x, dim=-1) * self.scale * gamma + + if not self.cond: + return out + + assert exists(cond) + gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1) + gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta)) + return out * gamma + beta + + +class CausalConv1d(nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + (kernel_size,) = self.kernel_size + (dilation,) = self.dilation + (stride,) = self.stride + + assert stride == 1 + self.causal_padding = dilation * (kernel_size - 1) + + def forward(self, x): + causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0) + return super().forward(causal_padded_x) + + +class GEGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.gelu(gate) * x + + +def FeedForward(dim, mult=4, causal_conv=False): + dim_inner = int(dim * mult * 2 / 3) + + conv = None + if causal_conv: + conv = nn.Sequential( + Rearrange("b n d -> b d n"), + CausalConv1d(dim_inner, dim_inner, 3), + Rearrange("b d n -> b n d"), + ) + + return Sequential(nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim)) + + +class PerceiverResampler(nn.Module): + def __init__( + self, + *, + dim, + depth=2, + dim_context=None, + num_latents=32, + dim_head=64, + heads=8, + ff_mult=4, + use_flash_attn=False, + ): + super().__init__() + dim_context = default(dim_context, dim) + + self.proj_context = nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity() + + self.latents = nn.Parameter(torch.randn(num_latents, dim)) + nn.init.normal_(self.latents, std=0.02) + + self.layers = nn.ModuleList([]) + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + use_flash=use_flash_attn, + cross_attn_include_queries=True, + ), + FeedForward(dim=dim, mult=ff_mult), + ] + ) + ) + + self.norm = RMSNorm(dim) + + def forward(self, x, mask=None): + batch = x.shape[0] + + x = self.proj_context(x) + + latents = repeat(self.latents, "n d -> b n d", b=batch) + + for attn, ff in self.layers: + latents = attn(latents, x, mask=mask) + latents + latents = ff(latents) + latents + + return self.norm(latents) + + +class Attention(nn.Module): + def __init__( + self, + dim, + *, + dim_context=None, + causal=False, + dim_head=64, + heads=8, + dropout=0.0, + use_flash=False, + cross_attn_include_queries=False, + ): + super().__init__() + self.scale = dim_head**-0.5 + self.heads = heads + self.cross_attn_include_queries = cross_attn_include_queries + + dim_inner = dim_head * heads + dim_context = default(dim_context, dim) + + self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash) + self.to_q = nn.Linear(dim, dim_inner, bias=False) + self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False) + self.to_out = nn.Linear(dim_inner, dim, bias=False) + + def forward(self, x, context=None, mask=None): + h, has_context = self.heads, exists(context) + + context = default(context, x) + + if has_context and self.cross_attn_include_queries: + context = torch.cat((x, context), dim=-2) + + q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1)) + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + + out = self.attend(q, k, v, mask=mask) + + out = rearrange(out, "b h n d -> b n (h d)") + return self.to_out(out) diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 8bdd2291ff..e12f8995cf 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -1,26 +1,27 @@ # Adapted from: https://github.com/LowinLi/transformers-stream-generator +import copy +import inspect +import random +import warnings +from typing import Callable, List, Optional, Union + +import numpy as np +import torch +import torch.distributed as dist +from torch import nn from transformers import ( + BeamSearchScorer, + ConstrainedBeamSearchScorer, + DisjunctiveConstraint, GenerationConfig, GenerationMixin, LogitsProcessorList, - StoppingCriteriaList, - DisjunctiveConstraint, - BeamSearchScorer, PhrasalConstraint, - ConstrainedBeamSearchScorer, PreTrainedModel, + StoppingCriteriaList, ) -import numpy as np -import random -import warnings -import inspect from transformers.generation.utils import GenerateOutput, SampleOutput, logger -import torch -from typing import Callable, List, Optional, Union -from torch import nn -import torch.distributed as dist -import copy def setup_seed(seed): @@ -48,9 +49,7 @@ def generate( generation_config: Optional[StreamGenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[ - Callable[[int, torch.Tensor], List[int]] - ] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, synced_gpus: Optional[bool] = False, seed=0, **kwargs, @@ -125,7 +124,7 @@ def generate( - [`~generation.BeamSearchEncoderDecoderOutput`], - [`~generation.BeamSampleEncoderDecoderOutput`] """ - #setup_seed(seed) + # setup_seed(seed) # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call self._validate_model_class() @@ -134,9 +133,7 @@ def generate( # legacy: users may modify the model configuration to control generation -- update the generation config # model attribute accordingly, if it was created from the model config if self.generation_config._from_model_config: - new_generation_config = StreamGenerationConfig.from_model_config( - self.config - ) + new_generation_config = StreamGenerationConfig.from_model_config(self.config) if new_generation_config != self.generation_config: warnings.warn( "You have modified the pretrained model configuration to control generation. This is a" @@ -148,25 +145,14 @@ def generate( generation_config = self.generation_config generation_config = copy.deepcopy(generation_config) - model_kwargs = generation_config.update( - **kwargs - ) # All unused kwargs must be model kwargs + model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs # self._validate_model_kwargs(model_kwargs.copy()) # 2. Set generation parameters if not already defined - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if ( - generation_config.pad_token_id is None - and generation_config.eos_token_id is not None - ): + if generation_config.pad_token_id is None and generation_config.eos_token_id is not None: if model_kwargs.get("attention_mask", None) is None: logger.warning( "The attention mask and the pad token id were not set. As a consequence, you may observe " @@ -175,9 +161,7 @@ def generate( eos_token_id = generation_config.eos_token_id if isinstance(eos_token_id, list): eos_token_id = eos_token_id[0] - logger.warning( - f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation." - ) + logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") generation_config.pad_token_id = eos_token_id # 3. Define model inputs @@ -195,19 +179,11 @@ def generate( model_kwargs["output_hidden_states"] = generation_config.output_hidden_states model_kwargs["use_cache"] = generation_config.use_cache - accepts_attention_mask = "attention_mask" in set( - inspect.signature(self.forward).parameters.keys() - ) + accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys()) requires_attention_mask = "encoder_outputs" not in model_kwargs - if ( - model_kwargs.get("attention_mask", None) is None - and requires_attention_mask - and accepts_attention_mask - ): - model_kwargs[ - "attention_mask" - ] = self._prepare_attention_mask_for_generation( + if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask: + model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id, @@ -217,8 +193,7 @@ def generate( if not self.config.is_encoder_decoder: if ( generation_config.pad_token_id is not None - and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) - > 0 + and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0 ): logger.warning( "A decoder-only architecture is being used, but right-padding was detected! For correct " @@ -247,10 +222,7 @@ def generate( # 6. Prepare `max_length` depending on other stopping criteria. input_ids_seq_length = input_ids.shape[-1] - has_default_max_length = ( - kwargs.get("max_length") is None - and generation_config.max_length is not None - ) + has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to" @@ -260,12 +232,8 @@ def generate( UserWarning, ) elif has_default_max_length and generation_config.max_new_tokens is not None: - generation_config.max_length = ( - generation_config.max_new_tokens + input_ids_seq_length - ) - elif ( - not has_default_max_length and generation_config.max_new_tokens is not None - ): + generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + elif not has_default_max_length and generation_config.max_new_tokens is not None: raise ValueError( "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a" " limit to the generated output length. Remove one of those arguments. Please refer to the" @@ -273,18 +241,13 @@ def generate( "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) - if ( - generation_config.min_length is not None - and generation_config.min_length > generation_config.max_length - ): + if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( f"Unfeasible length constraints: the minimum length ({generation_config.min_length}) is larger than" f" the maximum length ({generation_config.max_length})" ) if input_ids_seq_length >= generation_config.max_length: - input_ids_string = ( - "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" - ) + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" logger.warning( f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" @@ -293,8 +256,7 @@ def generate( # 7. determine generation mode is_constraint_gen_mode = ( - generation_config.constraints is not None - or generation_config.force_words_ids is not None + generation_config.constraints is not None or generation_config.force_words_ids is not None ) is_contrastive_search_gen_mode = ( @@ -349,9 +311,7 @@ def generate( ) if generation_config.num_beam_groups > generation_config.num_beams: - raise ValueError( - "`num_beam_groups` has to be smaller or equal to `num_beams`" - ) + raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`") if is_group_beam_gen_mode and generation_config.do_sample is True: raise ValueError( "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`." @@ -474,14 +434,10 @@ def generate( ) elif is_beam_gen_mode: if generation_config.num_return_sequences > generation_config.num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) + raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.") if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) + raise ValueError("`max_length` needs to be a stopping_criteria for now.") # 11. prepare beam search scorer beam_scorer = BeamSearchScorer( @@ -518,9 +474,7 @@ def generate( logits_warper = self._get_logits_warper(generation_config) if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) + raise ValueError("`max_length` needs to be a stopping_criteria for now.") # 12. prepare beam search scorer beam_scorer = BeamSearchScorer( batch_size=batch_size * generation_config.num_return_sequences, @@ -533,8 +487,7 @@ def generate( # 13. interleave input_ids with `num_beams` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( input_ids=input_ids, - expand_size=generation_config.num_beams - * generation_config.num_return_sequences, + expand_size=generation_config.num_beams * generation_config.num_return_sequences, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs, ) @@ -556,27 +509,17 @@ def generate( elif is_group_beam_gen_mode: if generation_config.num_return_sequences > generation_config.num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) + raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.") if generation_config.num_beams % generation_config.num_beam_groups != 0: - raise ValueError( - "`num_beams` should be divisible by `num_beam_groups` for group beam search." - ) + raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.") if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) + raise ValueError("`max_length` needs to be a stopping_criteria for now.") - has_default_typical_p = ( - kwargs.get("typical_p") is None and generation_config.typical_p == 1.0 - ) + has_default_typical_p = kwargs.get("typical_p") is None and generation_config.typical_p == 1.0 if not has_default_typical_p: - raise ValueError( - "Decoder argument `typical_p` is not supported with beam groups." - ) + raise ValueError("Decoder argument `typical_p` is not supported with beam groups.") # 11. prepare beam search scorer beam_scorer = BeamSearchScorer( @@ -612,32 +555,19 @@ def generate( elif is_constraint_gen_mode: if generation_config.num_return_sequences > generation_config.num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) + raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.") if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) + raise ValueError("`max_length` needs to be a stopping_criteria for now.") if generation_config.num_beams <= 1: - raise ValueError( - "`num_beams` needs to be greater than 1 for constrained generation." - ) + raise ValueError("`num_beams` needs to be greater than 1 for constrained generation.") if generation_config.do_sample: - raise ValueError( - "`do_sample` needs to be false for constrained generation." - ) + raise ValueError("`do_sample` needs to be false for constrained generation.") - if ( - generation_config.num_beam_groups is not None - and generation_config.num_beam_groups > 1 - ): - raise ValueError( - "`num_beam_groups` not supported yet for constrained generation." - ) + if generation_config.num_beam_groups is not None and generation_config.num_beam_groups > 1: + raise ValueError("`num_beam_groups` not supported yet for constrained generation.") final_constraints = [] if generation_config.constraints is not None: @@ -661,15 +591,10 @@ def typeerror(): if isinstance(word_ids[0], list): if not isinstance(word_ids, list) or len(word_ids) == 0: typeerror() - if any( - not isinstance(token_ids, list) for token_ids in word_ids - ): + if any(not isinstance(token_ids, list) for token_ids in word_ids): typeerror() if any( - any( - (not isinstance(token_id, int) or token_id < 0) - for token_id in token_ids - ) + any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids) for token_ids in word_ids ): typeerror() @@ -678,10 +603,7 @@ def typeerror(): else: if not isinstance(word_ids, list) or len(word_ids) == 0: typeerror() - if any( - (not isinstance(token_id, int) or token_id < 0) - for token_id in word_ids - ): + if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids): typeerror() constraint = PhrasalConstraint(word_ids) @@ -843,52 +765,26 @@ def sample_stream( ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the'] ```""" # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() if max_length is not None: warnings.warn( "`max_length` is deprecated in this function, use" " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", UserWarning, ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - logits_warper = ( - logits_warper if logits_warper is not None else LogitsProcessorList() - ) - pad_token_id = ( - pad_token_id - if pad_token_id is not None - else self.generation_config.pad_token_id - ) - eos_token_id = ( - eos_token_id - if eos_token_id is not None - else self.generation_config.eos_token_id - ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] - output_scores = ( - output_scores - if output_scores is not None - else self.generation_config.output_scores - ) + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores output_attentions = ( - output_attentions - if output_attentions is not None - else self.generation_config.output_attentions + output_attentions if output_attentions is not None else self.generation_config.output_attentions ) output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.generation_config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states ) return_dict_in_generate = ( return_dict_in_generate @@ -898,15 +794,9 @@ def sample_stream( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None # keep track of which sequences are already finished unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) @@ -917,9 +807,7 @@ def sample_stream( if synced_gpus: # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) # send 0.0 if we finished, 1.0 otherwise dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) # did all peers finish? the reduced sum will be 0.0 then @@ -952,18 +840,14 @@ def sample_stream( scores += (next_token_scores,) if output_attentions: decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ) if self.config.is_encoder_decoder: cross_attentions += (outputs.cross_attentions,) if output_hidden_states: decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) + (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,) ) # sample @@ -973,12 +857,8 @@ def sample_stream( # finished sentences should have their next token be a padding token if eos_token_id is not None: if pad_token_id is None: - raise ValueError( - "If `eos_token_id` is defined, make sure that `pad_token_id` is defined." - ) - next_tokens = next_tokens * unfinished_sequences + pad_token_id * ( - 1 - unfinished_sequences - ) + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) yield next_tokens, self.final_norm(outputs.hidden_states[-1][:, -1]) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) @@ -988,9 +868,7 @@ def sample_stream( # if eos_token was found in one sentence, set sentence to finished if eos_token_id is not None: - unfinished_sequences = unfinished_sequences.mul( - (sum(next_tokens != i for i in eos_token_id)).long() - ) + unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long()) # stop when each sentence is finished, or if we exceed the maximum length if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): @@ -1007,22 +885,17 @@ def init_stream_support(): if __name__ == "__main__": - from transformers import PreTrainedModel - from transformers import AutoTokenizer, AutoModelForCausalLM + from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel PreTrainedModel.generate = NewGenerationMixin.generate PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream - model = AutoModelForCausalLM.from_pretrained( - "bigscience/bloom-560m", torch_dtype=torch.float16 - ) + model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") model = model.to("cuda:0") model = model.eval() prompt_text = "hello? \n" - input_ids = tokenizer( - prompt_text, return_tensors="pt", add_special_tokens=False - ).input_ids + input_ids = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False).input_ids input_ids = input_ids.to("cuda:0") with torch.no_grad(): diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index f34a7ac036..1a3cc47aaf 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -1,15 +1,73 @@ import os import re -import json +import textwrap +from functools import cached_property +import pypinyin import torch +from hangul_romanize import Transliter +from hangul_romanize.rule import academic +from num2words import num2words +from spacy.lang.ar import Arabic +from spacy.lang.en import English +from spacy.lang.es import Spanish +from spacy.lang.ja import Japanese +from spacy.lang.zh import Chinese from tokenizers import Tokenizer -import pypinyin -import cutlet -from num2words import num2words from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words + +def get_spacy_lang(lang): + if lang == "zh": + return Chinese() + elif lang == "ja": + return Japanese() + elif lang == "ar": + return Arabic() + elif lang == "es": + return Spanish() + else: + # For most languages, Enlish does the job + return English() + + +def split_sentence(text, lang, text_split_length=250): + """Preprocess the input text""" + text_splits = [] + if text_split_length is not None and len(text) >= text_split_length: + text_splits.append("") + nlp = get_spacy_lang(lang) + nlp.add_pipe("sentencizer") + doc = nlp(text) + for sentence in doc.sents: + if len(text_splits[-1]) + len(str(sentence)) <= text_split_length: + # if the last sentence + the current sentence is less than the text_split_length + # then add the current sentence to the last sentence + text_splits[-1] += " " + str(sentence) + text_splits[-1] = text_splits[-1].lstrip() + elif len(str(sentence)) > text_split_length: + # if the current sentence is greater than the text_split_length + for line in textwrap.wrap( + str(sentence), + width=text_split_length, + drop_whitespace=True, + break_on_hyphens=False, + tabsize=1, + ): + text_splits.append(str(line)) + else: + text_splits.append(str(sentence)) + + if len(text_splits) > 1: + if text_splits[0] == "": + del text_splits[0] + else: + text_splits = [text.lstrip()] + + return text_splits + + _whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: @@ -88,7 +146,7 @@ "it": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ - #("sig.ra", "signora"), + # ("sig.ra", "signora"), ("sig", "signore"), ("dr", "dottore"), ("st", "santo"), @@ -113,7 +171,7 @@ # There are not many common abbreviations in Arabic as in English. ] ], - "zh-cn": [ + "zh": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. @@ -122,49 +180,66 @@ "cs": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ - ("dr", "doktor"), # doctor - ("ing", "inženýr"), # engineer - ("p", "pan"), # Could also map to pani for woman but no easy way to do it + ("dr", "doktor"), # doctor + ("ing", "inženýr"), # engineer + ("p", "pan"), # Could also map to pani for woman but no easy way to do it # Other abbreviations would be specialized and not as common. ] ], "ru": [ (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) for x in [ - ("г-жа", "госпожа"), # Mrs. - ("г-н", "господин"), # Mr. - ("д-р", "доктор"), # doctor + ("г-жа", "госпожа"), # Mrs. + ("г-н", "господин"), # Mr. + ("д-р", "доктор"), # doctor # Other abbreviations are less common or specialized. ] ], "nl": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ - ("dhr", "de heer"), # Mr. + ("dhr", "de heer"), # Mr. ("mevr", "mevrouw"), # Mrs. - ("dr", "dokter"), # doctor - ("jhr", "jonkheer"), # young lord or nobleman + ("dr", "dokter"), # doctor + ("jhr", "jonkheer"), # young lord or nobleman # Dutch uses more abbreviations, but these are the most common ones. ] ], "tr": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ - ("b", "bay"), # Mr. + ("b", "bay"), # Mr. ("byk", "büyük"), # büyük - ("dr", "doktor"), # doctor + ("dr", "doktor"), # doctor # Add other Turkish abbreviations here if needed. ] ], + "hu": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("dr", "doktor"), # doctor + ("b", "bácsi"), # Mr. + ("nőv", "nővér"), # nurse + # Add other Hungarian abbreviations here if needed. + ] + ], + "ko": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. + ] + ], } -def expand_abbreviations_multilingual(text, lang='en'): + +def expand_abbreviations_multilingual(text, lang="en"): for regex, replacement in _abbreviations[lang]: text = re.sub(regex, replacement, text) return text + _symbols_multilingual = { - 'en': [ + "en": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " and "), @@ -173,10 +248,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " hash "), ("$", " dollar "), ("£", " pound "), - ("°", " degree ") + ("°", " degree "), ] ], - 'es': [ + "es": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " y "), @@ -185,10 +260,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " numeral "), ("$", " dolar "), ("£", " libra "), - ("°", " grados ") + ("°", " grados "), ] ], - 'fr': [ + "fr": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " et "), @@ -197,10 +272,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " dièse "), ("$", " dollar "), ("£", " livre "), - ("°", " degrés ") + ("°", " degrés "), ] ], - 'de': [ + "de": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " und "), @@ -209,10 +284,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " raute "), ("$", " dollar "), ("£", " pfund "), - ("°", " grad ") + ("°", " grad "), ] ], - 'pt': [ + "pt": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " e "), @@ -221,10 +296,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " cardinal "), ("$", " dólar "), ("£", " libra "), - ("°", " graus ") + ("°", " graus "), ] ], - 'it': [ + "it": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " e "), @@ -233,10 +308,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " cancelletto "), ("$", " dollaro "), ("£", " sterlina "), - ("°", " gradi ") + ("°", " gradi "), ] ], - 'pl': [ + "pl": [ (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ ("&", " i "), @@ -245,7 +320,7 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " krzyżyk "), ("$", " dolar "), ("£", " funt "), - ("°", " stopnie ") + ("°", " stopnie "), ] ], "ar": [ @@ -258,10 +333,10 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " رقم "), ("$", " دولار "), ("£", " جنيه "), - ("°", " درجة ") + ("°", " درجة "), ] ], - "zh-cn": [ + "zh": [ # Chinese (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ @@ -271,7 +346,7 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " 号 "), ("$", " 美元 "), ("£", " 英镑 "), - ("°", " 度 ") + ("°", " 度 "), ] ], "cs": [ @@ -284,7 +359,7 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " křížek "), ("$", " dolar "), ("£", " libra "), - ("°", " stupně ") + ("°", " stupně "), ] ], "ru": [ @@ -297,7 +372,7 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " номер "), ("$", " доллар "), ("£", " фунт "), - ("°", " градус ") + ("°", " градус "), ] ], "nl": [ @@ -310,7 +385,7 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " hekje "), ("$", " dollar "), ("£", " pond "), - ("°", " graden ") + ("°", " graden "), ] ], "tr": [ @@ -322,15 +397,41 @@ def expand_abbreviations_multilingual(text, lang='en'): ("#", " diyez "), ("$", " dolar "), ("£", " sterlin "), - ("°", " derece ") + ("°", " derece "), + ] + ], + "hu": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " és "), + ("@", " kukac "), + ("%", " százalék "), + ("#", " kettőskereszt "), + ("$", " dollár "), + ("£", " font "), + ("°", " fok "), + ] + ], + "ko": [ + # Korean + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " 그리고 "), + ("@", " 에 "), + ("%", " 퍼센트 "), + ("#", " 번호 "), + ("$", " 달러 "), + ("£", " 파운드 "), + ("°", " 도 "), ] ], } -def expand_symbols_multilingual(text, lang='en'): + +def expand_symbols_multilingual(text, lang="en"): for regex, replacement in _symbols_multilingual[lang]: text = re.sub(regex, replacement, text) - text = text.replace(' ', ' ') # Ensure there are no double spaces + text = text.replace(" ", " ") # Ensure there are no double spaces return text.strip() @@ -343,41 +444,47 @@ def expand_symbols_multilingual(text, lang='en'): "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"), "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"), "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"), - "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals. + "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals. "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"), "nl": re.compile(r"([0-9]+)(de|ste|e)"), "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), + "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), + "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), } _number_re = re.compile(r"[0-9]+") _currency_re = { - 'USD': re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), - 'GBP': re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), - 'EUR': re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))") + "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), + "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), + "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), } _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") _dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b") _decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)") + def _remove_commas(m): text = m.group(0) if "," in text: text = text.replace(",", "") return text + def _remove_dots(m): text = m.group(0) if "." in text: text = text.replace(".", "") return text -def _expand_decimal_point(m, lang='en'): + +def _expand_decimal_point(m, lang="en"): amount = m.group(1).replace(",", ".") return num2words(float(amount), lang=lang if lang != "cs" else "cz") -def _expand_currency(m, lang='en', currency='USD'): - amount = float((re.sub(r'[^\d.]', '', m.group(0).replace(",", ".")))) - full_amount = num2words(amount, to='currency', currency=currency, lang=lang if lang != "cs" else "cz") + +def _expand_currency(m, lang="en", currency="USD"): + amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) + full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz") and_equivalents = { "en": ", ", @@ -392,6 +499,8 @@ def _expand_currency(m, lang='en', currency='USD'): "nl": ", ", "ar": ", ", "tr": ", ", + "hu": ", ", + "ko": ", ", } if amount.is_integer(): @@ -401,14 +510,17 @@ def _expand_currency(m, lang='en', currency='USD'): return full_amount -def _expand_ordinal(m, lang='en'): + +def _expand_ordinal(m, lang="en"): return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") -def _expand_number(m, lang='en'): + +def _expand_number(m, lang="en"): return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") -def expand_numbers_multilingual(text, lang='en'): - if lang == "zh-cn": + +def expand_numbers_multilingual(text, lang="en"): + if lang == "zh": text = zh_num2words()(text) else: if lang in ["en", "ru"]: @@ -416,9 +528,9 @@ def expand_numbers_multilingual(text, lang='en'): else: text = re.sub(_dot_number_re, _remove_dots, text) try: - text = re.sub(_currency_re['GBP'], lambda m: _expand_currency(m, lang, 'GBP'), text) - text = re.sub(_currency_re['USD'], lambda m: _expand_currency(m, lang, 'USD'), text) - text = re.sub(_currency_re['EUR'], lambda m: _expand_currency(m, lang, 'EUR'), text) + text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text) + text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text) + text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text) except: pass if lang != "tr": @@ -427,15 +539,18 @@ def expand_numbers_multilingual(text, lang='en'): text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) return text + def lowercase(text): return text.lower() + def collapse_whitespace(text): return re.sub(_whitespace_re, " ", text) + def multilingual_cleaners(text, lang): - text = text.replace('"', '') - if lang=="tr": + text = text.replace('"', "") + if lang == "tr": text = text.replace("İ", "i") text = text.replace("Ö", "ö") text = text.replace("Ü", "ü") @@ -446,55 +561,94 @@ def multilingual_cleaners(text, lang): text = collapse_whitespace(text) return text + def basic_cleaners(text): """Basic pipeline that lowercases and collapses whitespace without transliteration.""" text = lowercase(text) text = collapse_whitespace(text) return text + def chinese_transliterate(text): - return "".join([p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]) + return "".join( + [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)] + ) + def japanese_cleaners(text, katsu): text = katsu.romaji(text) text = lowercase(text) return text -class VoiceBpeTokenizer: - def __init__(self, vocab_file=None, preprocess=None): - self.tokenizer = None - self.katsu = None - if vocab_file is not None: - with open(vocab_file, "r", encoding="utf-8") as f: - vocab = json.load(f) +def korean_transliterate(text): + r = Transliter(academic) + return r.translit(text) - self.language = vocab["model"]["language"] if "language" in vocab["model"] else None - if preprocess is None: - self.preprocess = "pre_tokenizer" in vocab and vocab["pre_tokenizer"] - else: - self.preprocess = preprocess +DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json") + +class VoiceBpeTokenizer: + def __init__(self, vocab_file=None): + self.tokenizer = None + if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) + self.char_limits = { + "en": 250, + "de": 253, + "fr": 273, + "es": 239, + "it": 213, + "pt": 203, + "pl": 224, + "zh": 82, + "ar": 166, + "cs": 186, + "ru": 182, + "nl": 251, + "tr": 226, + "ja": 71, + "hu": 224, + "ko": 95, + } + + @cached_property + def katsu(self): + import cutlet + + return cutlet.Cutlet() + + def check_input_length(self, txt, lang): + lang = lang.split("-")[0] # remove the region + limit = self.char_limits.get(lang, 250) + if len(txt) > limit: + print( + f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." + ) def preprocess_text(self, txt, lang): - if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: txt = multilingual_cleaners(txt, lang) - if lang == "zh-cn": + if lang == "zh": txt = chinese_transliterate(txt) + if lang == "ko": + txt = korean_transliterate(txt) elif lang == "ja": - if self.katsu is None: - import cutlet - self.katsu = cutlet.Cutlet() txt = japanese_cleaners(txt, self.katsu) + elif lang == "hi": + # @manmay will implement this + txt = basic_cleaners(txt) else: - raise NotImplementedError() + raise NotImplementedError(f"Language '{lang}' is not supported.") return txt def encode(self, txt, lang): - if self.preprocess: - txt = self.preprocess_text(txt, lang) + lang = lang.split("-")[0] # remove the region + self.check_input_length(txt, lang) + txt = self.preprocess_text(txt, lang) + lang = "zh-cn" if lang == "zh" else lang + txt = f"[{lang}]{txt}" txt = txt.replace(" ", "[SPACE]") return self.tokenizer.encode(txt).ids @@ -512,3 +666,178 @@ def __len__(self): def get_number_tokens(self): return max(self.tokenizer.get_vocab().values()) + 1 + + +def test_expand_numbers_multilingual(): + test_cases = [ + # English + ("In 12.5 seconds.", "In twelve point five seconds.", "en"), + ("There were 50 soldiers.", "There were fifty soldiers.", "en"), + ("This is a 1st test", "This is a first test", "en"), + ("That will be $20 sir.", "That will be twenty dollars sir.", "en"), + ("That will be 20€ sir.", "That will be twenty euro sir.", "en"), + ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"), + ("That's 100,000.5.", "That's one hundred thousand point five.", "en"), + # French + ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"), + ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"), + ("Ceci est un 1er test", "Ceci est un premier test", "fr"), + ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"), + ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"), + ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"), + ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"), + # German + ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"), + ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"), + ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender + ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"), + ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"), + ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"), + # Spanish + ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"), + ("Había 50 soldados.", "Había cincuenta soldados.", "es"), + ("Este es un 1er test", "Este es un primero test", "es"), + ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"), + ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"), + ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"), + # Italian + ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"), + ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"), + ("Questo è un 1° test", "Questo è un primo test", "it"), + ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"), + ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"), + ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"), + # Portuguese + ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"), + ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"), + ("Este é um 1º teste", "Este é um primeiro teste", "pt"), + ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"), + ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"), + ( + "Isso custará 20,15€ senhor.", + "Isso custará vinte euros e quinze cêntimos senhor.", + "pt", + ), # "cêntimos" should be "centavos" num2words issue + # Polish + ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"), + ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"), + ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"), + ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"), + # Arabic + ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"), + ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"), + # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words + # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'), + # Czech + ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"), + ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"), + ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"), + ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"), + # Russian + ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"), + ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"), + ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"), + ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"), + # Dutch + ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"), + ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"), + ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), + ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), + # Chinese (Simplified) + ("在12.5秒内", "在十二点五秒内", "zh"), + ("有50名士兵", "有五十名士兵", "zh"), + # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work + # ("那将是20€先生", '那将是二十欧元先生', 'zh'), + # Turkish + # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR + ("50 asker vardı.", "elli asker vardı.", "tr"), + ("Bu 1. test", "Bu birinci test", "tr"), + # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'), + # Hungarian + ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"), + ("50 katona volt.", "ötven katona volt.", "hu"), + ("Ez az 1. teszt", "Ez az első teszt", "hu"), + # Korean + ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), + ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), + ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), + ] + for a, b, lang in test_cases: + out = expand_numbers_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +def test_abbreviations_multilingual(): + test_cases = [ + # English + ("Hello Mr. Smith.", "Hello mister Smith.", "en"), + ("Dr. Jones is here.", "doctor Jones is here.", "en"), + # Spanish + ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"), + ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"), + # French + ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"), + ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"), + # German + ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"), + # Portuguese + ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"), + ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"), + # Italian + ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"), + # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern + # Polish + ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"), + ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"), + # Czech + ("P. Novák", "pan Novák", "cs"), + ("Dr. Vojtěch", "doktor Vojtěch", "cs"), + # Dutch + ("Dhr. Jansen", "de heer Jansen", "nl"), + ("Mevr. de Vries", "mevrouw de Vries", "nl"), + # Russian + ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"), + ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"), + # Turkish + ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"), + ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), + # Hungarian + ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), + ] + + for a, b, lang in test_cases: + out = expand_abbreviations_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +def test_symbols_multilingual(): + test_cases = [ + ("I have 14% battery", "I have 14 percent battery", "en"), + ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"), + ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"), + ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"), + ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"), + ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"), + ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"), + ("Mám 14% baterie", "Mám 14 procento baterie", "cs"), + ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"), + ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"), + ("Я буду @ дома", "Я буду собака дома", "ru"), + ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), + ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), + ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), + ("我的电量为 14%", "我的电量为 14 百分之", "zh"), + ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), + ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), + ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), + ] + + for a, b, lang in test_cases: + out = expand_symbols_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +if __name__ == "__main__": + test_expand_numbers_multilingual() + test_abbreviations_multilingual() + test_symbols_multilingual() diff --git a/TTS/tts/layers/xtts/trainer/dataset.py b/TTS/tts/layers/xtts/trainer/dataset.py new file mode 100644 index 0000000000..2f958cb5a5 --- /dev/null +++ b/TTS/tts/layers/xtts/trainer/dataset.py @@ -0,0 +1,239 @@ +import os +import random +import sys + +import torch +import torch.nn.functional as F +import torch.utils.data + +from TTS.tts.models.xtts import load_audio + +torch.set_num_threads(1) + + +def key_samples_by_col(samples, col): + """Returns a dictionary of samples keyed by language.""" + samples_by_col = {} + for sample in samples: + col_val = sample[col] + assert isinstance(col_val, str) + if col_val not in samples_by_col: + samples_by_col[col_val] = [] + samples_by_col[col_val].append(sample) + return samples_by_col + + +def get_prompt_slice(gt_path, max_sample_length, min_sample_length, sample_rate, is_eval=False): + rel_clip = load_audio(gt_path, sample_rate) + # if eval uses a middle size sample when it is possible to be more reproducible + if is_eval: + sample_length = int((min_sample_length + max_sample_length) / 2) + else: + sample_length = random.randint(min_sample_length, max_sample_length) + gap = rel_clip.shape[-1] - sample_length + if gap < 0: + sample_length = rel_clip.shape[-1] // 2 + gap = rel_clip.shape[-1] - sample_length + + # if eval start always from the position 0 to be more reproducible + if is_eval: + rand_start = 0 + else: + rand_start = random.randint(0, gap) + + rand_end = rand_start + sample_length + rel_clip = rel_clip[:, rand_start:rand_end] + rel_clip = F.pad(rel_clip, pad=(0, max_sample_length - rel_clip.shape[-1])) + cond_idxs = [rand_start, rand_end] + return rel_clip, rel_clip.shape[-1], cond_idxs + + +class XTTSDataset(torch.utils.data.Dataset): + def __init__(self, config, samples, tokenizer, sample_rate, is_eval=False): + self.config = config + model_args = config.model_args + self.failed_samples = set() + self.debug_failures = model_args.debug_loading_failures + self.max_conditioning_length = model_args.max_conditioning_length + self.min_conditioning_length = model_args.min_conditioning_length + self.is_eval = is_eval + self.tokenizer = tokenizer + self.sample_rate = sample_rate + self.max_wav_len = model_args.max_wav_length + self.max_text_len = model_args.max_text_length + self.use_masking_gt_prompt_approach = model_args.gpt_use_masking_gt_prompt_approach + assert self.max_wav_len is not None and self.max_text_len is not None + + self.samples = samples + if not is_eval: + random.seed(config.training_seed) + # random.shuffle(self.samples) + random.shuffle(self.samples) + # order by language + self.samples = key_samples_by_col(self.samples, "language") + print(" > Sampling by language:", self.samples.keys()) + else: + # for evaluation load and check samples that are corrupted to ensures the reproducibility + self.check_eval_samples() + + def check_eval_samples(self): + print(" > Filtering invalid eval samples!!") + new_samples = [] + for sample in self.samples: + try: + tseq, _, wav, _, _, _ = self.load_item(sample) + except: + continue + # Basically, this audio file is nonexistent or too long to be supported by the dataset. + if ( + wav is None + or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) + or (self.max_text_len is not None and tseq.shape[0] > self.max_text_len) + ): + continue + new_samples.append(sample) + self.samples = new_samples + print(" > Total eval samples after filtering:", len(self.samples)) + + def get_text(self, text, lang): + tokens = self.tokenizer.encode(text, lang) + tokens = torch.IntTensor(tokens) + assert not torch.any(tokens == 1), f"UNK token found in {text} -> {self.tokenizer.decode(tokens)}" + # The stop token should always be sacred. + assert not torch.any(tokens == 0), f"Stop token found in {text}" + return tokens + + def load_item(self, sample): + text = str(sample["text"]) + tseq = self.get_text(text, sample["language"]) + audiopath = sample["audio_file"] + wav = load_audio(audiopath, self.sample_rate) + if text is None or len(text.strip()) == 0: + raise ValueError + if wav is None or wav.shape[-1] < (0.5 * self.sample_rate): + # Ultra short clips are also useless (and can cause problems within some models). + raise ValueError + + if self.use_masking_gt_prompt_approach: + # get a slice from GT to condition the model + cond, _, cond_idxs = get_prompt_slice( + audiopath, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval + ) + # if use masking do not use cond_len + cond_len = torch.nan + else: + ref_sample = ( + sample["reference_path"] + if "reference_path" in sample and sample["reference_path"] is not None + else audiopath + ) + cond, cond_len, _ = get_prompt_slice( + ref_sample, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval + ) + # if do not use masking use cond_len + cond_idxs = torch.nan + + return tseq, audiopath, wav, cond, cond_len, cond_idxs + + def __getitem__(self, index): + if self.is_eval: + sample = self.samples[index] + sample_id = str(index) + else: + # select a random language + lang = random.choice(list(self.samples.keys())) + # select random sample + index = random.randint(0, len(self.samples[lang]) - 1) + sample = self.samples[lang][index] + # a unique id for each sampel to deal with fails + sample_id = lang + "_" + str(index) + + # ignore samples that we already know that is not valid ones + if sample_id in self.failed_samples: + if self.debug_failures: + print(f"Ignoring sample {sample['audio_file']} because it was already ignored before !!") + # call get item again to get other sample + return self[1] + + # try to load the sample, if fails added it to the failed samples list + try: + tseq, audiopath, wav, cond, cond_len, cond_idxs = self.load_item(sample) + except: + if self.debug_failures: + print(f"error loading {sample['audio_file']} {sys.exc_info()}") + self.failed_samples.add(sample_id) + return self[1] + + # check if the audio and text size limits and if it out of the limits, added it failed_samples + if ( + wav is None + or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) + or (self.max_text_len is not None and tseq.shape[0] > self.max_text_len) + ): + # Basically, this audio file is nonexistent or too long to be supported by the dataset. + # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result. + if self.debug_failures and wav is not None and tseq is not None: + print( + f"error loading {sample['audio_file']}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}" + ) + self.failed_samples.add(sample_id) + return self[1] + + res = { + # 'real_text': text, + "text": tseq, + "text_lengths": torch.tensor(tseq.shape[0], dtype=torch.long), + "wav": wav, + "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long), + "filenames": audiopath, + "conditioning": cond.unsqueeze(1), + "cond_lens": torch.tensor(cond_len, dtype=torch.long) + if cond_len is not torch.nan + else torch.tensor([cond_len]), + "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]), + } + return res + + def __len__(self): + if self.is_eval: + return len(self.samples) + return sum([len(v) for v in self.samples.values()]) + + def collate_fn(self, batch): + # convert list of dicts to dict of lists + B = len(batch) + + batch = {k: [dic[k] for dic in batch] for k in batch[0]} + + # stack for features that already have the same shape + batch["wav_lengths"] = torch.stack(batch["wav_lengths"]) + batch["text_lengths"] = torch.stack(batch["text_lengths"]) + batch["conditioning"] = torch.stack(batch["conditioning"]) + batch["cond_lens"] = torch.stack(batch["cond_lens"]) + batch["cond_idxs"] = torch.stack(batch["cond_idxs"]) + + if torch.any(batch["cond_idxs"].isnan()): + batch["cond_idxs"] = None + + if torch.any(batch["cond_lens"].isnan()): + batch["cond_lens"] = None + + max_text_len = batch["text_lengths"].max() + max_wav_len = batch["wav_lengths"].max() + + # create padding tensors + text_padded = torch.IntTensor(B, max_text_len) + wav_padded = torch.FloatTensor(B, 1, max_wav_len) + + # initialize tensors for zero padding + text_padded = text_padded.zero_() + wav_padded = wav_padded.zero_() + for i in range(B): + text = batch["text"][i] + text_padded[i, : batch["text_lengths"][i]] = torch.IntTensor(text) + wav = batch["wav"][i] + wav_padded[i, :, : batch["wav_lengths"][i]] = torch.FloatTensor(wav) + + batch["wav"] = wav_padded + batch["padded_text"] = text_padded + return batch diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py new file mode 100644 index 0000000000..6276f60af6 --- /dev/null +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -0,0 +1,500 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Union + +import torch +import torch.nn as nn +import torchaudio +from coqpit import Coqpit +from torch.nn import functional as F +from torch.utils.data import DataLoader +from trainer.torch import DistributedSampler +from trainer.trainer_utils import get_optimizer, get_scheduler + +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.datasets.dataset import TTSDataset +from TTS.tts.layers.tortoise.arch_utils import TorchMelSpectrogram +from TTS.tts.layers.xtts.dvae import DiscreteVAE +from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer +from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig +from TTS.utils.io import load_fsspec + + +@dataclass +class GPTTrainerConfig(XttsConfig): + lr: float = 5e-06 + training_seed: int = 1 + optimizer_wd_only_on_weights: bool = False + weighted_loss_attrs: dict = field(default_factory=lambda: {}) + weighted_loss_multipliers: dict = field(default_factory=lambda: {}) + test_sentences: List[dict] = field(default_factory=lambda: []) + + +@dataclass +class XttsAudioConfig(XttsAudioConfig): + dvae_sample_rate: int = 22050 + + +@dataclass +class GPTArgs(XttsArgs): + min_conditioning_length: int = 66150 + max_conditioning_length: int = 132300 + gpt_loss_text_ce_weight: float = 0.01 + gpt_loss_mel_ce_weight: float = 1.0 + gpt_num_audio_tokens: int = 8194 + debug_loading_failures: bool = False + max_wav_length: int = 255995 # ~11.6 seconds + max_text_length: int = 200 + tokenizer_file: str = "" + mel_norm_file: str = "https://coqui.gateway.scarf.sh/v0.14.0_models/mel_norms.pth" + dvae_checkpoint: str = "" + xtts_checkpoint: str = "" + gpt_checkpoint: str = "" # if defined it will replace the gpt weights on xtts model + vocoder: str = "" # overide vocoder key on the config to avoid json write issues + + +def callback_clearml_load_save(operation_type, model_info): + # return None means skip the file upload/log, returning model_info will continue with the log/upload + # you can also change the upload destination file name model_info.upload_filename or check the local file size with Path(model_info.local_model_path).stat().st_size + assert operation_type in ("load", "save") + # print(operation_type, model_info.__dict__) + + if "similarities.pth" in model_info.__dict__["local_model_path"]: + return None + + return model_info + + +class GPTTrainer(BaseTTS): + def __init__(self, config: Coqpit): + """ + Tortoise GPT training class + """ + super().__init__(config, ap=None, tokenizer=None) + self.config = config + # init XTTS model + self.xtts = Xtts(self.config) + # create the tokenizer with the target vocabulary + self.xtts.tokenizer = VoiceBpeTokenizer(self.args.tokenizer_file) + # init gpt encoder and hifigan decoder + self.xtts.init_models() + + if self.args.xtts_checkpoint: + self.load_checkpoint(self.config, self.args.xtts_checkpoint, eval=False, strict=False) + + # set mel stats + if self.args.mel_norm_file: + self.xtts.mel_stats = load_fsspec(self.args.mel_norm_file) + + # load GPT if available + if self.args.gpt_checkpoint: + gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu")) + # deal with coqui Trainer exported model + if "model" in gpt_checkpoint.keys() and "config" in gpt_checkpoint.keys(): + print("Coqui Trainer checkpoint detected! Converting it!") + gpt_checkpoint = gpt_checkpoint["model"] + states_keys = list(gpt_checkpoint.keys()) + for key in states_keys: + if "gpt." in key: + new_key = key.replace("gpt.", "") + gpt_checkpoint[new_key] = gpt_checkpoint[key] + del gpt_checkpoint[key] + else: + del gpt_checkpoint[key] + + # edit checkpoint if the number of tokens is changed to ensures the better transfer learning possible + if ( + "text_embedding.weight" in gpt_checkpoint + and gpt_checkpoint["text_embedding.weight"].shape != self.xtts.gpt.text_embedding.weight.shape + ): + num_new_tokens = ( + self.xtts.gpt.text_embedding.weight.shape[0] - gpt_checkpoint["text_embedding.weight"].shape[0] + ) + print(f" > Loading checkpoint with {num_new_tokens} additional tokens.") + + # add new tokens to a linear layer (text_head) + emb_g = gpt_checkpoint["text_embedding.weight"] + new_row = torch.randn(num_new_tokens, emb_g.shape[1]) + start_token_row = emb_g[-1, :] + emb_g = torch.cat([emb_g, new_row], axis=0) + emb_g[-1, :] = start_token_row + gpt_checkpoint["text_embedding.weight"] = emb_g + + # add new weights to the linear layer (text_head) + text_head_weight = gpt_checkpoint["text_head.weight"] + start_token_row = text_head_weight[-1, :] + new_entry = torch.randn(num_new_tokens, self.xtts.gpt.text_head.weight.shape[1]) + text_head_weight = torch.cat([text_head_weight, new_entry], axis=0) + text_head_weight[-1, :] = start_token_row + gpt_checkpoint["text_head.weight"] = text_head_weight + + # add new biases to the linear layer (text_head) + text_head_bias = gpt_checkpoint["text_head.bias"] + start_token_row = text_head_bias[-1] + new_bias_entry = torch.zeros(num_new_tokens) + text_head_bias = torch.cat([text_head_bias, new_bias_entry], axis=0) + text_head_bias[-1] = start_token_row + gpt_checkpoint["text_head.bias"] = text_head_bias + + self.xtts.gpt.load_state_dict(gpt_checkpoint, strict=True) + print(">> GPT weights restored from:", self.args.gpt_checkpoint) + + # Mel spectrogram extractor for conditioning + if self.args.gpt_use_perceiver_resampler: + self.torch_mel_spectrogram_style_encoder = TorchMelSpectrogram( + filter_length=2048, + hop_length=256, + win_length=1024, + normalize=False, + sampling_rate=config.audio.sample_rate, + mel_fmin=0, + mel_fmax=8000, + n_mel_channels=80, + mel_norm_file=self.args.mel_norm_file, + ) + else: + self.torch_mel_spectrogram_style_encoder = TorchMelSpectrogram( + filter_length=4096, + hop_length=1024, + win_length=4096, + normalize=False, + sampling_rate=config.audio.sample_rate, + mel_fmin=0, + mel_fmax=8000, + n_mel_channels=80, + mel_norm_file=self.args.mel_norm_file, + ) + + # Load DVAE + self.dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=self.args.gpt_num_audio_tokens - 2, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, + ) + + self.dvae.eval() + if self.args.dvae_checkpoint: + dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu")) + self.dvae.load_state_dict(dvae_checkpoint, strict=False) + print(">> DVAE weights restored from:", self.args.dvae_checkpoint) + else: + raise RuntimeError( + "You need to specify config.model_args.dvae_checkpoint path to be able to train the GPT decoder!!" + ) + + # Mel spectrogram extractor for DVAE + self.torch_mel_spectrogram_dvae = TorchMelSpectrogram( + mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate + ) + + @property + def device(self): + return next(self.parameters()).device + + def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens): + """ + Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode + (actuated by `text_first`). + + text_inputs: long tensor, (b,t) + text_lengths: long tensor, (b,) + mel_inputs: long tensor, (b,m) + wav_lengths: long tensor, (b,) + cond_mels: MEL float tensor, (b, num_samples, 80,t_m) + cond_idxs: cond start and end indexs, (b, 2) + cond_lens: long tensor, (b,) + """ + losses = self.xtts.gpt( + text_inputs, + text_lengths, + audio_codes, + wav_lengths, + cond_mels=cond_mels, + cond_idxs=cond_idxs, + cond_lens=cond_lens, + ) + return losses + + @torch.no_grad() + def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 + test_audios = {} + if self.config.test_sentences: + # init gpt for inference mode + self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False) + self.xtts.gpt.eval() + print(" | > Synthesizing test sentences.") + for idx, s_info in enumerate(self.config.test_sentences): + wav = self.xtts.synthesize( + s_info["text"], + self.config, + s_info["speaker_wav"], + s_info["language"], + gpt_cond_len=3, + )["wav"] + test_audios["{}-audio".format(idx)] = wav + + # delete inference layers + del self.xtts.gpt.gpt_inference + del self.xtts.gpt.gpt.wte + return {"audios": test_audios} + + def test_log( + self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + ) -> None: + logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) + + def format_batch(self, batch: Dict) -> Dict: + return batch + + @torch.no_grad() # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction + def format_batch_on_device(self, batch): + """Compute spectrograms on the device.""" + batch["text_lengths"] = batch["text_lengths"] + batch["wav_lengths"] = batch["wav_lengths"] + batch["text_inputs"] = batch["padded_text"] + batch["cond_idxs"] = batch["cond_idxs"] + # compute conditioning mel specs + # transform waves from torch.Size([B, num_cond_samples, 1, T] to torch.Size([B * num_cond_samples, 1, T] because if is faster than iterate the tensor + B, num_cond_samples, C, T = batch["conditioning"].size() + conditioning_reshaped = batch["conditioning"].view(B * num_cond_samples, C, T) + paired_conditioning_mel = self.torch_mel_spectrogram_style_encoder(conditioning_reshaped) + # transform torch.Size([B * num_cond_samples, n_mel, T_mel]) in torch.Size([B, num_cond_samples, n_mel, T_mel]) + n_mel = self.torch_mel_spectrogram_style_encoder.n_mel_channels # paired_conditioning_mel.size(1) + T_mel = paired_conditioning_mel.size(2) + paired_conditioning_mel = paired_conditioning_mel.view(B, num_cond_samples, n_mel, T_mel) + # get the conditioning embeddings + batch["cond_mels"] = paired_conditioning_mel + # compute codes using DVAE + if self.config.audio.sample_rate != self.config.audio.dvae_sample_rate: + dvae_wav = torchaudio.functional.resample( + batch["wav"], + orig_freq=self.config.audio.sample_rate, + new_freq=self.config.audio.dvae_sample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492, + ) + else: + dvae_wav = batch["wav"] + dvae_mel_spec = self.torch_mel_spectrogram_dvae(dvae_wav) + codes = self.dvae.get_codebook_indices(dvae_mel_spec) + + batch["audio_codes"] = codes + # delete useless batch tensors + del batch["padded_text"] + del batch["wav"] + del batch["conditioning"] + return batch + + def train_step(self, batch, criterion): + loss_dict = {} + cond_mels = batch["cond_mels"] + text_inputs = batch["text_inputs"] + text_lengths = batch["text_lengths"] + audio_codes = batch["audio_codes"] + wav_lengths = batch["wav_lengths"] + cond_idxs = batch["cond_idxs"] + cond_lens = batch["cond_lens"] + + loss_text, loss_mel, _ = self.forward( + text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens + ) + loss_dict["loss_text_ce"] = loss_text * self.args.gpt_loss_text_ce_weight + loss_dict["loss_mel_ce"] = loss_mel * self.args.gpt_loss_mel_ce_weight + loss_dict["loss"] = loss_dict["loss_text_ce"] + loss_dict["loss_mel_ce"] + return {"model_outputs": None}, loss_dict + + def eval_step(self, batch, criterion): + # ignore masking for more consistent evaluation + batch["cond_idxs"] = None + return self.train_step(batch, criterion) + + def on_train_epoch_start(self, trainer): + trainer.model.eval() # the whole model to eval + # put gpt model in training mode + trainer.model.xtts.gpt.train() + + def on_init_end(self, trainer): # pylint: disable=W0613 + # ignore similarities.pth on clearml save/upload + if self.config.dashboard_logger.lower() == "clearml": + from clearml.binding.frameworks import WeightsFileHandler + + WeightsFileHandler.add_pre_callback(callback_clearml_load_save) + + @torch.no_grad() + def inference( + self, + x, + aux_input=None, + ): # pylint: disable=dangerous-default-value + return None + + @staticmethod + def get_criterion(): + return None + + def get_sampler(self, dataset: TTSDataset, num_gpus=1): + # sampler for DDP + batch_sampler = DistributedSampler(dataset) if num_gpus > 1 else None + return batch_sampler + + def get_data_loader( + self, + config: Coqpit, + assets: Dict, + is_eval: bool, + samples: Union[List[Dict], List[List]], + verbose: bool, + num_gpus: int, + rank: int = None, + ) -> "DataLoader": # pylint: disable=W0613 + if is_eval and not config.run_eval: + loader = None + else: + # init dataloader + dataset = XTTSDataset(self.config, samples, self.xtts.tokenizer, config.audio.sample_rate, is_eval) + + # wait all the DDP process to be ready + if num_gpus > 1: + torch.distributed.barrier() + + # sort input sequences from short to long + # dataset.preprocess_samples() + + # get samplers + sampler = self.get_sampler(dataset, num_gpus) + + # ignore sampler when is eval because if we changed the sampler parameter we will not be able to compare previous runs + if sampler is None or is_eval: + loader = DataLoader( + dataset, + batch_size=config.eval_batch_size if is_eval else config.batch_size, + shuffle=False, + drop_last=False, + collate_fn=dataset.collate_fn, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + else: + loader = DataLoader( + dataset, + batch_sampler=sampler, + collate_fn=dataset.collate_fn, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_optimizer(self) -> List: + """Initiate and return the optimizer based on the config parameters.""" + # ToDo: deal with multi GPU training + if self.config.optimizer_wd_only_on_weights: + # parameters to only GPT model + net = self.xtts.gpt + + # normalizations + norm_modules = ( + nn.BatchNorm2d, + nn.InstanceNorm2d, + nn.BatchNorm1d, + nn.InstanceNorm1d, + nn.BatchNorm3d, + nn.InstanceNorm3d, + nn.GroupNorm, + nn.LayerNorm, + ) + # nn.Embedding + emb_modules = (nn.Embedding, nn.EmbeddingBag) + + param_names_notweights = set() + all_param_names = set() + param_map = {} + for mn, m in net.named_modules(): + for k, v in m.named_parameters(): + v.is_bias = k.endswith(".bias") + v.is_weight = k.endswith(".weight") + v.is_norm = isinstance(m, norm_modules) + v.is_emb = isinstance(m, emb_modules) + + fpn = "%s.%s" % (mn, k) if mn else k # full param name + all_param_names.add(fpn) + param_map[fpn] = v + if v.is_bias or v.is_norm or v.is_emb: + param_names_notweights.add(fpn) + + params_names_notweights = sorted(list(param_names_notweights)) + params_notweights = [param_map[k] for k in params_names_notweights] + params_names_weights = sorted(list(all_param_names ^ param_names_notweights)) + params_weights = [param_map[k] for k in params_names_weights] + + groups = [ + {"params": params_weights, "weight_decay": self.config.optimizer_params["weight_decay"]}, + {"params": params_notweights, "weight_decay": 0}, + ] + # torch.optim.AdamW + opt = get_optimizer( + self.config.optimizer, + self.config.optimizer_params, + self.config.lr, + parameters=groups, + ) + opt._group_names = [params_names_weights, params_names_notweights] + return opt + + return get_optimizer( + self.config.optimizer, + self.config.optimizer_params, + self.config.lr, + # optimize only for the GPT model + parameters=self.xtts.gpt.parameters(), + ) + + def get_scheduler(self, optimizer) -> List: + """Set the scheduler for the optimizer. + + Args: + optimizer: `torch.optim.Optimizer`. + """ + return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, optimizer) + + def load_checkpoint( + self, + config, + checkpoint_path, + eval=False, + strict=True, + cache_storage="/tmp/tts_cache", + target_protocol="s3", + target_options={"anon": True}, + ): # pylint: disable=unused-argument, disable=W0201, disable=W0102, redefined-builtin + """Load the model checkpoint and setup for training or inference""" + + state = self.xtts.get_compatible_checkpoint_state_dict(checkpoint_path) + + # load the model weights + self.xtts.load_state_dict(state, strict=strict) + + if eval: + self.xtts.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=False) + self.eval() + assert not self.training + + @staticmethod + def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None): + """Initiate model from config + + Args: + config (GPTTrainerConfig): Model config. + samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. + Defaults to None. + """ + return GPTTrainer(config) diff --git a/TTS/tts/layers/xtts/vocoder.py b/TTS/tts/layers/xtts/vocoder.py deleted file mode 100644 index 0f4991b886..0000000000 --- a/TTS/tts/layers/xtts/vocoder.py +++ /dev/null @@ -1,385 +0,0 @@ -import json -from dataclasses import dataclass -from enum import Enum -from typing import Callable, Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F - -MAX_WAV_VALUE = 32768.0 - - -class KernelPredictor(torch.nn.Module): - """Kernel predictor for the location-variable convolutions""" - - def __init__( - self, - cond_channels, - conv_in_channels, - conv_out_channels, - conv_layers, - conv_kernel_size=3, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - kpnet_nonlinear_activation="LeakyReLU", - kpnet_nonlinear_activation_params={"negative_slope": 0.1}, - ): - """ - Args: - cond_channels (int): number of channel for the conditioning sequence, - conv_in_channels (int): number of channel for the input sequence, - conv_out_channels (int): number of channel for the output sequence, - conv_layers (int): number of layers - """ - super().__init__() - - self.conv_in_channels = conv_in_channels - self.conv_out_channels = conv_out_channels - self.conv_kernel_size = conv_kernel_size - self.conv_layers = conv_layers - - kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w - kpnet_bias_channels = conv_out_channels * conv_layers # l_b - - self.input_conv = nn.Sequential( - nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - - self.residual_convs = nn.ModuleList() - padding = (kpnet_conv_size - 1) // 2 - for _ in range(3): - self.residual_convs.append( - nn.Sequential( - nn.Dropout(kpnet_dropout), - nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - ) - self.kernel_conv = nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_kernel_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - self.bias_conv = nn.utils.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_bias_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - - def forward(self, c): - """ - Args: - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - """ - batch, _, cond_length = c.shape - c = self.input_conv(c) - for residual_conv in self.residual_convs: - residual_conv.to(c.device) - c = c + residual_conv(c) - k = self.kernel_conv(c) - b = self.bias_conv(c) - kernels = k.contiguous().view( - batch, - self.conv_layers, - self.conv_in_channels, - self.conv_out_channels, - self.conv_kernel_size, - cond_length, - ) - bias = b.contiguous().view( - batch, - self.conv_layers, - self.conv_out_channels, - cond_length, - ) - - return kernels, bias - - def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv[0]) - nn.utils.remove_weight_norm(self.kernel_conv) - nn.utils.remove_weight_norm(self.bias_conv) - for block in self.residual_convs: - nn.utils.remove_weight_norm(block[1]) - nn.utils.remove_weight_norm(block[3]) - - -class LVCBlock(torch.nn.Module): - """the location-variable convolutions""" - - def __init__( - self, - in_channels, - cond_channels, - stride, - dilations=[1, 3, 9, 27], - lReLU_slope=0.2, - conv_kernel_size=3, - cond_hop_length=256, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - ): - super().__init__() - - self.cond_hop_length = cond_hop_length - self.conv_layers = len(dilations) - self.conv_kernel_size = conv_kernel_size - - self.kernel_predictor = KernelPredictor( - cond_channels=cond_channels, - conv_in_channels=in_channels, - conv_out_channels=2 * in_channels, - conv_layers=len(dilations), - conv_kernel_size=conv_kernel_size, - kpnet_hidden_channels=kpnet_hidden_channels, - kpnet_conv_size=kpnet_conv_size, - kpnet_dropout=kpnet_dropout, - kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope}, - ) - - self.convt_pre = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( - nn.ConvTranspose1d( - in_channels, - in_channels, - 2 * stride, - stride=stride, - padding=stride // 2 + stride % 2, - output_padding=stride % 2, - ) - ), - ) - - self.conv_blocks = nn.ModuleList() - for dilation in dilations: - self.conv_blocks.append( - nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm( - nn.Conv1d( - in_channels, - in_channels, - conv_kernel_size, - padding=dilation * (conv_kernel_size - 1) // 2, - dilation=dilation, - ) - ), - nn.LeakyReLU(lReLU_slope), - ) - ) - - def forward(self, x, c): - """forward propagation of the location-variable convolutions. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length) - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - - Returns: - Tensor: the output sequence (batch, in_channels, in_length) - """ - _, in_channels, _ = x.shape # (B, c_g, L') - - x = self.convt_pre(x) # (B, c_g, stride * L') - kernels, bias = self.kernel_predictor(c) - - for i, conv in enumerate(self.conv_blocks): - output = conv(x) # (B, c_g, stride * L') - - k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) - b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) - - output = self.location_variable_convolution( - output, k, b, hop_size=self.cond_hop_length - ) # (B, 2 * c_g, stride * L'): LVC - x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( - output[:, in_channels:, :] - ) # (B, c_g, stride * L'): GAU - - return x - - def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): - """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. - Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length). - kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) - bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) - dilation (int): the dilation of convolution. - hop_size (int): the hop_size of the conditioning sequence. - Returns: - (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). - """ - batch, _, in_length = x.shape - batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" - - padding = dilation * int((kernel_size - 1) / 2) - x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) - x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) - - if hop_size < dilation: - x = F.pad(x, (0, dilation), "constant", 0) - x = x.unfold( - 3, dilation, dilation - ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) - x = x[:, :, :, :, :hop_size] - x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) - x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) - - o = torch.einsum("bildsk,biokl->bolsd", x, kernel) - o = o.to(memory_format=torch.channels_last_3d) - bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) - o = o + bias - o = o.contiguous().view(batch, out_channels, -1) - - return o - - def remove_weight_norm(self): - self.kernel_predictor.remove_weight_norm() - nn.utils.remove_weight_norm(self.convt_pre[1]) - for block in self.conv_blocks: - nn.utils.remove_weight_norm(block[1]) - - -class UnivNetGenerator(nn.Module): - """ - UnivNet Generator - - Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py. - """ - - def __init__( - self, - noise_dim=64, - channel_size=32, - dilations=[1, 3, 9, 27], - strides=[8, 8, 4], - lReLU_slope=0.2, - kpnet_conv_size=3, - # Below are MEL configurations options that this generator requires. - hop_length=256, - n_mel_channels=100, - ): - super(UnivNetGenerator, self).__init__() - self.mel_channel = n_mel_channels - self.noise_dim = noise_dim - self.hop_length = hop_length - channel_size = channel_size - kpnet_conv_size = kpnet_conv_size - - self.res_stack = nn.ModuleList() - hop_length = 1 - for stride in strides: - hop_length = stride * hop_length - self.res_stack.append( - LVCBlock( - channel_size, - n_mel_channels, - stride=stride, - dilations=dilations, - lReLU_slope=lReLU_slope, - cond_hop_length=hop_length, - kpnet_conv_size=kpnet_conv_size, - ) - ) - - self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")) - - self.conv_post = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")), - nn.Tanh(), - ) - - def forward(self, c, z): - """ - Args: - c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length) - z (Tensor): the noise sequence (batch, noise_dim, in_length) - - """ - z = self.conv_pre(z) # (B, c_g, L) - - for res_block in self.res_stack: - res_block.to(z.device) - z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i) - - z = self.conv_post(z) # (B, 1, L * 256) - - return z - - def eval(self, inference=False): - super(UnivNetGenerator, self).eval() - # don't remove weight norm while validation in training loop - if inference: - self.remove_weight_norm() - - def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.conv_pre) - - for layer in self.conv_post: - if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) - - for res_block in self.res_stack: - res_block.remove_weight_norm() - - def inference(self, c, z=None): - # pad input mel with zeros to cut artifact - # see https://github.com/seungwonpark/melgan/issues/8 - zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) - mel = torch.cat((c, zero), dim=2) - - if z is None: - z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) - - audio = self.forward(mel, z) - audio = audio[:, :, : -(self.hop_length * 10)] - audio = audio.clamp(min=-1, max=1) - return audio - - -if __name__ == "__main__": - model = UnivNetGenerator() - - c = torch.randn(3, 100, 10) - z = torch.randn(3, 64, 10) - print(c.shape) - - y = model(c, z) - print(y.shape) - assert y.shape == torch.Size([3, 1, 2560]) - - pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(pytorch_total_params) diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index d51174746e..e59ccb6630 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -2,382 +2,391 @@ # 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git) # 2019.9 - 2022 Jiayu DU -import sys, os, argparse -import string, re +import argparse import csv +import os +import re +import string +import sys + +# fmt: off # ================================================================================ # # basic constant # ================================================================================ # -CHINESE_DIGIS = u'零一二三四五六七八九' -BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' -BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' -SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' -SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' -LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' -LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' -SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' -SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' - -ZERO_ALT = u'〇' -ONE_ALT = u'幺' -TWO_ALTS = [u'两', u'兩'] - -POSITIVE = [u'正', u'正'] -NEGATIVE = [u'负', u'負'] -POINT = [u'点', u'點'] +CHINESE_DIGIS = "零一二三四五六七八九" +BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖" +BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖" +SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万" +SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬" +LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载" +LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載" +SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万" +SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬" + +ZERO_ALT = "〇" +ONE_ALT = "幺" +TWO_ALTS = ["两", "兩"] + +POSITIVE = ["正", "正"] +NEGATIVE = ["负", "負"] +POINT = ["点", "點"] # PLUS = [u'加', u'加'] # SIL = [u'杠', u'槓'] -FILLER_CHARS = ['呃', '啊'] +FILLER_CHARS = ["呃", "啊"] -ER_WHITELIST = '(儿女|儿子|儿孙|女儿|儿媳|妻儿|' \ - '胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|' \ - '儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|' \ - '佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)' +ER_WHITELIST = ( + "(儿女|儿子|儿孙|女儿|儿媳|妻儿|" + "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|" + "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|" + "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)" +) ER_WHITELIST_PATTERN = re.compile(ER_WHITELIST) # 中文数字系统类型 -NUMBERING_TYPES = ['low', 'mid', 'high'] - -CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ - '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' -CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' -COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ - '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ - '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ - '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ - '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ - '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' +NUMBERING_TYPES = ["low", "mid", "high"] + +CURRENCY_NAMES = "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|" "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)" +CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)" +COM_QUANTIFIERS = ( + "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|" + "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|" + "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|" + "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|" + "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|" + "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)" +) # Punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) -CN_PUNCS_STOP = '!?。。' -CN_PUNCS_NONSTOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-' +CN_PUNCS_STOP = "!?。。" +CN_PUNCS_NONSTOP = ""#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-" CN_PUNCS = CN_PUNCS_STOP + CN_PUNCS_NONSTOP PUNCS = CN_PUNCS + string.punctuation -PUNCS_TRANSFORM = str.maketrans(PUNCS, ' ' * len(PUNCS), '') # replace puncs with space +PUNCS_TRANSFORM = str.maketrans(PUNCS, "," * len(PUNCS), "") # replace puncs with English comma # https://zh.wikipedia.org/wiki/全行和半行 QJ2BJ = { - ' ': ' ', - '!': '!', - '"': '"', - '#': '#', - '$': '$', - '%': '%', - '&': '&', - ''': "'", - '(': '(', - ')': ')', - '*': '*', - '+': '+', - ',': ',', - '-': '-', - '.': '.', - '/': '/', - '0': '0', - '1': '1', - '2': '2', - '3': '3', - '4': '4', - '5': '5', - '6': '6', - '7': '7', - '8': '8', - '9': '9', - ':': ':', - ';': ';', - '<': '<', - '=': '=', - '>': '>', - '?': '?', - '@': '@', - 'A': 'A', - 'B': 'B', - 'C': 'C', - 'D': 'D', - 'E': 'E', - 'F': 'F', - 'G': 'G', - 'H': 'H', - 'I': 'I', - 'J': 'J', - 'K': 'K', - 'L': 'L', - 'M': 'M', - 'N': 'N', - 'O': 'O', - 'P': 'P', - 'Q': 'Q', - 'R': 'R', - 'S': 'S', - 'T': 'T', - 'U': 'U', - 'V': 'V', - 'W': 'W', - 'X': 'X', - 'Y': 'Y', - 'Z': 'Z', - '[': '[', - '\': '\\', - ']': ']', - '^': '^', - '_': '_', - '`': '`', - 'a': 'a', - 'b': 'b', - 'c': 'c', - 'd': 'd', - 'e': 'e', - 'f': 'f', - 'g': 'g', - 'h': 'h', - 'i': 'i', - 'j': 'j', - 'k': 'k', - 'l': 'l', - 'm': 'm', - 'n': 'n', - 'o': 'o', - 'p': 'p', - 'q': 'q', - 'r': 'r', - 's': 's', - 't': 't', - 'u': 'u', - 'v': 'v', - 'w': 'w', - 'x': 'x', - 'y': 'y', - 'z': 'z', - '{': '{', - '|': '|', - '}': '}', - '~': '~', + " ": " ", + "!": "!", + """: '"', + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "A", + "B": "B", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "J", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "X", + "Y": "Y", + "Z": "Z", + "[": "[", + "\": "\\", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "a", + "b": "b", + "c": "c", + "d": "d", + "e": "e", + "f": "f", + "g": "g", + "h": "h", + "i": "i", + "j": "j", + "k": "k", + "l": "l", + "m": "m", + "n": "n", + "o": "o", + "p": "p", + "q": "q", + "r": "r", + "s": "s", + "t": "t", + "u": "u", + "v": "v", + "w": "w", + "x": "x", + "y": "y", + "z": "z", + "{": "{", + "|": "|", + "}": "}", + "~": "~", } -QJ2BJ_TRANSFORM = str.maketrans(''.join(QJ2BJ.keys()), ''.join(QJ2BJ.values()), '') +QJ2BJ_TRANSFORM = str.maketrans("".join(QJ2BJ.keys()), "".join(QJ2BJ.values()), "") # 2013 China National Standard: https://zh.wikipedia.org/wiki/通用规范汉字表, raw resources: # https://github.com/mozillazg/pinyin-data/blob/master/kMandarin_8105.txt with 8105 chinese chars in total CN_CHARS_COMMON = ( - '一丁七万丈三上下不与丏丐丑专且丕世丘丙业丛东丝丞丢两严丧个丫中丰串临丸丹为主丽举' - '乂乃久么义之乌乍乎乏乐乒乓乔乖乘乙乜九乞也习乡书乩买乱乳乸乾了予争事二亍于亏云互' - '亓五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仂仃仄仅仆仇仉今介仍从' - '仑仓仔仕他仗付仙仝仞仟仡代令以仨仪仫们仰仲仳仵件价任份仿企伈伉伊伋伍伎伏伐休众优' - '伙会伛伞伟传伢伣伤伥伦伧伪伫伭伯估伲伴伶伸伺似伽伾佁佃但位低住佐佑体何佖佗佘余佚' - '佛作佝佞佟你佣佤佥佩佬佯佰佳佴佶佸佺佻佼佽佾使侁侂侃侄侈侉例侍侏侑侔侗侘供依侠侣' - '侥侦侧侨侩侪侬侮侯侴侵侹便促俄俅俊俍俎俏俐俑俗俘俙俚俜保俞俟信俣俦俨俩俪俫俭修俯' - '俱俳俵俶俸俺俾倌倍倏倒倓倔倕倘候倚倜倞借倡倥倦倧倨倩倪倬倭倮倴债倻值倾偁偃假偈偌' - '偎偏偓偕做停偡健偬偭偰偲偶偷偻偾偿傀傃傅傈傉傍傒傕傣傥傧储傩催傲傺傻僇僎像僔僖僚' - '僦僧僬僭僮僰僳僵僻儆儇儋儒儡儦儳儴儿兀允元兄充兆先光克免兑兔兕兖党兜兢入全八公六' - '兮兰共关兴兵其具典兹养兼兽冀冁内冈冉册再冏冒冔冕冗写军农冠冢冤冥冬冮冯冰冱冲决况' - '冶冷冻冼冽净凄准凇凉凋凌减凑凓凘凛凝几凡凤凫凭凯凰凳凶凸凹出击凼函凿刀刁刃分切刈' - '刊刍刎刑划刖列刘则刚创初删判刨利别刬刭刮到刳制刷券刹刺刻刽刿剀剁剂剃剅削剋剌前剐' - '剑剔剕剖剜剞剟剡剥剧剩剪副割剽剿劁劂劄劈劐劓力劝办功加务劢劣动助努劫劬劭励劲劳劼' - '劾势勃勇勉勋勍勐勒勔勖勘勚募勠勤勰勺勾勿匀包匆匈匍匏匐匕化北匙匜匝匠匡匣匦匪匮匹' - '区医匼匾匿十千卅升午卉半华协卑卒卓单卖南博卜卞卟占卡卢卣卤卦卧卫卬卮卯印危即却卵' - '卷卸卺卿厂厄厅历厉压厌厍厕厖厘厚厝原厢厣厥厦厨厩厮去厾县叁参叆叇又叉及友双反发叔' - '叕取受变叙叚叛叟叠口古句另叨叩只叫召叭叮可台叱史右叵叶号司叹叻叼叽吁吃各吆合吉吊' - '同名后吏吐向吒吓吕吖吗君吝吞吟吠吡吣否吧吨吩含听吭吮启吱吲吴吵吸吹吻吼吽吾呀呃呆' - '呇呈告呋呐呒呓呔呕呖呗员呙呛呜呢呣呤呦周呱呲味呵呶呷呸呻呼命咀咂咄咆咇咉咋和咍咎' - '咏咐咒咔咕咖咙咚咛咝咡咣咤咥咦咧咨咩咪咫咬咯咱咳咴咸咺咻咽咿哀品哂哃哄哆哇哈哉哌' - '响哎哏哐哑哒哓哔哕哗哙哚哝哞哟哢哥哦哧哨哩哪哭哮哱哲哳哺哼哽哿唁唆唇唉唏唐唑唔唛' - '唝唠唢唣唤唧唪唬售唯唰唱唳唵唷唼唾唿啁啃啄商啉啊啐啕啖啜啡啤啥啦啧啪啫啬啭啮啰啴' - '啵啶啷啸啻啼啾喀喁喂喃善喆喇喈喉喊喋喏喑喔喘喙喜喝喟喤喧喱喳喵喷喹喻喽喾嗄嗅嗉嗌' - '嗍嗐嗑嗒嗓嗔嗖嗜嗝嗞嗟嗡嗣嗤嗥嗦嗨嗪嗫嗬嗯嗲嗳嗵嗷嗽嗾嘀嘁嘈嘉嘌嘎嘏嘘嘚嘛嘞嘟嘡' - '嘣嘤嘧嘬嘭嘱嘲嘴嘶嘹嘻嘿噀噂噇噌噍噎噔噗噘噙噜噢噤器噩噪噫噬噱噶噻噼嚄嚅嚆嚎嚏嚓' - '嚚嚣嚭嚯嚷嚼囊囔囚四回囟因囡团囤囫园困囱围囵囷囹固国图囿圃圄圆圈圉圊圌圐圙圜土圢' - '圣在圩圪圫圬圭圮圯地圲圳圹场圻圾址坂均坉坊坋坌坍坎坏坐坑坒块坚坛坜坝坞坟坠坡坤坥' - '坦坨坩坪坫坬坭坯坰坳坷坻坼坽垂垃垄垆垈型垌垍垎垏垒垓垕垙垚垛垞垟垠垡垢垣垤垦垧垩' - '垫垭垮垯垱垲垴垵垸垺垾垿埂埃埆埇埋埌城埏埒埔埕埗埘埙埚埝域埠埤埪埫埭埯埴埵埸培基' - '埼埽堂堃堆堇堉堋堌堍堎堐堑堕堙堞堠堡堤堧堨堪堰堲堵堼堽堾塄塅塆塌塍塑塔塘塝塞塥填' - '塬塱塾墀墁境墅墈墉墐墒墓墕墘墙墚增墟墡墣墦墨墩墼壁壅壑壕壤士壬壮声壳壶壸壹处备复' - '夏夐夔夕外夙多夜够夤夥大天太夫夬夭央夯失头夷夸夹夺夼奁奂奄奇奈奉奋奎奏契奓奔奕奖' - '套奘奚奠奡奢奥奭女奴奶奸她好妁如妃妄妆妇妈妊妍妒妓妖妗妘妙妞妣妤妥妧妨妩妪妫妭妮' - '妯妲妹妻妾姆姈姊始姐姑姒姓委姗姘姚姜姝姞姣姤姥姨姬姮姱姶姹姻姽姿娀威娃娄娅娆娇娈' - '娉娌娑娓娘娜娟娠娣娥娩娱娲娴娵娶娼婀婆婉婊婌婍婕婘婚婞婠婢婤婧婪婫婳婴婵婶婷婺婻' - '婼婿媂媄媆媒媓媖媚媛媞媪媭媱媲媳媵媸媾嫁嫂嫄嫉嫌嫒嫔嫕嫖嫘嫚嫜嫠嫡嫣嫦嫩嫪嫫嫭嫱' - '嫽嬉嬖嬗嬛嬥嬬嬴嬷嬿孀孅子孑孓孔孕孖字存孙孚孛孜孝孟孢季孤孥学孩孪孬孰孱孳孵孺孽' - '宁它宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宧宪宫宬宰害宴宵家宸容宽宾' - '宿寁寂寄寅密寇富寐寒寓寝寞察寡寤寥寨寮寰寸对寺寻导寿封射将尉尊小少尔尕尖尘尚尜尝' - '尢尤尥尧尨尪尬就尴尸尹尺尻尼尽尾尿局屁层屃居屈屉届屋屎屏屐屑展屙属屠屡屣履屦屯山' - '屹屺屼屾屿岁岂岈岊岌岍岐岑岔岖岗岘岙岚岛岜岞岠岢岣岨岩岫岬岭岱岳岵岷岸岽岿峁峂峃' - '峄峋峒峗峘峙峛峡峣峤峥峦峧峨峪峭峰峱峻峿崀崁崂崃崄崆崇崌崎崒崔崖崚崛崞崟崡崤崦崧' - '崩崭崮崴崶崽崾崿嵁嵅嵇嵊嵋嵌嵎嵖嵘嵚嵛嵝嵩嵫嵬嵯嵲嵴嶂嶅嶍嶒嶓嶙嶝嶟嶦嶲嶷巅巇巉' - '巍川州巡巢工左巧巨巩巫差巯己已巳巴巷巽巾币市布帅帆师希帏帐帑帔帕帖帘帙帚帛帜帝帡' - '带帧帨席帮帱帷常帻帼帽幂幄幅幌幔幕幖幛幞幡幢幪干平年并幸幺幻幼幽广庄庆庇床庋序庐' - '庑库应底庖店庙庚府庞废庠庤庥度座庭庱庳庵庶康庸庹庼庾廆廉廊廋廑廒廓廖廙廛廨廪延廷' - '建廿开弁异弃弄弆弇弈弊弋式弑弓引弗弘弛弟张弢弥弦弧弨弩弭弯弱弶弸弹强弼彀归当录彖' - '彗彘彝彟形彤彦彧彩彪彬彭彰影彳彷役彻彼往征徂径待徇很徉徊律徐徒徕得徘徙徛徜御徨循' - '徭微徵德徼徽心必忆忉忌忍忏忐忑忒忖志忘忙忝忞忠忡忤忧忪快忭忮忱忳念忸忺忻忽忾忿怀' - '态怂怃怄怅怆怊怍怎怏怒怔怕怖怙怛怜思怠怡急怦性怨怩怪怫怯怵总怼怿恁恂恃恋恍恐恒恓' - '恔恕恙恚恝恢恣恤恧恨恩恪恫恬恭息恰恳恶恸恹恺恻恼恽恿悃悄悆悈悉悌悍悒悔悖悚悛悝悟' - '悠悢患悦您悫悬悭悯悰悱悲悴悸悻悼情惆惇惊惋惎惑惔惕惘惙惚惛惜惝惟惠惦惧惨惩惫惬惭' - '惮惯惰想惴惶惹惺愀愁愃愆愈愉愍愎意愐愔愕愚感愠愣愤愦愧愫愭愿慆慈慊慌慎慑慕慝慢慥' - '慧慨慬慭慰慵慷憋憎憔憕憙憧憨憩憬憭憷憺憾懂懈懊懋懑懒懔懦懵懿戆戈戊戋戌戍戎戏成我' - '戒戕或戗战戚戛戟戡戢戣戤戥截戬戭戮戳戴户戽戾房所扁扂扃扅扆扇扈扉扊手才扎扑扒打扔' - '托扛扞扣扦执扩扪扫扬扭扮扯扰扳扶批扺扼扽找承技抃抄抉把抑抒抓抔投抖抗折抚抛抟抠抡' - '抢护报抨披抬抱抵抹抻押抽抿拂拃拄担拆拇拈拉拊拌拍拎拐拒拓拔拖拗拘拙招拜拟拢拣拤拥' - '拦拧拨择括拭拮拯拱拳拴拶拷拼拽拾拿持挂指挈按挎挑挓挖挚挛挝挞挟挠挡挣挤挥挦挨挪挫' - '振挲挹挺挽捂捃捅捆捉捋捌捍捎捏捐捕捞损捡换捣捧捩捭据捯捶捷捺捻捽掀掂掇授掉掊掌掎' - '掏掐排掖掘掞掠探掣接控推掩措掬掭掮掰掳掴掷掸掺掼掾揄揆揉揍描提插揕揖揠握揣揩揪揭' - '揳援揶揸揽揿搀搁搂搅搋搌搏搐搒搓搔搛搜搞搠搡搦搪搬搭搴携搽摁摄摅摆摇摈摊摏摒摔摘' - '摛摞摧摩摭摴摸摹摽撂撄撅撇撑撒撕撖撙撞撤撩撬播撮撰撵撷撸撺撼擀擂擅操擎擐擒擘擞擢' - '擤擦擿攀攉攒攘攥攫攮支收攸改攻攽放政故效敉敌敏救敔敕敖教敛敝敞敢散敦敩敫敬数敲整' - '敷文斋斌斐斑斓斗料斛斜斝斟斠斡斤斥斧斩斫断斯新斶方於施旁旃旄旅旆旋旌旎族旐旒旖旗' - '旞无既日旦旧旨早旬旭旮旯旰旱旴旵时旷旸旺旻旿昀昂昃昄昆昇昈昉昊昌明昏昒易昔昕昙昝' - '星映昡昣昤春昧昨昪昫昭是昱昳昴昵昶昺昼昽显晁晃晅晊晋晌晏晐晒晓晔晕晖晗晙晚晞晟晡' - '晢晤晦晨晪晫普景晰晱晴晶晷智晾暂暄暅暇暌暑暕暖暗暝暧暨暮暲暴暵暶暹暾暿曈曌曙曛曜' - '曝曦曩曰曲曳更曷曹曼曾替最月有朋服朏朐朓朔朕朗望朝期朦木未末本札术朱朳朴朵朸机朽' - '杀杂权杄杆杈杉杌李杏材村杓杕杖杙杜杞束杠条来杧杨杩杪杭杯杰杲杳杵杷杻杼松板极构枅' - '枇枉枋枍析枕林枘枚果枝枞枢枣枥枧枨枪枫枭枯枰枲枳枵架枷枸枹柁柃柄柈柊柏某柑柒染柔' - '柖柘柙柚柜柝柞柠柢查柩柬柯柰柱柳柴柷柽柿栀栅标栈栉栊栋栌栎栏栐树栒栓栖栗栝栟校栩' - '株栲栳栴样核根栻格栽栾桀桁桂桃桄桅框案桉桊桌桎桐桑桓桔桕桠桡桢档桤桥桦桧桨桩桫桯' - '桲桴桶桷桹梁梃梅梆梌梏梓梗梠梢梣梦梧梨梭梯械梳梴梵梼梽梾梿检棁棂棉棋棍棐棒棓棕棘' - '棚棠棣棤棨棪棫棬森棰棱棵棹棺棻棼棽椀椁椅椆椋植椎椐椑椒椓椟椠椤椪椭椰椴椸椹椽椿楂' - '楒楔楗楙楚楝楞楠楣楦楩楪楫楮楯楷楸楹楼概榃榄榅榆榇榈榉榍榑榔榕榖榛榜榧榨榫榭榰榱' - '榴榷榻槁槃槊槌槎槐槔槚槛槜槟槠槭槱槲槽槿樊樗樘樟模樨横樯樱樵樽樾橄橇橐橑橘橙橛橞' - '橡橥橦橱橹橼檀檄檎檐檑檗檞檠檩檫檬櫆欂欠次欢欣欤欧欲欸欹欺欻款歃歅歆歇歉歌歙止正' - '此步武歧歪歹死歼殁殂殃殄殆殇殉殊残殍殒殓殖殚殛殡殣殪殳殴段殷殿毁毂毅毋毌母每毐毒' - '毓比毕毖毗毙毛毡毪毫毯毳毵毹毽氅氆氇氍氏氐民氓气氕氖氘氙氚氛氟氡氢氤氦氧氨氩氪氮' - '氯氰氲水永氾氿汀汁求汆汇汈汉汊汋汐汔汕汗汛汜汝汞江池污汤汧汨汩汪汫汭汰汲汴汶汹汽' - '汾沁沂沃沄沅沆沇沈沉沌沏沐沓沔沘沙沚沛沟没沣沤沥沦沧沨沩沪沫沭沮沱河沸油沺治沼沽' - '沾沿泂泃泄泅泇泉泊泌泐泓泔法泖泗泙泚泛泜泞泠泡波泣泥注泪泫泮泯泰泱泳泵泷泸泺泻泼' - '泽泾洁洄洇洈洋洌洎洑洒洓洗洘洙洚洛洞洢洣津洧洨洪洫洭洮洱洲洳洴洵洸洹洺活洼洽派洿' - '流浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕浙浚浛浜浞浟浠浡浣浥浦浩浪浬浭浮浯浰浲浴海浸' - '浼涂涄涅消涉涌涍涎涐涑涓涔涕涘涛涝涞涟涠涡涢涣涤润涧涨涩涪涫涮涯液涴涵涸涿淀淄淅' - '淆淇淋淌淏淑淖淘淙淜淝淞淟淠淡淤淦淫淬淮淯深淳淴混淹添淼清渊渌渍渎渐渑渔渗渚渝渟' - '渠渡渣渤渥温渫渭港渰渲渴游渺渼湃湄湉湍湎湑湓湔湖湘湛湜湝湟湣湫湮湲湴湾湿溁溃溅溆' - '溇溉溍溏源溘溚溜溞溟溠溢溥溦溧溪溯溱溲溴溵溶溷溹溺溻溽滁滂滃滆滇滉滋滍滏滑滓滔滕' - '滗滘滚滞滟滠满滢滤滥滦滧滨滩滪滫滴滹漂漆漈漉漋漏漓演漕漖漠漤漦漩漪漫漭漯漱漳漴漶' - '漷漹漻漼漾潆潇潋潍潏潖潘潜潞潟潢潦潩潭潮潲潴潵潸潺潼潽潾澂澄澈澉澌澍澎澛澜澡澥澧' - '澪澭澳澴澶澹澼澽激濂濉濋濑濒濞濠濡濩濮濯瀌瀍瀑瀔瀚瀛瀣瀱瀵瀹瀼灈灌灏灞火灭灯灰灵' - '灶灸灼灾灿炀炅炆炉炊炌炎炒炔炕炖炘炙炜炝炟炣炫炬炭炮炯炱炳炷炸点炻炼炽烀烁烂烃烈' - '烊烔烘烙烛烜烝烟烠烤烦烧烨烩烫烬热烯烶烷烹烺烻烽焆焉焊焌焐焓焕焖焗焘焙焚焜焞焦焯' - '焰焱然煁煃煅煊煋煌煎煓煜煞煟煤煦照煨煮煲煳煴煸煺煽熄熇熊熏熔熘熙熛熜熟熠熥熨熬熵' - '熹熻燃燊燋燎燏燔燕燚燠燥燧燮燹爆爇爔爚爝爟爨爪爬爰爱爵父爷爸爹爻爽爿牁牂片版牌牍' - '牒牖牙牚牛牝牟牡牢牤牥牦牧物牮牯牲牵特牺牻牾牿犀犁犄犇犊犋犍犏犒犟犨犬犯犰犴状犷' - '犸犹狁狂狃狄狈狉狍狎狐狒狗狙狝狞狠狡狨狩独狭狮狯狰狱狲狳狴狷狸狺狻狼猁猃猄猇猊猎' - '猕猖猗猛猜猝猞猡猢猥猩猪猫猬献猯猰猱猴猷猹猺猾猿獍獐獒獗獠獬獭獯獴獾玃玄率玉王玎' - '玑玒玓玕玖玘玙玚玛玞玟玠玡玢玤玥玦玩玫玭玮环现玱玲玳玶玷玹玺玻玼玿珀珂珅珇珈珉珊' - '珋珌珍珏珐珑珒珕珖珙珛珝珞珠珢珣珥珦珧珩珪珫班珰珲珵珷珸珹珺珽琀球琄琅理琇琈琉琊' - '琎琏琐琔琚琛琟琡琢琤琥琦琨琪琫琬琭琮琯琰琲琳琴琵琶琼瑀瑁瑂瑃瑄瑅瑆瑑瑓瑔瑕瑖瑗瑙' - '瑚瑛瑜瑝瑞瑟瑢瑧瑨瑬瑭瑰瑱瑳瑶瑷瑾璀璁璃璆璇璈璋璎璐璒璘璜璞璟璠璥璧璨璩璪璬璮璱' - '璲璺瓀瓒瓖瓘瓜瓞瓠瓢瓣瓤瓦瓮瓯瓴瓶瓷瓻瓿甄甍甏甑甓甗甘甚甜生甡甥甦用甩甪甫甬甭甯' - '田由甲申电男甸町画甾畀畅畈畋界畎畏畔畖留畚畛畜畤略畦番畬畯畲畴畸畹畿疁疃疆疍疏疐' - '疑疔疖疗疙疚疝疟疠疡疢疣疤疥疫疬疭疮疯疰疱疲疳疴疵疸疹疼疽疾痂痃痄病症痈痉痊痍痒' - '痓痔痕痘痛痞痢痣痤痦痧痨痪痫痰痱痴痹痼痿瘀瘁瘃瘅瘆瘊瘌瘐瘕瘗瘘瘙瘛瘟瘠瘢瘤瘥瘦瘩' - '瘪瘫瘭瘰瘳瘴瘵瘸瘼瘾瘿癀癃癌癍癔癖癗癜癞癣癫癯癸登白百癿皂的皆皇皈皋皎皑皓皕皖皙' - '皛皞皤皦皭皮皱皲皴皿盂盅盆盈盉益盍盎盏盐监盒盔盖盗盘盛盟盥盦目盯盱盲直盷相盹盼盾' - '省眄眇眈眉眊看眍眙眚真眠眢眦眨眩眬眭眯眵眶眷眸眺眼着睁睃睄睇睎睐睑睚睛睡睢督睥睦' - '睨睫睬睹睽睾睿瞀瞄瞅瞋瞌瞍瞎瞑瞒瞟瞠瞢瞥瞧瞩瞪瞫瞬瞭瞰瞳瞵瞻瞽瞿矍矗矛矜矞矢矣知' - '矧矩矫矬短矮矰石矶矸矻矼矾矿砀码砂砄砆砉砌砍砑砒研砖砗砘砚砜砝砟砠砣砥砧砫砬砭砮' - '砰破砵砷砸砹砺砻砼砾础硁硅硇硊硌硍硎硐硒硔硕硖硗硙硚硝硪硫硬硭确硼硿碃碇碈碉碌碍' - '碎碏碑碓碗碘碚碛碜碟碡碣碥碧碨碰碱碲碳碴碶碹碾磁磅磉磊磋磏磐磔磕磙磜磡磨磬磲磴磷' - '磹磻礁礅礌礓礞礴礵示礼社祀祁祃祆祇祈祉祊祋祎祏祐祓祕祖祗祚祛祜祝神祟祠祢祥祧票祭' - '祯祲祷祸祺祼祾禀禁禄禅禊禋福禒禔禘禚禛禤禧禳禹禺离禽禾秀私秃秆秉秋种科秒秕秘租秣' - '秤秦秧秩秫秬秭积称秸移秽秾稀稂稃稆程稌稍税稑稔稗稙稚稞稠稣稳稷稹稻稼稽稿穄穆穑穗' - '穙穜穟穰穴究穷穸穹空穿窀突窃窄窅窈窊窍窎窑窒窕窖窗窘窜窝窟窠窣窥窦窨窬窭窳窸窿立' - '竑竖竘站竞竟章竣童竦竫竭端竹竺竽竿笃笄笆笈笊笋笏笑笔笕笙笛笞笠笤笥符笨笪笫第笮笯' - '笱笳笸笺笼笾筀筅筇等筋筌筏筐筑筒答策筘筚筛筜筝筠筢筤筥筦筮筱筲筵筶筷筹筻筼签简箅' - '箍箐箓箔箕箖算箜管箢箦箧箨箩箪箫箬箭箱箴箸篁篆篇篌篑篓篙篚篝篡篥篦篪篮篯篱篷篼篾' - '簃簇簉簋簌簏簕簖簝簟簠簧簪簰簸簿籀籁籍籥米籴类籼籽粉粑粒粕粗粘粜粝粞粟粢粤粥粪粮' - '粱粲粳粹粼粽精粿糁糅糇糈糊糌糍糒糕糖糗糙糜糟糠糨糯糵系紊素索紧紫累絜絮絷綦綮縠縢' - '縻繁繄繇纂纛纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁' - '绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩' - '绪绫续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缌缎缐缑缒缓缔缕' - '编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵缶缸缺罂罄罅罍罐' - '网罔罕罗罘罚罟罡罢罨罩罪置罱署罴罶罹罽罾羁羊羌美羑羓羔羕羖羚羝羞羟羡群羧羯羰羱羲' - '羸羹羼羽羿翀翁翂翃翅翈翊翌翎翔翕翘翙翚翛翟翠翡翥翦翩翮翯翰翱翳翷翻翼翾耀老考耄者' - '耆耇耋而耍耏耐耑耒耔耕耖耗耘耙耜耠耢耤耥耦耧耨耩耪耰耱耳耵耶耷耸耻耽耿聂聃聆聊聋' - '职聍聒联聘聚聩聪聱聿肃肄肆肇肉肋肌肓肖肘肚肛肝肟肠股肢肤肥肩肪肫肭肮肯肱育肴肷肸' - '肺肼肽肾肿胀胁胂胃胄胆胈背胍胎胖胗胙胚胛胜胝胞胠胡胣胤胥胧胨胩胪胫胬胭胯胰胱胲胳' - '胴胶胸胺胼能脂脆脉脊脍脎脏脐脑脒脓脔脖脘脚脞脟脩脬脯脱脲脶脸脾脿腆腈腊腋腌腐腑腒' - '腓腔腕腘腙腚腠腥腧腨腩腭腮腯腰腱腴腹腺腻腼腽腾腿膀膂膈膊膏膑膘膙膛膜膝膦膨膳膺膻' - '臀臂臃臆臊臌臑臜臣臧自臬臭至致臻臼臾舀舁舂舄舅舆舌舍舐舒舔舛舜舞舟舠舢舣舥航舫般' - '舭舯舰舱舲舳舴舵舶舷舸船舻舾艄艅艇艉艋艎艏艘艚艟艨艮良艰色艳艴艺艽艾艿节芃芄芈芊' - '芋芍芎芏芑芒芗芘芙芜芝芟芠芡芣芤芥芦芨芩芪芫芬芭芮芯芰花芳芴芷芸芹芼芽芾苁苄苇苈' - '苉苊苋苌苍苎苏苑苒苓苔苕苗苘苛苜苞苟苠苡苣苤若苦苧苫苯英苴苷苹苻苾茀茁茂范茄茅茆' - '茈茉茋茌茎茏茑茓茔茕茗茚茛茜茝茧茨茫茬茭茯茱茳茴茵茶茸茹茺茼茽荀荁荃荄荆荇草荏荐' - '荑荒荓荔荖荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药荷荸荻荼荽莅莆莉莎莒莓莘莙莛' - '莜莝莞莠莨莩莪莫莰莱莲莳莴莶获莸莹莺莼莽莿菀菁菂菅菇菉菊菌菍菏菔菖菘菜菝菟菠菡菥' - '菩菪菰菱菲菹菼菽萁萃萄萆萋萌萍萎萏萑萘萚萜萝萣萤营萦萧萨萩萱萳萸萹萼落葆葎葑葖著' - '葙葚葛葜葡董葩葫葬葭葰葱葳葴葵葶葸葺蒂蒄蒇蒈蒉蒋蒌蒎蒐蒗蒙蒜蒟蒡蒨蒯蒱蒲蒴蒸蒹蒺' - '蒻蒽蒿蓁蓂蓄蓇蓉蓊蓍蓏蓐蓑蓓蓖蓝蓟蓠蓢蓣蓥蓦蓬蓰蓼蓿蔀蔃蔈蔊蔌蔑蔓蔗蔚蔟蔡蔫蔬蔷' - '蔸蔹蔺蔻蔼蔽蕃蕈蕉蕊蕖蕗蕙蕞蕤蕨蕰蕲蕴蕹蕺蕻蕾薁薄薅薇薏薛薜薢薤薨薪薮薯薰薳薷薸' - '薹薿藁藉藏藐藓藕藜藟藠藤藦藨藩藻藿蘅蘑蘖蘘蘧蘩蘸蘼虎虏虐虑虒虓虔虚虞虢虤虫虬虮虱' - '虷虸虹虺虻虼虽虾虿蚀蚁蚂蚄蚆蚊蚋蚌蚍蚓蚕蚜蚝蚣蚤蚧蚨蚩蚪蚬蚯蚰蚱蚲蚴蚶蚺蛀蛃蛄蛆' - '蛇蛉蛊蛋蛎蛏蛐蛑蛔蛘蛙蛛蛞蛟蛤蛩蛭蛮蛰蛱蛲蛳蛴蛸蛹蛾蜀蜂蜃蜇蜈蜉蜊蜍蜎蜐蜒蜓蜕蜗' - '蜘蜚蜜蜞蜡蜢蜣蜥蜩蜮蜱蜴蜷蜻蜾蜿蝇蝈蝉蝌蝎蝓蝗蝘蝙蝠蝣蝤蝥蝮蝰蝲蝴蝶蝻蝼蝽蝾螂螃' - '螅螈螋融螗螟螠螣螨螫螬螭螯螱螳螵螺螽蟀蟆蟊蟋蟏蟑蟒蟛蟠蟥蟪蟫蟮蟹蟾蠃蠊蠋蠓蠕蠖蠡' - '蠢蠲蠹蠼血衃衄衅行衍衎衒衔街衙衠衡衢衣补表衩衫衬衮衰衲衷衽衾衿袁袂袄袅袆袈袋袍袒' - '袖袗袜袢袤袪被袭袯袱袷袼裁裂装裆裈裉裎裒裔裕裘裙裛裟裢裣裤裥裨裰裱裳裴裸裹裼裾褂' - '褊褐褒褓褕褙褚褛褟褡褥褪褫褯褰褴褶襁襄襕襚襜襞襟襦襫襻西要覃覆见观觃规觅视觇览觉' - '觊觋觌觎觏觐觑角觖觚觜觞觟解觥触觫觭觯觱觳觿言訄訇訚訾詈詟詹誉誊誓謇警譬计订讣认' - '讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词' - '诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诫诬语诮误诰诱诲诳说诵请' - '诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谙谚谛谜谝谞谟谠谡' - '谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷谼谿豁豆豇豉豌豕豚象豢豨豪豫豮豳豸豹' - '豺貂貅貆貉貊貌貔貘贝贞负贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼' - '贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赤' - '赦赧赪赫赭走赳赴赵赶起趁趄超越趋趑趔趟趣趯趱足趴趵趸趺趼趾趿跂跃跄跆跋跌跎跏跐跑' - '跖跗跚跛距跞跟跣跤跨跪跬路跱跳践跶跷跸跹跺跻跽踅踉踊踌踏踒踔踝踞踟踢踣踦踩踪踬踮' - '踯踱踵踶踹踺踽蹀蹁蹂蹄蹅蹇蹈蹉蹊蹋蹐蹑蹒蹙蹚蹜蹢蹦蹩蹬蹭蹯蹰蹲蹴蹶蹼蹽蹾蹿躁躅躇' - '躏躐躔躜躞身躬躯躲躺车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较' - '辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辛辜辞辟辣辨辩辫辰辱边辽达辿迁迂迄' - '迅过迈迎运近迓返迕还这进远违连迟迢迤迥迦迨迩迪迫迭迮述迳迷迸迹迺追退送适逃逄逅逆' - '选逊逋逍透逐逑递途逖逗通逛逝逞速造逡逢逦逭逮逯逴逵逶逸逻逼逾遁遂遄遆遇遍遏遐遑遒' - '道遗遘遛遢遣遥遨遭遮遴遵遹遽避邀邂邃邈邋邑邓邕邗邘邙邛邝邠邡邢那邦邨邪邬邮邯邰邱' - '邲邳邴邵邶邸邹邺邻邽邾邿郁郃郄郅郇郈郊郎郏郐郑郓郗郚郛郜郝郡郢郤郦郧部郪郫郭郯郴' - '郸都郾郿鄀鄂鄃鄄鄅鄌鄑鄗鄘鄙鄚鄜鄞鄠鄢鄣鄫鄯鄱鄹酂酃酅酆酉酊酋酌配酎酏酐酒酗酚酝' - '酞酡酢酣酤酥酦酩酪酬酮酯酰酱酲酴酵酶酷酸酹酺酽酾酿醅醇醉醋醌醍醐醑醒醚醛醢醨醪醭' - '醮醯醴醵醺醾采釉释里重野量釐金釜鉴銎銮鋆鋈錾鍪鎏鏊鏖鐾鑫钆钇针钉钊钋钌钍钎钏钐钒' - '钓钔钕钖钗钘钙钚钛钜钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钷钹钺钻钼' - '钽钾钿铀铁铂铃铄铅铆铈铉铊铋铌铍铎铏铐铑铒铕铖铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铧铨' - '铩铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐' - '锑锒锓锔锕锖锗锘错锚锛锜锝锞锟锡锢锣锤锥锦锧锨锩锪锫锬锭键锯锰锱锲锳锴锵锶锷锸锹' - '锺锻锼锽锾锿镀镁镂镃镄镅镆镇镈镉镊镋镌镍镎镏镐镑镒镓镔镕镖镗镘镚镛镜镝镞镠镡镢镣' - '镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镵镶长门闩闪闫闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼' - '闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阔阕阖阗阘阙阚阜队阡阪阮阱防阳阴阵阶' - '阻阼阽阿陀陂附际陆陇陈陉陋陌降陎限陑陔陕陛陞陟陡院除陧陨险陪陬陲陴陵陶陷隃隅隆隈' - '隋隍随隐隔隗隘隙障隧隩隰隳隶隹隺隼隽难雀雁雄雅集雇雉雊雌雍雎雏雒雕雠雨雩雪雯雱雳' - '零雷雹雾需霁霄霅霆震霈霉霍霎霏霓霖霜霞霨霪霭霰露霸霹霾青靓靖静靛非靠靡面靥革靬靰' - '靳靴靶靸靺靼靽靿鞁鞅鞋鞍鞑鞒鞔鞘鞠鞡鞣鞧鞨鞫鞬鞭鞮鞯鞲鞳鞴韂韦韧韨韩韪韫韬韭音韵' - '韶页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颓颔颖颗题颙颚颛颜额' - '颞颟颠颡颢颤颥颦颧风飏飐飑飒飓飔飕飗飘飙飞食飧飨餍餐餮饔饕饥饧饨饩饪饫饬饭饮饯饰' - '饱饲饳饴饵饶饷饸饹饺饻饼饽饿馁馃馄馅馆馇馈馉馊馋馌馍馏馐馑馒馓馔馕首馗馘香馝馞馥' - '馧馨马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑' - '骒骓骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧骨骰骱骶骷骸骺骼髀髁髂髃髅髋髌髎髑髓高' - '髡髢髦髫髭髯髹髻髽鬃鬈鬏鬒鬓鬘鬟鬣鬯鬲鬶鬷鬻鬼魁魂魃魄魅魆魇魈魉魋魍魏魑魔鱼鱽鱾' - '鱿鲀鲁鲂鲃鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨' - '鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳇鳈鳉鳊鳌鳍鳎鳏鳐鳑鳒鳓' - '鳔鳕鳖鳗鳘鳙鳚鳛鳜鳝鳞鳟鳠鳡鳢鳣鳤鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸵鸶' - '鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹔鹕鹖鹗鹘鹙鹚鹛鹜鹝鹞鹟' - '鹠鹡鹢鹣鹤鹦鹧鹨鹩鹪鹫鹬鹭鹮鹯鹰鹱鹲鹳鹴鹾鹿麀麂麇麈麋麑麒麓麖麝麟麦麸麹麻麽麾黄' - '黇黉黍黎黏黑黔默黛黜黝黟黠黡黢黥黧黩黪黯黹黻黼黾鼋鼍鼎鼐鼒鼓鼗鼙鼠鼢鼩鼫鼬鼯鼱鼷' - '鼹鼻鼽鼾齁齇齉齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟龠龢鿍鿎鿏㑇㑊㕮㘎㙍㙘㙦㛃' - '㛚㛹㟃㠇㠓㤘㥄㧐㧑㧟㫰㬊㬎㬚㭎㭕㮾㰀㳇㳘㳚㴔㵐㶲㸆㸌㺄㻬㽏㿠䁖䂮䃅䃎䅟䌹䎃䎖䏝䏡' - '䏲䐃䓖䓛䓨䓫䓬䗖䗛䗪䗴䜣䝙䢺䢼䣘䥽䦃䲟䲠䲢䴓䴔䴕䴖䴗䴘䴙䶮𠅤𠙶𠳐𡎚𡐓𣗋𣲗𣲘𣸣𤧛𤩽' - '𤫉𥔲𥕢𥖨𥻗𦈡𦒍𦙶𦝼𦭜𦰡𧿹𨐈𨙸𨚕𨟠𨭉𨱇𨱏𨱑𨱔𨺙𩽾𩾃𩾌𪟝𪣻𪤗𪨰𪨶𪩘𪾢𫄧𫄨𫄷𫄸𫇭𫌀𫍣𫍯' - '𫍲𫍽𫐄𫐐𫐓𫑡𫓧𫓯𫓶𫓹𫔍𫔎𫔶𫖮𫖯𫖳𫗧𫗴𫘜𫘝𫘦𫘧𫘨𫘪𫘬𫚕𫚖𫚭𫛭𫞩𫟅𫟦𫟹𫟼𫠆𫠊𫠜𫢸𫫇𫭟' - '𫭢𫭼𫮃𫰛𫵷𫶇𫷷𫸩𬀩𬀪𬂩𬃊𬇕𬇙𬇹𬉼𬊈𬊤𬌗𬍛𬍡𬍤𬒈𬒔𬒗𬕂𬘓𬘘𬘡𬘩𬘫𬘬𬘭𬘯𬙂𬙊𬙋𬜬𬜯𬞟' - '𬟁𬟽𬣙𬣞𬣡𬣳𬤇𬤊𬤝𬨂𬨎𬩽𬪩𬬩𬬭𬬮𬬱𬬸𬬹𬬻𬬿𬭁𬭊𬭎𬭚𬭛𬭤𬭩𬭬𬭯𬭳𬭶𬭸𬭼𬮱𬮿𬯀𬯎𬱖𬱟' - '𬳵𬳶𬳽𬳿𬴂𬴃𬴊𬶋𬶍𬶏𬶐𬶟𬶠𬶨𬶭𬶮𬷕𬸘𬸚𬸣𬸦𬸪𬹼𬺈𬺓' + "一丁七万丈三上下不与丏丐丑专且丕世丘丙业丛东丝丞丢两严丧个丫中丰串临丸丹为主丽举" + "乂乃久么义之乌乍乎乏乐乒乓乔乖乘乙乜九乞也习乡书乩买乱乳乸乾了予争事二亍于亏云互" + "亓五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仂仃仄仅仆仇仉今介仍从" + "仑仓仔仕他仗付仙仝仞仟仡代令以仨仪仫们仰仲仳仵件价任份仿企伈伉伊伋伍伎伏伐休众优" + "伙会伛伞伟传伢伣伤伥伦伧伪伫伭伯估伲伴伶伸伺似伽伾佁佃但位低住佐佑体何佖佗佘余佚" + "佛作佝佞佟你佣佤佥佩佬佯佰佳佴佶佸佺佻佼佽佾使侁侂侃侄侈侉例侍侏侑侔侗侘供依侠侣" + "侥侦侧侨侩侪侬侮侯侴侵侹便促俄俅俊俍俎俏俐俑俗俘俙俚俜保俞俟信俣俦俨俩俪俫俭修俯" + "俱俳俵俶俸俺俾倌倍倏倒倓倔倕倘候倚倜倞借倡倥倦倧倨倩倪倬倭倮倴债倻值倾偁偃假偈偌" + "偎偏偓偕做停偡健偬偭偰偲偶偷偻偾偿傀傃傅傈傉傍傒傕傣傥傧储傩催傲傺傻僇僎像僔僖僚" + "僦僧僬僭僮僰僳僵僻儆儇儋儒儡儦儳儴儿兀允元兄充兆先光克免兑兔兕兖党兜兢入全八公六" + "兮兰共关兴兵其具典兹养兼兽冀冁内冈冉册再冏冒冔冕冗写军农冠冢冤冥冬冮冯冰冱冲决况" + "冶冷冻冼冽净凄准凇凉凋凌减凑凓凘凛凝几凡凤凫凭凯凰凳凶凸凹出击凼函凿刀刁刃分切刈" + "刊刍刎刑划刖列刘则刚创初删判刨利别刬刭刮到刳制刷券刹刺刻刽刿剀剁剂剃剅削剋剌前剐" + "剑剔剕剖剜剞剟剡剥剧剩剪副割剽剿劁劂劄劈劐劓力劝办功加务劢劣动助努劫劬劭励劲劳劼" + "劾势勃勇勉勋勍勐勒勔勖勘勚募勠勤勰勺勾勿匀包匆匈匍匏匐匕化北匙匜匝匠匡匣匦匪匮匹" + "区医匼匾匿十千卅升午卉半华协卑卒卓单卖南博卜卞卟占卡卢卣卤卦卧卫卬卮卯印危即却卵" + "卷卸卺卿厂厄厅历厉压厌厍厕厖厘厚厝原厢厣厥厦厨厩厮去厾县叁参叆叇又叉及友双反发叔" + "叕取受变叙叚叛叟叠口古句另叨叩只叫召叭叮可台叱史右叵叶号司叹叻叼叽吁吃各吆合吉吊" + "同名后吏吐向吒吓吕吖吗君吝吞吟吠吡吣否吧吨吩含听吭吮启吱吲吴吵吸吹吻吼吽吾呀呃呆" + "呇呈告呋呐呒呓呔呕呖呗员呙呛呜呢呣呤呦周呱呲味呵呶呷呸呻呼命咀咂咄咆咇咉咋和咍咎" + "咏咐咒咔咕咖咙咚咛咝咡咣咤咥咦咧咨咩咪咫咬咯咱咳咴咸咺咻咽咿哀品哂哃哄哆哇哈哉哌" + "响哎哏哐哑哒哓哔哕哗哙哚哝哞哟哢哥哦哧哨哩哪哭哮哱哲哳哺哼哽哿唁唆唇唉唏唐唑唔唛" + "唝唠唢唣唤唧唪唬售唯唰唱唳唵唷唼唾唿啁啃啄商啉啊啐啕啖啜啡啤啥啦啧啪啫啬啭啮啰啴" + "啵啶啷啸啻啼啾喀喁喂喃善喆喇喈喉喊喋喏喑喔喘喙喜喝喟喤喧喱喳喵喷喹喻喽喾嗄嗅嗉嗌" + "嗍嗐嗑嗒嗓嗔嗖嗜嗝嗞嗟嗡嗣嗤嗥嗦嗨嗪嗫嗬嗯嗲嗳嗵嗷嗽嗾嘀嘁嘈嘉嘌嘎嘏嘘嘚嘛嘞嘟嘡" + "嘣嘤嘧嘬嘭嘱嘲嘴嘶嘹嘻嘿噀噂噇噌噍噎噔噗噘噙噜噢噤器噩噪噫噬噱噶噻噼嚄嚅嚆嚎嚏嚓" + "嚚嚣嚭嚯嚷嚼囊囔囚四回囟因囡团囤囫园困囱围囵囷囹固国图囿圃圄圆圈圉圊圌圐圙圜土圢" + "圣在圩圪圫圬圭圮圯地圲圳圹场圻圾址坂均坉坊坋坌坍坎坏坐坑坒块坚坛坜坝坞坟坠坡坤坥" + "坦坨坩坪坫坬坭坯坰坳坷坻坼坽垂垃垄垆垈型垌垍垎垏垒垓垕垙垚垛垞垟垠垡垢垣垤垦垧垩" + "垫垭垮垯垱垲垴垵垸垺垾垿埂埃埆埇埋埌城埏埒埔埕埗埘埙埚埝域埠埤埪埫埭埯埴埵埸培基" + "埼埽堂堃堆堇堉堋堌堍堎堐堑堕堙堞堠堡堤堧堨堪堰堲堵堼堽堾塄塅塆塌塍塑塔塘塝塞塥填" + "塬塱塾墀墁境墅墈墉墐墒墓墕墘墙墚增墟墡墣墦墨墩墼壁壅壑壕壤士壬壮声壳壶壸壹处备复" + "夏夐夔夕外夙多夜够夤夥大天太夫夬夭央夯失头夷夸夹夺夼奁奂奄奇奈奉奋奎奏契奓奔奕奖" + "套奘奚奠奡奢奥奭女奴奶奸她好妁如妃妄妆妇妈妊妍妒妓妖妗妘妙妞妣妤妥妧妨妩妪妫妭妮" + "妯妲妹妻妾姆姈姊始姐姑姒姓委姗姘姚姜姝姞姣姤姥姨姬姮姱姶姹姻姽姿娀威娃娄娅娆娇娈" + "娉娌娑娓娘娜娟娠娣娥娩娱娲娴娵娶娼婀婆婉婊婌婍婕婘婚婞婠婢婤婧婪婫婳婴婵婶婷婺婻" + "婼婿媂媄媆媒媓媖媚媛媞媪媭媱媲媳媵媸媾嫁嫂嫄嫉嫌嫒嫔嫕嫖嫘嫚嫜嫠嫡嫣嫦嫩嫪嫫嫭嫱" + "嫽嬉嬖嬗嬛嬥嬬嬴嬷嬿孀孅子孑孓孔孕孖字存孙孚孛孜孝孟孢季孤孥学孩孪孬孰孱孳孵孺孽" + "宁它宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宧宪宫宬宰害宴宵家宸容宽宾" + "宿寁寂寄寅密寇富寐寒寓寝寞察寡寤寥寨寮寰寸对寺寻导寿封射将尉尊小少尔尕尖尘尚尜尝" + "尢尤尥尧尨尪尬就尴尸尹尺尻尼尽尾尿局屁层屃居屈屉届屋屎屏屐屑展屙属屠屡屣履屦屯山" + "屹屺屼屾屿岁岂岈岊岌岍岐岑岔岖岗岘岙岚岛岜岞岠岢岣岨岩岫岬岭岱岳岵岷岸岽岿峁峂峃" + "峄峋峒峗峘峙峛峡峣峤峥峦峧峨峪峭峰峱峻峿崀崁崂崃崄崆崇崌崎崒崔崖崚崛崞崟崡崤崦崧" + "崩崭崮崴崶崽崾崿嵁嵅嵇嵊嵋嵌嵎嵖嵘嵚嵛嵝嵩嵫嵬嵯嵲嵴嶂嶅嶍嶒嶓嶙嶝嶟嶦嶲嶷巅巇巉" + "巍川州巡巢工左巧巨巩巫差巯己已巳巴巷巽巾币市布帅帆师希帏帐帑帔帕帖帘帙帚帛帜帝帡" + "带帧帨席帮帱帷常帻帼帽幂幄幅幌幔幕幖幛幞幡幢幪干平年并幸幺幻幼幽广庄庆庇床庋序庐" + "庑库应底庖店庙庚府庞废庠庤庥度座庭庱庳庵庶康庸庹庼庾廆廉廊廋廑廒廓廖廙廛廨廪延廷" + "建廿开弁异弃弄弆弇弈弊弋式弑弓引弗弘弛弟张弢弥弦弧弨弩弭弯弱弶弸弹强弼彀归当录彖" + "彗彘彝彟形彤彦彧彩彪彬彭彰影彳彷役彻彼往征徂径待徇很徉徊律徐徒徕得徘徙徛徜御徨循" + "徭微徵德徼徽心必忆忉忌忍忏忐忑忒忖志忘忙忝忞忠忡忤忧忪快忭忮忱忳念忸忺忻忽忾忿怀" + "态怂怃怄怅怆怊怍怎怏怒怔怕怖怙怛怜思怠怡急怦性怨怩怪怫怯怵总怼怿恁恂恃恋恍恐恒恓" + "恔恕恙恚恝恢恣恤恧恨恩恪恫恬恭息恰恳恶恸恹恺恻恼恽恿悃悄悆悈悉悌悍悒悔悖悚悛悝悟" + "悠悢患悦您悫悬悭悯悰悱悲悴悸悻悼情惆惇惊惋惎惑惔惕惘惙惚惛惜惝惟惠惦惧惨惩惫惬惭" + "惮惯惰想惴惶惹惺愀愁愃愆愈愉愍愎意愐愔愕愚感愠愣愤愦愧愫愭愿慆慈慊慌慎慑慕慝慢慥" + "慧慨慬慭慰慵慷憋憎憔憕憙憧憨憩憬憭憷憺憾懂懈懊懋懑懒懔懦懵懿戆戈戊戋戌戍戎戏成我" + "戒戕或戗战戚戛戟戡戢戣戤戥截戬戭戮戳戴户戽戾房所扁扂扃扅扆扇扈扉扊手才扎扑扒打扔" + "托扛扞扣扦执扩扪扫扬扭扮扯扰扳扶批扺扼扽找承技抃抄抉把抑抒抓抔投抖抗折抚抛抟抠抡" + "抢护报抨披抬抱抵抹抻押抽抿拂拃拄担拆拇拈拉拊拌拍拎拐拒拓拔拖拗拘拙招拜拟拢拣拤拥" + "拦拧拨择括拭拮拯拱拳拴拶拷拼拽拾拿持挂指挈按挎挑挓挖挚挛挝挞挟挠挡挣挤挥挦挨挪挫" + "振挲挹挺挽捂捃捅捆捉捋捌捍捎捏捐捕捞损捡换捣捧捩捭据捯捶捷捺捻捽掀掂掇授掉掊掌掎" + "掏掐排掖掘掞掠探掣接控推掩措掬掭掮掰掳掴掷掸掺掼掾揄揆揉揍描提插揕揖揠握揣揩揪揭" + "揳援揶揸揽揿搀搁搂搅搋搌搏搐搒搓搔搛搜搞搠搡搦搪搬搭搴携搽摁摄摅摆摇摈摊摏摒摔摘" + "摛摞摧摩摭摴摸摹摽撂撄撅撇撑撒撕撖撙撞撤撩撬播撮撰撵撷撸撺撼擀擂擅操擎擐擒擘擞擢" + "擤擦擿攀攉攒攘攥攫攮支收攸改攻攽放政故效敉敌敏救敔敕敖教敛敝敞敢散敦敩敫敬数敲整" + "敷文斋斌斐斑斓斗料斛斜斝斟斠斡斤斥斧斩斫断斯新斶方於施旁旃旄旅旆旋旌旎族旐旒旖旗" + "旞无既日旦旧旨早旬旭旮旯旰旱旴旵时旷旸旺旻旿昀昂昃昄昆昇昈昉昊昌明昏昒易昔昕昙昝" + "星映昡昣昤春昧昨昪昫昭是昱昳昴昵昶昺昼昽显晁晃晅晊晋晌晏晐晒晓晔晕晖晗晙晚晞晟晡" + "晢晤晦晨晪晫普景晰晱晴晶晷智晾暂暄暅暇暌暑暕暖暗暝暧暨暮暲暴暵暶暹暾暿曈曌曙曛曜" + "曝曦曩曰曲曳更曷曹曼曾替最月有朋服朏朐朓朔朕朗望朝期朦木未末本札术朱朳朴朵朸机朽" + "杀杂权杄杆杈杉杌李杏材村杓杕杖杙杜杞束杠条来杧杨杩杪杭杯杰杲杳杵杷杻杼松板极构枅" + "枇枉枋枍析枕林枘枚果枝枞枢枣枥枧枨枪枫枭枯枰枲枳枵架枷枸枹柁柃柄柈柊柏某柑柒染柔" + "柖柘柙柚柜柝柞柠柢查柩柬柯柰柱柳柴柷柽柿栀栅标栈栉栊栋栌栎栏栐树栒栓栖栗栝栟校栩" + "株栲栳栴样核根栻格栽栾桀桁桂桃桄桅框案桉桊桌桎桐桑桓桔桕桠桡桢档桤桥桦桧桨桩桫桯" + "桲桴桶桷桹梁梃梅梆梌梏梓梗梠梢梣梦梧梨梭梯械梳梴梵梼梽梾梿检棁棂棉棋棍棐棒棓棕棘" + "棚棠棣棤棨棪棫棬森棰棱棵棹棺棻棼棽椀椁椅椆椋植椎椐椑椒椓椟椠椤椪椭椰椴椸椹椽椿楂" + "楒楔楗楙楚楝楞楠楣楦楩楪楫楮楯楷楸楹楼概榃榄榅榆榇榈榉榍榑榔榕榖榛榜榧榨榫榭榰榱" + "榴榷榻槁槃槊槌槎槐槔槚槛槜槟槠槭槱槲槽槿樊樗樘樟模樨横樯樱樵樽樾橄橇橐橑橘橙橛橞" + "橡橥橦橱橹橼檀檄檎檐檑檗檞檠檩檫檬櫆欂欠次欢欣欤欧欲欸欹欺欻款歃歅歆歇歉歌歙止正" + "此步武歧歪歹死歼殁殂殃殄殆殇殉殊残殍殒殓殖殚殛殡殣殪殳殴段殷殿毁毂毅毋毌母每毐毒" + "毓比毕毖毗毙毛毡毪毫毯毳毵毹毽氅氆氇氍氏氐民氓气氕氖氘氙氚氛氟氡氢氤氦氧氨氩氪氮" + "氯氰氲水永氾氿汀汁求汆汇汈汉汊汋汐汔汕汗汛汜汝汞江池污汤汧汨汩汪汫汭汰汲汴汶汹汽" + "汾沁沂沃沄沅沆沇沈沉沌沏沐沓沔沘沙沚沛沟没沣沤沥沦沧沨沩沪沫沭沮沱河沸油沺治沼沽" + "沾沿泂泃泄泅泇泉泊泌泐泓泔法泖泗泙泚泛泜泞泠泡波泣泥注泪泫泮泯泰泱泳泵泷泸泺泻泼" + "泽泾洁洄洇洈洋洌洎洑洒洓洗洘洙洚洛洞洢洣津洧洨洪洫洭洮洱洲洳洴洵洸洹洺活洼洽派洿" + "流浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕浙浚浛浜浞浟浠浡浣浥浦浩浪浬浭浮浯浰浲浴海浸" + "浼涂涄涅消涉涌涍涎涐涑涓涔涕涘涛涝涞涟涠涡涢涣涤润涧涨涩涪涫涮涯液涴涵涸涿淀淄淅" + "淆淇淋淌淏淑淖淘淙淜淝淞淟淠淡淤淦淫淬淮淯深淳淴混淹添淼清渊渌渍渎渐渑渔渗渚渝渟" + "渠渡渣渤渥温渫渭港渰渲渴游渺渼湃湄湉湍湎湑湓湔湖湘湛湜湝湟湣湫湮湲湴湾湿溁溃溅溆" + "溇溉溍溏源溘溚溜溞溟溠溢溥溦溧溪溯溱溲溴溵溶溷溹溺溻溽滁滂滃滆滇滉滋滍滏滑滓滔滕" + "滗滘滚滞滟滠满滢滤滥滦滧滨滩滪滫滴滹漂漆漈漉漋漏漓演漕漖漠漤漦漩漪漫漭漯漱漳漴漶" + "漷漹漻漼漾潆潇潋潍潏潖潘潜潞潟潢潦潩潭潮潲潴潵潸潺潼潽潾澂澄澈澉澌澍澎澛澜澡澥澧" + "澪澭澳澴澶澹澼澽激濂濉濋濑濒濞濠濡濩濮濯瀌瀍瀑瀔瀚瀛瀣瀱瀵瀹瀼灈灌灏灞火灭灯灰灵" + "灶灸灼灾灿炀炅炆炉炊炌炎炒炔炕炖炘炙炜炝炟炣炫炬炭炮炯炱炳炷炸点炻炼炽烀烁烂烃烈" + "烊烔烘烙烛烜烝烟烠烤烦烧烨烩烫烬热烯烶烷烹烺烻烽焆焉焊焌焐焓焕焖焗焘焙焚焜焞焦焯" + "焰焱然煁煃煅煊煋煌煎煓煜煞煟煤煦照煨煮煲煳煴煸煺煽熄熇熊熏熔熘熙熛熜熟熠熥熨熬熵" + "熹熻燃燊燋燎燏燔燕燚燠燥燧燮燹爆爇爔爚爝爟爨爪爬爰爱爵父爷爸爹爻爽爿牁牂片版牌牍" + "牒牖牙牚牛牝牟牡牢牤牥牦牧物牮牯牲牵特牺牻牾牿犀犁犄犇犊犋犍犏犒犟犨犬犯犰犴状犷" + "犸犹狁狂狃狄狈狉狍狎狐狒狗狙狝狞狠狡狨狩独狭狮狯狰狱狲狳狴狷狸狺狻狼猁猃猄猇猊猎" + "猕猖猗猛猜猝猞猡猢猥猩猪猫猬献猯猰猱猴猷猹猺猾猿獍獐獒獗獠獬獭獯獴獾玃玄率玉王玎" + "玑玒玓玕玖玘玙玚玛玞玟玠玡玢玤玥玦玩玫玭玮环现玱玲玳玶玷玹玺玻玼玿珀珂珅珇珈珉珊" + "珋珌珍珏珐珑珒珕珖珙珛珝珞珠珢珣珥珦珧珩珪珫班珰珲珵珷珸珹珺珽琀球琄琅理琇琈琉琊" + "琎琏琐琔琚琛琟琡琢琤琥琦琨琪琫琬琭琮琯琰琲琳琴琵琶琼瑀瑁瑂瑃瑄瑅瑆瑑瑓瑔瑕瑖瑗瑙" + "瑚瑛瑜瑝瑞瑟瑢瑧瑨瑬瑭瑰瑱瑳瑶瑷瑾璀璁璃璆璇璈璋璎璐璒璘璜璞璟璠璥璧璨璩璪璬璮璱" + "璲璺瓀瓒瓖瓘瓜瓞瓠瓢瓣瓤瓦瓮瓯瓴瓶瓷瓻瓿甄甍甏甑甓甗甘甚甜生甡甥甦用甩甪甫甬甭甯" + "田由甲申电男甸町画甾畀畅畈畋界畎畏畔畖留畚畛畜畤略畦番畬畯畲畴畸畹畿疁疃疆疍疏疐" + "疑疔疖疗疙疚疝疟疠疡疢疣疤疥疫疬疭疮疯疰疱疲疳疴疵疸疹疼疽疾痂痃痄病症痈痉痊痍痒" + "痓痔痕痘痛痞痢痣痤痦痧痨痪痫痰痱痴痹痼痿瘀瘁瘃瘅瘆瘊瘌瘐瘕瘗瘘瘙瘛瘟瘠瘢瘤瘥瘦瘩" + "瘪瘫瘭瘰瘳瘴瘵瘸瘼瘾瘿癀癃癌癍癔癖癗癜癞癣癫癯癸登白百癿皂的皆皇皈皋皎皑皓皕皖皙" + "皛皞皤皦皭皮皱皲皴皿盂盅盆盈盉益盍盎盏盐监盒盔盖盗盘盛盟盥盦目盯盱盲直盷相盹盼盾" + "省眄眇眈眉眊看眍眙眚真眠眢眦眨眩眬眭眯眵眶眷眸眺眼着睁睃睄睇睎睐睑睚睛睡睢督睥睦" + "睨睫睬睹睽睾睿瞀瞄瞅瞋瞌瞍瞎瞑瞒瞟瞠瞢瞥瞧瞩瞪瞫瞬瞭瞰瞳瞵瞻瞽瞿矍矗矛矜矞矢矣知" + "矧矩矫矬短矮矰石矶矸矻矼矾矿砀码砂砄砆砉砌砍砑砒研砖砗砘砚砜砝砟砠砣砥砧砫砬砭砮" + "砰破砵砷砸砹砺砻砼砾础硁硅硇硊硌硍硎硐硒硔硕硖硗硙硚硝硪硫硬硭确硼硿碃碇碈碉碌碍" + "碎碏碑碓碗碘碚碛碜碟碡碣碥碧碨碰碱碲碳碴碶碹碾磁磅磉磊磋磏磐磔磕磙磜磡磨磬磲磴磷" + "磹磻礁礅礌礓礞礴礵示礼社祀祁祃祆祇祈祉祊祋祎祏祐祓祕祖祗祚祛祜祝神祟祠祢祥祧票祭" + "祯祲祷祸祺祼祾禀禁禄禅禊禋福禒禔禘禚禛禤禧禳禹禺离禽禾秀私秃秆秉秋种科秒秕秘租秣" + "秤秦秧秩秫秬秭积称秸移秽秾稀稂稃稆程稌稍税稑稔稗稙稚稞稠稣稳稷稹稻稼稽稿穄穆穑穗" + "穙穜穟穰穴究穷穸穹空穿窀突窃窄窅窈窊窍窎窑窒窕窖窗窘窜窝窟窠窣窥窦窨窬窭窳窸窿立" + "竑竖竘站竞竟章竣童竦竫竭端竹竺竽竿笃笄笆笈笊笋笏笑笔笕笙笛笞笠笤笥符笨笪笫第笮笯" + "笱笳笸笺笼笾筀筅筇等筋筌筏筐筑筒答策筘筚筛筜筝筠筢筤筥筦筮筱筲筵筶筷筹筻筼签简箅" + "箍箐箓箔箕箖算箜管箢箦箧箨箩箪箫箬箭箱箴箸篁篆篇篌篑篓篙篚篝篡篥篦篪篮篯篱篷篼篾" + "簃簇簉簋簌簏簕簖簝簟簠簧簪簰簸簿籀籁籍籥米籴类籼籽粉粑粒粕粗粘粜粝粞粟粢粤粥粪粮" + "粱粲粳粹粼粽精粿糁糅糇糈糊糌糍糒糕糖糗糙糜糟糠糨糯糵系紊素索紧紫累絜絮絷綦綮縠縢" + "縻繁繄繇纂纛纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁" + "绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩" + "绪绫续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缌缎缐缑缒缓缔缕" + "编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵缶缸缺罂罄罅罍罐" + "网罔罕罗罘罚罟罡罢罨罩罪置罱署罴罶罹罽罾羁羊羌美羑羓羔羕羖羚羝羞羟羡群羧羯羰羱羲" + "羸羹羼羽羿翀翁翂翃翅翈翊翌翎翔翕翘翙翚翛翟翠翡翥翦翩翮翯翰翱翳翷翻翼翾耀老考耄者" + "耆耇耋而耍耏耐耑耒耔耕耖耗耘耙耜耠耢耤耥耦耧耨耩耪耰耱耳耵耶耷耸耻耽耿聂聃聆聊聋" + "职聍聒联聘聚聩聪聱聿肃肄肆肇肉肋肌肓肖肘肚肛肝肟肠股肢肤肥肩肪肫肭肮肯肱育肴肷肸" + "肺肼肽肾肿胀胁胂胃胄胆胈背胍胎胖胗胙胚胛胜胝胞胠胡胣胤胥胧胨胩胪胫胬胭胯胰胱胲胳" + "胴胶胸胺胼能脂脆脉脊脍脎脏脐脑脒脓脔脖脘脚脞脟脩脬脯脱脲脶脸脾脿腆腈腊腋腌腐腑腒" + "腓腔腕腘腙腚腠腥腧腨腩腭腮腯腰腱腴腹腺腻腼腽腾腿膀膂膈膊膏膑膘膙膛膜膝膦膨膳膺膻" + "臀臂臃臆臊臌臑臜臣臧自臬臭至致臻臼臾舀舁舂舄舅舆舌舍舐舒舔舛舜舞舟舠舢舣舥航舫般" + "舭舯舰舱舲舳舴舵舶舷舸船舻舾艄艅艇艉艋艎艏艘艚艟艨艮良艰色艳艴艺艽艾艿节芃芄芈芊" + "芋芍芎芏芑芒芗芘芙芜芝芟芠芡芣芤芥芦芨芩芪芫芬芭芮芯芰花芳芴芷芸芹芼芽芾苁苄苇苈" + "苉苊苋苌苍苎苏苑苒苓苔苕苗苘苛苜苞苟苠苡苣苤若苦苧苫苯英苴苷苹苻苾茀茁茂范茄茅茆" + "茈茉茋茌茎茏茑茓茔茕茗茚茛茜茝茧茨茫茬茭茯茱茳茴茵茶茸茹茺茼茽荀荁荃荄荆荇草荏荐" + "荑荒荓荔荖荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药荷荸荻荼荽莅莆莉莎莒莓莘莙莛" + "莜莝莞莠莨莩莪莫莰莱莲莳莴莶获莸莹莺莼莽莿菀菁菂菅菇菉菊菌菍菏菔菖菘菜菝菟菠菡菥" + "菩菪菰菱菲菹菼菽萁萃萄萆萋萌萍萎萏萑萘萚萜萝萣萤营萦萧萨萩萱萳萸萹萼落葆葎葑葖著" + "葙葚葛葜葡董葩葫葬葭葰葱葳葴葵葶葸葺蒂蒄蒇蒈蒉蒋蒌蒎蒐蒗蒙蒜蒟蒡蒨蒯蒱蒲蒴蒸蒹蒺" + "蒻蒽蒿蓁蓂蓄蓇蓉蓊蓍蓏蓐蓑蓓蓖蓝蓟蓠蓢蓣蓥蓦蓬蓰蓼蓿蔀蔃蔈蔊蔌蔑蔓蔗蔚蔟蔡蔫蔬蔷" + "蔸蔹蔺蔻蔼蔽蕃蕈蕉蕊蕖蕗蕙蕞蕤蕨蕰蕲蕴蕹蕺蕻蕾薁薄薅薇薏薛薜薢薤薨薪薮薯薰薳薷薸" + "薹薿藁藉藏藐藓藕藜藟藠藤藦藨藩藻藿蘅蘑蘖蘘蘧蘩蘸蘼虎虏虐虑虒虓虔虚虞虢虤虫虬虮虱" + "虷虸虹虺虻虼虽虾虿蚀蚁蚂蚄蚆蚊蚋蚌蚍蚓蚕蚜蚝蚣蚤蚧蚨蚩蚪蚬蚯蚰蚱蚲蚴蚶蚺蛀蛃蛄蛆" + "蛇蛉蛊蛋蛎蛏蛐蛑蛔蛘蛙蛛蛞蛟蛤蛩蛭蛮蛰蛱蛲蛳蛴蛸蛹蛾蜀蜂蜃蜇蜈蜉蜊蜍蜎蜐蜒蜓蜕蜗" + "蜘蜚蜜蜞蜡蜢蜣蜥蜩蜮蜱蜴蜷蜻蜾蜿蝇蝈蝉蝌蝎蝓蝗蝘蝙蝠蝣蝤蝥蝮蝰蝲蝴蝶蝻蝼蝽蝾螂螃" + "螅螈螋融螗螟螠螣螨螫螬螭螯螱螳螵螺螽蟀蟆蟊蟋蟏蟑蟒蟛蟠蟥蟪蟫蟮蟹蟾蠃蠊蠋蠓蠕蠖蠡" + "蠢蠲蠹蠼血衃衄衅行衍衎衒衔街衙衠衡衢衣补表衩衫衬衮衰衲衷衽衾衿袁袂袄袅袆袈袋袍袒" + "袖袗袜袢袤袪被袭袯袱袷袼裁裂装裆裈裉裎裒裔裕裘裙裛裟裢裣裤裥裨裰裱裳裴裸裹裼裾褂" + "褊褐褒褓褕褙褚褛褟褡褥褪褫褯褰褴褶襁襄襕襚襜襞襟襦襫襻西要覃覆见观觃规觅视觇览觉" + "觊觋觌觎觏觐觑角觖觚觜觞觟解觥触觫觭觯觱觳觿言訄訇訚訾詈詟詹誉誊誓謇警譬计订讣认" + "讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词" + "诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诫诬语诮误诰诱诲诳说诵请" + "诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谙谚谛谜谝谞谟谠谡" + "谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷谼谿豁豆豇豉豌豕豚象豢豨豪豫豮豳豸豹" + "豺貂貅貆貉貊貌貔貘贝贞负贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼" + "贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赤" + "赦赧赪赫赭走赳赴赵赶起趁趄超越趋趑趔趟趣趯趱足趴趵趸趺趼趾趿跂跃跄跆跋跌跎跏跐跑" + "跖跗跚跛距跞跟跣跤跨跪跬路跱跳践跶跷跸跹跺跻跽踅踉踊踌踏踒踔踝踞踟踢踣踦踩踪踬踮" + "踯踱踵踶踹踺踽蹀蹁蹂蹄蹅蹇蹈蹉蹊蹋蹐蹑蹒蹙蹚蹜蹢蹦蹩蹬蹭蹯蹰蹲蹴蹶蹼蹽蹾蹿躁躅躇" + "躏躐躔躜躞身躬躯躲躺车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较" + "辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辛辜辞辟辣辨辩辫辰辱边辽达辿迁迂迄" + "迅过迈迎运近迓返迕还这进远违连迟迢迤迥迦迨迩迪迫迭迮述迳迷迸迹迺追退送适逃逄逅逆" + "选逊逋逍透逐逑递途逖逗通逛逝逞速造逡逢逦逭逮逯逴逵逶逸逻逼逾遁遂遄遆遇遍遏遐遑遒" + "道遗遘遛遢遣遥遨遭遮遴遵遹遽避邀邂邃邈邋邑邓邕邗邘邙邛邝邠邡邢那邦邨邪邬邮邯邰邱" + "邲邳邴邵邶邸邹邺邻邽邾邿郁郃郄郅郇郈郊郎郏郐郑郓郗郚郛郜郝郡郢郤郦郧部郪郫郭郯郴" + "郸都郾郿鄀鄂鄃鄄鄅鄌鄑鄗鄘鄙鄚鄜鄞鄠鄢鄣鄫鄯鄱鄹酂酃酅酆酉酊酋酌配酎酏酐酒酗酚酝" + "酞酡酢酣酤酥酦酩酪酬酮酯酰酱酲酴酵酶酷酸酹酺酽酾酿醅醇醉醋醌醍醐醑醒醚醛醢醨醪醭" + "醮醯醴醵醺醾采釉释里重野量釐金釜鉴銎銮鋆鋈錾鍪鎏鏊鏖鐾鑫钆钇针钉钊钋钌钍钎钏钐钒" + "钓钔钕钖钗钘钙钚钛钜钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钷钹钺钻钼" + "钽钾钿铀铁铂铃铄铅铆铈铉铊铋铌铍铎铏铐铑铒铕铖铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铧铨" + "铩铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐" + "锑锒锓锔锕锖锗锘错锚锛锜锝锞锟锡锢锣锤锥锦锧锨锩锪锫锬锭键锯锰锱锲锳锴锵锶锷锸锹" + "锺锻锼锽锾锿镀镁镂镃镄镅镆镇镈镉镊镋镌镍镎镏镐镑镒镓镔镕镖镗镘镚镛镜镝镞镠镡镢镣" + "镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镵镶长门闩闪闫闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼" + "闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阔阕阖阗阘阙阚阜队阡阪阮阱防阳阴阵阶" + "阻阼阽阿陀陂附际陆陇陈陉陋陌降陎限陑陔陕陛陞陟陡院除陧陨险陪陬陲陴陵陶陷隃隅隆隈" + "隋隍随隐隔隗隘隙障隧隩隰隳隶隹隺隼隽难雀雁雄雅集雇雉雊雌雍雎雏雒雕雠雨雩雪雯雱雳" + "零雷雹雾需霁霄霅霆震霈霉霍霎霏霓霖霜霞霨霪霭霰露霸霹霾青靓靖静靛非靠靡面靥革靬靰" + "靳靴靶靸靺靼靽靿鞁鞅鞋鞍鞑鞒鞔鞘鞠鞡鞣鞧鞨鞫鞬鞭鞮鞯鞲鞳鞴韂韦韧韨韩韪韫韬韭音韵" + "韶页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颓颔颖颗题颙颚颛颜额" + "颞颟颠颡颢颤颥颦颧风飏飐飑飒飓飔飕飗飘飙飞食飧飨餍餐餮饔饕饥饧饨饩饪饫饬饭饮饯饰" + "饱饲饳饴饵饶饷饸饹饺饻饼饽饿馁馃馄馅馆馇馈馉馊馋馌馍馏馐馑馒馓馔馕首馗馘香馝馞馥" + "馧馨马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑" + "骒骓骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧骨骰骱骶骷骸骺骼髀髁髂髃髅髋髌髎髑髓高" + "髡髢髦髫髭髯髹髻髽鬃鬈鬏鬒鬓鬘鬟鬣鬯鬲鬶鬷鬻鬼魁魂魃魄魅魆魇魈魉魋魍魏魑魔鱼鱽鱾" + "鱿鲀鲁鲂鲃鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨" + "鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳇鳈鳉鳊鳌鳍鳎鳏鳐鳑鳒鳓" + "鳔鳕鳖鳗鳘鳙鳚鳛鳜鳝鳞鳟鳠鳡鳢鳣鳤鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸵鸶" + "鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹔鹕鹖鹗鹘鹙鹚鹛鹜鹝鹞鹟" + "鹠鹡鹢鹣鹤鹦鹧鹨鹩鹪鹫鹬鹭鹮鹯鹰鹱鹲鹳鹴鹾鹿麀麂麇麈麋麑麒麓麖麝麟麦麸麹麻麽麾黄" + "黇黉黍黎黏黑黔默黛黜黝黟黠黡黢黥黧黩黪黯黹黻黼黾鼋鼍鼎鼐鼒鼓鼗鼙鼠鼢鼩鼫鼬鼯鼱鼷" + "鼹鼻鼽鼾齁齇齉齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟龠龢鿍鿎鿏㑇㑊㕮㘎㙍㙘㙦㛃" + "㛚㛹㟃㠇㠓㤘㥄㧐㧑㧟㫰㬊㬎㬚㭎㭕㮾㰀㳇㳘㳚㴔㵐㶲㸆㸌㺄㻬㽏㿠䁖䂮䃅䃎䅟䌹䎃䎖䏝䏡" + "䏲䐃䓖䓛䓨䓫䓬䗖䗛䗪䗴䜣䝙䢺䢼䣘䥽䦃䲟䲠䲢䴓䴔䴕䴖䴗䴘䴙䶮𠅤𠙶𠳐𡎚𡐓𣗋𣲗𣲘𣸣𤧛𤩽" + "𤫉𥔲𥕢𥖨𥻗𦈡𦒍𦙶𦝼𦭜𦰡𧿹𨐈𨙸𨚕𨟠𨭉𨱇𨱏𨱑𨱔𨺙𩽾𩾃𩾌𪟝𪣻𪤗𪨰𪨶𪩘𪾢𫄧𫄨𫄷𫄸𫇭𫌀𫍣𫍯" + "𫍲𫍽𫐄𫐐𫐓𫑡𫓧𫓯𫓶𫓹𫔍𫔎𫔶𫖮𫖯𫖳𫗧𫗴𫘜𫘝𫘦𫘧𫘨𫘪𫘬𫚕𫚖𫚭𫛭𫞩𫟅𫟦𫟹𫟼𫠆𫠊𫠜𫢸𫫇𫭟" + "𫭢𫭼𫮃𫰛𫵷𫶇𫷷𫸩𬀩𬀪𬂩𬃊𬇕𬇙𬇹𬉼𬊈𬊤𬌗𬍛𬍡𬍤𬒈𬒔𬒗𬕂𬘓𬘘𬘡𬘩𬘫𬘬𬘭𬘯𬙂𬙊𬙋𬜬𬜯𬞟" + "𬟁𬟽𬣙𬣞𬣡𬣳𬤇𬤊𬤝𬨂𬨎𬩽𬪩𬬩𬬭𬬮𬬱𬬸𬬹𬬻𬬿𬭁𬭊𬭎𬭚𬭛𬭤𬭩𬭬𬭯𬭳𬭶𬭸𬭼𬮱𬮿𬯀𬯎𬱖𬱟" + "𬳵𬳶𬳽𬳿𬴂𬴃𬴊𬶋𬶍𬶏𬶐𬶟𬶠𬶨𬶭𬶮𬷕𬸘𬸚𬸣𬸦𬸪𬹼𬺈𬺓" ) -CN_CHARS_EXT = '吶诶屌囧飚屄' +CN_CHARS_EXT = "吶诶屌囧飚屄" CN_CHARS = CN_CHARS_COMMON + CN_CHARS_EXT -IN_CH_CHARS = { c : True for c in CN_CHARS } +IN_CH_CHARS = {c: True for c in CN_CHARS} EN_CHARS = string.ascii_letters + string.digits -IN_EN_CHARS = { c : True for c in EN_CHARS } +IN_EN_CHARS = {c: True for c in EN_CHARS} + +VALID_CHARS = CN_CHARS + EN_CHARS + " " +IN_VALID_CHARS = {c: True for c in VALID_CHARS} -VALID_CHARS = CN_CHARS + EN_CHARS + ' ' -IN_VALID_CHARS = { c : True for c in VALID_CHARS } # ================================================================================ # # basic class @@ -393,7 +402,7 @@ class ChineseChar(object): def __init__(self, simplified, traditional): self.simplified = simplified self.traditional = traditional - #self.__repr__ = self.__str__ + # self.__repr__ = self.__str__ def __str__(self): return self.simplified or self.traditional or None @@ -416,26 +425,28 @@ def __init__(self, power, simplified, traditional, big_s, big_t): self.big_t = big_t def __str__(self): - return '10^{}'.format(self.power) + return "10^{}".format(self.power) @classmethod def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): - if small_unit: - return ChineseNumberUnit(power=index + 1, - simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]) + return ChineseNumberUnit( + power=index + 1, simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1] + ) elif numbering_type == NUMBERING_TYPES[0]: - return ChineseNumberUnit(power=index + 8, - simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + return ChineseNumberUnit( + power=index + 8, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) elif numbering_type == NUMBERING_TYPES[1]: - return ChineseNumberUnit(power=(index + 2) * 4, - simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + return ChineseNumberUnit( + power=(index + 2) * 4, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) elif numbering_type == NUMBERING_TYPES[2]: - return ChineseNumberUnit(power=pow(2, index + 3), - simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + return ChineseNumberUnit( + power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) else: - raise ValueError( - 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type)) + raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type)) class ChineseNumberDigit(ChineseChar): @@ -479,6 +490,7 @@ class NumberSystem(object): """ 中文数字系统 """ + pass @@ -527,28 +539,22 @@ def create_system(numbering_type=NUMBERING_TYPES[1]): """ # chinese number units of '亿' and larger - all_larger_units = zip( - LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) - larger_units = [CNU.create(i, v, numbering_type, False) - for i, v in enumerate(all_larger_units)] + all_larger_units = zip(LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) + larger_units = [CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)] # chinese number units of '十, 百, 千, 万' - all_smaller_units = zip( - SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) - smaller_units = [CNU.create(i, v, small_unit=True) - for i, v in enumerate(all_smaller_units)] + all_smaller_units = zip(SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) + smaller_units = [CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)] # digis - chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, - BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) + chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] # symbols - positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) - negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) - point_cn = CM(POINT[0], POINT[1], '.', lambda x, - y: float(str(x) + '.' + str(y))) + positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x) + negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x) + point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y))) # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) system = NumberSystem() system.units = smaller_units + larger_units @@ -559,7 +565,6 @@ def create_system(numbering_type=NUMBERING_TYPES[1]): def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): - def get_symbol(char, system): for u in system.units: if char in [u.traditional, u.simplified, u.big_s, u.big_t]: @@ -572,13 +577,12 @@ def get_symbol(char, system): return m def string2symbols(chinese_string, system): - int_string, dec_string = chinese_string, '' + int_string, dec_string = chinese_string, "" for p in [system.math.point.simplified, system.math.point.traditional]: if p in chinese_string: int_string, dec_string = chinese_string.split(p) break - return [get_symbol(c, system) for c in int_string], \ - [get_symbol(c, system) for c in dec_string] + return [get_symbol(c, system) for c in int_string], [get_symbol(c, system) for c in dec_string] def correct_symbols(integer_symbols, system): """ @@ -592,8 +596,7 @@ def correct_symbols(integer_symbols, system): if len(integer_symbols) > 1: if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): - integer_symbols.append( - CNU(integer_symbols[-2].power - 1, None, None, None, None)) + integer_symbols.append(CNU(integer_symbols[-2].power - 1, None, None, None, None)) result = [] unit_count = 0 @@ -610,8 +613,7 @@ def correct_symbols(integer_symbols, system): elif unit_count > 1: for i in range(len(result)): if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: - result[-i - 1] = CNU(result[-i - 1].power + - current_unit.power, None, None, None, None) + result[-i - 1] = CNU(result[-i - 1].power + current_unit.power, None, None, None, None) return result def compute_value(integer_symbols): @@ -628,8 +630,7 @@ def compute_value(integer_symbols): elif isinstance(s, CNU): value[-1] *= pow(10, s.power) if s.power > last_power: - value[:-1] = list(map(lambda v: v * - pow(10, s.power), value[:-1])) + value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1])) last_power = s.power value.append(0) return sum(value) @@ -638,20 +639,26 @@ def compute_value(integer_symbols): int_part, dec_part = string2symbols(chinese_string, system) int_part = correct_symbols(int_part, system) int_str = str(compute_value(int_part)) - dec_str = ''.join([str(d.value) for d in dec_part]) + dec_str = "".join([str(d.value) for d in dec_part]) if dec_part: - return '{0}.{1}'.format(int_str, dec_str) + return "{0}.{1}".format(int_str, dec_str) else: return int_str -def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False, - traditional=False, alt_zero=False, alt_one=False, alt_two=True, - use_zeros=True, use_units=True): - +def num2chn( + number_string, + numbering_type=NUMBERING_TYPES[1], + big=False, + traditional=False, + alt_zero=False, + alt_one=False, + alt_two=True, + use_zeros=True, + use_units=True, +): def get_value(value_string, use_zeros=True): - - striped_string = value_string.lstrip('0') + striped_string = value_string.lstrip("0") # record nothing if all zeros if not striped_string: @@ -666,14 +673,13 @@ def get_value(value_string, use_zeros=True): # recursively record multiple digits else: - result_unit = next(u for u in reversed( - system.units) if u.power < len(striped_string)) - result_string = value_string[:-result_unit.power] - return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:]) + result_unit = next(u for u in reversed(system.units) if u.power < len(striped_string)) + result_string = value_string[: -result_unit.power] + return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power :]) system = create_system(numbering_type) - int_dec = number_string.split('.') + int_dec = number_string.split(".") if len(int_dec) == 1: int_string = int_dec[0] dec_string = "" @@ -681,8 +687,7 @@ def get_value(value_string, use_zeros=True): int_string = int_dec[0] dec_string = int_dec[1] else: - raise ValueError( - "invalid input num string with more than one dot: {}".format(number_string)) + raise ValueError("invalid input num string with more than one dot: {}".format(number_string)) if use_units and len(int_string) > 1: result_symbols = get_value(int_string) @@ -693,12 +698,10 @@ def get_value(value_string, use_zeros=True): result_symbols += [system.math.point] + dec_symbols if alt_two: - liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, - system.digits[2].big_s, system.digits[2].big_t) + liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, system.digits[2].big_s, system.digits[2].big_t) for i, v in enumerate(result_symbols): if isinstance(v, CND) and v.value == 2: - next_symbol = result_symbols[i + - 1] if i < len(result_symbols) - 1 else None + next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None previous_symbol = result_symbols[i - 1] if i > 0 else None if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): @@ -706,38 +709,38 @@ def get_value(value_string, use_zeros=True): # if big is True, '两' will not be used and `alt_two` has no impact on output if big: - attr_name = 'big_' + attr_name = "big_" if traditional: - attr_name += 't' + attr_name += "t" else: - attr_name += 's' + attr_name += "s" else: if traditional: - attr_name = 'traditional' + attr_name = "traditional" else: - attr_name = 'simplified' + attr_name = "simplified" - result = ''.join([getattr(s, attr_name) for s in result_symbols]) + result = "".join([getattr(s, attr_name) for s in result_symbols]) # if not use_zeros: # result = result.strip(getattr(system.digits[0], attr_name)) if alt_zero: - result = result.replace( - getattr(system.digits[0], attr_name), system.digits[0].alt_s) + result = result.replace(getattr(system.digits[0], attr_name), system.digits[0].alt_s) if alt_one: - result = result.replace( - getattr(system.digits[1], attr_name), system.digits[1].alt_s) + result = result.replace(getattr(system.digits[1], attr_name), system.digits[1].alt_s) for i, p in enumerate(POINT): if result.startswith(p): return CHINESE_DIGIS[0] + result # ^10, 11, .., 19 - if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], - SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ - result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: + if ( + len(result) >= 2 + and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] + and result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]] + ): result = result[1:] return result @@ -761,6 +764,7 @@ def chntext2cardinal(self): def cardinal2chntext(self): return num2chn(self.cardinal) + class Digit: """ DIGIT类 @@ -795,19 +799,14 @@ def __init__(self, telephone=None, raw_chntext=None, chntext=None): # return self.telephone def telephone2chntext(self, fixed=False): - if fixed: - sil_parts = self.telephone.split('-') - self.raw_chntext = ''.join([ - num2chn(part, alt_two=False, use_units=False) for part in sil_parts - ]) - self.chntext = self.raw_chntext.replace('', '') + sil_parts = self.telephone.split("-") + self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sil_parts]) + self.chntext = self.raw_chntext.replace("", "") else: - sp_parts = self.telephone.strip('+').split() - self.raw_chntext = ''.join([ - num2chn(part, alt_two=False, use_units=False) for part in sp_parts - ]) - self.chntext = self.raw_chntext.replace('', '') + sp_parts = self.telephone.strip("+").split() + self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sp_parts]) + self.chntext = self.raw_chntext.replace("", "") return self.chntext @@ -821,12 +820,12 @@ def __init__(self, fraction=None, chntext=None): self.chntext = chntext def chntext2fraction(self): - denominator, numerator = self.chntext.split('分之') - return chn2num(numerator) + '/' + chn2num(denominator) + denominator, numerator = self.chntext.split("分之") + return chn2num(numerator) + "/" + chn2num(denominator) def fraction2chntext(self): - numerator, denominator = self.fraction.split('/') - return num2chn(denominator) + '分之' + num2chn(numerator) + numerator, denominator = self.fraction.split("/") + return num2chn(denominator) + "分之" + num2chn(numerator) class Date: @@ -865,23 +864,23 @@ def __init__(self, date=None, chntext=None): def date2chntext(self): date = self.date try: - year, other = date.strip().split('年', 1) - year = Digit(digit=year).digit2chntext() + '年' + year, other = date.strip().split("年", 1) + year = Digit(digit=year).digit2chntext() + "年" except ValueError: other = date - year = '' + year = "" if other: try: - month, day = other.strip().split('月', 1) - month = Cardinal(cardinal=month).cardinal2chntext() + '月' + month, day = other.strip().split("月", 1) + month = Cardinal(cardinal=month).cardinal2chntext() + "月" except ValueError: day = date - month = '' + month = "" if day: day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] else: - month = '' - day = '' + month = "" + day = "" chntext = year + month + day self.chntext = chntext return self.chntext @@ -901,7 +900,7 @@ def __init__(self, money=None, chntext=None): def money2chntext(self): money = self.money - pattern = re.compile(r'(\d+(\.\d+)?)') + pattern = re.compile(r"(\d+(\.\d+)?)") matchers = pattern.findall(money) if matchers: for matcher in matchers: @@ -920,20 +919,20 @@ def __init__(self, percentage=None, chntext=None): self.chntext = chntext def chntext2percentage(self): - return chn2num(self.chntext.strip().strip('百分之')) + '%' + return chn2num(self.chntext.strip().strip("百分之")) + "%" def percentage2chntext(self): - return '百分之' + num2chn(self.percentage.strip().strip('%')) + return "百分之" + num2chn(self.percentage.strip().strip("%")) def normalize_nsw(raw_text): - text = '^' + raw_text + '$' + text = "^" + raw_text + "$" # 规范化日期 pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") matchers = pattern.findall(text) if matchers: - #print('date') + # print('date') for matcher in matchers: text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) @@ -941,7 +940,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") matchers = pattern.findall(text) if matchers: - #print('money') + # print('money') for matcher in matchers: text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) @@ -954,7 +953,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") matchers = pattern.findall(text) if matchers: - #print('telephone') + # print('telephone') for matcher in matchers: text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) # 固话 @@ -969,16 +968,16 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+/\d+)") matchers = pattern.findall(text) if matchers: - #print('fraction') + # print('fraction') for matcher in matchers: text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) # 规范化百分数 - text = text.replace('%', '%') + text = text.replace("%", "%") pattern = re.compile(r"(\d+(\.\d+)?%)") matchers = pattern.findall(text) if matchers: - #print('percentage') + # print('percentage') for matcher in matchers: text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) @@ -986,7 +985,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) matchers = pattern.findall(text) if matchers: - #print('cardinal+quantifier') + # print('cardinal+quantifier') for matcher in matchers: text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) @@ -994,7 +993,7 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d{4,32})") matchers = pattern.findall(text) if matchers: - #print('digit') + # print('digit') for matcher in matchers: text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) @@ -1002,20 +1001,19 @@ def normalize_nsw(raw_text): pattern = re.compile(r"(\d+(\.\d+)?)") matchers = pattern.findall(text) if matchers: - #print('cardinal') + # print('cardinal') for matcher in matchers: text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) - # restore P2P, O2O, B2C, B2B etc pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") matchers = pattern.findall(text) if matchers: # print('particular') for matcher in matchers: - text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1) + text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1) - return text.lstrip('^').rstrip('$') + return text.lstrip("^").rstrip("$") def remove_erhua(text): @@ -1024,9 +1022,9 @@ def remove_erhua(text): 他女儿在那边儿 -> 他女儿在那边 """ - new_str='' - while re.search('儿',text): - a = re.search('儿',text).span() + new_str = "" + while re.search("儿", text): + a = re.search("儿", text).span() remove_er_flag = 0 if ER_WHITELIST_PATTERN.search(text): @@ -1034,12 +1032,12 @@ def remove_erhua(text): if b[0] <= a[0]: remove_er_flag = 1 - if remove_er_flag == 0 : - new_str = new_str + text[0:a[0]] - text = text[a[1]:] + if remove_er_flag == 0: + new_str = new_str + text[0 : a[0]] + text = text[a[1] :] else: - new_str = new_str + text[0:b[1]] - text = text[b[1]:] + new_str = new_str + text[0 : b[1]] + text = text[b[1] :] text = new_str + text return text @@ -1048,25 +1046,26 @@ def remove_erhua(text): def remove_space(text): tokens = text.split() new = [] - for k,t in enumerate(tokens): + for k, t in enumerate(tokens): if k != 0: - if IN_EN_CHARS.get(tokens[k-1][-1]) and IN_EN_CHARS.get(t[0]): - new.append(' ') + if IN_EN_CHARS.get(tokens[k - 1][-1]) and IN_EN_CHARS.get(t[0]): + new.append(" ") new.append(t) - return ''.join(new) + return "".join(new) class TextNorm: - def __init__(self, - to_banjiao:bool = False, - to_upper:bool = False, - to_lower:bool = False, - remove_fillers:bool = False, - remove_erhua:bool = False, - check_chars:bool = False, - remove_space:bool = False, - cc_mode:str = '', - ) : + def __init__( + self, + to_banjiao: bool = False, + to_upper: bool = False, + to_lower: bool = False, + remove_fillers: bool = False, + remove_erhua: bool = False, + check_chars: bool = False, + remove_space: bool = False, + cc_mode: str = "", + ): self.to_banjiao = to_banjiao self.to_upper = to_upper self.to_lower = to_lower @@ -1078,6 +1077,7 @@ def __init__(self, self.cc = None if cc_mode: from opencc import OpenCC # Open Chinese Convert: pip install opencc + self.cc = OpenCC(cc_mode) def __call__(self, text): @@ -1095,7 +1095,7 @@ def __call__(self, text): if self.remove_fillers: for c in FILLER_CHARS: - text = text.replace(c, '') + text = text.replace(c, "") if self.remove_erhua: text = remove_erhua(text) @@ -1107,8 +1107,8 @@ def __call__(self, text): if self.check_chars: for c in text: if not IN_VALID_CHARS.get(c): - print(f'WARNING: illegal char {c} in: {text}', file=sys.stderr) - return '' + print(f"WARNING: illegal char {c} in: {text}", file=sys.stderr) + return "" if self.remove_space: text = remove_space(text) @@ -1116,79 +1116,81 @@ def __call__(self, text): return text -if __name__ == '__main__': +if __name__ == "__main__": p = argparse.ArgumentParser() # normalizer options - p.add_argument('--to_banjiao', action='store_true', help='convert quanjiao chars to banjiao') - p.add_argument('--to_upper', action='store_true', help='convert to upper case') - p.add_argument('--to_lower', action='store_true', help='convert to lower case') - p.add_argument('--remove_fillers', action='store_true', help='remove filler chars such as "呃, 啊"') - p.add_argument('--remove_erhua', action='store_true', help='remove erhua chars such as "他女儿在那边儿 -> 他女儿在那边"') - p.add_argument('--check_chars', action='store_true' , help='skip sentences containing illegal chars') - p.add_argument('--remove_space', action='store_true' , help='remove whitespace') - p.add_argument('--cc_mode', choices=['', 't2s', 's2t'], default='', help='convert between traditional to simplified') + p.add_argument("--to_banjiao", action="store_true", help="convert quanjiao chars to banjiao") + p.add_argument("--to_upper", action="store_true", help="convert to upper case") + p.add_argument("--to_lower", action="store_true", help="convert to lower case") + p.add_argument("--remove_fillers", action="store_true", help='remove filler chars such as "呃, 啊"') + p.add_argument("--remove_erhua", action="store_true", help='remove erhua chars such as "他女儿在那边儿 -> 他女儿在那边"') + p.add_argument("--check_chars", action="store_true", help="skip sentences containing illegal chars") + p.add_argument("--remove_space", action="store_true", help="remove whitespace") + p.add_argument( + "--cc_mode", choices=["", "t2s", "s2t"], default="", help="convert between traditional to simplified" + ) # I/O options - p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines') - p.add_argument('--has_key', action='store_true', help="will be deprecated, set --format ark instead") - p.add_argument('--format', type=str, choices=['txt', 'ark', 'tsv'], default='txt', help='input format') - p.add_argument('ifile', help='input filename, assume utf-8 encoding') - p.add_argument('ofile', help='output filename') + p.add_argument("--log_interval", type=int, default=10000, help="log interval in number of processed lines") + p.add_argument("--has_key", action="store_true", help="will be deprecated, set --format ark instead") + p.add_argument("--format", type=str, choices=["txt", "ark", "tsv"], default="txt", help="input format") + p.add_argument("ifile", help="input filename, assume utf-8 encoding") + p.add_argument("ofile", help="output filename") args = p.parse_args() if args.has_key: - args.format = 'ark' + args.format = "ark" normalizer = TextNorm( - to_banjiao = args.to_banjiao, - to_upper = args.to_upper, - to_lower = args.to_lower, - remove_fillers = args.remove_fillers, - remove_erhua = args.remove_erhua, - check_chars = args.check_chars, - remove_space = args.remove_space, - cc_mode = args.cc_mode, + to_banjiao=args.to_banjiao, + to_upper=args.to_upper, + to_lower=args.to_lower, + remove_fillers=args.remove_fillers, + remove_erhua=args.remove_erhua, + check_chars=args.check_chars, + remove_space=args.remove_space, + cc_mode=args.cc_mode, ) normalizer = TextNorm( - to_banjiao = args.to_banjiao, - to_upper = args.to_upper, - to_lower = args.to_lower, - remove_fillers = args.remove_fillers, - remove_erhua = args.remove_erhua, - check_chars = args.check_chars, - remove_space = args.remove_space, - cc_mode = args.cc_mode, + to_banjiao=args.to_banjiao, + to_upper=args.to_upper, + to_lower=args.to_lower, + remove_fillers=args.remove_fillers, + remove_erhua=args.remove_erhua, + check_chars=args.check_chars, + remove_space=args.remove_space, + cc_mode=args.cc_mode, ) ndone = 0 - with open(args.ifile, 'r', encoding = 'utf8') as istream, open(args.ofile, 'w+', encoding = 'utf8') as ostream: - if args.format == 'tsv': - reader = csv.DictReader(istream, delimiter = '\t') - assert('TEXT' in reader.fieldnames) - print('\t'.join(reader.fieldnames), file=ostream) + with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: + if args.format == "tsv": + reader = csv.DictReader(istream, delimiter="\t") + assert "TEXT" in reader.fieldnames + print("\t".join(reader.fieldnames), file=ostream) for item in reader: - text = item['TEXT'] + text = item["TEXT"] if text: text = normalizer(text) if text: - item['TEXT'] = text - print('\t'.join([ item[f] for f in reader.fieldnames ]), file = ostream) + item["TEXT"] = text + print("\t".join([item[f] for f in reader.fieldnames]), file=ostream) ndone += 1 if ndone % args.log_interval == 0: - print(f'text norm: {ndone} lines done.', file = sys.stderr, flush = True) + print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True) else: for l in istream: - key, text = '', '' - if args.format == 'ark': # KALDI archive, line format: "key text" + key, text = "", "" + if args.format == "ark": # KALDI archive, line format: "key text" cols = l.strip().split(maxsplit=1) - key, text = cols[0], cols[1] if len(cols) == 2 else '' + key, text = cols[0], cols[1] if len(cols) == 2 else "" else: text = l.strip() @@ -1196,12 +1198,12 @@ def __call__(self, text): text = normalizer(text) if text: - if args.format == 'ark': - print(key + '\t' + text, file = ostream) + if args.format == "ark": + print(key + "\t" + text, file=ostream) else: - print(text, file = ostream) + print(text, file=ostream) ndone += 1 if ndone % args.log_interval == 0: - print(f'text norm: {ndone} lines done.', file = sys.stderr, flush = True) - print(f'text norm: {ndone} lines done in total.', file = sys.stderr, flush = True) + print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True) + print(f"text norm: {ndone} lines done in total.", file=sys.stderr, flush=True) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 76c5595ec3..6b8cc59101 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -1,55 +1,36 @@ import os -from contextlib import contextmanager from dataclasses import dataclass +import librosa import torch import torch.nn.functional as F import torchaudio from coqpit import Coqpit -from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel -from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts -from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps from TTS.tts.layers.xtts.gpt import GPT -from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer -from TTS.tts.layers.xtts.vocoder import UnivNetGenerator from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support +from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence from TTS.tts.models.base_tts import BaseTTS from TTS.utils.io import load_fsspec init_stream_support() -def load_audio(audiopath, sr=22050): - """ - Load an audio file from disk and resample it to the specified sampling rate. - - Args: - audiopath (str): Path to the audio file. - sr (int): Target sampling rate. - - Returns: - Tensor: Audio waveform tensor with shape (1, T), where T is the number of samples. - """ - audio, sampling_rate = torchaudio.load(audiopath) - - if len(audio.shape) > 1: - if audio.shape[0] < 5: - audio = audio[0] - else: - assert audio.shape[1] < 5 - audio = audio[:, 0] - - if sampling_rate != sr: - resampler = torchaudio.transforms.Resample(sampling_rate, sr) - audio = resampler(audio) - - audio = audio.clamp_(-1, 1) - return audio.unsqueeze(0) - def wav_to_mel_cloning( - wav, mel_norms_file="../experiments/clips_mel_norms.pth", mel_norms=None, device=torch.device("cpu") + wav, + mel_norms_file="../experiments/clips_mel_norms.pth", + mel_norms=None, + device=torch.device("cpu"), + n_fft=4096, + hop_length=1024, + win_length=4096, + power=2, + normalized=False, + sample_rate=22050, + f_min=0, + f_max=8000, + n_mels=80, ): """ Convert waveform to mel-spectrogram with hard-coded parameters for cloning. @@ -64,15 +45,15 @@ def wav_to_mel_cloning( torch.Tensor: Mel-spectrogram tensor. """ mel_stft = torchaudio.transforms.MelSpectrogram( - n_fft=4096, - hop_length=1024, - win_length=4096, - power=2, - normalized=False, - sample_rate=22050, - f_min=0, - f_max=8000, - n_mels=80, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + power=power, + normalized=normalized, + sample_rate=sample_rate, + f_min=f_min, + f_max=f_max, + n_mels=n_mels, norm="slaney", ).to(device) wav = wav.to(device) @@ -84,6 +65,28 @@ def wav_to_mel_cloning( return mel +def load_audio(audiopath, sampling_rate): + # better load setting following: https://github.com/faroit/python_audio_loading_benchmark + + # torchaudio should chose proper backend to load audio depending on platform + audio, lsr = torchaudio.load(audiopath) + + # stereo to mono if needed + if audio.size(0) != 1: + audio = torch.mean(audio, dim=0, keepdim=True) + + if lsr != sampling_rate: + audio = torchaudio.functional.resample(audio, lsr, sampling_rate) + + # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk. + # '10' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds. + if torch.any(audio > 10) or not torch.any(audio < 0): + print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") + # clip audio invalid values + audio.clip_(-1, 1) + return audio + + def pad_or_truncate(t, length): """ Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it. @@ -103,78 +106,6 @@ def pad_or_truncate(t, length): return tp -def load_discrete_vocoder_diffuser( - trained_diffusion_steps=4000, - desired_diffusion_steps=200, - cond_free=True, - cond_free_k=1, - sampler="ddim", -): - """ - Load a GaussianDiffusion instance configured for use as a decoder. - - Args: - trained_diffusion_steps (int): The number of diffusion steps used during training. - desired_diffusion_steps (int): The number of diffusion steps to use during inference. - cond_free (bool): Whether to use a conditioning-free model. - cond_free_k (int): The number of samples to use for conditioning-free models. - sampler (str): The name of the sampler to use. - - Returns: - A SpacedDiffusion instance configured with the given parameters. - """ - return SpacedDiffusion( - use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]), - model_mean_type="epsilon", - model_var_type="learned_range", - loss_type="mse", - betas=get_named_beta_schedule("linear", trained_diffusion_steps), - conditioning_free=cond_free, - conditioning_free_k=cond_free_k, - sampler=sampler, - ) - - -def do_spectrogram_diffusion( - diffusion_model, - diffuser, - latents, - conditioning_latents, - temperature=1, -): - """ - Generate a mel-spectrogram using a diffusion model and a diffuser. - - Args: - diffusion_model (nn.Module): A diffusion model that converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. - diffuser (Diffuser): A diffuser that generates a mel-spectrogram from noise. - latents (torch.Tensor): A tensor of shape (batch_size, seq_len, code_size) containing the input spectrogram codes. - conditioning_latents (torch.Tensor): A tensor of shape (batch_size, code_size) containing the conditioning codes. - temperature (float, optional): The temperature of the noise used by the diffuser. Defaults to 1. - - Returns: - torch.Tensor: A tensor of shape (batch_size, mel_channels, mel_seq_len) containing the generated mel-spectrogram. - """ - with torch.no_grad(): - output_seq_len = ( - latents.shape[1] * 4 * 24000 // 22050 - ) # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal. - output_shape = (latents.shape[0], 100, output_seq_len) - precomputed_embeddings = diffusion_model.timestep_independent( - latents, conditioning_latents, output_seq_len, False - ) - - noise = torch.randn(output_shape, device=latents.device) * temperature - mel = diffuser.sample_loop( - diffusion_model, - output_shape, - noise=noise, - model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings}, - progress=False, - ) - return denormalize_tacotron_mel(mel)[:, :, :output_seq_len] - - @dataclass class XttsAudioConfig(Coqpit): """ @@ -182,12 +113,10 @@ class XttsAudioConfig(Coqpit): Args: sample_rate (int): The sample rate in which the GPT operates. - diffusion_sample_rate (int): The sample rate of the diffusion audio waveform. output_sample_rate (int): The sample rate of the output audio waveform. """ sample_rate: int = 22050 - diffusion_sample_rate: int = 24000 output_sample_rate: int = 24000 @@ -203,32 +132,21 @@ class XttsArgs(Coqpit): clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None. decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None. num_chars (int, optional): The maximum number of characters to generate. Defaults to 255. - use_hifigan (bool, optional): Whether to use hifigan or diffusion + univnet as a decoder. Defaults to True. For GPT model: - ar_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604. - ar_max_text_tokens (int, optional): The maximum text tokens for the autoregressive model. Defaults to 402. - ar_max_prompt_tokens (int, optional): The maximum prompt tokens or the autoregressive model. Defaults to 70. - ar_layers (int, optional): The number of layers for the autoregressive model. Defaults to 30. - ar_n_model_channels (int, optional): The model dimension for the autoregressive model. Defaults to 1024. - ar_n_heads (int, optional): The number of heads for the autoregressive model. Defaults to 16. - ar_number_text_tokens (int, optional): The number of text tokens for the autoregressive model. Defaults to 255. - ar_start_text_token (int, optional): The start text token for the autoregressive model. Defaults to 255. + gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604. + gpt_max_text_tokens (int, optional): The maximum text tokens for the autoregressive model. Defaults to 402. + gpt_max_prompt_tokens (int, optional): The maximum prompt tokens or the autoregressive model. Defaults to 70. + gpt_layers (int, optional): The number of layers for the autoregressive model. Defaults to 30. + gpt_n_model_channels (int, optional): The model dimension for the autoregressive model. Defaults to 1024. + gpt_n_heads (int, optional): The number of heads for the autoregressive model. Defaults to 16. + gpt_number_text_tokens (int, optional): The number of text tokens for the autoregressive model. Defaults to 255. + gpt_start_text_token (int, optional): The start text token for the autoregressive model. Defaults to 255. gpt_checkpointing (bool, optional): Whether to use checkpointing for the autoregressive model. Defaults to False. - ar_train_solo_embeddings (bool, optional): Whether to train embeddings for the autoregressive model. Defaults to False. - - For DiffTTS model: - diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024. - diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10. - diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100. - diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200. - diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024. - diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193. - diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0. - diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False. - diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16. - diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0. - diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0. + gpt_train_solo_embeddings (bool, optional): Whether to train embeddings for the autoregressive model. Defaults to False. + gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024. + gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True. + gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False. """ gpt_batch_size: int = 1 @@ -238,8 +156,6 @@ class XttsArgs(Coqpit): clvp_checkpoint: str = None decoder_checkpoint: str = None num_chars: int = 255 - use_hifigan: bool = True - use_ne_hifigan: bool = False # XTTS GPT Encoder params tokenizer_file: str = "" @@ -255,25 +171,14 @@ class XttsArgs(Coqpit): gpt_num_audio_tokens: int = 8194 gpt_start_audio_token: int = 8192 gpt_stop_audio_token: int = 8193 - - # Diffusion Decoder params - diff_model_channels: int = 1024 - diff_num_layers: int = 10 - diff_in_channels: int = 100 - diff_out_channels: int = 200 - diff_in_latent_channels: int = 1024 - diff_in_tokens: int = 8193 - diff_dropout: int = 0 - diff_use_fp16: bool = False - diff_num_heads: int = 16 - diff_layer_drop: int = 0 - diff_unconditioned_percentage: int = 0 + gpt_code_stride_len: int = 1024 + gpt_use_masking_gt_prompt_approach: bool = True + gpt_use_perceiver_resampler: bool = False # HifiGAN Decoder params input_sample_rate: int = 22050 output_sample_rate: int = 24000 output_hop_length: int = 256 - ar_mel_length_compression: int = 1024 decoder_input_dim: int = 1024 d_vector_dim: int = 512 cond_d_vector_in_each_upsampling_layer: bool = True @@ -330,113 +235,148 @@ def init_models(self): num_audio_tokens=self.args.gpt_num_audio_tokens, start_audio_token=self.args.gpt_start_audio_token, stop_audio_token=self.args.gpt_stop_audio_token, + use_perceiver_resampler=self.args.gpt_use_perceiver_resampler, + code_stride_len=self.args.gpt_code_stride_len, ) - - if self.args.use_hifigan: - self.hifigan_decoder = HifiDecoder( - input_sample_rate=self.args.input_sample_rate, - output_sample_rate=self.args.output_sample_rate, - output_hop_length=self.args.output_hop_length, - ar_mel_length_compression=self.args.ar_mel_length_compression, - decoder_input_dim=self.args.decoder_input_dim, - d_vector_dim=self.args.d_vector_dim, - cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, - ) - - if self.args.use_ne_hifigan: - self.ne_hifigan_decoder = HifiDecoder( - input_sample_rate=self.args.input_sample_rate, - output_sample_rate=self.args.output_sample_rate, - output_hop_length=self.args.output_hop_length, - ar_mel_length_compression=self.args.ar_mel_length_compression, - decoder_input_dim=self.args.decoder_input_dim, - d_vector_dim=self.args.d_vector_dim, - cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, - ) - - if not (self.args.use_hifigan or self.args.use_ne_hifigan): - self.diffusion_decoder = DiffusionTts( - model_channels=self.args.diff_model_channels, - num_layers=self.args.diff_num_layers, - in_channels=self.args.diff_in_channels, - out_channels=self.args.diff_out_channels, - in_latent_channels=self.args.diff_in_latent_channels, - in_tokens=self.args.diff_in_tokens, - dropout=self.args.diff_dropout, - use_fp16=self.args.diff_use_fp16, - num_heads=self.args.diff_num_heads, - layer_drop=self.args.diff_layer_drop, - unconditioned_percentage=self.args.diff_unconditioned_percentage, - ) - self.vocoder = UnivNetGenerator() + self.hifigan_decoder = HifiDecoder( + input_sample_rate=self.args.input_sample_rate, + output_sample_rate=self.args.output_sample_rate, + output_hop_length=self.args.output_hop_length, + ar_mel_length_compression=self.args.gpt_code_stride_len, + decoder_input_dim=self.args.decoder_input_dim, + d_vector_dim=self.args.d_vector_dim, + cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, + ) @property def device(self): return next(self.parameters()).device @torch.inference_mode() - def get_gpt_cond_latents(self, audio_path: str, length: int = 3): + def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6): """Compute the conditioning latents for the GPT model from the given audio. Args: - audio_path (str): Path to the audio file. - length (int): Length of the audio in seconds. Defaults to 3. + audio (tensor): audio tensor. + sr (int): Sample rate of the audio. + length (int): Length of the audio in seconds. If < 0, use the whole audio. Defaults to 30. + chunk_length (int): Length of the audio chunks in seconds. When `length == chunk_length`, the whole audio + is being used without chunking. It must be < `length`. Defaults to 6. """ + if sr != 22050: + audio = torchaudio.functional.resample(audio, sr, 22050) + if length > 0: + audio = audio[:, : 22050 * length] + if self.args.gpt_use_perceiver_resampler: + style_embs = [] + for i in range(0, audio.shape[1], 22050 * chunk_length): + audio_chunk = audio[:, i : i + 22050 * chunk_length] + + # if the chunk is too short ignore it + if audio_chunk.size(-1) < 22050 * 0.33: + continue + + mel_chunk = wav_to_mel_cloning( + audio_chunk, + mel_norms=self.mel_stats.cpu(), + n_fft=2048, + hop_length=256, + win_length=1024, + power=2, + normalized=False, + sample_rate=22050, + f_min=0, + f_max=8000, + n_mels=80, + ) + style_emb = self.gpt.get_style_emb(mel_chunk.to(self.device), None) + style_embs.append(style_emb) - audio = load_audio(audio_path) - audio = audio[:, : 22050 * length] - mel = wav_to_mel_cloning(audio, mel_norms=self.mel_stats.cpu()) - cond_latent = self.gpt.get_style_emb(mel.to(self.device), sample=False) + # mean style embedding + cond_latent = torch.stack(style_embs).mean(dim=0) + else: + mel = wav_to_mel_cloning( + audio, + mel_norms=self.mel_stats.cpu(), + n_fft=4096, + hop_length=1024, + win_length=4096, + power=2, + normalized=False, + sample_rate=22050, + f_min=0, + f_max=8000, + n_mels=80, + ) + cond_latent = self.gpt.get_style_emb(mel.to(self.device)) return cond_latent.transpose(1, 2) @torch.inference_mode() - def get_diffusion_cond_latents( - self, - audio_path, - ): - from math import ceil - - diffusion_conds = [] - CHUNK_SIZE = 102400 - audio = load_audio(audio_path, 24000) - for chunk in range(ceil(audio.shape[1] / CHUNK_SIZE)): - current_sample = audio[:, chunk * CHUNK_SIZE : (chunk + 1) * CHUNK_SIZE] - current_sample = pad_or_truncate(current_sample, CHUNK_SIZE) - cond_mel = wav_to_univnet_mel( - current_sample.to(self.device), - do_normalization=False, - device=self.device, - ) - diffusion_conds.append(cond_mel) - diffusion_conds = torch.stack(diffusion_conds, dim=1) - diffusion_latent = self.diffusion_decoder.get_conditioning(diffusion_conds) - return diffusion_latent + def get_speaker_embedding(self, audio, sr): + audio_16k = torchaudio.functional.resample(audio, sr, 16000) + return ( + self.hifigan_decoder.speaker_encoder.forward(audio_16k.to(self.device), l2_norm=True) + .unsqueeze(-1) + .to(self.device) + ) @torch.inference_mode() - def get_speaker_embedding( - self, - audio_path - ): - audio = load_audio(audio_path, self.hifigan_decoder.speaker_encoder_audio_config["sample_rate"]) - speaker_embedding = self.hifigan_decoder.speaker_encoder.forward( - audio.to(self.device), l2_norm=True - ).unsqueeze(-1).to(self.device) - return speaker_embedding - def get_conditioning_latents( self, audio_path, - gpt_cond_len=3, - ): - speaker_embedding = None - diffusion_cond_latents = None - if self.args.use_hifigan: - speaker_embedding = self.get_speaker_embedding(audio_path) + max_ref_length=30, + gpt_cond_len=6, + gpt_cond_chunk_len=6, + librosa_trim_db=None, + sound_norm_refs=False, + load_sr=22050, + ): + """Get the conditioning latents for the GPT model from the given audio. + + Args: + audio_path (str or List[str]): Path to reference audio file(s). + max_ref_length (int): Maximum length of each reference audio in seconds. Defaults to 30. + gpt_cond_len (int): Length of the audio used for gpt latents. Defaults to 6. + gpt_cond_chunk_len (int): Chunk length used for gpt latents. It must be <= gpt_conf_len. Defaults to 6. + librosa_trim_db (int, optional): Trim the audio using this value. If None, not trimming. Defaults to None. + sound_norm_refs (bool, optional): Whether to normalize the audio. Defaults to False. + load_sr (int, optional): Sample rate to load the audio. Defaults to 24000. + """ + # deal with multiples references + if not isinstance(audio_path, list): + audio_paths = [audio_path] else: - diffusion_cond_latents = self.get_diffusion_cond_latents(audio_path) - gpt_cond_latents = self.get_gpt_cond_latents(audio_path, length=gpt_cond_len) # [1, 1024, T] - return gpt_cond_latents, diffusion_cond_latents, speaker_embedding + audio_paths = audio_path + + speaker_embeddings = [] + audios = [] + speaker_embedding = None + for file_path in audio_paths: + audio = load_audio(file_path, load_sr) + audio = audio[:, : load_sr * max_ref_length].to(self.device) + if sound_norm_refs: + audio = (audio / torch.abs(audio).max()) * 0.75 + if librosa_trim_db is not None: + audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0] + + # compute latents for the decoder + speaker_embedding = self.get_speaker_embedding(audio, load_sr) + speaker_embeddings.append(speaker_embedding) + + audios.append(audio) + + # merge all the audios and compute the latents for the gpt + full_audio = torch.cat(audios, dim=-1) + gpt_cond_latents = self.get_gpt_cond_latents( + full_audio, load_sr, length=gpt_cond_len, chunk_length=gpt_cond_chunk_len + ) # [1, 1024, T] + + if speaker_embeddings: + speaker_embedding = torch.stack(speaker_embeddings) + speaker_embedding = speaker_embedding.mean(dim=0) + + return gpt_cond_latents, speaker_embedding def synthesize(self, text, config, speaker_wav, language, **kwargs): """Synthesize speech with the given input text. @@ -444,7 +384,7 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs): Args: text (str): Input text. config (XttsConfig): Config with inference parameters. - speaker_wav (str): Path to the speaker audio file for cloning. + speaker_wav (list): List of paths to the speaker audio files to be used for cloning. language (str): Language ID of the speaker. **kwargs: Inference settings. See `inference()`. @@ -454,11 +394,6 @@ def synthesize(self, text, config, speaker_wav, language, **kwargs): as latents used at inference. """ - - # Make the synthesizer happy 🥳 - if isinstance(speaker_wav, list): - speaker_wav = speaker_wav[0] - return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs) def inference_with_config(self, text, config, ref_audio_path, language, **kwargs): @@ -466,7 +401,7 @@ def inference_with_config(self, text, config, ref_audio_path, language, **kwargs inference with config """ assert ( - language in self.config.languages + "zh-cn" if language == "zh" else language in self.config.languages ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" # Use generally found best tuning knobs for generation. settings = { @@ -475,10 +410,10 @@ def inference_with_config(self, text, config, ref_audio_path, language, **kwargs "repetition_penalty": config.repetition_penalty, "top_k": config.top_k, "top_p": config.top_p, - "cond_free_k": config.cond_free_k, - "diffusion_temperature": config.diffusion_temperature, - "decoder_iterations": config.decoder_iterations, - "decoder_sampler": config.decoder_sampler, + "gpt_cond_len": config.gpt_cond_len, + "gpt_cond_chunk_len": config.gpt_cond_chunk_len, + "max_ref_len": config.max_ref_len, + "sound_norm_refs": config.sound_norm_refs, } settings.update(kwargs) # allow overriding of preset settings with kwargs return self.full_inference(text, ref_audio_path, language, **settings) @@ -490,20 +425,17 @@ def full_inference( ref_audio_path, language, # GPT inference - temperature=0.65, - length_penalty=1, - repetition_penalty=2.0, + temperature=0.75, + length_penalty=1.0, + repetition_penalty=10.0, top_k=50, top_p=0.85, - gpt_cond_len=4, do_sample=True, - # Decoder inference - decoder_iterations=100, - cond_free=True, - cond_free_k=2, - diffusion_temperature=1.0, - decoder_sampler="ddim", - decoder="hifigan", + # Cloning + gpt_cond_len=30, + gpt_cond_chunk_len=6, + max_ref_len=10, + sound_norm_refs=False, **hf_generate_kwargs, ): """ @@ -532,28 +464,10 @@ def full_inference( (aka boring) outputs. Defaults to 0.8. gpt_cond_len: (int) Length of the audio used for cloning. If audio is shorter, then audio length is used - else the first `gpt_cond_len` secs is used. Defaults to 3 seconds. - - decoder_iterations: (int) Number of diffusion steps to perform. [0,4000]. More steps means the network has - more chances to iteratively refine the output, which should theoretically mean a higher quality output. - Generally a value above 250 is not noticeably better, however. Defaults to 100. + else the first `gpt_cond_len` secs is used. Defaults to 30 seconds. - cond_free: (bool) Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion - performs two forward passes for each diffusion step: one with the outputs of the autoregressive model - and one with no conditioning priors. The output of the two is blended according to the cond_free_k - value below. Conditioning-free diffusion is the real deal, and dramatically improves realism. - Defaults to True. - - cond_free_k: (float) Knob that determines how to balance the conditioning free signal with the - conditioning-present signal. [0,inf]. As cond_free_k increases, the output becomes dominated by the - conditioning-free signal. Defaults to 2.0. - - diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1]. - Values at 0 re the "mean" prediction of the diffusion network and will sound bland and smeared. - Defaults to 1.0. - - decoder: (str) Selects the decoder to use between ("hifigan", "ne_hifigan" and "diffusion") - Defaults to hifigan + gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`. + If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds. hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation @@ -563,32 +477,28 @@ def full_inference( Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length. Sample rate is 24kHz. """ - ( - gpt_cond_latent, - diffusion_conditioning, - speaker_embedding - ) = self.get_conditioning_latents(audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len) + (gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents( + audio_path=ref_audio_path, + gpt_cond_len=gpt_cond_len, + gpt_cond_chunk_len=gpt_cond_chunk_len, + max_ref_length=max_ref_len, + sound_norm_refs=sound_norm_refs, + ) + return self.inference( text, language, gpt_cond_latent, speaker_embedding, - diffusion_conditioning, temperature=temperature, length_penalty=length_penalty, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, do_sample=do_sample, - decoder_iterations=decoder_iterations, - cond_free=cond_free, - cond_free_k=cond_free_k, - diffusion_temperature=diffusion_temperature, - decoder_sampler=decoder_sampler, - decoder=decoder, **hf_generate_kwargs, ) - + @torch.inference_mode() def inference( self, @@ -596,95 +506,79 @@ def inference( language, gpt_cond_latent, speaker_embedding, - diffusion_conditioning, # GPT inference - temperature=0.65, - length_penalty=1, - repetition_penalty=2.0, + temperature=0.75, + length_penalty=1.0, + repetition_penalty=10.0, top_k=50, top_p=0.85, do_sample=True, - # Decoder inference - decoder_iterations=100, - cond_free=True, - cond_free_k=2, - diffusion_temperature=1.0, - decoder_sampler="ddim", - decoder="hifigan", + num_beams=1, + speed=1.0, + enable_text_splitting=False, **hf_generate_kwargs, ): - text = f"[{language}]{text.strip().lower()}" - text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) - - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." - - if not self.args.use_hifigan: - diffuser = load_discrete_vocoder_diffuser( - desired_diffusion_steps=decoder_iterations, - cond_free=cond_free, - cond_free_k=cond_free_k, - sampler=decoder_sampler, - ) + language = language.split("-")[0] # remove the country code + length_scale = 1.0 / max(speed, 0.05) + if enable_text_splitting: + text = split_sentence(text, language, self.tokenizer.char_limits[language]) + else: + text = [text] + + wavs = [] + gpt_latents_list = [] + for sent in text: + sent = sent.strip().lower() + text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) + + assert ( + text_tokens.shape[-1] < self.args.gpt_max_text_tokens + ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + + with torch.no_grad(): + gpt_codes = self.gpt.generate( + cond_latents=gpt_cond_latent, + text_inputs=text_tokens, + input_tokens=None, + do_sample=do_sample, + top_p=top_p, + top_k=top_k, + temperature=temperature, + num_return_sequences=self.gpt_batch_size, + num_beams=num_beams, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + output_attentions=False, + **hf_generate_kwargs, + ) + expected_output_len = torch.tensor( + [gpt_codes.shape[-1] * self.gpt.code_stride_len], device=text_tokens.device + ) - with torch.no_grad(): - gpt_codes = self.gpt.generate( - cond_latents=gpt_cond_latent, - text_inputs=text_tokens, - input_tokens=None, - do_sample=do_sample, - top_p=top_p, - top_k=top_k, - temperature=temperature, - num_return_sequences=self.gpt_batch_size, - length_penalty=length_penalty, - repetition_penalty=repetition_penalty, - output_attentions=False, - **hf_generate_kwargs, - ) - expected_output_len = torch.tensor( - [gpt_codes.shape[-1] * self.gpt.code_stride_len], device=text_tokens.device - ) - text_len = torch.tensor([text_tokens.shape[-1]], device=self.device) - gpt_latents = self.gpt( - text_tokens, - text_len, - gpt_codes, - expected_output_len, - cond_latents=gpt_cond_latent, - return_attentions=False, - return_latent=True, - ) - silence_token = 83 - ctokens = 0 - for k in range(gpt_codes.shape[-1]): - if gpt_codes[0, k] == silence_token: - ctokens += 1 - else: - ctokens = 0 - if ctokens > 8: - gpt_latents = gpt_latents[:, :k] - break - - if decoder == "hifigan": - assert hasattr(self, "hifigan_decoder"), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`" - wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding) - elif decoder == "ne_hifigan": - assert hasattr(self, "ne_hifigan_decoder"), "You must enable ne_hifigan decoder to use it by setting config `use_ne_hifigan: true`" - wav = self.ne_hifigan_decoder(gpt_latents, g=speaker_embedding) - else: - assert hasattr(self, "diffusion_decoder"), "You must disable hifigan decoders to use difffusion by setting config `use_ne_hifigan: false` and `use_hifigan: false`" - mel = do_spectrogram_diffusion( - self.diffusion_decoder, - diffuser, - gpt_latents, - diffusion_conditioning, - temperature=diffusion_temperature, + text_len = torch.tensor([text_tokens.shape[-1]], device=self.device) + gpt_latents = self.gpt( + text_tokens, + text_len, + gpt_codes, + expected_output_len, + cond_latents=gpt_cond_latent, + return_attentions=False, + return_latent=True, ) - wav = self.vocoder.inference(mel) - return {"wav": wav.cpu().numpy().squeeze()} + if length_scale != 1.0: + gpt_latents = F.interpolate( + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" + ).transpose(1, 2) + + gpt_latents_list.append(gpt_latents.cpu()) + wavs.append(self.hifigan_decoder(gpt_latents, g=speaker_embedding).cpu().squeeze()) + + return { + "wav": torch.cat(wavs, dim=0).numpy(), + "gpt_latents": torch.cat(gpt_latents_list, dim=1).numpy(), + "speaker_embedding": speaker_embedding, + } def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len): """Handle chunk formatting in streaming mode""" @@ -692,10 +586,21 @@ def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len): if wav_gen_prev is not None: wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len] if wav_overlap is not None: - crossfade_wav = wav_chunk[:overlap_len] - crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device) - wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device) - wav_chunk[:overlap_len] += crossfade_wav + # cross fade the overlap section + if overlap_len > len(wav_chunk): + # wav_chunk is smaller than overlap_len, pass on last wav_gen + if wav_gen_prev is not None: + wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) :] + else: + # not expecting will hit here as problem happens on last chunk + wav_chunk = wav_gen[-overlap_len:] + return wav_chunk, wav_gen, None + else: + crossfade_wav = wav_chunk[:overlap_len] + crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device) + wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device) + wav_chunk[:overlap_len] += crossfade_wav + wav_overlap = wav_gen[-overlap_len:] wav_gen_prev = wav_gen return wav_chunk, wav_gen_prev, wav_overlap @@ -711,74 +616,86 @@ def inference_stream( stream_chunk_size=20, overlap_wav_len=1024, # GPT inference - temperature=0.65, - length_penalty=1, - repetition_penalty=2.0, + temperature=0.75, + length_penalty=1.0, + repetition_penalty=10.0, top_k=50, top_p=0.85, do_sample=True, - # Decoder inference - decoder="hifigan", + speed=1.0, + enable_text_splitting=False, **hf_generate_kwargs, ): - assert hasattr(self, "hifigan_decoder"), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream." - text = f"[{language}]{text.strip().lower()}" - text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) + language = language.split("-")[0] # remove the country code + length_scale = 1.0 / max(speed, 0.05) + if enable_text_splitting: + text = split_sentence(text, language, self.tokenizer.char_limits[language]) + else: + text = [text] - fake_inputs = self.gpt.compute_embeddings( - gpt_cond_latent.to(self.device), - text_tokens, - ) - gpt_generator = self.gpt.get_generator( - fake_inputs=fake_inputs, - top_k=top_k, - top_p=top_p, - temperature=temperature, - do_sample=do_sample, - num_beams=1, - num_return_sequences=1, - length_penalty=float(length_penalty), - repetition_penalty=float(repetition_penalty), - output_attentions=False, - output_hidden_states=True, - **hf_generate_kwargs, - ) + for sent in text: + sent = sent.strip().lower() + text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) + + assert ( + text_tokens.shape[-1] < self.args.gpt_max_text_tokens + ), " ❗ XTTS can only generate text with a maximum of 400 tokens." - last_tokens = [] - all_latents = [] - wav_gen_prev = None - wav_overlap = None - is_end = False - - while not is_end: - try: - x, latent = next(gpt_generator) - last_tokens += [x] - all_latents += [latent] - except StopIteration: - is_end = True - - if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size): - gpt_latents = torch.cat(all_latents, dim=0)[None, :] - if decoder == "hifigan": - assert hasattr(self, "hifigan_decoder"), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`" + fake_inputs = self.gpt.compute_embeddings( + gpt_cond_latent.to(self.device), + text_tokens, + ) + gpt_generator = self.gpt.get_generator( + fake_inputs=fake_inputs, + top_k=top_k, + top_p=top_p, + temperature=temperature, + do_sample=do_sample, + num_beams=1, + num_return_sequences=1, + length_penalty=float(length_penalty), + repetition_penalty=float(repetition_penalty), + output_attentions=False, + output_hidden_states=True, + **hf_generate_kwargs, + ) + + last_tokens = [] + all_latents = [] + wav_gen_prev = None + wav_overlap = None + is_end = False + + while not is_end: + try: + x, latent = next(gpt_generator) + last_tokens += [x] + all_latents += [latent] + except StopIteration: + is_end = True + + if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size): + gpt_latents = torch.cat(all_latents, dim=0)[None, :] + if length_scale != 1.0: + gpt_latents = F.interpolate( + gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear" + ).transpose(1, 2) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) - elif decoder == "ne_hifigan": - assert hasattr(self, "ne_hifigan_decoder"), "You must enable ne_hifigan decoder to use it by setting config `use_ne_hifigan: true`" - wav_gen = self.ne_hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) - else: - raise NotImplementedError("Diffusion for streaming inference not implemented.") - wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( - wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len - ) - last_tokens = [] - yield wav_chunk + wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( + wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len + ) + last_tokens = [] + yield wav_chunk def forward(self): - raise NotImplementedError("XTTS Training is not implemented") + raise NotImplementedError( + "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + ) def eval_step(self): - raise NotImplementedError("XTTS Training is not implemented") + raise NotImplementedError( + "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + ) @staticmethod def init_from_config(config: "XttsConfig", **kwargs): # pylint: disable=unused-argument @@ -789,11 +706,29 @@ def eval(self): # pylint: disable=redefined-builtin self.gpt.init_gpt_for_inference() super().eval() + def get_compatible_checkpoint_state_dict(self, model_path): + checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"] + # remove xtts gpt trainer extra keys + ignore_keys = ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"] + for key in list(checkpoint.keys()): + # check if it is from the coqui Trainer if so convert it + if key.startswith("xtts."): + new_key = key.replace("xtts.", "") + checkpoint[new_key] = checkpoint[key] + del checkpoint[key] + key = new_key + + # remove unused keys + if key.split(".")[0] in ignore_keys: + del checkpoint[key] + + return checkpoint + def load_checkpoint( self, config, checkpoint_dir=None, - checkpoint_path=None, + checkpoint_path=None, vocab_path=None, eval=True, strict=True, @@ -821,25 +756,23 @@ def load_checkpoint( self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) self.init_models() - if eval: - self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache) - checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"] - ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan or self.args.use_ne_hifigan else [] - ignore_keys += [] if self.args.use_hifigan else ["hifigan_decoder"] - ignore_keys += [] if self.args.use_ne_hifigan else ["ne_hifigan_decoder"] - for key in list(checkpoint.keys()): - if key.split(".")[0] in ignore_keys: - del checkpoint[key] - self.load_state_dict(checkpoint, strict=strict) + checkpoint = self.get_compatible_checkpoint_state_dict(model_path) + + # deal with v1 and v1.1. V1 has the init_gpt_for_inference keys, v1.1 do not + try: + self.load_state_dict(checkpoint, strict=strict) + except: + if eval: + self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache) + self.load_state_dict(checkpoint, strict=strict) if eval: - if hasattr(self, "hifigan_decoder"): self.hifigan_decoder.eval() - if hasattr(self, "ne_hifigan_decoder"): self.hifigan_decoder.eval() - if hasattr(self, "diffusion_decoder"): self.diffusion_decoder.eval() - if hasattr(self, "vocoder"): self.vocoder.eval() + self.hifigan_decoder.eval() self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed) self.gpt.eval() def train_step(self): - raise NotImplementedError("XTTS Training is not implemented") + raise NotImplementedError( + "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + ) diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index 8982a89377..328e52f369 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -185,20 +185,16 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str: if tie: args.append("--tie=%s" % tie) - args.append('"' + text + '"') + args.append(text) # compute phonemes phonemes = "" for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): logging.debug("line: %s", repr(line)) ph_decoded = line.decode("utf8").strip() - # espeak need to skip first two characters of the retuned text: - # version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" + # espeak: # version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" - # espeak-ng need to skip the first character of the retuned text: - # "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" - - # dealing with the conditions descrived above - ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:] + # espeak-ng: + # "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n" # espeak-ng backend can add language flags that need to be removed: # "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ." diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py index 8d199cc545..36c467d083 100644 --- a/TTS/tts/utils/text/punctuation.py +++ b/TTS/tts/utils/text/punctuation.py @@ -15,7 +15,6 @@ class PuncPosition(Enum): BEGIN = 0 END = 1 MIDDLE = 2 - ALONE = 3 class Punctuation: @@ -92,7 +91,7 @@ def _strip_to_restore(self, text): return [text], [] # the text is only punctuations if len(matches) == 1 and matches[0].group() == text: - return [], [_PUNC_IDX(text, PuncPosition.ALONE)] + return [], [_PUNC_IDX(text, PuncPosition.BEGIN)] # build a punctuation map to be used later to restore punctuations puncs = [] for match in matches: @@ -107,11 +106,14 @@ def _strip_to_restore(self, text): for idx, punc in enumerate(puncs): split = text.split(punc.punc) prefix, suffix = split[0], punc.punc.join(split[1:]) + text = suffix + if prefix == "": + # We don't want to insert an empty string in case of initial punctuation + continue splitted_text.append(prefix) # if the text does not end with a punctuation, add it to the last item if idx == len(puncs) - 1 and len(suffix) > 0: splitted_text.append(suffix) - text = suffix return splitted_text, puncs @classmethod @@ -127,10 +129,10 @@ def restore(cls, text, puncs): ['This is', 'example'], ['.', '!'] -> "This is. example!" """ - return cls._restore(text, puncs, 0) + return cls._restore(text, puncs) @classmethod - def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statements + def _restore(cls, text, puncs): # pylint: disable=too-many-return-statements """Auxiliary method for Punctuation.restore()""" if not puncs: return text @@ -142,21 +144,18 @@ def _restore(cls, text, puncs, num): # pylint: disable=too-many-return-statemen current = puncs[0] if current.position == PuncPosition.BEGIN: - return cls._restore([current.punc + text[0]] + text[1:], puncs[1:], num) + return cls._restore([current.punc + text[0]] + text[1:], puncs[1:]) if current.position == PuncPosition.END: - return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1) - - if current.position == PuncPosition.ALONE: - return [current.mark] + cls._restore(text, puncs[1:], num + 1) + return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:]) # POSITION == MIDDLE if len(text) == 1: # pragma: nocover # a corner case where the final part of an intermediate # mark (I) has not been phonemized - return cls._restore([text[0] + current.punc], puncs[1:], num) + return cls._restore([text[0] + current.punc], puncs[1:]) - return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num) + return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:]) # if __name__ == "__main__": diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index e2b71fb2fe..af88569fc3 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -201,7 +201,6 @@ def stft( def istft( *, y: np.ndarray = None, - fft_size: int = None, hop_length: int = None, win_length: int = None, window: str = "hann", @@ -428,7 +427,7 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out = None, **kwargs) -> None: +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out=None, **kwargs) -> None: """Save float waveform to a file using Scipy. Args: diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 248e15b888..c53bad562e 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -5,10 +5,26 @@ import numpy as np import scipy.io.wavfile import scipy.signal -import soundfile as sf from TTS.tts.utils.helpers import StandardScaler -from TTS.utils.audio.numpy_transforms import compute_f0 +from TTS.utils.audio.numpy_transforms import ( + amp_to_db, + build_mel_basis, + compute_f0, + db_to_amp, + deemphasis, + find_endpoint, + griffin_lim, + load_wav, + mel_to_spec, + millisec_to_length, + preemphasis, + rms_volume_norm, + spec_to_mel, + stft, + trim_silence, + volume_norm, +) # pylint: disable=too-many-public-methods @@ -200,7 +216,9 @@ def __init__( # setup stft parameters if hop_length is None: # compute stft parameters from given time values - self.hop_length, self.win_length = self._stft_parameters() + self.win_length, self.hop_length = millisec_to_length( + frame_length_ms=self.frame_length_ms, frame_shift_ms=self.frame_shift_ms, sample_rate=self.sample_rate + ) else: # use stft parameters from config file self.hop_length = hop_length @@ -215,8 +233,13 @@ def __init__( for key, value in members.items(): print(" | > {}:{}".format(key, value)) # create spectrogram utils - self.mel_basis = self._build_mel_basis() - self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + self.mel_basis = build_mel_basis( + sample_rate=self.sample_rate, + fft_size=self.fft_size, + num_mels=self.num_mels, + mel_fmax=self.mel_fmax, + mel_fmin=self.mel_fmin, + ) # setup scaler if stats_path and signal_norm: mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) @@ -232,35 +255,6 @@ def init_from_config(config: "Coqpit", verbose=True): return AudioProcessor(verbose=verbose, **config.audio) return AudioProcessor(verbose=verbose, **config) - ### setting up the parameters ### - def _build_mel_basis( - self, - ) -> np.ndarray: - """Build melspectrogram basis. - - Returns: - np.ndarray: melspectrogram basis. - """ - if self.mel_fmax is not None: - assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel( - sr=self.sample_rate, n_fft=self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax - ) - - def _stft_parameters( - self, - ) -> Tuple[int, int]: - """Compute the real STFT parameters from the time values. - - Returns: - Tuple[int, int]: hop length and window length for STFT. - """ - factor = self.frame_length_ms / self.frame_shift_ms - assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(hop_length * factor) - return hop_length, win_length - ### normalization ### def normalize(self, S: np.ndarray) -> np.ndarray: """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` @@ -386,31 +380,6 @@ def setup_scaler( self.linear_scaler = StandardScaler() self.linear_scaler.set_stats(linear_mean, linear_std) - ### DB and AMP conversion ### - # pylint: disable=no-self-use - def _amp_to_db(self, x: np.ndarray) -> np.ndarray: - """Convert amplitude values to decibels. - - Args: - x (np.ndarray): Amplitude spectrogram. - - Returns: - np.ndarray: Decibels spectrogram. - """ - return self.spec_gain * _log(np.maximum(1e-5, x), self.base) - - # pylint: disable=no-self-use - def _db_to_amp(self, x: np.ndarray) -> np.ndarray: - """Convert decibels spectrogram to amplitude spectrogram. - - Args: - x (np.ndarray): Decibels spectrogram. - - Returns: - np.ndarray: Amplitude spectrogram. - """ - return _exp(x / self.spec_gain, self.base) - ### Preemphasis ### def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. @@ -424,32 +393,13 @@ def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: Returns: np.ndarray: Decorrelated audio signal. """ - if self.preemphasis == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") - return scipy.signal.lfilter([1, -self.preemphasis], [1], x) + return preemphasis(x=x, coef=self.preemphasis) def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray: """Reverse pre-emphasis.""" - if self.preemphasis == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") - return scipy.signal.lfilter([1], [1, -self.preemphasis], x) + return deemphasis(x=x, coef=self.preemphasis) ### SPECTROGRAMs ### - def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray: - """Project a full scale spectrogram to a melspectrogram. - - Args: - spectrogram (np.ndarray): Full scale spectrogram. - - Returns: - np.ndarray: Melspectrogram - """ - return np.dot(self.mel_basis, spectrogram) - - def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray: - """Convert a melspectrogram to full scale spectrogram.""" - return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) - def spectrogram(self, y: np.ndarray) -> np.ndarray: """Compute a spectrogram from a waveform. @@ -460,11 +410,16 @@ def spectrogram(self, y: np.ndarray) -> np.ndarray: np.ndarray: Spectrogram. """ if self.preemphasis != 0: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) + y = self.apply_preemphasis(y) + D = stft( + y=y, + fft_size=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + ) if self.do_amp_to_db_linear: - S = self._amp_to_db(np.abs(D)) + S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base) else: S = np.abs(D) return self.normalize(S).astype(np.float32) @@ -472,32 +427,35 @@ def spectrogram(self, y: np.ndarray) -> np.ndarray: def melspectrogram(self, y: np.ndarray) -> np.ndarray: """Compute a melspectrogram from a waveform.""" if self.preemphasis != 0: - D = self._stft(self.apply_preemphasis(y)) - else: - D = self._stft(y) + y = self.apply_preemphasis(y) + D = stft( + y=y, + fft_size=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + ) + S = spec_to_mel(spec=np.abs(D), mel_basis=self.mel_basis) if self.do_amp_to_db_mel: - S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - else: - S = self._linear_to_mel(np.abs(D)) + S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) + return self.normalize(S).astype(np.float32) def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" S = self.denormalize(spectrogram) - S = self._db_to_amp(S) + S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) # Reconstruct phase - if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) + W = self._griffin_lim(S**self.power) + return self.apply_inv_preemphasis(W) if self.preemphasis != 0 else W def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" D = self.denormalize(mel_spectrogram) - S = self._db_to_amp(D) - S = self._mel_to_linear(S) # Convert back to linear - if self.preemphasis != 0: - return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - return self._griffin_lim(S**self.power) + S = db_to_amp(x=D, gain=self.spec_gain, base=self.base) + S = mel_to_spec(mel=S, mel_basis=self.mel_basis) # Convert back to linear + W = self._griffin_lim(S**self.power) + return self.apply_inv_preemphasis(W) if self.preemphasis != 0 else W def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: """Convert a full scale linear spectrogram output of a network to a melspectrogram. @@ -509,60 +467,22 @@ def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: np.ndarray: Normalized melspectrogram. """ S = self.denormalize(linear_spec) - S = self._db_to_amp(S) - S = self._linear_to_mel(np.abs(S)) - S = self._amp_to_db(S) + S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) + S = spec_to_mel(spec=np.abs(S), mel_basis=self.mel_basis) + S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) mel = self.normalize(S) return mel - ### STFT and ISTFT ### - def _stft(self, y: np.ndarray) -> np.ndarray: - """Librosa STFT wrapper. - - Args: - y (np.ndarray): Audio signal. - - Returns: - np.ndarray: Complex number array. - """ - return librosa.stft( - y=y, - n_fft=self.fft_size, + def _griffin_lim(self, S): + return griffin_lim( + spec=S, + num_iter=self.griffin_lim_iters, hop_length=self.hop_length, win_length=self.win_length, + fft_size=self.fft_size, pad_mode=self.stft_pad_mode, - window="hann", - center=True, ) - def _istft(self, y: np.ndarray) -> np.ndarray: - """Librosa iSTFT wrapper.""" - return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) - - def _griffin_lim(self, S): - angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) - try: - S_complex = np.abs(S).astype(np.complex) - except AttributeError: # np.complex is deprecated since numpy 1.20.0 - S_complex = np.abs(S).astype(complex) - y = self._istft(S_complex * angles) - if not np.isfinite(y).all(): - print(" [!] Waveform is not finite everywhere. Skipping the GL.") - return np.array([0.0]) - for _ in range(self.griffin_lim_iters): - angles = np.exp(1j * np.angle(self._stft(y))) - y = self._istft(S_complex * angles) - return y - - def compute_stft_paddings(self, x, pad_sides=1): - """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding - (first and final frames)""" - assert pad_sides in (1, 2) - pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] - if pad_sides == 1: - return 0, pad - return pad // 2, pad // 2 + pad % 2 - def compute_f0(self, x: np.ndarray) -> np.ndarray: """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. @@ -581,8 +501,6 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] >>> pitch = ap.compute_f0(wav) """ - assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." - assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`." # align F0 length to the spectrogram length if len(x) % self.hop_length == 0: x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode) @@ -612,21 +530,24 @@ def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: Returns: int: Last point without silence. """ - window_length = int(self.sample_rate * min_silence_sec) - hop_length = int(window_length / 4) - threshold = self._db_to_amp(-self.trim_db) - for x in range(hop_length, len(wav) - window_length, hop_length): - if np.max(wav[x : x + window_length]) < threshold: - return x + hop_length - return len(wav) + return find_endpoint( + wav=wav, + trim_db=self.trim_db, + sample_rate=self.sample_rate, + min_silence_sec=min_silence_sec, + gain=self.spec_gain, + base=self.base, + ) def trim_silence(self, wav): """Trim silent parts with a threshold and 0.01 sec margin""" - margin = int(self.sample_rate * 0.01) - wav = wav[margin:-margin] - return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[ - 0 - ] + return trim_silence( + wav=wav, + sample_rate=self.sample_rate, + trim_db=self.trim_db, + win_length=self.win_length, + hop_length=self.hop_length, + ) @staticmethod def sound_norm(x: np.ndarray) -> np.ndarray: @@ -638,13 +559,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray: Returns: np.ndarray: Volume normalized waveform. """ - return x / abs(x).max() * 0.95 - - @staticmethod - def _rms_norm(wav, db_level=-27): - r = 10 ** (db_level / 20) - a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2)) - return wav * a + return volume_norm(x=x) def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: """Normalize the volume based on RMS of the signal. @@ -657,9 +572,7 @@ def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: """ if db_level is None: db_level = self.db_level - assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" - wav = self._rms_norm(x, db_level) - return wav + return rms_volume_norm(x=x, db_level=db_level) ### save and load ### def load_wav(self, filename: str, sr: int = None) -> np.ndarray: @@ -674,15 +587,10 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: Returns: np.ndarray: Loaded waveform. """ - if self.resample: - # loading with resampling. It is significantly slower. - x, sr = librosa.load(filename, sr=self.sample_rate) - elif sr is None: - # SF is faster than librosa for loading files - x, sr = sf.read(filename) - assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr) + if sr is not None: + x = load_wav(filename=filename, sample_rate=sr, resample=True) else: - x, sr = librosa.load(filename, sr=sr) + x = load_wav(filename=filename, sample_rate=self.sample_rate, resample=self.resample) if self.do_trim_silence: try: x = self.trim_silence(x) @@ -694,7 +602,7 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: x = self.rms_volume_norm(x, self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out = None) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> None: """Save a waveform to a file using Scipy. Args: @@ -723,55 +631,3 @@ def get_duration(self, filename: str) -> float: filename (str): Path to the wav file. """ return librosa.get_duration(filename=filename) - - @staticmethod - def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: - mu = 2**qc - 1 - # wav_abs = np.minimum(np.abs(wav), 1.0) - signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) - # Quantize signal to the specified number of levels. - signal = (signal + 1) / 2 * mu + 0.5 - return np.floor( - signal, - ) - - @staticmethod - def mulaw_decode(wav, qc): - """Recovers waveform from quantized values.""" - mu = 2**qc - 1 - x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) - return x - - @staticmethod - def encode_16bits(x): - return np.clip(x * 2**15, -(2**15), 2**15 - 1).astype(np.int16) - - @staticmethod - def quantize(x: np.ndarray, bits: int) -> np.ndarray: - """Quantize a waveform to a given number of bits. - - Args: - x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`. - bits (int): Number of quantization bits. - - Returns: - np.ndarray: Quantized waveform. - """ - return (x + 1.0) * (2**bits - 1) / 2 - - @staticmethod - def dequantize(x, bits): - """Dequantize a waveform from the given number of bits.""" - return 2 * x / (2**bits - 1) - 1 - - -def _log(x, base): - if base == 10: - return np.log10(x) - return np.log(x) - - -def _exp(x, base): - if base == 10: - return np.power(10, x) - return np.exp(x) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index e9bdf3e686..3107ba661b 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,13 +1,9 @@ -import datetime -import json import os import pickle as pickle_tts -import shutil from typing import Any, Callable, Dict, Union import fsspec import torch -from coqpit import Coqpit from TTS.utils.generic_utils import get_user_data_dir @@ -28,34 +24,6 @@ def __init__(self, *args, **kwargs): self.__dict__ = self -def copy_model_files(config: Coqpit, out_path, new_fields=None): - """Copy config.json and other model files to training folder and add - new fields. - - Args: - config (Coqpit): Coqpit config defining the training run. - out_path (str): output path to copy the file. - new_fields (dict): new fileds to be added or edited - in the config file. - """ - copy_config_path = os.path.join(out_path, "config.json") - # add extra information fields - if new_fields: - config.update(new_fields, allow_new=True) - # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths. - with fsspec.open(copy_config_path, "w", encoding="utf8") as f: - json.dump(config.to_dict(), f, indent=4) - - # copy model stats file if available - if config.audio.stats_path is not None: - copy_stats_path = os.path.join(out_path, "scale_stats.npy") - filesystem = fsspec.get_mapper(copy_stats_path).fs - if not filesystem.exists(copy_stats_path): - with fsspec.open(config.audio.stats_path, "rb") as source_file: - with fsspec.open(copy_stats_path, "wb") as target_file: - shutil.copyfileobj(source_file, target_file) - - def load_fsspec( path: str, map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, @@ -100,117 +68,3 @@ def load_checkpoint( if eval: model.eval() return model, state - - -def save_fsspec(state: Any, path: str, **kwargs): - """Like torch.save but can save to other locations (e.g. s3:// , gs://). - - Args: - state: State object to save - path: Any path or url supported by fsspec. - **kwargs: Keyword arguments forwarded to torch.save. - """ - with fsspec.open(path, "wb") as f: - torch.save(state, f, **kwargs) - - -def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - if isinstance(optimizer, list): - optimizer_state = [optim.state_dict() for optim in optimizer] - elif optimizer.__class__.__name__ == "CapacitronOptimizer": - optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()] - else: - optimizer_state = optimizer.state_dict() if optimizer is not None else None - - if isinstance(scaler, list): - scaler_state = [s.state_dict() for s in scaler] - else: - scaler_state = scaler.state_dict() if scaler is not None else None - - if isinstance(config, Coqpit): - config = config.to_dict() - - state = { - "config": config, - "model": model_state, - "optimizer": optimizer_state, - "scaler": scaler_state, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - save_fsspec(state, output_path) - - -def save_checkpoint( - config, - model, - optimizer, - scaler, - current_step, - epoch, - output_folder, - **kwargs, -): - file_name = "checkpoint_{}.pth".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print("\n > CHECKPOINT : {}".format(checkpoint_path)) - save_model( - config, - model, - optimizer, - scaler, - current_step, - epoch, - checkpoint_path, - **kwargs, - ) - - -def save_best_model( - current_loss, - best_loss, - config, - model, - optimizer, - scaler, - current_step, - epoch, - out_path, - keep_all_best=False, - keep_after=10000, - **kwargs, -): - if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth" - checkpoint_path = os.path.join(out_path, best_model_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model( - config, - model, - optimizer, - scaler, - current_step, - epoch, - checkpoint_path, - model_loss=current_loss, - **kwargs, - ) - fs = fsspec.get_mapper(out_path).fs - # only delete previous if current is saved successfully - if not keep_all_best or (current_step < keep_after): - model_names = fs.glob(os.path.join(out_path, "best_model*.pth")) - for model_name in model_names: - if os.path.basename(model_name) != best_model_name: - fs.rm(model_name) - # create a shortcut which always points to the currently best model - shortcut_name = "best_model.pth" - shortcut_path = os.path.join(out_path, shortcut_name) - fs.copy(checkpoint_path, shortcut_path) - best_loss = current_loss - return best_loss diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index eef987efd4..3952504d0b 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -1,5 +1,6 @@ import json import os +import re import tarfile import zipfile from pathlib import Path @@ -27,6 +28,7 @@ class ModelManager(object): + tqdm_progress = None """Manage TTS models defined in .models.json. It provides an interface to list and download models defines in '.model.json' @@ -109,7 +111,6 @@ def _list_models(self, model_type, model_count=0): def _list_for_model_type(self, model_type): models_name_list = [] model_count = 1 - model_type = "tts_models" models_name_list.extend(self._list_models(model_type, model_count)) return models_name_list @@ -275,13 +276,15 @@ def set_model_url(model_item: Dict): model_item["model_url"] = model_item["hf_url"] elif "fairseq" in model_item["model_name"]: model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/" + elif "xtts" in model_item["model_name"]: + model_item["model_url"] = "https://coqui.gateway.scarf.sh/xtts/" return model_item def _set_model_item(self, model_name): # fetch model info from the dict - model_type, lang, dataset, model = model_name.split("/") - model_full_name = f"{model_type}--{lang}--{dataset}--{model}" if "fairseq" in model_name: + model_type = "tts_models" + lang = model_name.split("/")[1] model_item = { "model_type": "tts_models", "license": "CC BY-NC 4.0", @@ -290,30 +293,57 @@ def _set_model_item(self, model_name): "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.", } model_item["model_name"] = model_name + elif "xtts" in model_name and len(model_name.split("/")) != 4: + # loading xtts models with only model name (e.g. xtts_v2.0.2) + # check model name has the version number with regex + version_regex = r"v\d+\.\d+\.\d+" + if re.search(version_regex, model_name): + model_version = model_name.split("_")[-1] + else: + model_version = "main" + model_type = "tts_models" + lang = "multilingual" + dataset = "multi-dataset" + model = model_name + model_item = { + "default_vocoder": None, + "license": "CPML", + "contact": "info@coqui.ai", + "tos_required": True, + "hf_url": [ + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json", + f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5", + ], + } else: # get model from models.json + model_type, lang, dataset, model = model_name.split("/") model_item = self.models_dict[model_type][lang][dataset][model] model_item["model_type"] = model_type + + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" md5hash = model_item["model_hash"] if "model_hash" in model_item else None model_item = self.set_model_url(model_item) return model_item, model_full_name, model, md5hash - def ask_tos(self, model_full_path): + @staticmethod + def ask_tos(model_full_path): """Ask the user to agree to the terms of service""" tos_path = os.path.join(model_full_path, "tos_agreed.txt") - if not os.path.exists(tos_path): - print(" > You must agree to the terms of service to use this model.") - print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") - print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]') - answer = input(" | | > ") - if answer.lower() == "y": - with open(tos_path, "w") as f: - f.write("I have read, understood ad agree the Terms and Conditions.") - return True - else: - return False + print(" > You must agree to the terms of service to use this model.") + print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") + print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]') + answer = input(" | | > ") + if answer.lower() == "y": + with open(tos_path, "w", encoding="utf-8") as f: + f.write("I have read, understood and agreed to the Terms and Conditions.") + return True + return False - def tos_agreed(self, model_item, model_full_path): + @staticmethod + def tos_agreed(model_item, model_full_path): """Check if the user has agreed to the terms of service""" if "tos_required" in model_item and model_item["tos_required"]: tos_path = os.path.join(model_full_path, "tos_agreed.txt") @@ -392,7 +422,7 @@ def download_model(self, model_name): self.create_dir_and_download_model(model_name, model_item, output_path) # if the configs are different, redownload it # ToDo: we need a better way to handle it - if "xtts_v1" in model_name: + if "xtts" in model_name: try: self.check_if_configs_are_equal(model_name, model_item, output_path) except: @@ -406,7 +436,7 @@ def download_model(self, model_name): output_model_path = output_path output_config_path = None if ( - model not in ["tortoise-v2", "bark", "xtts_v1", "xtts_v1.1"] and "fairseq" not in model_name + model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name ): # TODO:This is stupid but don't care for now. output_model_path, output_config_path = self._find_files(output_path) # update paths in the config.json @@ -526,12 +556,12 @@ def _download_zip_file(file_url, output_folder, progress_bar): total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1]) with open(temp_zip_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) with zipfile.ZipFile(temp_zip_name) as z: z.extractall(output_folder) @@ -561,12 +591,12 @@ def _download_tar_file(file_url, output_folder, progress_bar): total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1]) with open(temp_tar_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) with tarfile.open(temp_tar_name) as t: t.extractall(output_folder) @@ -597,10 +627,10 @@ def _download_model_files(file_urls, output_folder, progress_bar): block_size = 1024 # 1 Kibibyte with open(temp_zip_name, "wb") as file: if progress_bar: - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) for data in r.iter_content(block_size): if progress_bar: - progress_bar.update(len(data)) + ModelManager.tqdm_progress.update(len(data)) file.write(data) @staticmethod diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a7370cd2c9..781561f973 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -235,7 +235,7 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str, pipe_out = None) -> None: + def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: """Save the waveform as a file. Args: @@ -264,6 +264,7 @@ def tts( style_text=None, reference_wav=None, reference_speaker_name=None, + split_sentences: bool = True, **kwargs, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -277,6 +278,8 @@ def tts( style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None. reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None. + split_sentences (bool, optional): split the input text into sentences. Defaults to True. + **kwargs: additional arguments to pass to the TTS model. Returns: List[int]: [description] """ @@ -289,8 +292,10 @@ def tts( ) if text: - sens = self.split_into_sentences(text) - print(" > Text splitted to sentences.") + sens = [text] + if split_sentences: + print(" > Text splitted to sentences.") + sens = self.split_into_sentences(text) print(sens) # handle multi-speaker @@ -358,7 +363,11 @@ def tts( ) # compute a new d_vector from the given clip. - if speaker_wav is not None and self.tts_model.speaker_manager is not None: + if ( + speaker_wav is not None + and self.tts_model.speaker_manager is not None + and self.tts_model.speaker_manager.encoder_ap is not None + ): speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) vocoder_device = "cpu" diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index 890a269303..207181b303 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -1,5 +1,278 @@ from dataclasses import dataclass, field -from typing import List +from typing import List, Optional + +from coqpit import Coqpit from TTS.vc.configs.shared_configs import BaseVCConfig -from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig + + +@dataclass +class FreeVCAudioConfig(Coqpit): + """Audio configuration + + Args: + max_wav_value (float): + The maximum value of the waveform. + + input_sample_rate (int): + The sampling rate of the input waveform. + + output_sample_rate (int): + The sampling rate of the output waveform. + + filter_length (int): + The length of the filter. + + hop_length (int): + The hop length. + + win_length (int): + The window length. + + n_mel_channels (int): + The number of mel channels. + + mel_fmin (float): + The minimum frequency of the mel filterbank. + + mel_fmax (Optional[float]): + The maximum frequency of the mel filterbank. + """ + + max_wav_value: float = field(default=32768.0) + input_sample_rate: int = field(default=16000) + output_sample_rate: int = field(default=24000) + filter_length: int = field(default=1280) + hop_length: int = field(default=320) + win_length: int = field(default=1280) + n_mel_channels: int = field(default=80) + mel_fmin: float = field(default=0.0) + mel_fmax: Optional[float] = field(default=None) + + +@dataclass +class FreeVCArgs(Coqpit): + """FreeVC model arguments + + Args: + spec_channels (int): + The number of channels in the spectrogram. + + inter_channels (int): + The number of channels in the intermediate layers. + + hidden_channels (int): + The number of channels in the hidden layers. + + filter_channels (int): + The number of channels in the filter layers. + + n_heads (int): + The number of attention heads. + + n_layers (int): + The number of layers. + + kernel_size (int): + The size of the kernel. + + p_dropout (float): + The dropout probability. + + resblock (str): + The type of residual block. + + resblock_kernel_sizes (List[int]): + The kernel sizes for the residual blocks. + + resblock_dilation_sizes (List[List[int]]): + The dilation sizes for the residual blocks. + + upsample_rates (List[int]): + The upsample rates. + + upsample_initial_channel (int): + The number of channels in the initial upsample layer. + + upsample_kernel_sizes (List[int]): + The kernel sizes for the upsample layers. + + n_layers_q (int): + The number of layers in the quantization network. + + use_spectral_norm (bool): + Whether to use spectral normalization. + + gin_channels (int): + The number of channels in the global conditioning vector. + + ssl_dim (int): + The dimension of the self-supervised learning embedding. + + use_spk (bool): + Whether to use external speaker encoder. + """ + + spec_channels: int = field(default=641) + inter_channels: int = field(default=192) + hidden_channels: int = field(default=192) + filter_channels: int = field(default=768) + n_heads: int = field(default=2) + n_layers: int = field(default=6) + kernel_size: int = field(default=3) + p_dropout: float = field(default=0.1) + resblock: str = field(default="1") + resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) + upsample_initial_channel: int = field(default=512) + upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + n_layers_q: int = field(default=3) + use_spectral_norm: bool = field(default=False) + gin_channels: int = field(default=256) + ssl_dim: int = field(default=1024) + use_spk: bool = field(default=False) + num_spks: int = field(default=0) + segment_size: int = field(default=8960) + + +@dataclass +class FreeVCConfig(BaseVCConfig): + """Defines parameters for FreeVC End2End TTS model. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (FreeVCArgs): + Model architecture arguments. Defaults to `FreeVCArgs()`. + + audio (FreeVCAudioConfig): + Audio processing configuration. Defaults to `FreeVCAudioConfig()`. + + grad_clip (List): + Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`. + + lr_gen (float): + Initial learning rate for the generator. Defaults to 0.0002. + + lr_disc (float): + Initial learning rate for the discriminator. Defaults to 0.0002. + + lr_scheduler_gen (str): + Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to + `ExponentialLR`. + + lr_scheduler_gen_params (dict): + Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. + + lr_scheduler_disc (str): + Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to + `ExponentialLR`. + + lr_scheduler_disc_params (dict): + Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. + + scheduler_after_epoch (bool): + If true, step the schedulers after each epoch else after each step. Defaults to `False`. + + optimizer (str): + Name of the optimizer to use with both the generator and the discriminator networks. One of the + `torch.optim.*`. Defaults to `AdamW`. + + kl_loss_alpha (float): + Loss weight for KL loss. Defaults to 1.0. + + disc_loss_alpha (float): + Loss weight for the discriminator loss. Defaults to 1.0. + + gen_loss_alpha (float): + Loss weight for the generator loss. Defaults to 1.0. + + feat_loss_alpha (float): + Loss weight for the feature matching loss. Defaults to 1.0. + + mel_loss_alpha (float): + Loss weight for the mel loss. Defaults to 45.0. + + return_wav (bool): + If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. + + compute_linear_spec (bool): + If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. + + use_weighted_sampler (bool): + If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. + + weighted_sampler_attrs (dict): + Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities + by overweighting `root_path` by 2.0. Defaults to `{}`. + + weighted_sampler_multipliers (dict): + Weight each unique value of a key returned by the formatter for weighted sampling. + For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. + It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. + + r (int): + Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. + + add_blank (bool): + If true, a blank token is added in between every character. Defaults to `True`. + + test_sentences (List[List]): + List of sentences with speaker and language information to be used for testing. + + language_ids_file (str): + Path to the language ids file. + + use_language_embedding (bool): + If true, language embedding is used. Defaults to `False`. + + Note: + Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. + + Example: + + >>> from TTS.vc.configs.freevc_config import FreeVCConfig + >>> config = FreeVCConfig() + """ + + model: str = "freevc" + # model specific params + model_args: FreeVCArgs = field(default_factory=FreeVCArgs) + audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig) + + # optimizer + # TODO with training support + + # loss params + # TODO with training support + + # data loader params + return_wav: bool = True + compute_linear_spec: bool = True + + # sampler params + use_weighted_sampler: bool = False # TODO: move it to the base config + weighted_sampler_attrs: dict = field(default_factory=lambda: {}) + weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) + + # overrides + r: int = 1 # DO NOT CHANGE + add_blank: bool = True + + # multi-speaker settings + # use speaker embedding layer + num_speakers: int = 0 + speakers_file: str = None + speaker_embedding_channels: int = 256 + + # use d-vectors + use_d_vector_file: bool = False + d_vector_file: List[str] = None + d_vector_dim: int = None + + def __post_init__(self): + for key, val in self.model_args.items(): + if hasattr(self, key): + self[key] = val diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index ae22ad28c1..8bb9989224 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple, Union import librosa @@ -6,15 +5,17 @@ import torch from coqpit import Coqpit from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from torch.nn.utils import spectral_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations import TTS.vc.modules.freevc.commons as commons import TTS.vc.modules.freevc.modules as modules from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.io import load_fsspec, save_checkpoint -from TTS.vc.configs.shared_configs import BaseVCConfig +from TTS.utils.io import load_fsspec +from TTS.vc.configs.freevc_config import FreeVCConfig from TTS.vc.models.base_vc import BaseVC from TTS.vc.modules.freevc.commons import get_padding, init_weights from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch @@ -153,9 +154,9 @@ def forward(self, x, g=None): def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: - l.remove_weight_norm() + remove_parametrizations(l, "weight") class DiscriminatorP(torch.nn.Module): @@ -294,136 +295,6 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64): return embed -@dataclass -class FreeVCAudioConfig(Coqpit): - """Audio configuration - - Args: - max_wav_value (float): - The maximum value of the waveform. - - input_sample_rate (int): - The sampling rate of the input waveform. - - output_sample_rate (int): - The sampling rate of the output waveform. - - filter_length (int): - The length of the filter. - - hop_length (int): - The hop length. - - win_length (int): - The window length. - - n_mel_channels (int): - The number of mel channels. - - mel_fmin (float): - The minimum frequency of the mel filterbank. - - mel_fmax (Optional[float]): - The maximum frequency of the mel filterbank. - """ - - max_wav_value: float = field(default=32768.0) - input_sample_rate: int = field(default=16000) - output_sample_rate: int = field(default=24000) - filter_length: int = field(default=1280) - hop_length: int = field(default=320) - win_length: int = field(default=1280) - n_mel_channels: int = field(default=80) - mel_fmin: float = field(default=0.0) - mel_fmax: Optional[float] = field(default=None) - - -@dataclass -class FreeVCArgs(Coqpit): - """FreeVC model arguments - - Args: - spec_channels (int): - The number of channels in the spectrogram. - - inter_channels (int): - The number of channels in the intermediate layers. - - hidden_channels (int): - The number of channels in the hidden layers. - - filter_channels (int): - The number of channels in the filter layers. - - n_heads (int): - The number of attention heads. - - n_layers (int): - The number of layers. - - kernel_size (int): - The size of the kernel. - - p_dropout (float): - The dropout probability. - - resblock (str): - The type of residual block. - - resblock_kernel_sizes (List[int]): - The kernel sizes for the residual blocks. - - resblock_dilation_sizes (List[List[int]]): - The dilation sizes for the residual blocks. - - upsample_rates (List[int]): - The upsample rates. - - upsample_initial_channel (int): - The number of channels in the initial upsample layer. - - upsample_kernel_sizes (List[int]): - The kernel sizes for the upsample layers. - - n_layers_q (int): - The number of layers in the quantization network. - - use_spectral_norm (bool): - Whether to use spectral normalization. - - gin_channels (int): - The number of channels in the global conditioning vector. - - ssl_dim (int): - The dimension of the self-supervised learning embedding. - - use_spk (bool): - Whether to use external speaker encoder. - """ - - spec_channels: int = field(default=641) - inter_channels: int = field(default=192) - hidden_channels: int = field(default=192) - filter_channels: int = field(default=768) - n_heads: int = field(default=2) - n_layers: int = field(default=6) - kernel_size: int = field(default=3) - p_dropout: float = field(default=0.1) - resblock: str = field(default="1") - resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) - upsample_initial_channel: int = field(default=512) - upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) - n_layers_q: int = field(default=3) - use_spectral_norm: bool = field(default=False) - gin_channels: int = field(default=256) - ssl_dim: int = field(default=1024) - use_spk: bool = field(default=False) - num_spks: int = field(default=0) - segment_size: int = field(default=8960) - - class FreeVC(BaseVC): """ @@ -677,7 +548,7 @@ def eval_step(): ... @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): + def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): model = FreeVC(config) return model @@ -689,145 +560,3 @@ def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cach def train_step(): ... - - -@dataclass -class FreeVCConfig(BaseVCConfig): - """Defines parameters for FreeVC End2End TTS model. - - Args: - model (str): - Model name. Do not change unless you know what you are doing. - - model_args (FreeVCArgs): - Model architecture arguments. Defaults to `FreeVCArgs()`. - - audio (FreeVCAudioConfig): - Audio processing configuration. Defaults to `FreeVCAudioConfig()`. - - grad_clip (List): - Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`. - - lr_gen (float): - Initial learning rate for the generator. Defaults to 0.0002. - - lr_disc (float): - Initial learning rate for the discriminator. Defaults to 0.0002. - - lr_scheduler_gen (str): - Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to - `ExponentialLR`. - - lr_scheduler_gen_params (dict): - Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. - - lr_scheduler_disc (str): - Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to - `ExponentialLR`. - - lr_scheduler_disc_params (dict): - Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`. - - scheduler_after_epoch (bool): - If true, step the schedulers after each epoch else after each step. Defaults to `False`. - - optimizer (str): - Name of the optimizer to use with both the generator and the discriminator networks. One of the - `torch.optim.*`. Defaults to `AdamW`. - - kl_loss_alpha (float): - Loss weight for KL loss. Defaults to 1.0. - - disc_loss_alpha (float): - Loss weight for the discriminator loss. Defaults to 1.0. - - gen_loss_alpha (float): - Loss weight for the generator loss. Defaults to 1.0. - - feat_loss_alpha (float): - Loss weight for the feature matching loss. Defaults to 1.0. - - mel_loss_alpha (float): - Loss weight for the mel loss. Defaults to 45.0. - - return_wav (bool): - If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. - - compute_linear_spec (bool): - If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. - - use_weighted_sampler (bool): - If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. - - weighted_sampler_attrs (dict): - Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities - by overweighting `root_path` by 2.0. Defaults to `{}`. - - weighted_sampler_multipliers (dict): - Weight each unique value of a key returned by the formatter for weighted sampling. - For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. - It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. - - r (int): - Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. - - add_blank (bool): - If true, a blank token is added in between every character. Defaults to `True`. - - test_sentences (List[List]): - List of sentences with speaker and language information to be used for testing. - - language_ids_file (str): - Path to the language ids file. - - use_language_embedding (bool): - If true, language embedding is used. Defaults to `False`. - - Note: - Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. - - Example: - - >>> from TTS.tts.configs.freevc_config import FreeVCConfig - >>> config = FreeVCConfig() - """ - - model: str = "freevc" - # model specific params - model_args: FreeVCArgs = field(default_factory=FreeVCArgs) - audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig) - - # optimizer - # TODO with training support - - # loss params - # TODO with training support - - # data loader params - return_wav: bool = True - compute_linear_spec: bool = True - - # sampler params - use_weighted_sampler: bool = False # TODO: move it to the base config - weighted_sampler_attrs: dict = field(default_factory=lambda: {}) - weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) - - # overrides - r: int = 1 # DO NOT CHANGE - add_blank: bool = True - - # multi-speaker settings - # use speaker embedding layer - num_speakers: int = 0 - speakers_file: str = None - speaker_embedding_channels: int = 256 - - # use d-vectors - use_d_vector_file: bool = False - d_vector_file: List[str] = None - d_vector_dim: int = None - - def __post_init__(self): - for key, val in self.model_args.items(): - if hasattr(self, key): - self[key] = val diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/modules/freevc/modules.py index 0503a13c8a..9bb5499003 100644 --- a/TTS/vc/modules/freevc/modules.py +++ b/TTS/vc/modules/freevc/modules.py @@ -1,13 +1,9 @@ -import copy -import math - -import numpy as np -import scipy import torch from torch import nn -from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations import TTS.vc.modules.freevc.commons as commons from TTS.vc.modules.freevc.commons import get_padding, init_weights @@ -122,7 +118,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch if gin_channels != 0: cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layer = torch.nn.utils.parametrizations.weight_norm(cond_layer, name="weight") for i in range(n_layers): dilation = dilation_rate**i @@ -130,7 +126,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + in_layer = torch.nn.utils.parametrizations.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) # last one is not necessary @@ -140,7 +136,7 @@ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_ch res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + res_skip_layer = torch.nn.utils.parametrizations.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) def forward(self, x, x_mask, g=None, **kwargs): @@ -172,11 +168,11 @@ def forward(self, x, x_mask, g=None, **kwargs): def remove_weight_norm(self): if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) + remove_parametrizations(self.cond_layer, "weight") for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock1(torch.nn.Module): @@ -250,9 +246,9 @@ def forward(self, x, x_mask=None): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -297,7 +293,7 @@ def forward(self, x, x_mask=None): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class Log(nn.Module): diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/modules/freevc/wavlm/wavlm.py index 7efb11bfc6..fc93bd4f50 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/modules/freevc/wavlm/wavlm.py @@ -497,7 +497,7 @@ def __init__(self, args): nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.utils.parametrizations.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) if hasattr(args, "relative_position_embedding"): diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index 7845dd6bf8..6059d7f04f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -94,6 +94,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): use_noise_augment: bool = False use_cache: bool = True steps_to_start_discriminator: int = 200000 + target_loss: str = "loss_1" # LOSS PARAMETERS - overrides use_stft_loss: bool = True diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index 0f69b812fa..503bb04b2f 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -7,6 +7,7 @@ from tqdm import tqdm from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): @@ -29,7 +30,11 @@ def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): mel = ap.melspectrogram(y) np.save(mel_path, mel) if isinstance(config.mode, int): - quant = ap.mulaw_encode(y, qc=config.mode) if config.model_args.mulaw else ap.quantize(y, bits=config.mode) + quant = ( + mulaw_encode(wav=y, mulaw_qc=config.mode) + if config.model_args.mulaw + else quantize(x=y, quantize_bits=config.mode) + ) np.save(quant_path, quant) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index c390796428..a67c5b31a0 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -2,6 +2,8 @@ import torch from torch.utils.data import Dataset +from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize + class WaveRNNDataset(Dataset): """ @@ -66,7 +68,9 @@ def load_item(self, index): x_input = audio elif isinstance(self.mode, int): x_input = ( - self.ap.mulaw_encode(audio, qc=self.mode) if self.mulaw else self.ap.quantize(audio, bits=self.mode) + mulaw_encode(wav=audio, mulaw_qc=self.mode) + if self.mulaw + else quantize(x=audio, quantize_bits=self.mode) ) else: raise RuntimeError("Unknown dataset mode - ", self.mode) diff --git a/TTS/vocoder/layers/hifigan.py b/TTS/vocoder/layers/hifigan.py index f512007248..8dd75133bb 100644 --- a/TTS/vocoder/layers/hifigan.py +++ b/TTS/vocoder/layers/hifigan.py @@ -1,4 +1,5 @@ from torch import nn +from torch.nn.utils.parametrize import remove_parametrizations # pylint: disable=dangerous-default-value @@ -10,14 +11,16 @@ def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]): resstack += [ nn.LeakyReLU(0.2), nn.ReflectionPad1d(dilation), - nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)), + nn.utils.parametrizations.weight_norm( + nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation) + ), nn.LeakyReLU(0.2), nn.ReflectionPad1d(padding), - nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), + nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), ] self.resstack = nn.Sequential(*resstack) - self.shortcut = nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) + self.shortcut = nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) def forward(self, x): x1 = self.shortcut(x) @@ -25,13 +28,13 @@ def forward(self, x): return x1 + x2 def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.shortcut) - nn.utils.remove_weight_norm(self.resstack[2]) - nn.utils.remove_weight_norm(self.resstack[5]) - nn.utils.remove_weight_norm(self.resstack[8]) - nn.utils.remove_weight_norm(self.resstack[11]) - nn.utils.remove_weight_norm(self.resstack[14]) - nn.utils.remove_weight_norm(self.resstack[17]) + remove_parametrizations(self.shortcut, "weight") + remove_parametrizations(self.resstack[2], "weight") + remove_parametrizations(self.resstack[5], "weight") + remove_parametrizations(self.resstack[8], "weight") + remove_parametrizations(self.resstack[11], "weight") + remove_parametrizations(self.resstack[14], "weight") + remove_parametrizations(self.resstack[17], "weight") class MRF(nn.Module): diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index befc43cca6..74cfc7262b 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func): if isinstance(scores_fake, list): # multi-scale loss for score_fake, score_real in zip(scores_fake, scores_real): - total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) + total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real) loss += total_loss - real_loss += real_loss - fake_loss += fake_loss + real_loss += real_loss_ + fake_loss += fake_loss_ # normalize loss values with number of scales (discriminators) loss /= len(scores_fake) real_loss /= len(scores_real) diff --git a/TTS/vocoder/layers/melgan.py b/TTS/vocoder/layers/melgan.py index 4bb328e983..7ad41a0f78 100644 --- a/TTS/vocoder/layers/melgan.py +++ b/TTS/vocoder/layers/melgan.py @@ -1,5 +1,6 @@ from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations class ResidualStack(nn.Module): @@ -27,7 +28,7 @@ def __init__(self, channels, num_res_blocks, kernel_size): ] self.shortcuts = nn.ModuleList( - [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for i in range(num_res_blocks)] + [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for _ in range(num_res_blocks)] ) def forward(self, x): @@ -37,6 +38,6 @@ def forward(self, x): def remove_weight_norm(self): for block, shortcut in zip(self.blocks, self.shortcuts): - nn.utils.remove_weight_norm(block[2]) - nn.utils.remove_weight_norm(block[4]) - nn.utils.remove_weight_norm(shortcut) + remove_parametrizations(block[2], "weight") + remove_parametrizations(block[4], "weight") + remove_parametrizations(shortcut, "weight") diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 24b905f994..9f1512c6d4 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -1,7 +1,8 @@ import torch import torch.nn.functional as F from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations class Conv1d(nn.Conv1d): @@ -56,8 +57,8 @@ def forward(self, x, noise_scale): return shift, scale def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.input_conv) - nn.utils.remove_weight_norm(self.output_conv) + remove_parametrizations(self.input_conv, "weight") + remove_parametrizations(self.output_conv, "weight") def apply_weight_norm(self): self.input_conv = weight_norm(self.input_conv) @@ -111,13 +112,13 @@ def forward(self, x, shift, scale): return o def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.res_block) + remove_parametrizations(self.res_block, "weight") for _, layer in enumerate(self.main_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") for _, layer in enumerate(self.out_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") def apply_weight_norm(self): self.res_block = weight_norm(self.res_block) @@ -153,10 +154,10 @@ def forward(self, x): return o + res def remove_weight_norm(self): - nn.utils.remove_weight_norm(self.res_block) + remove_parametrizations(self.res_block, "weight") for _, layer in enumerate(self.main_block): if len(layer.state_dict()) != 0: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") def apply_weight_norm(self): self.res_block = weight_norm(self.res_block) diff --git a/TTS/vocoder/models/hifigan_discriminator.py b/TTS/vocoder/models/hifigan_discriminator.py index ca5eaf408c..7447a5fbc4 100644 --- a/TTS/vocoder/models/hifigan_discriminator.py +++ b/TTS/vocoder/models/hifigan_discriminator.py @@ -30,7 +30,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super().__init__() self.period = period get_padding = lambda k, d: int((k * d - d) / 2) - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -125,7 +125,7 @@ class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() - norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm + norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm self.convs = nn.ModuleList( [ norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)), diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 4916d1e697..9247532259 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -3,7 +3,8 @@ from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec @@ -99,9 +100,9 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs1: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.convs2: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class ResBlock2(torch.nn.Module): @@ -155,7 +156,7 @@ def forward(self, x): def remove_weight_norm(self): for l in self.convs: - remove_weight_norm(l) + remove_parametrizations(l, "weight") class HifiganGenerator(torch.nn.Module): @@ -227,10 +228,10 @@ def __init__( self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) if not conv_pre_weight_norm: - remove_weight_norm(self.conv_pre) + remove_parametrizations(self.conv_pre, "weight") if not conv_post_weight_norm: - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_post, "weight") def forward(self, x, g=None): """ @@ -283,11 +284,11 @@ def inference(self, c): def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: - remove_weight_norm(l) + remove_parametrizations(l, "weight") for l in self.resblocks: l.remove_weight_norm() - remove_weight_norm(self.conv_pre) - remove_weight_norm(self.conv_post) + remove_parametrizations(self.conv_pre, "weight") + remove_parametrizations(self.conv_post, "weight") def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False diff --git a/TTS/vocoder/models/melgan_discriminator.py b/TTS/vocoder/models/melgan_discriminator.py index 14f00c5927..e41467da3c 100644 --- a/TTS/vocoder/models/melgan_discriminator.py +++ b/TTS/vocoder/models/melgan_discriminator.py @@ -1,6 +1,6 @@ import numpy as np from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm class MelganDiscriminator(nn.Module): diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 989797f0b8..bb3fee789c 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -1,6 +1,6 @@ import torch from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm from TTS.utils.io import load_fsspec from TTS.vocoder.layers.melgan import ResidualStack @@ -80,7 +80,7 @@ def remove_weight_norm(self): for _, layer in enumerate(self.layers): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + nn.utils.parametrize.remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index adf1bdaea0..d02af75f05 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -2,6 +2,7 @@ import torch from torch import nn +from torch.nn.utils.parametrize import remove_parametrizations from TTS.vocoder.layers.parallel_wavegan import ResidualBlock @@ -68,7 +69,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -76,7 +77,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -171,7 +172,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -179,7 +180,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 5587fb7264..8338d94653 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -2,6 +2,7 @@ import numpy as np import torch +from torch.nn.utils.parametrize import remove_parametrizations from TTS.utils.io import load_fsspec from TTS.vocoder.layers.parallel_wavegan import ResidualBlock @@ -126,7 +127,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - torch.nn.utils.remove_weight_norm(m) + remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -135,7 +136,7 @@ def _remove_weight_norm(m): def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) # print(f"Weight norm is applied to {m}.") self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py index 4c09520c2a..497d67ac76 100644 --- a/TTS/vocoder/models/univnet_discriminator.py +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -1,7 +1,8 @@ import torch import torch.nn.functional as F from torch import nn -from torch.nn.utils import spectral_norm, weight_norm +from torch.nn.utils import spectral_norm +from torch.nn.utils.parametrizations import weight_norm from TTS.utils.audio.torch_transforms import TorchSTFT from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 2ee28c7b85..5e66b70df8 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -3,6 +3,7 @@ import numpy as np import torch import torch.nn.functional as F +from torch.nn.utils import parametrize from TTS.vocoder.layers.lvc_block import LVCBlock @@ -113,7 +114,7 @@ def remove_weight_norm(self): def _remove_weight_norm(m): try: # print(f"Weight norm is removed from {m}.") - torch.nn.utils.remove_weight_norm(m) + parametrize.remove_parametrizations(m, "weight") except ValueError: # this module didn't have weight norm return @@ -124,7 +125,7 @@ def apply_weight_norm(self): def _apply_weight_norm(m): if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): - torch.nn.utils.weight_norm(m) + torch.nn.utils.parametrizations.weight_norm(m) # print(f"Weight norm is applied to {m}.") self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index a0f9221a8f..c1166e0914 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -5,7 +5,8 @@ import torch from coqpit import Coqpit from torch import nn -from torch.nn.utils import weight_norm +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler @@ -178,27 +179,27 @@ def remove_weight_norm(self): for _, layer in enumerate(self.dblocks): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() for _, layer in enumerate(self.film): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() for _, layer in enumerate(self.ublocks): if len(layer.state_dict()) != 0: try: - nn.utils.remove_weight_norm(layer) + remove_parametrizations(layer, "weight") except ValueError: layer.remove_weight_norm() - nn.utils.remove_weight_norm(self.x_conv) - nn.utils.remove_weight_norm(self.out_conv) - nn.utils.remove_weight_norm(self.y_conv) + remove_parametrizations(self.x_conv, "weight") + remove_parametrizations(self.out_conv, "weight") + remove_parametrizations(self.y_conv, "weight") def apply_weight_norm(self): for _, layer in enumerate(self.dblocks): diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 903f4b7e63..7f74ba3ebf 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -13,6 +13,7 @@ from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import mulaw_decode from TTS.utils.io import load_fsspec from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.layers.losses import WaveRNNLoss @@ -399,7 +400,7 @@ def inference(self, mels, batched=None, target=None, overlap=None): output = output[0] if self.args.mulaw and isinstance(self.args.mode, int): - output = AudioProcessor.mulaw_decode(output, self.args.mode) + output = mulaw_decode(wav=output, mulaw_qc=self.args.mode) # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.config.audio.hop_length) diff --git a/dockerfiles/Dockerfile.dev b/dockerfiles/Dockerfile.dev new file mode 100644 index 0000000000..58baee53e2 --- /dev/null +++ b/dockerfiles/Dockerfile.dev @@ -0,0 +1,44 @@ +ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 +FROM ${BASE} + +# Install OS dependencies: +RUN apt-get update && apt-get upgrade -y +RUN apt-get install -y --no-install-recommends \ + gcc g++ \ + make \ + python3 python3-dev python3-pip python3-venv python3-wheel \ + espeak-ng libsndfile1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Major Python Dependencies: +RUN pip3 install llvmlite --ignore-installed +RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 +RUN rm -rf /root/.cache/pip + +WORKDIR /root + +# Copy Dependency Lock Files: +COPY \ + Makefile \ + pyproject.toml \ + setup.py \ + requirements.dev.txt \ + requirements.ja.txt \ + requirements.notebooks.txt \ + requirements.txt \ + /root/ + +# Install Project Dependencies +# Separate stage to limit re-downloading: +RUN pip install \ + -r requirements.txt \ + -r requirements.dev.txt \ + -r requirements.ja.txt \ + -r requirements.notebooks.txt + +# Copy TTS repository contents: +COPY . /root + +# Installing the TTS package itself: +RUN make install + diff --git a/docs/source/configuration.md b/docs/source/configuration.md index cde7e073e9..ada61e16db 100644 --- a/docs/source/configuration.md +++ b/docs/source/configuration.md @@ -56,4 +56,4 @@ ModelConfig() In the example above, ```ModelConfig()``` is the final configuration that the model receives and it has all the fields necessary for the model. -We host pre-defined model configurations under ```TTS//configs/```.Although we recommend a unified config class, you can decompose it as you like as for your custom models as long as all the fields for the trainer, model, and inference APIs are provided. \ No newline at end of file +We host pre-defined model configurations under ```TTS//configs/```. Although we recommend a unified config class, you can decompose it as you like as for your custom models as long as all the fields for the trainer, model, and inference APIs are provided. diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md index c236260d0c..069f565137 100644 --- a/docs/source/finetuning.md +++ b/docs/source/finetuning.md @@ -21,7 +21,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways: Fine-tuning comes to the rescue in this case. You can take one of our pre-trained models and fine-tune it on your own speech dataset and achieve reasonable results with only a couple of hours of data. - However, note that, fine-tuning does not ensure great results. The model performance is still depends on the + However, note that, fine-tuning does not ensure great results. The model performance still depends on the {ref}`dataset quality ` and the hyper-parameters you choose for fine-tuning. Therefore, it still takes a bit of tinkering. @@ -41,7 +41,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways: tts --list_models ``` - The command above lists the the models in a naming format as ```///```. + The command above lists the models in a naming format as ```///```. Or you can manually check the `.model.json` file in the project directory. diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md index 796c7b6d06..23c497d0bf 100644 --- a/docs/source/formatting_your_dataset.md +++ b/docs/source/formatting_your_dataset.md @@ -7,7 +7,7 @@ If you have a single audio file and you need to split it into clips, there are d It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using `wav` file format. -Let's assume you created the audio clips and their transcription. You can collect all your clips under a folder. Let's call this folder `wavs`. +Let's assume you created the audio clips and their transcription. You can collect all your clips in a folder. Let's call this folder `wavs`. ``` /wavs @@ -17,7 +17,7 @@ Let's assume you created the audio clips and their transcription. You can collec ... ``` -You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each column must be delimitered by a special character separating the audio file name, the transcription and the normalized transcription. And make sure that the delimiter is not used in the transcription text. +You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each column must be delimited by a special character separating the audio file name, the transcription and the normalized transcription. And make sure that the delimiter is not used in the transcription text. We recommend the following format delimited by `|`. In the following example, `audio1`, `audio2` refer to files `audio1.wav`, `audio2.wav` etc. @@ -55,7 +55,7 @@ For more info about dataset qualities and properties check our [post](https://gi After you collect and format your dataset, you need to check two things. Whether you need a `formatter` and a `text_cleaner`. The `formatter` loads the text file (created above) as a list and the `text_cleaner` performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format). -If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own `formatter`. +If you use a different dataset format than the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own `formatter`. If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`. diff --git a/docs/source/implementing_a_new_language_frontend.md b/docs/source/implementing_a_new_language_frontend.md index f4f6a04a5f..2041352d64 100644 --- a/docs/source/implementing_a_new_language_frontend.md +++ b/docs/source/implementing_a_new_language_frontend.md @@ -2,11 +2,11 @@ - Language frontends are located under `TTS.tts.utils.text` - Each special language has a separate folder. -- Each folder containst all the utilities for processing the text input. +- Each folder contains all the utilities for processing the text input. - `TTS.tts.utils.text.phonemizers` contains the main phonemizer for a language. This is the class that uses the utilities from the previous step and used to convert the text to phonemes or graphemes for the model. - After you implement your phonemizer, you need to add it to the `TTS/tts/utils/text/phonemizers/__init__.py` to be able to map the language code in the model config - `config.phoneme_language` - to the phonemizer class and initiate the phonemizer automatically. - You should also add tests to `tests/text_tests` if you want to make a PR. -We suggest you to check the available implementations as reference. Good luck! \ No newline at end of file +We suggest you to check the available implementations as reference. Good luck! diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md index e2a0437e9a..1bf7a8822e 100644 --- a/docs/source/implementing_a_new_model.md +++ b/docs/source/implementing_a_new_model.md @@ -145,7 +145,7 @@ class MyModel(BaseTTS): Args: ap (AudioProcessor): audio processor used at training. batch (Dict): Model inputs used at the previous training step. - outputs (Dict): Model outputs generated at the previoud training step. + outputs (Dict): Model outputs generated at the previous training step. Returns: Tuple[Dict, np.ndarray]: training plots and output waveform. @@ -183,7 +183,7 @@ class MyModel(BaseTTS): ... def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: - """Setup an return optimizer or optimizers.""" + """Setup a return optimizer or optimizers.""" pass def get_lr(self) -> Union[float, List[float]]: diff --git a/docs/source/inference.md b/docs/source/inference.md index 4de9ecdd14..611a2445bf 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" print(TTS().list_models()) # Init TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) # Run TTS # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language @@ -198,19 +198,12 @@ from TTS.api import CS_API # Init 🐸 Coqui Studio API # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. -# XTTS - Best quality and life-like speech in EN +# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. api = CS_API(api_token=, model="XTTS") api.speakers # all the speakers are available with all the models. api.list_speakers() api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) - -# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon) -api = CS_API(api_token=, model="XTTS-multilingual") -api.speakers -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5) # V1 - Fast and lightweight TTS in EN with emotion control. api = CS_API(api_token=, model="V1") @@ -238,4 +231,4 @@ api.tts_with_vc_to_file( speaker_wav="target/speaker.wav", file_path="ouptut.wav" ) -``` \ No newline at end of file +``` diff --git a/docs/source/marytts.md b/docs/source/marytts.md index 81d547107d..9091ca330f 100644 --- a/docs/source/marytts.md +++ b/docs/source/marytts.md @@ -2,13 +2,13 @@ ## What is Mary-TTS? -[Mary (Modular Architecture for Research in sYynthesis) Text-to-Speech](http://mary.dfki.de/) is an open-source (GNU LGPL license), multilingual Text-to-Speech Synthesis platform written in Java. It was originally developed as a collaborative project of [DFKI’s](http://www.dfki.de/web) Language Technology Lab and the [Institute of Phonetics](http://www.coli.uni-saarland.de/groups/WB/Phonetics/) at Saarland University, Germany. It is now maintained by the Multimodal Speech Processing Group in the [Cluster of Excellence MMCI](https://www.mmci.uni-saarland.de/) and DFKI. +[Mary (Modular Architecture for Research in sYnthesis) Text-to-Speech](http://mary.dfki.de/) is an open-source (GNU LGPL license), multilingual Text-to-Speech Synthesis platform written in Java. It was originally developed as a collaborative project of [DFKI’s](http://www.dfki.de/web) Language Technology Lab and the [Institute of Phonetics](http://www.coli.uni-saarland.de/groups/WB/Phonetics/) at Saarland University, Germany. It is now maintained by the Multimodal Speech Processing Group in the [Cluster of Excellence MMCI](https://www.mmci.uni-saarland.de/) and DFKI. MaryTTS has been around for a very! long time. Version 3.0 even dates back to 2006, long before Deep Learning was a broadly known term and the last official release was version 5.2 in 2016. You can check out this OpenVoice-Tech page to learn more: https://openvoice-tech.net/index.php/MaryTTS ## Why Mary-TTS compatibility is relevant -Due to it's open-source nature, relatively high quality voices and fast synthetization speed Mary-TTS was a popular choice in the past and many tools implemented API support over the years like screen-readers (NVDA + SpeechHub), smart-home HUBs (openHAB, Home Assistant) or voice assistants (Rhasspy, Mycroft, SEPIA). A compatibility layer for Coqui-TTS will ensure that these tools can use Coqui as a drop-in replacement and get even better voices right away. +Due to its open-source nature, relatively high quality voices and fast synthetization speed Mary-TTS was a popular choice in the past and many tools implemented API support over the years like screen-readers (NVDA + SpeechHub), smart-home HUBs (openHAB, Home Assistant) or voice assistants (Rhasspy, Mycroft, SEPIA). A compatibility layer for Coqui-TTS will ensure that these tools can use Coqui as a drop-in replacement and get even better voices right away. ## API and code examples @@ -40,4 +40,4 @@ You can enter the same URLs in your browser and check-out the results there as w ### How it works and limitations A classic Mary-TTS server would usually show all installed locales and voices via the corresponding endpoints and accept the parameters `LOCALE` and `VOICE` for processing. For Coqui-TTS we usually start the server with one specific locale and model and thus cannot return all available options. Instead we return the active locale and use the model name as "voice". Since we only have one active model and always want to return a WAV-file, we currently ignore all other processing parameters except `INPUT_TEXT`. Since the gender is not defined for models in Coqui-TTS we always return `u` (undefined). -We think that this is an acceptable compromise, since users are often only interested in one specific voice anyways, but the API might get extended in the future to support multiple languages and voices at the same time. \ No newline at end of file +We think that this is an acceptable compromise, since users are often only interested in one specific voice anyways, but the API might get extended in the future to support multiple languages and voices at the same time. diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md index 2df6da7649..1a8e9ca8e9 100644 --- a/docs/source/models/tortoise.md +++ b/docs/source/models/tortoise.md @@ -1,6 +1,6 @@ # 🐢 Tortoise Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input -text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to +text to discritized acoustic tokens, a diffusion model that converts these tokens to melspectrogram frames and a Univnet vocoder to convert the spectrograms to the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS. Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS. diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index ff6bcf974a..e5da50ff4e 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -7,17 +7,24 @@ This is the same model that powers [Coqui Studio](https://coqui.ai/), and [Coqui a few tricks to make it faster and support streaming inference. ### Features -- Voice cloning with just a 3-second audio clip. +- Voice cloning. - Cross-language voice cloning. - Multi-lingual speech generation. - 24khz sampling rate. +- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-inference)) +- Fine-tuning support. (See [Training](#training)) + +### Updates with v2 +- Improved voice cloning. +- Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime. +- 2 new languages: Hungarian and Korean. +- Across the board quality improvements. ### Code Current implementation only supports inference. ### Languages -As of now, XTTS-v1 supports 13 languages: English, Spanish, French, German, Italian, Portuguese, -Polish, Turkish, Russian, Dutch, Czech, Arabic, and Chinese (Simplified). +As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. @@ -31,33 +38,124 @@ You can also mail us at info@coqui.ai. ### Inference #### 🐸TTS API +##### Single reference + +Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio. +You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit. + ```python from TTS.api import TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) # generate speech by cloning a voice using default settings tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", file_path="output.wav", - speaker_wav="/path/to/target/speaker.wav", + speaker_wav=["/path/to/target/speaker.wav"], + language="en", + split_sentences=True + ) +``` + +##### Multiple references + +You can pass multiple audio files to the `speaker_wav` argument for better voice cloning. + +```python +from TTS.api import TTS + +# using the default version set in 🐸TTS +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) + +# using a specific version +# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main +# ❗some versions might be incompatible with the API +tts = TTS("xtts_v2.0.2", gpu=True) + +# getting the latest XTTS_v2 +tts = TTS("xtts", gpu=True) + +# generate speech by cloning a voice using default settings +tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + file_path="output.wav", + speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"], language="en") ``` +##### Streaming inference + +XTTS supports streaming inference. This is useful for real-time applications. + +```python +import os +import time +import torch +import torchaudio + +print("Loading model...") +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) +model = tts.synthesizer.tts_model + +print("Computing speaker latents...") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) + +print("Inference...") +t0 = time.time() +stream_generator = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding +) + +wav_chuncks = [] +for i, chunk in enumerate(stream_generator): + if i == 0: + print(f"Time to first chunck: {time.time() - t0}") + print(f"Received chunk {i} of audio length {chunk.shape[-1]}") + wav_chuncks.append(chunk) +wav = torch.cat(wav_chuncks, dim=0) +torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) +``` + #### 🐸TTS Command line +##### Single reference ```console - tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 \ + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ --text "Bugün okula gitmek istemiyorum." \ --speaker_wav /path/to/target/speaker.wav \ --language_idx tr \ --use_cuda true ``` -#### model directly +##### Multiple references +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \ + --language_idx tr \ + --use_cuda true +``` +or for all wav files in a directory you can use: + +```console + tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ + --text "Bugün okula gitmek istemiyorum." \ + --speaker_wav /path/to/target/*.wav \ + --language_idx tr \ + --use_cuda true +``` + +#### 🐸TTS Model API + +To use the model API, you need to download the model files and pass config and model file paths manually. -If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first. +##### Calling manually + +If you want to be able to run with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first. ```console -pip install deepspeed==0.8.3 +pip install deepspeed==0.10.3 ``` ```python @@ -73,9 +171,9 @@ config.load_json("/path/to/xtts/config.json") model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=True) model.cuda() - + print("Computing speaker latents...") -gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") out = model.inference( @@ -83,14 +181,13 @@ out = model.inference( "en", gpt_cond_latent, speaker_embedding, - diffusion_conditioning, temperature=0.7, # Add custom parameters here ) torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) ``` -#### streaming inference +##### Streaming manually Here the goal is to stream the audio as it is being generated. This is useful for real-time applications. Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster. @@ -112,7 +209,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru model.cuda() print("Computing speaker latents...") -gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) print("Inference...") t0 = time.time() @@ -122,7 +219,7 @@ chunks = model.inference_stream( gpt_cond_latent, speaker_embedding ) - + wav_chuncks = [] for i, chunk in enumerate(chunks): if i == 0: @@ -134,13 +231,108 @@ torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) ``` -## Important resources & papers +### Training + +#### Easy training +To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps: + +- Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter +- Train the XTTS GPT encoder with the processed data +- Inference support using the fine-tuned model + +The user can run this gradio demo locally or remotely using a Colab Notebook. + +##### Run demo on Colab +To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. + +The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). + +To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). + +If you are not able to acess the video you need to follow the steps: + +1. Open the Colab notebook and start the demo by runining the first two cells (ignore pip install errors in the first one). +2. Click on the link "Running on public URL:" on the second cell output. +3. On the first Tab (1 - Data processing) you need to select the audio file or files, wait for upload, and then click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +4. Soon as the dataset processing is done you need to go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. Note that it can take up to 40 minutes. +5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". + + +##### Run demo locally + +To run the demo locally you need to do the following steps: +1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). +2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` +3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` +4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. + + +If you are not able to access the video, here is what you need to do: + +1. On the first Tab (1 - Data processing) select the audio file or files, wait for upload +2. Click on the button "Step 1 - Create dataset" and then wait until the dataset processing is done. +3. Go to the second Tab (2 - Fine-tuning XTTS Encoder) and press the button "Step 2 - Run the training" and then wait until the training is finished. it will take some time. +4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. +5. Now you can run inference with the model by clicking on the button "Step 4 - Inference". + +#### Advanced training + +A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py + +You need to change the fields of the `BaseDatasetConfig` to match your dataset and then update `GPTArgs` and `GPTTrainerConfig` fields as you need. By default, it will use the same parameters that XTTS v1.1 model was trained with. To speed up the model convergence, as default, it will also download the XTTS v1.1 checkpoint and load it. + +After training you can do inference following the code bellow. + +```python +import os +import torch +import torchaudio +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts + +# Add here the xtts_config path +CONFIG_PATH = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT-October-23-2023_10+36AM-653f2e75/config.json" +# Add here the vocab file that you have used to train the model +TOKENIZER_PATH = "recipes/ljspeech/xtts_v1/run/training/XTTS_v2_original_model_files/vocab.json" +# Add here the checkpoint that you want to do inference with +XTTS_CHECKPOINT = "recipes/ljspeech/xtts_v1/run/training/GPT_XTTS_LJSpeech_FT/best_model.pth" +# Add here the speaker reference +SPEAKER_REFERENCE = "LjSpeech_reference.wav" + +# output wav path +OUTPUT_WAV_PATH = "xtts-ft.wav" + +print("Loading model...") +config = XttsConfig() +config.load_json(CONFIG_PATH) +model = Xtts.init_from_config(config) +model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False) +model.cuda() + +print("Computing speaker latents...") +gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) + +print("Inference...") +out = model.inference( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + temperature=0.7, # Add custom parameters here +) +torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) +``` + + + +## References and Acknowledgements - VallE: https://arxiv.org/abs/2301.02111 - Tortoise Repo: https://github.com/neonbjb/tortoise-tts - Faster implementation: https://github.com/152334H/tortoise-tts-fast - Univnet: https://arxiv.org/abs/2106.07889 - Latent Diffusion:https://arxiv.org/abs/2112.10752 - DALL-E: https://arxiv.org/abs/2102.12092 +- Perceiver: https://arxiv.org/abs/2103.03206 ## XttsConfig diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index 9acc9929fc..0ec5f167b4 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -13,23 +13,28 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import sys\n", - "import torch\n", "import importlib\n", + "import os\n", + "import pickle\n", + "\n", "import numpy as np\n", - "from tqdm import tqdm\n", - "from torch.utils.data import DataLoader\n", "import soundfile as sf\n", - "import pickle\n", + "import torch\n", + "from matplotlib import pylab as plt\n", + "from torch.utils.data import DataLoader\n", + "from tqdm import tqdm\n", + "\n", + "from TTS.config import load_config\n", + "from TTS.tts.configs.shared_configs import BaseDatasetConfig\n", + "from TTS.tts.datasets import load_tts_samples\n", "from TTS.tts.datasets.dataset import TTSDataset\n", "from TTS.tts.layers.losses import L1LossMasked\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.config import load_config\n", - "from TTS.tts.utils.visual import plot_spectrogram\n", - "from TTS.tts.utils.helpers import sequence_mask\n", "from TTS.tts.models import setup_model\n", - "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", + "from TTS.tts.utils.helpers import sequence_mask\n", + "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n", + "from TTS.tts.utils.visual import plot_spectrogram\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.audio.numpy_transforms import quantize\n", "\n", "%matplotlib inline\n", "\n", @@ -49,11 +54,9 @@ " file_name = wav_file.split('.')[0]\n", " os.makedirs(os.path.join(out_path, \"quant\"), exist_ok=True)\n", " os.makedirs(os.path.join(out_path, \"mel\"), exist_ok=True)\n", - " os.makedirs(os.path.join(out_path, \"wav_gl\"), exist_ok=True)\n", " wavq_path = os.path.join(out_path, \"quant\", file_name)\n", " mel_path = os.path.join(out_path, \"mel\", file_name)\n", - " wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n", - " return file_name, wavq_path, mel_path, wav_path" + " return file_name, wavq_path, mel_path" ] }, { @@ -65,14 +68,14 @@ "# Paths and configurations\n", "OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n", "DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n", + "PHONEME_CACHE_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/phoneme_cache\"\n", "DATASET = \"ljspeech\"\n", "METADATA_FILE = \"metadata.csv\"\n", "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n", "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n", "BATCH_SIZE = 32\n", "\n", - "QUANTIZED_WAV = False\n", - "QUANTIZE_BIT = None\n", + "QUANTIZE_BITS = 0 # if non-zero, quantize wav files with the given number of bits\n", "DRY_RUN = False # if False, does not generate output files, only computes loss and visuals.\n", "\n", "# Check CUDA availability\n", @@ -80,10 +83,10 @@ "print(\" > CUDA enabled: \", use_cuda)\n", "\n", "# Load the configuration\n", + "dataset_config = BaseDatasetConfig(formatter=DATASET, meta_file_train=METADATA_FILE, path=DATA_PATH)\n", "C = load_config(CONFIG_PATH)\n", "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", - "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)\n", - "print(C['r'])" + "ap = AudioProcessor(**C.audio)" ] }, { @@ -92,12 +95,10 @@ "metadata": {}, "outputs": [], "source": [ - "# If the vocabulary was passed, replace the default\n", - "if 'characters' in C and C['characters']:\n", - " symbols, phonemes = make_symbols(**C.characters)\n", + "# Initialize the tokenizer\n", + "tokenizer, C = TTSTokenizer.init_from_config(C)\n", "\n", "# Load the model\n", - "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", "# TODO: multiple speakers\n", "model = setup_model(C)\n", "model.load_checkpoint(C, MODEL_FILE, eval=True)" @@ -109,42 +110,21 @@ "metadata": {}, "outputs": [], "source": [ - "# Load the preprocessor based on the dataset\n", - "preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n", - "preprocessor = getattr(preprocessor, DATASET.lower())\n", - "meta_data = preprocessor(DATA_PATH, METADATA_FILE)\n", + "# Load data instances\n", + "meta_data_train, meta_data_eval = load_tts_samples(dataset_config)\n", + "meta_data = meta_data_train + meta_data_eval\n", + "\n", "dataset = TTSDataset(\n", - " C,\n", - " C.text_cleaner,\n", - " False,\n", - " ap,\n", - " meta_data,\n", - " characters=C.get('characters', None),\n", - " use_phonemes=C.use_phonemes,\n", - " phoneme_cache_path=C.phoneme_cache_path,\n", - " enable_eos_bos=C.enable_eos_bos_chars,\n", + " outputs_per_step=C[\"r\"],\n", + " compute_linear_spec=False,\n", + " ap=ap,\n", + " samples=meta_data,\n", + " tokenizer=tokenizer,\n", + " phoneme_cache_path=PHONEME_CACHE_PATH,\n", ")\n", "loader = DataLoader(\n", " dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize lists for storing results\n", - "file_idxs = []\n", - "metadata = []\n", - "losses = []\n", - "postnet_losses = []\n", - "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", - "\n", - "# Create log file\n", - "log_file_path = os.path.join(OUT_PATH, \"log.txt\")\n", - "log_file = open(log_file_path, \"w\")" + ")" ] }, { @@ -160,26 +140,33 @@ "metadata": {}, "outputs": [], "source": [ + "# Initialize lists for storing results\n", + "file_idxs = []\n", + "metadata = []\n", + "losses = []\n", + "postnet_losses = []\n", + "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", + "\n", "# Start processing with a progress bar\n", - "with torch.no_grad():\n", + "log_file_path = os.path.join(OUT_PATH, \"log.txt\")\n", + "with torch.no_grad() and open(log_file_path, \"w\") as log_file:\n", " for data in tqdm(loader, desc=\"Processing\"):\n", " try:\n", - " # setup input data\n", - " text_input, text_lengths, _, linear_input, mel_input, mel_lengths, stop_targets, item_idx = data\n", - "\n", " # dispatch data to GPU\n", " if use_cuda:\n", - " text_input = text_input.cuda()\n", - " text_lengths = text_lengths.cuda()\n", - " mel_input = mel_input.cuda()\n", - " mel_lengths = mel_lengths.cuda()\n", + " data[\"token_id\"] = data[\"token_id\"].cuda()\n", + " data[\"token_id_lengths\"] = data[\"token_id_lengths\"].cuda()\n", + " data[\"mel\"] = data[\"mel\"].cuda()\n", + " data[\"mel_lengths\"] = data[\"mel_lengths\"].cuda()\n", "\n", - " mask = sequence_mask(text_lengths)\n", - " mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)\n", + " mask = sequence_mask(data[\"token_id_lengths\"])\n", + " outputs = model.forward(data[\"token_id\"], data[\"token_id_lengths\"], data[\"mel\"])\n", + " mel_outputs = outputs[\"decoder_outputs\"]\n", + " postnet_outputs = outputs[\"model_outputs\"]\n", "\n", " # compute loss\n", - " loss = criterion(mel_outputs, mel_input, mel_lengths)\n", - " loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)\n", + " loss = criterion(mel_outputs, data[\"mel\"], data[\"mel_lengths\"])\n", + " loss_postnet = criterion(postnet_outputs, data[\"mel\"], data[\"mel_lengths\"])\n", " losses.append(loss.item())\n", " postnet_losses.append(loss_postnet.item())\n", "\n", @@ -193,28 +180,27 @@ " postnet_outputs = torch.stack(mel_specs)\n", " elif C.model == \"Tacotron2\":\n", " postnet_outputs = postnet_outputs.detach().cpu().numpy()\n", - " alignments = alignments.detach().cpu().numpy()\n", + " alignments = outputs[\"alignments\"].detach().cpu().numpy()\n", "\n", " if not DRY_RUN:\n", - " for idx in range(text_input.shape[0]):\n", - " wav_file_path = item_idx[idx]\n", + " for idx in range(data[\"token_id\"].shape[0]):\n", + " wav_file_path = data[\"item_idxs\"][idx]\n", " wav = ap.load_wav(wav_file_path)\n", - " file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)\n", + " file_name, wavq_path, mel_path = set_filename(wav_file_path, OUT_PATH)\n", " file_idxs.append(file_name)\n", "\n", " # quantize and save wav\n", - " if QUANTIZED_WAV:\n", - " wavq = ap.quantize(wav)\n", + " if QUANTIZE_BITS > 0:\n", + " wavq = quantize(wav, QUANTIZE_BITS)\n", " np.save(wavq_path, wavq)\n", "\n", " # save TTS mel\n", " mel = postnet_outputs[idx]\n", - " mel_length = mel_lengths[idx]\n", + " mel_length = data[\"mel_lengths\"][idx]\n", " mel = mel[:mel_length, :].T\n", " np.save(mel_path, mel)\n", "\n", " metadata.append([wav_file_path, mel_path])\n", - "\n", " except Exception as e:\n", " log_file.write(f\"Error processing data: {str(e)}\\n\")\n", "\n", @@ -224,35 +210,20 @@ " log_file.write(f\"Mean Loss: {mean_loss}\\n\")\n", " log_file.write(f\"Mean Postnet Loss: {mean_postnet_loss}\\n\")\n", "\n", - "# Close the log file\n", - "log_file.close()\n", - "\n", "# For wavernn\n", "if not DRY_RUN:\n", " pickle.dump(file_idxs, open(os.path.join(OUT_PATH, \"dataset_ids.pkl\"), \"wb\"))\n", "\n", "# For pwgan\n", "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", - " for data in metadata:\n", - " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")\n", + " for wav_file_path, mel_path in metadata:\n", + " f.write(f\"{wav_file_path[0]}|{mel_path[1]+'.npy'}\\n\")\n", "\n", "# Print mean losses\n", "print(f\"Mean Loss: {mean_loss}\")\n", "print(f\"Mean Postnet Loss: {mean_postnet_loss}\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for pwgan\n", - "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", - " for data in metadata:\n", - " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -267,7 +238,7 @@ "outputs": [], "source": [ "idx = 1\n", - "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" + "ap.melspectrogram(ap.load_wav(data[\"item_idxs\"][idx])).shape" ] }, { @@ -276,10 +247,9 @@ "metadata": {}, "outputs": [], "source": [ - "import soundfile as sf\n", - "wav, sr = sf.read(item_idx[idx])\n", - "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n", - "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", + "wav, sr = sf.read(data[\"item_idxs\"][idx])\n", + "mel_postnet = postnet_outputs[idx][:data[\"mel_lengths\"][idx], :]\n", + "mel_decoder = mel_outputs[idx][:data[\"mel_lengths\"][idx], :].detach().cpu().numpy()\n", "mel_truth = ap.melspectrogram(wav)\n", "print(mel_truth.shape)" ] @@ -291,7 +261,7 @@ "outputs": [], "source": [ "# plot posnet output\n", - "print(mel_postnet[:mel_lengths[idx], :].shape)\n", + "print(mel_postnet[:data[\"mel_lengths\"][idx], :].shape)\n", "plot_spectrogram(mel_postnet, ap)" ] }, @@ -324,10 +294,9 @@ "outputs": [], "source": [ "# postnet, decoder diff\n", - "from matplotlib import pylab as plt\n", "mel_diff = mel_decoder - mel_postnet\n", "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.imshow(abs(mel_diff[:data[\"mel_lengths\"][idx],:]).T,aspect=\"auto\", origin=\"lower\")\n", "plt.colorbar()\n", "plt.tight_layout()" ] @@ -339,10 +308,9 @@ "outputs": [], "source": [ "# PLOT GT SPECTROGRAM diff\n", - "from matplotlib import pylab as plt\n", "mel_diff2 = mel_truth.T - mel_decoder\n", "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\")\n", "plt.colorbar()\n", "plt.tight_layout()" ] @@ -354,21 +322,13 @@ "outputs": [], "source": [ "# PLOT GT SPECTROGRAM diff\n", - "from matplotlib import pylab as plt\n", "mel = postnet_outputs[idx]\n", "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n", "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\")\n", "plt.colorbar()\n", "plt.tight_layout()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py new file mode 100644 index 0000000000..7d8f4064c5 --- /dev/null +++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py @@ -0,0 +1,176 @@ +import os + +from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.utils.manage import ModelManager + +# Logging parameters +RUN_NAME = "GPT_XTTS_LJSpeech_FT" +PROJECT_NAME = "XTTS_trainer" +DASHBOARD_LOGGER = "tensorboard" +LOGGER_URI = None + +# Set here the path that the checkpoints will be saved. Default: ./run/training/ +OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training") + +# Training Parameters +OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False +START_WITH_EVAL = True # if True it will star with evaluation +BATCH_SIZE = 3 # set here the batch size +GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + +# Define here the dataset that you want to use for the fine-tuning on. +config_dataset = BaseDatasetConfig( + formatter="ljspeech", + dataset_name="ljspeech", + path="/raid/datasets/LJSpeech-1.1_24khz/", + meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv", + language="en", +) + +# Add here the configs of the datasets +DATASETS_CONFIG_LIST = [config_dataset] + +# Define the path where XTTS v1.1.1 files will be downloaded +CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v1.1_original_model_files/") +os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) + + +# DVAE files +DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/dvae.pth" +MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/mel_stats.pth" + +# Set the path to the downloaded files +DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, DVAE_CHECKPOINT_LINK.split("/")[-1]) +MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, MEL_NORM_LINK.split("/")[-1]) + +# download DVAE files if needed +if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): + print(" > Downloading DVAE files!") + ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) + + +# Download XTTS v1.1 checkpoint if needed +TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json" +XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth" + +# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. +TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file +XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, XTTS_CHECKPOINT_LINK.split("/")[-1]) # model.pth file + +# download XTTS v1.1 files if needed +if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): + print(" > Downloading XTTS v1.1 files!") + ModelManager._download_model_files( + [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) + + +# Training sentences generations +SPEAKER_REFERENCE = [ + "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences +] +LANGUAGE = config_dataset.language + + +def main(): + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + # tokenizer_file="/raid/datasets/xtts_models/vocab.json", # vocab path of the model that you want to fine-tune + # xtts_checkpoint="https://huggingface.co/coqui/XTTS-v1/resolve/hifigan/model.pth", + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, + ) + # define audio config + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + # training parameters config + config = GPTTrainerConfig( + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description=""" + GPT XTTS training + """, + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=START_WITH_EVAL, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py new file mode 100644 index 0000000000..626917381a --- /dev/null +++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py @@ -0,0 +1,176 @@ +import os + +from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.utils.manage import ModelManager + +# Logging parameters +RUN_NAME = "GPT_XTTS_v2.0_LJSpeech_FT" +PROJECT_NAME = "XTTS_trainer" +DASHBOARD_LOGGER = "tensorboard" +LOGGER_URI = None + +# Set here the path that the checkpoints will be saved. Default: ./run/training/ +OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training") + +# Training Parameters +OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False +START_WITH_EVAL = True # if True it will star with evaluation +BATCH_SIZE = 3 # set here the batch size +GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + +# Define here the dataset that you want to use for the fine-tuning on. +config_dataset = BaseDatasetConfig( + formatter="ljspeech", + dataset_name="ljspeech", + path="/raid/datasets/LJSpeech-1.1_24khz/", + meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv", + language="en", +) + +# Add here the configs of the datasets +DATASETS_CONFIG_LIST = [config_dataset] + +# Define the path where XTTS v2.0.1 files will be downloaded +CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/") +os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) + + +# DVAE files +DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" +MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" + +# Set the path to the downloaded files +DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) +MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK)) + +# download DVAE files if needed +if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): + print(" > Downloading DVAE files!") + ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) + + +# Download XTTS v2.0 checkpoint if needed +TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" +XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" + +# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. +TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file +XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file + +# download XTTS v2.0 files if needed +if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): + print(" > Downloading XTTS v2.0 files!") + ModelManager._download_model_files( + [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) + + +# Training sentences generations +SPEAKER_REFERENCE = [ + "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences +] +LANGUAGE = config_dataset.language + + +def main(): + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=1026, + gpt_start_audio_token=1024, + gpt_stop_audio_token=1025, + gpt_use_masking_gt_prompt_approach=True, + gpt_use_perceiver_resampler=True, + ) + # define audio config + audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + # training parameters config + config = GPTTrainerConfig( + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description=""" + GPT XTTS training + """, + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=START_WITH_EVAL, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index a57fed850d..448b6d80fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,39 +1,41 @@ # core deps numpy==1.22.0;python_version<="3.10" -numpy==1.24.3;python_version>"3.10" -cython==0.29.30 +numpy>=1.24.3;python_version>"3.10" +cython>=0.29.30 scipy>=1.11.2 -torch>=1.7 +torch>=2.1 torchaudio -soundfile==0.12.* -librosa==0.10.* -scikit-learn==1.3.0 +soundfile>=0.12.0 +librosa>=0.10.0 +scikit-learn>=1.3.0 numba==0.55.1;python_version<"3.9" -numba==0.57.0;python_version>="3.9" -inflect==5.6.* -tqdm==4.64.* -anyascii==0.3.* -pyyaml==6.* -fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp==3.8.* -packaging==23.1 +numba>=0.57.0;python_version>="3.9" +inflect>=5.6.0 +tqdm>=4.64.1 +anyascii>=0.3.0 +pyyaml>=6.0 +fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail +aiohttp>=3.8.1 +packaging>=23.1 mutagen==1.47.0 # deps for examples -flask==2.* +flask>=2.0.1 # deps for inference -pysbd==0.3.4 +pysbd>=0.3.4 # deps for notebooks -umap-learn==0.5.* +umap-learn>=0.5.1 pandas>=1.4,<2.0 # deps for training -matplotlib==3.7.* +matplotlib>=3.7.0 # coqui stack -trainer +trainer>=0.0.32 # config management coqpit>=0.0.16 # chinese g2p deps jieba pypinyin +# korean +hangul_romanize # gruut+supported langs gruut[de,es,fr]==2.2.3 # deps for korean @@ -45,10 +47,11 @@ bangla bnnumerizer bnunicodenormalizer #deps for tortoise -k_diffusion -einops==0.6.* -transformers==4.33.* +einops>=0.6.0 +transformers>=4.33.0 #deps for bark -encodec==0.1.* +encodec>=0.1.1 # deps for XTTS -unidecode==1.3.* +unidecode>=1.3.2 +num2words +spacy[ja]>=3 \ No newline at end of file diff --git a/tests/api_tests/test_synthesize_api.py b/tests/api_tests/test_synthesize_api.py index 084f81d489..e7b4f12048 100644 --- a/tests/api_tests/test_synthesize_api.py +++ b/tests/api_tests/test_synthesize_api.py @@ -22,7 +22,4 @@ def test_synthesize(): ) # test pipe_out command - run_cli( - 'tts --text "test." --pipe_out ' - f'--out_path "{output_path}" | aplay' - ) + run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay') diff --git a/tests/aux_tests/test_embedding_manager.py b/tests/aux_tests/test_embedding_manager.py index 7392150163..e3acd62bee 100644 --- a/tests/aux_tests/test_embedding_manager.py +++ b/tests/aux_tests/test_embedding_manager.py @@ -3,11 +3,11 @@ import numpy as np import torch +from trainer.io import save_checkpoint from tests import get_tests_input_path from TTS.config import load_config from TTS.encoder.utils.generic_utils import setup_encoder_model -from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.managers import EmbeddingManager from TTS.utils.audio import AudioProcessor @@ -31,7 +31,7 @@ def test_speaker_embedding(): # create a dummy speaker encoder model = setup_encoder_model(config) - save_checkpoint(model, None, None, get_tests_input_path(), 0) + save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path()) # load audio processor and speaker encoder manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index 397f9c81f6..402fbca459 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -3,11 +3,11 @@ import numpy as np import torch +from trainer.io import save_checkpoint from tests import get_tests_input_path from TTS.config import load_config from TTS.encoder.utils.generic_utils import setup_encoder_model -from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -30,7 +30,7 @@ def test_speaker_embedding(): # create a dummy speaker encoder model = setup_encoder_model(config) - save_checkpoint(model, None, None, get_tests_input_path(), 0) + save_checkpoint(config, model, None, None, 0, 0, get_tests_input_path()) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 40e830178c..ce4fc751c2 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -1,10 +1,11 @@ import os import unittest +from trainer.io import save_checkpoint + from tests import get_tests_input_path from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer diff --git a/tests/inputs/xtts_vocab.json b/tests/inputs/xtts_vocab.json new file mode 100644 index 0000000000..a3c6dcec77 --- /dev/null +++ b/tests/inputs/xtts_vocab.json @@ -0,0 +1,12669 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "special": true, + "content": "[STOP]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 1, + "special": true, + "content": "[UNK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 2, + "special": true, + "content": "[SPACE]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 259, + "special": true, + "content": "[en]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 260, + "special": true, + "content": "[de]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 261, + "special": true, + "content": "[START]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 262, + "special": true, + "content": "[fr]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 284, + "special": true, + "content": "[es]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 285, + "special": true, + "content": "[it]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 286, + "special": true, + "content": "[pt]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 294, + "special": true, + "content": "[pl]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 295, + "special": true, + "content": "[tr]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 267, + "special": true, + "content": "[ru]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 293, + "special": true, + "content": "[cs]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 297, + "special": true, + "content": "[nl]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5022, + "special": true, + "content": "[ar]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5023, + "special": true, + "content": "[zh-cn]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5412, + "special": true, + "content": "[ja]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "[UNK]", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "vocab": { + "[STOP]": 0, + "[UNK]": 1, + "[SPACE]": 2, + "!": 3, + "'": 4, + "(": 5, + ")": 6, + ",": 7, + "-": 8, + ".": 9, + "/": 10, + ":": 11, + ";": 12, + "?": 13, + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "g": 20, + "h": 21, + "i": 22, + "j": 23, + "k": 24, + "l": 25, + "m": 26, + "n": 27, + "o": 28, + "p": 29, + "q": 30, + "r": 31, + "s": 32, + "t": 33, + "u": 34, + "v": 35, + "w": 36, + "x": 37, + "y": 38, + "z": 39, + "th": 40, + "in": 41, + "the": 42, + "an": 43, + "er": 44, + "ou": 45, + "re": 46, + "on": 47, + "at": 48, + "ed": 49, + "en": 50, + "to": 51, + "ing": 52, + "and": 53, + "is": 54, + "as": 55, + "al": 56, + "or": 57, + "of": 58, + "ar": 59, + "it": 60, + "es": 61, + "he": 62, + "st": 63, + "le": 64, + "om": 65, + "se": 66, + "be": 67, + "ad": 68, + "ow": 69, + "ly": 70, + "ch": 71, + "wh": 72, + "that": 73, + "you": 74, + "li": 75, + "ve": 76, + "ac": 77, + "ti": 78, + "ld": 79, + "me": 80, + "was": 81, + "gh": 82, + "id": 83, + "ll": 84, + "wi": 85, + "ent": 86, + "for": 87, + "ay": 88, + "ro": 89, + "ver": 90, + "ic": 91, + "her": 92, + "ke": 93, + "his": 94, + "no": 95, + "ut": 96, + "un": 97, + "ir": 98, + "lo": 99, + "we": 100, + "ri": 101, + "ha": 102, + "with": 103, + "ght": 104, + "out": 105, + "im": 106, + "ion": 107, + "all": 108, + "ab": 109, + "one": 110, + "ne": 111, + "ge": 112, + "ould": 113, + "ter": 114, + "mo": 115, + "had": 116, + "ce": 117, + "she": 118, + "go": 119, + "sh": 120, + "ur": 121, + "am": 122, + "so": 123, + "pe": 124, + "my": 125, + "de": 126, + "are": 127, + "but": 128, + "ome": 129, + "fr": 130, + "ther": 131, + "fe": 132, + "su": 133, + "do": 134, + "con": 135, + "te": 136, + "ain": 137, + "ere": 138, + "po": 139, + "if": 140, + "they": 141, + "us": 142, + "ag": 143, + "tr": 144, + "now": 145, + "oun": 146, + "this": 147, + "have": 148, + "not": 149, + "sa": 150, + "il": 151, + "up": 152, + "thing": 153, + "from": 154, + "ap": 155, + "him": 156, + "ack": 157, + "ation": 158, + "ant": 159, + "our": 160, + "op": 161, + "like": 162, + "ust": 163, + "ess": 164, + "bo": 165, + "ok": 166, + "ul": 167, + "ind": 168, + "ex": 169, + "com": 170, + "some": 171, + "there": 172, + "ers": 173, + "co": 174, + "res": 175, + "man": 176, + "ard": 177, + "pl": 178, + "wor": 179, + "way": 180, + "tion": 181, + "fo": 182, + "ca": 183, + "were": 184, + "by": 185, + "ate": 186, + "pro": 187, + "ted": 188, + "ound": 189, + "own": 190, + "would": 191, + "ts": 192, + "what": 193, + "qu": 194, + "ally": 195, + "ight": 196, + "ck": 197, + "gr": 198, + "when": 199, + "ven": 200, + "can": 201, + "ough": 202, + "ine": 203, + "end": 204, + "per": 205, + "ous": 206, + "od": 207, + "ide": 208, + "know": 209, + "ty": 210, + "very": 211, + "si": 212, + "ak": 213, + "who": 214, + "about": 215, + "ill": 216, + "them": 217, + "est": 218, + "red": 219, + "ye": 220, + "could": 221, + "ong": 222, + "your": 223, + "their": 224, + "em": 225, + "just": 226, + "other": 227, + "into": 228, + "any": 229, + "whi": 230, + "um": 231, + "tw": 232, + "ast": 233, + "der": 234, + "did": 235, + "ie": 236, + "been": 237, + "ace": 238, + "ink": 239, + "ity": 240, + "back": 241, + "ting": 242, + "br": 243, + "more": 244, + "ake": 245, + "pp": 246, + "then": 247, + "sp": 248, + "el": 249, + "use": 250, + "bl": 251, + "said": 252, + "over": 253, + "get": 254, + "ß": 255, + "ä": 256, + "ö": 257, + "ü": 258, + "[en]": 259, + "[de]": 260, + "[START]": 261, + "[fr]": 262, + "œ": 263, + "ï": 264, + "ê": 265, + "â": 266, + "[ru]": 267, + "ÿ": 268, + "è": 269, + "à": 270, + "ë": 271, + "ù": 272, + "î": 273, + "ç": 274, + "æ": 275, + "ô": 276, + "û": 277, + "á": 278, + "é": 279, + "í": 280, + "ó": 281, + "ú": 282, + "ñ": 283, + "[es]": 284, + "[it]": 285, + "[pt]": 286, + "ń": 287, + "ś": 288, + "ę": 289, + "ą": 290, + "ż": 291, + "ć": 292, + "[cs]": 293, + "[pl]": 294, + "[tr]": 295, + "ã": 296, + "[nl]": 297, + "ş": 298, + "ğ": 299, + "ı": 300, + "ò": 301, + "ì": 302, + "¿": 303, + "…": 304, + "i̇": 305, + "õ": 306, + "\"": 307, + "´": 308, + "ø": 309, + "č": 310, + "ō": 311, + "š": 312, + "ž": 313, + "̇": 314, + "ei": 315, + "ich": 316, + "ein": 317, + "au": 318, + "sch": 319, + "und": 320, + "die": 321, + "da": 322, + "den": 323, + "gen": 324, + "zu": 325, + "hr": 326, + "ten": 327, + "mi": 328, + "sie": 329, + "das": 330, + "eine": 331, + "icht": 332, + "ber": 333, + "ach": 334, + "auf": 335, + "lich": 336, + "nicht": 337, + "mm": 338, + "ben": 339, + "war": 340, + "mit": 341, + "sich": 342, + "ig": 343, + "aus": 344, + "ist": 345, + "wie": 346, + "och": 347, + "ung": 348, + "ann": 349, + "ür": 350, + "hn": 351, + "ihr": 352, + "sen": 353, + "tz": 354, + "dem": 355, + "eit": 356, + "hat": 357, + "wir": 358, + "von": 359, + "wei": 360, + "ier": 361, + "ra": 362, + "einen": 363, + "vor": 364, + "als": 365, + "wo": 366, + "rei": 367, + "ste": 368, + "lie": 369, + "auch": 370, + "du": 371, + "des": 372, + "ko": 373, + "über": 374, + "bei": 375, + "hen": 376, + "hm": 377, + "lei": 378, + "aber": 379, + "wen": 380, + "hl": 381, + "ger": 382, + "nach": 383, + "ft": 384, + "imm": 385, + "je": 386, + "schen": 387, + "wer": 388, + "ser": 389, + "än": 390, + "sein": 391, + "ol": 392, + "cht": 393, + "für": 394, + "kl": 395, + "ff": 396, + "einem": 397, + "nen": 398, + "ja": 399, + "noch": 400, + "hatte": 401, + "pf": 402, + "hin": 403, + "di": 404, + "chen": 405, + "rü": 406, + "iel": 407, + "sel": 408, + "dass": 409, + "ihn": 410, + "mir": 411, + "schl": 412, + "ön": 413, + "gan": 414, + "gt": 415, + "einer": 416, + "sten": 417, + "mich": 418, + "wenn": 419, + "ell": 420, + "gte": 421, + "mal": 422, + "gel": 423, + "ken": 424, + "nur": 425, + "mmen": 426, + "fü": 427, + "ern": 428, + "ör": 429, + "unter": 430, + "ander": 431, + "dur": 432, + "uch": 433, + "ta": 434, + "men": 435, + "mach": 436, + "doch": 437, + "durch": 438, + "os": 439, + "gl": 440, + "hal": 441, + "ihre": 442, + "wä": 443, + "immer": 444, + "ihm": 445, + "kann": 446, + "ort": 447, + "dann": 448, + "lan": 449, + "tzt": 450, + "oder": 451, + "hren": 452, + "et": 453, + "kön": 454, + "ick": 455, + "fa": 456, + "wieder": 457, + "daß": 458, + "mein": 459, + "fen": 460, + "ganz": 461, + "diese": 462, + "ster": 463, + "dar": 464, + "wa": 465, + "ges": 466, + "na": 467, + "fl": 468, + "igen": 469, + "sche": 470, + "ungen": 471, + "mehr": 472, + "ßen": 473, + "ot": 474, + "kon": 475, + "gew": 476, + "haben": 477, + "geh": 478, + "ät": 479, + "sind": 480, + "dr": 481, + "wel": 482, + "uns": 483, + "vo": 484, + "ma": 485, + "ute": 486, + "schon": 487, + "bes": 488, + "gesch": 489, + "bt": 490, + "che": 491, + "son": 492, + "ob": 493, + "la": 494, + "rück": 495, + "seine": 496, + "kr": 497, + "fre": 498, + "eil": 499, + "zum": 500, + "hier": 501, + "kt": 502, + "ige": 503, + "spr": 504, + "leben": 505, + "bst": 506, + "zeit": 507, + "gro": 508, + "denn": 509, + "ho": 510, + "scha": 511, + "bar": 512, + "alle": 513, + "gegen": 514, + "wür": 515, + "mü": 516, + "ze": 517, + "werden": 518, + "jetzt": 519, + "kommen": 520, + "nie": 521, + "sei": 522, + "heit": 523, + "soll": 524, + "glei": 525, + "meine": 526, + "woll": 527, + "ner": 528, + "habe": 529, + "wur": 530, + "lichen": 531, + "assen": 532, + "nte": 533, + "sehen": 534, + "wird": 535, + "bis": 536, + "gar": 537, + "ien": 538, + "mus": 539, + "uß": 540, + "är": 541, + "stell": 542, + "keit": 543, + "zwei": 544, + "selbst": 545, + "sta": 546, + "pa": 547, + "sagte": 548, + "tet": 549, + "kam": 550, + "ssen": 551, + "viel": 552, + "ug": 553, + "zen": 554, + "hei": 555, + "mann": 556, + "will": 557, + "geb": 558, + "waren": 559, + "ück": 560, + "äch": 561, + "mer": 562, + "ru": 563, + "hau": 564, + "eigen": 565, + "ang": 566, + "weg": 567, + "blick": 568, + "fra": 569, + "alles": 570, + "ka": 571, + "augen": 572, + "fin": 573, + "liche": 574, + "unser": 575, + "dern": 576, + "herr": 577, + "nun": 578, + "vie": 579, + "chte": 580, + "wohl": 581, + "fall": 582, + "ht": 583, + "ün": 584, + "etwas": 585, + "stand": 586, + "äu": 587, + "mö": 588, + "tel": 589, + "rie": 590, + "dich": 591, + "dies": 592, + "hand": 593, + "bin": 594, + "ffen": 595, + "nichts": 596, + "dan": 597, + "hne": 598, + "ihnen": 599, + "esen": 600, + "dieser": 601, + "frau": 602, + "art": 603, + "dir": 604, + "isch": 605, + "erst": 606, + "gleich": 607, + "komm": 608, + "hör": 609, + "ße": 610, + "dig": 611, + "sehr": 612, + "zei": 613, + "sam": 614, + "aum": 615, + "hät": 616, + "ingen": 617, + "gut": 618, + "mut": 619, + "cken": 620, + "konnte": 621, + "stimm": 622, + "zur": 623, + "itz": 624, + "weil": 625, + "würde": 626, + "fä": 627, + "können": 628, + "keine": 629, + "fer": 630, + "ischen": 631, + "voll": 632, + "eines": 633, + "setz": 634, + "zie": 635, + "del": 636, + "tete": 637, + "seiner": 638, + "ieren": 639, + "gest": 640, + "zurück": 641, + "wurde": 642, + "schn": 643, + "pr": 644, + "ließ": 645, + "tra": 646, + "mä": 647, + "gend": 648, + "fol": 649, + "ik": 650, + "schla": 651, + "schaft": 652, + "ater": 653, + "weiß": 654, + "seinen": 655, + "lassen": 656, + "lu": 657, + "unden": 658, + "teil": 659, + "neu": 660, + "iert": 661, + "menschen": 662, + "hmen": 663, + "str": 664, + "gi": 665, + "sah": 666, + "ihren": 667, + "eln": 668, + "weiter": 669, + "gehen": 670, + "iger": 671, + "macht": 672, + "tag": 673, + "also": 674, + "halten": 675, + "nis": 676, + "acht": 677, + "geben": 678, + "og": 679, + "nat": 680, + "mar": 681, + "det": 682, + "ohne": 683, + "haus": 684, + "tro": 685, + "ange": 686, + "lau": 687, + "spiel": 688, + "tre": 689, + "schr": 690, + "inn": 691, + "los": 692, + "machen": 693, + "hätte": 694, + "beg": 695, + "wirk": 696, + "alt": 697, + "glich": 698, + "tes": 699, + "richt": 700, + "freund": 701, + "ihrer": 702, + "fel": 703, + "bel": 704, + "sol": 705, + "einmal": 706, + "eben": 707, + "hol": 708, + "hän": 709, + "tern": 710, + "hö": 711, + "schw": 712, + "recht": 713, + "wahr": 714, + "seinem": 715, + "stehen": 716, + "hlen": 717, + "ins": 718, + "ging": 719, + "wollte": 720, + "wissen": 721, + "ungs": 722, + "ald": 723, + "ass": 724, + "jahr": 725, + "mor": 726, + "welt": 727, + "under": 728, + "zusa": 729, + "kopf": 730, + "lang": 731, + "hinter": 732, + "atz": 733, + "stra": 734, + "angen": 735, + "ank": 736, + "ade": 737, + "glau": 738, + "fach": 739, + "hatten": 740, + "fort": 741, + "eicht": 742, + "iff": 743, + "ler": 744, + "mei": 745, + "diesem": 746, + "kein": 747, + "frei": 748, + "führ": 749, + "vom": 750, + "β": 751, + "ai": 752, + "ait": 753, + "que": 754, + "les": 755, + "av": 756, + "ais": 757, + "oi": 758, + "eu": 759, + "lle": 760, + "par": 761, + "ans": 762, + "ment": 763, + "ét": 764, + "une": 765, + "pas": 766, + "qui": 767, + "elle": 768, + "dé": 769, + "pour": 770, + "dans": 771, + "ré": 772, + "tou": 773, + "vous": 774, + "vi": 775, + "ouv": 776, + "mon": 777, + "sur": 778, + "ci": 779, + "plu": 780, + "ère": 781, + "mais": 782, + "ois": 783, + "plus": 784, + "ée": 785, + "aient": 786, + "mp": 787, + "lui": 788, + "ave": 789, + "était": 790, + "ses": 791, + "tout": 792, + "oir": 793, + "avait": 794, + "és": 795, + "mes": 796, + "nous": 797, + "eux": 798, + "bi": 799, + "ons": 800, + "pu": 801, + "ces": 802, + "tu": 803, + "leur": 804, + "don": 805, + "eur": 806, + "ette": 807, + "aire": 808, + "avec": 809, + "dit": 810, + "té": 811, + "ille": 812, + "comme": 813, + "cr": 814, + "ux": 815, + "ès": 816, + "aux": 817, + "jour": 818, + "ils": 819, + "bien": 820, + "cou": 821, + "quel": 822, + "peu": 823, + "cette": 824, + "cu": 825, + "mê": 826, + "fait": 827, + "gu": 828, + "être": 829, + "ité": 830, + "ens": 831, + "ni": 832, + "lé": 833, + "dis": 834, + "ble": 835, + "né": 836, + "puis": 837, + "même": 838, + "ques": 839, + "fi": 840, + "age": 841, + "moi": 842, + "ence": 843, + "ont": 844, + "main": 845, + "ors": 846, + "aut": 847, + "ance": 848, + "mé": 849, + "sans": 850, + "sé": 851, + "lon": 852, + "hom": 853, + "car": 854, + "able": 855, + "cher": 856, + "deux": 857, + "enf": 858, + "où": 859, + "ph": 860, + "ure": 861, + "temp": 862, + "pos": 863, + "rent": 864, + "pé": 865, + "faire": 866, + "pi": 867, + "tres": 868, + "ça": 869, + "endre": 870, + "bon": 871, + "sou": 872, + "int": 873, + "pré": 874, + "sent": 875, + "tant": 876, + "cer": 877, + "là": 878, + "lais": 879, + "près": 880, + "bre": 881, + "cour": 882, + "pet": 883, + "comp": 884, + "lait": 885, + "trouv": 886, + "entre": 887, + "sont": 888, + "dev": 889, + "nu": 890, + "temps": 891, + "dou": 892, + "rait": 893, + "bou": 894, + "quand": 895, + "jours": 896, + "avoir": 897, + "été": 898, + "ale": 899, + "pre": 900, + "fois": 901, + "orte": 902, + "vé": 903, + "non": 904, + "tous": 905, + "jus": 906, + "coup": 907, + "homme": 908, + "ête": 909, + "aussi": 910, + "urs": 911, + "seu": 912, + "ord": 913, + "min": 914, + "gé": 915, + "core": 916, + "va": 917, + "vre": 918, + "encore": 919, + "sem": 920, + "ite": 921, + "autre": 922, + "pris": 923, + "peut": 924, + "ue": 925, + "ante": 926, + "gn": 927, + "rép": 928, + "hu": 929, + "sion": 930, + "votre": 931, + "dire": 932, + "ez": 933, + "fem": 934, + "leurs": 935, + "met": 936, + "cri": 937, + "mis": 938, + "tour": 939, + "rai": 940, + "jam": 941, + "regar": 942, + "rien": 943, + "vers": 944, + "suis": 945, + "pouv": 946, + "vis": 947, + "grand": 948, + "ants": 949, + "cor": 950, + "rer": 951, + "cé": 952, + "tent": 953, + "pres": 954, + "vou": 955, + "alors": 956, + "sieur": 957, + "aine": 958, + "quoi": 959, + "fon": 960, + "endant": 961, + "arri": 962, + "eure": 963, + "après": 964, + "donc": 965, + "itu": 966, + "lè": 967, + "sait": 968, + "toi": 969, + "cha": 970, + "ail": 971, + "asse": 972, + "imp": 973, + "voy": 974, + "conn": 975, + "pla": 976, + "petit": 977, + "avant": 978, + "nom": 979, + "tin": 980, + "dont": 981, + "sous": 982, + "emp": 983, + "person": 984, + "elles": 985, + "beau": 986, + "parti": 987, + "cho": 988, + "prit": 989, + "toujours": 990, + "rais": 991, + "jamais": 992, + "trav": 993, + "tions": 994, + "très": 995, + "voi": 996, + "ren": 997, + "yeux": 998, + "voir": 999, + "premi": 1000, + "gne": 1001, + "heure": 1002, + "rou": 1003, + "eff": 1004, + "notre": 1005, + "ments": 1006, + "ton": 1007, + "fais": 1008, + "cela": 1009, + "répon": 1010, + "cons": 1011, + "air": 1012, + "ôt": 1013, + "pendant": 1014, + "ici": 1015, + "toute": 1016, + "jet": 1017, + "port": 1018, + "étaient": 1019, + "pen": 1020, + "hé": 1021, + "autres": 1022, + "père": 1023, + "oc": 1024, + "quelques": 1025, + "ique": 1026, + "lis": 1027, + "femme": 1028, + "jou": 1029, + "teur": 1030, + "monde": 1031, + "nes": 1032, + "dre": 1033, + "aff": 1034, + "rap": 1035, + "part": 1036, + "lement": 1037, + "cla": 1038, + "fut": 1039, + "quelque": 1040, + "prendre": 1041, + "rê": 1042, + "aille": 1043, + "sais": 1044, + "ches": 1045, + "let": 1046, + "char": 1047, + "ères": 1048, + "ents": 1049, + "moins": 1050, + "eau": 1051, + "aî": 1052, + "jeu": 1053, + "heur": 1054, + "ées": 1055, + "tri": 1056, + "point": 1057, + "mom": 1058, + "vent": 1059, + "nouv": 1060, + "gran": 1061, + "trois": 1062, + "sant": 1063, + "toutes": 1064, + "contre": 1065, + "èrent": 1066, + "chez": 1067, + "avez": 1068, + "ût": 1069, + "att": 1070, + "pau": 1071, + "porte": 1072, + "ouver": 1073, + "lit": 1074, + "prés": 1075, + "chose": 1076, + "vit": 1077, + "monsieur": 1078, + "hab": 1079, + "tête": 1080, + "ju": 1081, + "tement": 1082, + "ction": 1083, + "vrai": 1084, + "lar": 1085, + "cet": 1086, + "regard": 1087, + "lant": 1088, + "som": 1089, + "moment": 1090, + "illes": 1091, + "ple": 1092, + "ps": 1093, + "mère": 1094, + "cl": 1095, + "sour": 1096, + "ys": 1097, + "trop": 1098, + "enne": 1099, + "jusqu": 1100, + "avaient": 1101, + "avais": 1102, + "jeune": 1103, + "depuis": 1104, + "personne": 1105, + "fit": 1106, + "cert": 1107, + "jo": 1108, + "oui": 1109, + "rest": 1110, + "semb": 1111, + "cap": 1112, + "mat": 1113, + "mu": 1114, + "long": 1115, + "fran": 1116, + "faut": 1117, + "iti": 1118, + "bli": 1119, + "chev": 1120, + "pri": 1121, + "ente": 1122, + "ainsi": 1123, + "cham": 1124, + "lors": 1125, + "cas": 1126, + "ili": 1127, + "bé": 1128, + "nos": 1129, + "sui": 1130, + "rit": 1131, + "cro": 1132, + "gue": 1133, + "ía": 1134, + "por": 1135, + "las": 1136, + "ón": 1137, + "una": 1138, + "aba": 1139, + "dos": 1140, + "era": 1141, + "mb": 1142, + "para": 1143, + "ás": 1144, + "mos": 1145, + "ando": 1146, + "como": 1147, + "más": 1148, + "ción": 1149, + "tan": 1150, + "dad": 1151, + "ado": 1152, + "fu": 1153, + "cia": 1154, + "mente": 1155, + "sus": 1156, + "tar": 1157, + "za": 1158, + "ba": 1159, + "pero": 1160, + "sin": 1161, + "lla": 1162, + "án": 1163, + "ia": 1164, + "ran": 1165, + "ga": 1166, + "yo": 1167, + "tos": 1168, + "cos": 1169, + "ya": 1170, + "ones": 1171, + "había": 1172, + "hi": 1173, + "esta": 1174, + "mas": 1175, + "tor": 1176, + "aban": 1177, + "dor": 1178, + "ían": 1179, + "tas": 1180, + "én": 1181, + "endo": 1182, + "aque": 1183, + "ero": 1184, + "io": 1185, + "qué": 1186, + "cab": 1187, + "tal": 1188, + "señ": 1189, + "ora": 1190, + "todo": 1191, + "sal": 1192, + "cuando": 1193, + "gun": 1194, + "bu": 1195, + "ras": 1196, + "esto": 1197, + "pare": 1198, + "él": 1199, + "tras": 1200, + "jos": 1201, + "mien": 1202, + "pue": 1203, + "cre": 1204, + "pon": 1205, + "día": 1206, + "tros": 1207, + "sab": 1208, + "sobre": 1209, + "ese": 1210, + "mbre": 1211, + "eron": 1212, + "añ": 1213, + "ido": 1214, + "porque": 1215, + "ella": 1216, + "cen": 1217, + "muy": 1218, + "cal": 1219, + "este": 1220, + "has": 1221, + "có": 1222, + "gra": 1223, + "ros": 1224, + "aquel": 1225, + "dijo": 1226, + "cía": 1227, + "zo": 1228, + "ciones": 1229, + "mbi": 1230, + "elo": 1231, + "tó": 1232, + "ina": 1233, + "todos": 1234, + "tien": 1235, + "estaba": 1236, + "deci": 1237, + "cio": 1238, + "ño": 1239, + "lor": 1240, + "nues": 1241, + "medi": 1242, + "len": 1243, + "vida": 1244, + "ali": 1245, + "pues": 1246, + "ales": 1247, + "vol": 1248, + "mí": 1249, + "rar": 1250, + "cion": 1251, + "hasta": 1252, + "señor": 1253, + "cono": 1254, + "ah": 1255, + "dios": 1256, + "esa": 1257, + "ún": 1258, + "var": 1259, + "san": 1260, + "gui": 1261, + "otros": 1262, + "tado": 1263, + "buen": 1264, + "ña": 1265, + "tiemp": 1266, + "hacer": 1267, + "jer": 1268, + "vu": 1269, + "ana": 1270, + "así": 1271, + "antes": 1272, + "vez": 1273, + "miento": 1274, + "jar": 1275, + "lab": 1276, + "casa": 1277, + "eso": 1278, + "ego": 1279, + "dió": 1280, + "está": 1281, + "encia": 1282, + "eli": 1283, + "ías": 1284, + "tiempo": 1285, + "zar": 1286, + "van": 1287, + "mun": 1288, + "erta": 1289, + "tambi": 1290, + "sí": 1291, + "aun": 1292, + "mismo": 1293, + "entes": 1294, + "mano": 1295, + "ele": 1296, + "nada": 1297, + "segu": 1298, + "mej": 1299, + "erra": 1300, + "tir": 1301, + "uno": 1302, + "donde": 1303, + "toda": 1304, + "desde": 1305, + "también": 1306, + "cuer": 1307, + "hombre": 1308, + "otro": 1309, + "lib": 1310, + "trar": 1311, + "cual": 1312, + "hay": 1313, + "cada": 1314, + "taba": 1315, + "mento": 1316, + "tenía": 1317, + "quer": 1318, + "eran": 1319, + "siemp": 1320, + "siempre": 1321, + "erto": 1322, + "quí": 1323, + "gos": 1324, + "pués": 1325, + "ellos": 1326, + "después": 1327, + "nue": 1328, + "llo": 1329, + "inter": 1330, + "cómo": 1331, + "ahora": 1332, + "uste": 1333, + "traba": 1334, + "lado": 1335, + "ino": 1336, + "poco": 1337, + "erte": 1338, + "mujer": 1339, + "quier": 1340, + "algun": 1341, + "fue": 1342, + "ojos": 1343, + "enton": 1344, + "vos": 1345, + "esper": 1346, + "much": 1347, + "otra": 1348, + "az": 1349, + "eza": 1350, + "aquí": 1351, + "cias": 1352, + "gua": 1353, + "mucho": 1354, + "decir": 1355, + "esti": 1356, + "idad": 1357, + "algo": 1358, + "ocu": 1359, + "entonces": 1360, + "dido": 1361, + "entos": 1362, + "gri": 1363, + "dado": 1364, + "ios": 1365, + "dose": 1366, + "usted": 1367, + "quien": 1368, + "ami": 1369, + "unto": 1370, + "mejor": 1371, + "bas": 1372, + "solo": 1373, + "pregun": 1374, + "tur": 1375, + "alg": 1376, + "todas": 1377, + "parte": 1378, + "emb": 1379, + "cto": 1380, + "mundo": 1381, + "tiene": 1382, + "tante": 1383, + "palab": 1384, + "tran": 1385, + "aquella": 1386, + "cios": 1387, + "aunque": 1388, + "cuen": 1389, + "tener": 1390, + "fun": 1391, + "respon": 1392, + "allí": 1393, + "xi": 1394, + "han": 1395, + "pens": 1396, + "contra": 1397, + "tura": 1398, + "val": 1399, + "dio": 1400, + "tanto": 1401, + "camin": 1402, + "mó": 1403, + "esp": 1404, + "ada": 1405, + "ío": 1406, + "hacia": 1407, + "dej": 1408, + "estar": 1409, + "ión": 1410, + "gas": 1411, + "vas": 1412, + "noche": 1413, + "ér": 1414, + "años": 1415, + "padre": 1416, + "gus": 1417, + "ár": 1418, + "sino": 1419, + "manos": 1420, + "cido": 1421, + "estu": 1422, + "hubi": 1423, + "vir": 1424, + "bri": 1425, + "raz": 1426, + "chi": 1427, + "puede": 1428, + "menos": 1429, + "habi": 1430, + "homb": 1431, + "neces": 1432, + "may": 1433, + "eros": 1434, + "ría": 1435, + "hecho": 1436, + "escu": 1437, + "lti": 1438, + "ándo": 1439, + "bus": 1440, + "cosas": 1441, + "tú": 1442, + "espa": 1443, + "reci": 1444, + "ctor": 1445, + "prim": 1446, + "dia": 1447, + "dese": 1448, + "mientras": 1449, + "hor": 1450, + "fuer": 1451, + "ida": 1452, + "posi": 1453, + "lante": 1454, + "ano": 1455, + "estas": 1456, + "pli": 1457, + "luego": 1458, + "sión": 1459, + "cin": 1460, + "tierra": 1461, + "guar": 1462, + "cado": 1463, + "encon": 1464, + "pren": 1465, + "mayor": 1466, + "fal": 1467, + "ð": 1468, + "ħ": 1469, + "ň": 1470, + "ə": 1471, + "θ": 1472, + "’": 1473, + "“": 1474, + "”": 1475, + "zi": 1476, + "gli": 1477, + "tto": 1478, + "ono": 1479, + "nel": 1480, + "tti": 1481, + "della": 1482, + "zione": 1483, + "tta": 1484, + "tà": 1485, + "uo": 1486, + "come": 1487, + "alla": 1488, + "oni": 1489, + "ggi": 1490, + "ssi": 1491, + "più": 1492, + "ini": 1493, + "bb": 1494, + "sto": 1495, + "sono": 1496, + "eri": 1497, + "sse": 1498, + "sc": 1499, + "sul": 1500, + "vano": 1501, + "sti": 1502, + "suo": 1503, + "cchi": 1504, + "zza": 1505, + "anche": 1506, + "tte": 1507, + "sci": 1508, + "col": 1509, + "sso": 1510, + "ssa": 1511, + "dei": 1512, + "aveva": 1513, + "zz": 1514, + "amo": 1515, + "gno": 1516, + "sua": 1517, + "ria": 1518, + "sì": 1519, + "ché": 1520, + "dal": 1521, + "ona": 1522, + "spe": 1523, + "gni": 1524, + "tt": 1525, + "delle": 1526, + "questo": 1527, + "nella": 1528, + "dere": 1529, + "anno": 1530, + "dell": 1531, + "uni": 1532, + "bbe": 1533, + "anti": 1534, + "ene": 1535, + "gio": 1536, + "uto": 1537, + "qual": 1538, + "glia": 1539, + "quando": 1540, + "tutto": 1541, + "glio": 1542, + "zioni": 1543, + "cam": 1544, + "esso": 1545, + "ss": 1546, + "mol": 1547, + "loro": 1548, + "perché": 1549, + "cosa": 1550, + "due": 1551, + "poi": 1552, + "sco": 1553, + "cco": 1554, + "gna": 1555, + "tem": 1556, + "prima": 1557, + "così": 1558, + "essere": 1559, + "ani": 1560, + "bra": 1561, + "rio": 1562, + "anco": 1563, + "cui": 1564, + "spi": 1565, + "via": 1566, + "gior": 1567, + "bile": 1568, + "ggio": 1569, + "mai": 1570, + "tare": 1571, + "indi": 1572, + "rebbe": 1573, + "senza": 1574, + "zio": 1575, + "tutti": 1576, + "stato": 1577, + "zia": 1578, + "dalla": 1579, + "mia": 1580, + "vita": 1581, + "quella": 1582, + "qua": 1583, + "dove": 1584, + "allo": 1585, + "sempre": 1586, + "zzo": 1587, + "sia": 1588, + "dopo": 1589, + "porta": 1590, + "ccia": 1591, + "erano": 1592, + "anni": 1593, + "chia": 1594, + "enza": 1595, + "propri": 1596, + "anda": 1597, + "cca": 1598, + "occhi": 1599, + "questa": 1600, + "ffi": 1601, + "ron": 1602, + "mio": 1603, + "ris": 1604, + "ogni": 1605, + "rin": 1606, + "far": 1607, + "menti": 1608, + "ancora": 1609, + "fatto": 1610, + "mani": 1611, + "senti": 1612, + "pra": 1613, + "tempo": 1614, + "essi": 1615, + "bbi": 1616, + "lare": 1617, + "pers": 1618, + "sor": 1619, + "anza": 1620, + "pie": 1621, + "verso": 1622, + "altro": 1623, + "tato": 1624, + "cato": 1625, + "ato": 1626, + "volta": 1627, + "cc": 1628, + "fare": 1629, + "ciò": 1630, + "bili": 1631, + "nuo": 1632, + "quello": 1633, + "colo": 1634, + "ppo": 1635, + "trova": 1636, + "ore": 1637, + "rono": 1638, + "molto": 1639, + "almente": 1640, + "sca": 1641, + "vole": 1642, + "tali": 1643, + "sulla": 1644, + "sce": 1645, + "meno": 1646, + "anto": 1647, + "pun": 1648, + "stu": 1649, + "capi": 1650, + "giu": 1651, + "mini": 1652, + "pia": 1653, + "lavo": 1654, + "vero": 1655, + "rsi": 1656, + "altri": 1657, + "scia": 1658, + "suoi": 1659, + "glie": 1660, + "sotto": 1661, + "bene": 1662, + "scri": 1663, + "tale": 1664, + "degli": 1665, + "alc": 1666, + "uomo": 1667, + "pel": 1668, + "pote": 1669, + "essa": 1670, + "scu": 1671, + "signo": 1672, + "stro": 1673, + "uti": 1674, + "sione": 1675, + "gre": 1676, + "fini": 1677, + "lun": 1678, + "esi": 1679, + "passa": 1680, + "rà": 1681, + "mentre": 1682, + "hanno": 1683, + "usci": 1684, + "gia": 1685, + "già": 1686, + "mina": 1687, + "tica": 1688, + "giorno": 1689, + "esse": 1690, + "modo": 1691, + "spa": 1692, + "proprio": 1693, + "ori": 1694, + "contro": 1695, + "stru": 1696, + "diven": 1697, + "disse": 1698, + "rato": 1699, + "noi": 1700, + "vere": 1701, + "può": 1702, + "dice": 1703, + "cci": 1704, + "secon": 1705, + "ccio": 1706, + "qualche": 1707, + "tutta": 1708, + "gg": 1709, + "mondo": 1710, + "forma": 1711, + "mma": 1712, + "pensa": 1713, + "deva": 1714, + "fosse": 1715, + "sopra": 1716, + "tamente": 1717, + "ness": 1718, + "quanto": 1719, + "raga": 1720, + "unque": 1721, + "care": 1722, + "stre": 1723, + "grande": 1724, + "picco": 1725, + "guarda": 1726, + "nell": 1727, + "possi": 1728, + "presen": 1729, + "rò": 1730, + "paro": 1731, + "tua": 1732, + "vin": 1733, + "ane": 1734, + "stesso": 1735, + "dav": 1736, + "nei": 1737, + "nelle": 1738, + "ghi": 1739, + "pio": 1740, + "lato": 1741, + "sid": 1742, + "fine": 1743, + "fuo": 1744, + "quasi": 1745, + "ulti": 1746, + "ito": 1747, + "sue": 1748, + "fil": 1749, + "allora": 1750, + "veni": 1751, + "tano": 1752, + "ello": 1753, + "ão": 1754, + "não": 1755, + "uma": 1756, + "ela": 1757, + "lh": 1758, + "ção": 1759, + "cê": 1760, + "inha": 1761, + "você": 1762, + "ec": 1763, + "dade": 1764, + "ao": 1765, + "ram": 1766, + "vel": 1767, + "ém": 1768, + "pode": 1769, + "estava": 1770, + "isso": 1771, + "mui": 1772, + "faz": 1773, + "ões": 1774, + "pes": 1775, + "ix": 1776, + "sim": 1777, + "olh": 1778, + "isa": 1779, + "ên": 1780, + "tinha": 1781, + "meu": 1782, + "são": 1783, + "minha": 1784, + "muito": 1785, + "foi": 1786, + "bem": 1787, + "diz": 1788, + "parec": 1789, + "ço": 1790, + "pesso": 1791, + "pois": 1792, + "mesmo": 1793, + "ções": 1794, + "seus": 1795, + "até": 1796, + "ência": 1797, + "lhe": 1798, + "tiv": 1799, + "mã": 1800, + "só": 1801, + "tão": 1802, + "tudo": 1803, + "então": 1804, + "inda": 1805, + "bal": 1806, + "indo": 1807, + "ndo": 1808, + "já": 1809, + "vam": 1810, + "eito": 1811, + "depois": 1812, + "mel": 1813, + "lha": 1814, + "ainda": 1815, + "fazer": 1816, + "pou": 1817, + "pergun": 1818, + "deix": 1819, + "tamb": 1820, + "ala": 1821, + "pelo": 1822, + "também": 1823, + "fica": 1824, + "prec": 1825, + "eles": 1826, + "havia": 1827, + "lá": 1828, + "nas": 1829, + "gem": 1830, + "mem": 1831, + "ós": 1832, + "deu": 1833, + "eiro": 1834, + "..": 1835, + "assim": 1836, + "ior": 1837, + "har": 1838, + "aqui": 1839, + "cul": 1840, + "sar": 1841, + "outra": 1842, + "olhos": 1843, + "ima": 1844, + "mim": 1845, + "ago": 1846, + "pessoas": 1847, + "eram": 1848, + "eira": 1849, + "pela": 1850, + "coisa": 1851, + "mão": 1852, + "conh": 1853, + "agora": 1854, + "iam": 1855, + "há": 1856, + "suas": 1857, + "guém": 1858, + "cabe": 1859, + "nem": 1860, + "ível": 1861, + "consegu": 1862, + "trabal": 1863, + "lev": 1864, + "lem": 1865, + "vai": 1866, + "tei": 1867, + "pró": 1868, + "quem": 1869, + "onde": 1870, + "cabeça": 1871, + "nunca": 1872, + "mentos": 1873, + "hum": 1874, + "dele": 1875, + "verdade": 1876, + "tá": 1877, + "hos": 1878, + "algum": 1879, + "dizer": 1880, + "penas": 1881, + "nós": 1882, + "enquanto": 1883, + "outro": 1884, + "lho": 1885, + "melhor": 1886, + "primei": 1887, + "iu": 1888, + "apenas": 1889, + "estou": 1890, + "conte": 1891, + "homem": 1892, + "dois": 1893, + "ças": 1894, + "pouco": 1895, + "senhor": 1896, + "tando": 1897, + "espera": 1898, + "pai": 1899, + "rios": 1900, + "baix": 1901, + "ase": 1902, + "isas": 1903, + "hora": 1904, + "ficar": 1905, + "seja": 1906, + "ân": 1907, + "clar": 1908, + "inc": 1909, + "fos": 1910, + "ouvi": 1911, + "vem": 1912, + "tava": 1913, + "ário": 1914, + "sos": 1915, + "inho": 1916, + "rando": 1917, + "ês": 1918, + "coisas": 1919, + "aconte": 1920, + "lher": 1921, + "anos": 1922, + "talvez": 1923, + "estão": 1924, + "liv": 1925, + "outros": 1926, + "qualquer": 1927, + "gou": 1928, + "lí": 1929, + "tivesse": 1930, + "rado": 1931, + "precisa": 1932, + "mãe": 1933, + "dela": 1934, + "entra": 1935, + "maior": 1936, + "noite": 1937, + "tiva": 1938, + "pala": 1939, + "ração": 1940, + "deus": 1941, + "sas": 1942, + "inte": 1943, + "fei": 1944, + "palav": 1945, + "trás": 1946, + "cidade": 1947, + "lugar": 1948, + "vezes": 1949, + "encontra": 1950, + "tru": 1951, + "eci": 1952, + "ın": 1953, + "bir": 1954, + "yor": 1955, + "ek": 1956, + "dı": 1957, + "ey": 1958, + "tı": 1959, + "mı": 1960, + "iz": 1961, + "ır": 1962, + "gö": 1963, + "sı": 1964, + "bil": 1965, + "lı": 1966, + "üz": 1967, + "iç": 1968, + "iy": 1969, + "ım": 1970, + "uz": 1971, + "cak": 1972, + "iş": 1973, + "ını": 1974, + "iyor": 1975, + "baş": 1976, + "dü": 1977, + "değ": 1978, + "kar": 1979, + "ev": 1980, + "öy": 1981, + "bun": 1982, + "yap": 1983, + "sun": 1984, + "gör": 1985, + "yı": 1986, + "ki": 1987, + "ara": 1988, + "alı": 1989, + "onu": 1990, + "çı": 1991, + "şey": 1992, + "sın": 1993, + "kı": 1994, + "kad": 1995, + "ağ": 1996, + "değil": 1997, + "ük": 1998, + "çok": 1999, + "şı": 2000, + "ül": 2001, + "için": 2002, + "eye": 2003, + "oldu": 2004, + "mış": 2005, + "kal": 2006, + "mek": 2007, + "öyle": 2008, + "yordu": 2009, + "yüz": 2010, + "miş": 2011, + "mak": 2012, + "ola": 2013, + "yan": 2014, + "cek": 2015, + "yorum": 2016, + "bak": 2017, + "üm": 2018, + "ları": 2019, + "oğ": 2020, + "kadar": 2021, + "arı": 2022, + "ında": 2023, + "gün": 2024, + "yok": 2025, + "yer": 2026, + "dım": 2027, + "daha": 2028, + "ına": 2029, + "dim": 2030, + "bilir": 2031, + "iki": 2032, + "siz": 2033, + "diğ": 2034, + "bü": 2035, + "düş": 2036, + "üç": 2037, + "unu": 2038, + "aman": 2039, + "fak": 2040, + "ede": 2041, + "sonra": 2042, + "hiç": 2043, + "aki": 2044, + "ğı": 2045, + "bul": 2046, + "maz": 2047, + "anla": 2048, + "bura": 2049, + "geç": 2050, + "maya": 2051, + "konu": 2052, + "din": 2053, + "tek": 2054, + "zaman": 2055, + "eler": 2056, + "öz": 2057, + "dır": 2058, + "gibi": 2059, + "şa": 2060, + "leri": 2061, + "kim": 2062, + "ku": 2063, + "fakat": 2064, + "yar": 2065, + "göz": 2066, + "cı": 2067, + "yorsun": 2068, + "bek": 2069, + "inde": 2070, + "pek": 2071, + "bunu": 2072, + "lik": 2073, + "iler": 2074, + "edi": 2075, + "öl": 2076, + "sür": 2077, + "sır": 2078, + "çık": 2079, + "sıl": 2080, + "alar": 2081, + "kes": 2082, + "yak": 2083, + "çek": 2084, + "yıl": 2085, + "ecek": 2086, + "ız": 2087, + "git": 2088, + "kap": 2089, + "ama": 2090, + "ıl": 2091, + "ların": 2092, + "biz": 2093, + "tır": 2094, + "oy": 2095, + "ancak": 2096, + "doğ": 2097, + "bana": 2098, + "şim": 2099, + "başla": 2100, + "lü": 2101, + "madı": 2102, + "beni": 2103, + "yük": 2104, + "lık": 2105, + "beş": 2106, + "nasıl": 2107, + "tık": 2108, + "tür": 2109, + "daki": 2110, + "ceğ": 2111, + "zı": 2112, + "iyi": 2113, + "dok": 2114, + "benim": 2115, + "cağ": 2116, + "yen": 2117, + "şu": 2118, + "mez": 2119, + "düşün": 2120, + "kendi": 2121, + "şimdi": 2122, + "yol": 2123, + "yu": 2124, + "iste": 2125, + "sek": 2126, + "mam": 2127, + "söyle": 2128, + "dik": 2129, + "kur": 2130, + "olduğ": 2131, + "sını": 2132, + "biliyor": 2133, + "kan": 2134, + "yal": 2135, + "meye": 2136, + "muş": 2137, + "kaç": 2138, + "iye": 2139, + "tü": 2140, + "ef": 2141, + "tım": 2142, + "evet": 2143, + "yet": 2144, + "burada": 2145, + "tim": 2146, + "biraz": 2147, + "kor": 2148, + "doğru": 2149, + "inin": 2150, + "kız": 2151, + "diye": 2152, + "dör": 2153, + "etti": 2154, + "onun": 2155, + "isti": 2156, + "ği": 2157, + "sana": 2158, + "üş": 2159, + "arka": 2160, + "hayır": 2161, + "karşı": 2162, + "ile": 2163, + "hak": 2164, + "ıyor": 2165, + "neden": 2166, + "sev": 2167, + "sız": 2168, + "çocu": 2169, + "çalı": 2170, + "olur": 2171, + "bır": 2172, + "gir": 2173, + "ise": 2174, + "ih": 2175, + "kır": 2176, + "dön": 2177, + "böyle": 2178, + "seni": 2179, + "!\"": 2180, + "dört": 2181, + "söy": 2182, + "oş": 2183, + "musun": 2184, + "laş": 2185, + "ip": 2186, + "kay": 2187, + "hem": 2188, + "büyük": 2189, + "aç": 2190, + "bırak": 2191, + "misin": 2192, + "söz": 2193, + "değiş": 2194, + "ünü": 2195, + "gül": 2196, + "kö": 2197, + "karı": 2198, + "tamam": 2199, + "olu": 2200, + "yeni": 2201, + "lam": 2202, + "mıştı": 2203, + "yaş": 2204, + "iniz": 2205, + "kadın": 2206, + "bunun": 2207, + "mey": 2208, + "altı": 2209, + "yi": 2210, + "inden": 2211, + "senin": 2212, + "yat": 2213, + "top": 2214, + "isi": 2215, + "dün": 2216, + "hiçbir": 2217, + "yon": 2218, + "dın": 2219, + "tün": 2220, + "başka": 2221, + "hep": 2222, + "irmi": 2223, + "devam": 2224, + "olacak": 2225, + "artık": 2226, + "durum": 2227, + "imiz": 2228, + "üzel": 2229, + "lerini": 2230, + "sağ": 2231, + "gerek": 2232, + "yirmi": 2233, + "şek": 2234, + "bağ": 2235, + "lara": 2236, + "yür": 2237, + "ması": 2238, + "katı": 2239, + "dedi": 2240, + "gü": 2241, + "sorun": 2242, + "üne": 2243, + "mız": 2244, + "yapı": 2245, + "mil": 2246, + "ğını": 2247, + "tara": 2248, + "vardı": 2249, + "konuş": 2250, + "arak": 2251, + "larak": 2252, + "çocuk": 2253, + "bütün": 2254, + "ley": 2255, + "dür": 2256, + "güzel": 2257, + "ayı": 2258, + "yapa": 2259, + "nı": 2260, + "ayr": 2261, + "öne": 2262, + "yordum": 2263, + "ban": 2264, + "i̇ş": 2265, + "dum": 2266, + "yorlar": 2267, + "larını": 2268, + "çıkar": 2269, + "zan": 2270, + "seç": 2271, + "liyor": 2272, + "tak": 2273, + "şık": 2274, + "tekrar": 2275, + "aş": 2276, + "eş": 2277, + "mişti": 2278, + "kin": 2279, + "imi": 2280, + "eğ": 2281, + "gidi": 2282, + "leş": 2283, + "başladı": 2284, + "gide": 2285, + "otur": 2286, + "dde": 2287, + "ından": 2288, + "üzer": 2289, + "ının": 2290, + "nız": 2291, + "uy": 2292, + "yedi": 2293, + "kat": 2294, + "olarak": 2295, + "ladı": 2296, + "yalnız": 2297, + "bah": 2298, + "iyet": 2299, + "sak": 2300, + "açık": 2301, + "sında": 2302, + "...": 2303, + "insan": 2304, + "aynı": 2305, + "eder": 2306, + "istan": 2307, + "uzun": 2308, + "geri": 2309, + "erek": 2310, + "olan": 2311, + "gerçek": 2312, + "alan": 2313, + "dış": 2314, + "alık": 2315, + "fark": 2316, + "üst": 2317, + "sade": 2318, + "kiş": 2319, + "ldı": 2320, + "zor": 2321, + "etir": 2322, + "herkes": 2323, + "ömer": 2324, + "unda": 2325, + "haf": 2326, + "buna": 2327, + "ydı": 2328, + "peki": 2329, + "adam": 2330, + "haz": 2331, + "sına": 2332, + "kapı": 2333, + "görüş": 2334, + "sadece": 2335, + "aldı": 2336, + "geldi": 2337, + "rz": 2338, + "sz": 2339, + "cz": 2340, + "ię": 2341, + "dz": 2342, + "ał": 2343, + "się": 2344, + "rze": 2345, + "że": 2346, + "wy": 2347, + "rzy": 2348, + "ła": 2349, + "ło": 2350, + "ny": 2351, + "dzie": 2352, + "dzi": 2353, + "czy": 2354, + "cie": 2355, + "prze": 2356, + "dy": 2357, + "kie": 2358, + "ry": 2359, + "ją": 2360, + "ów": 2361, + "przy": 2362, + "mie": 2363, + "szy": 2364, + "cze": 2365, + "bie": 2366, + "cy": 2367, + "nia": 2368, + "ści": 2369, + "sze": 2370, + "jest": 2371, + "ży": 2372, + "ną": 2373, + "któ": 2374, + "ała": 2375, + "mnie": 2376, + "ły": 2377, + "cza": 2378, + "jak": 2379, + "roz": 2380, + "ró": 2381, + "zna": 2382, + "łu": 2383, + "ść": 2384, + "wia": 2385, + "wszy": 2386, + "spo": 2387, + "gdy": 2388, + "wał": 2389, + "wię": 2390, + "łem": 2391, + "ję": 2392, + "sk": 2393, + "rę": 2394, + "dob": 2395, + "już": 2396, + "bę": 2397, + "ałem": 2398, + "sza": 2399, + "pod": 2400, + "dla": 2401, + "pan": 2402, + "nę": 2403, + "może": 2404, + "śli": 2405, + "ało": 2406, + "lko": 2407, + "nych": 2408, + "powie": 2409, + "cię": 2410, + "tylko": 2411, + "naj": 2412, + "tego": 2413, + "ski": 2414, + "nego": 2415, + "wszyst": 2416, + "szcze": 2417, + "jed": 2418, + "jej": 2419, + "two": 2420, + "ąd": 2421, + "śmy": 2422, + "czę": 2423, + "wać": 2424, + "jego": 2425, + "ża": 2426, + "sy": 2427, + "praw": 2428, + "tym": 2429, + "który": 2430, + "ały": 2431, + "trze": 2432, + "niej": 2433, + "nym": 2434, + "gło": 2435, + "jąc": 2436, + "mówi": 2437, + "ska": 2438, + "nej": 2439, + "słu": 2440, + "wła": 2441, + "będzie": 2442, + "dę": 2443, + "pó": 2444, + "bez": 2445, + "nic": 2446, + "pła": 2447, + "ście": 2448, + "są": 2449, + "trzy": 2450, + "kiem": 2451, + "był": 2452, + "mog": 2453, + "robi": 2454, + "tam": 2455, + "mię": 2456, + "zy": 2457, + "pew": 2458, + "myś": 2459, + "przed": 2460, + "sko": 2461, + "które": 2462, + "lę": 2463, + "wsze": 2464, + "ąc": 2465, + "było": 2466, + "sobie": 2467, + "py": 2468, + "cią": 2469, + "jeszcze": 2470, + "tę": 2471, + "czas": 2472, + "szę": 2473, + "gł": 2474, + "kę": 2475, + "czu": 2476, + "przez": 2477, + "sło": 2478, + "wz": 2479, + "kto": 2480, + "ków": 2481, + "czo": 2482, + "liśmy": 2483, + "więc": 2484, + "rą": 2485, + "wó": 2486, + "rza": 2487, + "ności": 2488, + "wet": 2489, + "nął": 2490, + "śmie": 2491, + "nawet": 2492, + "musi": 2493, + "swo": 2494, + "tej": 2495, + "wą": 2496, + "wu": 2497, + "wią": 2498, + "niu": 2499, + "czą": 2500, + "dzo": 2501, + "skie": 2502, + "jeśli": 2503, + "czego": 2504, + "chy": 2505, + "dł": 2506, + "tych": 2507, + "bym": 2508, + "żo": 2509, + "eś": 2510, + "sią": 2511, + "kiedy": 2512, + "wró": 2513, + "dze": 2514, + "dro": 2515, + "rów": 2516, + "pani": 2517, + "kul": 2518, + "nad": 2519, + "chwi": 2520, + "nim": 2521, + "być": 2522, + "chodzi": 2523, + "nio": 2524, + "dobrze": 2525, + "teraz": 2526, + "wokul": 2527, + "coś": 2528, + "kł": 2529, + "pier": 2530, + "gdzie": 2531, + "dzy": 2532, + "pię": 2533, + "dź": 2534, + "ką": 2535, + "gó": 2536, + "zda": 2537, + "chce": 2538, + "stę": 2539, + "świa": 2540, + "wszystko": 2541, + "peł": 2542, + "wiem": 2543, + "wiel": 2544, + "każ": 2545, + "rzu": 2546, + "sły": 2547, + "jedna": 2548, + "myśl": 2549, + "mój": 2550, + "jestem": 2551, + "óż": 2552, + "miej": 2553, + "moż": 2554, + "kła": 2555, + "resz": 2556, + "dłu": 2557, + "stwo": 2558, + "nię": 2559, + "masz": 2560, + "żeby": 2561, + "niem": 2562, + "jakie": 2563, + "sty": 2564, + "nią": 2565, + "wej": 2566, + "oj": 2567, + "sła": 2568, + "ność": 2569, + "zło": 2570, + "szczę": 2571, + "lej": 2572, + "wego": 2573, + "cał": 2574, + "dział": 2575, + "kich": 2576, + "dza": 2577, + "dzię": 2578, + "oczy": 2579, + "zosta": 2580, + "czło": 2581, + "nam": 2582, + "kil": 2583, + "szu": 2584, + "wę": 2585, + "miał": 2586, + "strze": 2587, + "cej": 2588, + "ej": 2589, + "znaj": 2590, + "dać": 2591, + "miejs": 2592, + "kró": 2593, + "kry": 2594, + "bardzo": 2595, + "śnie": 2596, + "lą": 2597, + "gie": 2598, + "ciebie": 2599, + "dni": 2600, + "potrze": 2601, + "wokulski": 2602, + "uwa": 2603, + "umie": 2604, + "jednak": 2605, + "kra": 2606, + "wróci": 2607, + "człowie": 2608, + "czyć": 2609, + "była": 2610, + "żeli": 2611, + "mę": 2612, + "cę": 2613, + "zrobi": 2614, + "mogę": 2615, + "prowa": 2616, + "rem": 2617, + "niech": 2618, + "cznie": 2619, + "kro": 2620, + "tą": 2621, + "chci": 2622, + "bro": 2623, + "dzieć": 2624, + "szą": 2625, + "pad": 2626, + "trz": 2627, + "jem": 2628, + "tów": 2629, + "dru": 2630, + "taj": 2631, + "rzekł": 2632, + "niego": 2633, + "takie": 2634, + "wała": 2635, + "towa": 2636, + "kapła": 2637, + "widzi": 2638, + "podob": 2639, + "dzę": 2640, + "tał": 2641, + "stęp": 2642, + "bą": 2643, + "poko": 2644, + "wem": 2645, + "gę": 2646, + "aby": 2647, + "albo": 2648, + "spra": 2649, + "zno": 2650, + "smo": 2651, + "jesz": 2652, + "księ": 2653, + "jesteś": 2654, + "poz": 2655, + "nigdy": 2656, + "ksią": 2657, + "cóż": 2658, + "ws": 2659, + "pow": 2660, + "tka": 2661, + "świe": 2662, + "szka": 2663, + "samo": 2664, + "sł": 2665, + "rzę": 2666, + "nale": 2667, + "chcesz": 2668, + "nik": 2669, + "pę": 2670, + "chyba": 2671, + "ciąg": 2672, + "jący": 2673, + "woj": 2674, + "nasze": 2675, + "mniej": 2676, + "więcej": 2677, + "zwy": 2678, + "osta": 2679, + "waż": 2680, + "śmier": 2681, + "wier": 2682, + "dzą": 2683, + "zaś": 2684, + "gdyby": 2685, + "jaki": 2686, + "wol": 2687, + "win": 2688, + "dą": 2689, + "ścia": 2690, + "rozma": 2691, + "wal": 2692, + "panie": 2693, + "star": 2694, + "kaz": 2695, + "jeżeli": 2696, + "wra": 2697, + "koń": 2698, + "siebie": 2699, + "znowu": 2700, + "czem": 2701, + "stwa": 2702, + "isto": 2703, + "pół": 2704, + "dał": 2705, + "kobie": 2706, + "ałam": 2707, + "wych": 2708, + "cesa": 2709, + "nich": 2710, + "zawsze": 2711, + "dzić": 2712, + "też": 2713, + "lepie": 2714, + "proszę": 2715, + "kre": 2716, + "twa": 2717, + "łą": 2718, + "chu": 2719, + "cą": 2720, + "prz": 2721, + "łe": 2722, + "szedł": 2723, + "odpowie": 2724, + "myśli": 2725, + "świą": 2726, + "ź": 2727, + "ł": 2728, + "&": 2729, + "=": 2730, + "ă": 2731, + "đ": 2732, + "ţ": 2733, + "–": 2734, + "‘": 2735, + "ij": 2736, + "aa": 2737, + "een": 2738, + "het": 2739, + "aar": 2740, + "oor": 2741, + "ijn": 2742, + "dat": 2743, + "oe": 2744, + "ijk": 2745, + "aan": 2746, + "voor": 2747, + "iet": 2748, + "zijn": 2749, + "niet": 2750, + "oo": 2751, + "moet": 2752, + "heb": 2753, + "uit": 2754, + "wij": 2755, + "aat": 2756, + "lijk": 2757, + "sl": 2758, + "daar": 2759, + "deze": 2760, + "worden": 2761, + "moeten": 2762, + "onder": 2763, + "hebben": 2764, + "ook": 2765, + "ct": 2766, + "nog": 2767, + "aal": 2768, + "eer": 2769, + "bij": 2770, + "mijn": 2771, + "kom": 2772, + "atie": 2773, + "eft": 2774, + "kel": 2775, + "rij": 2776, + "heid": 2777, + "af": 2778, + "stel": 2779, + "maar": 2780, + "wee": 2781, + "heeft": 2782, + "waar": 2783, + "eren": 2784, + "wat": 2785, + "wil": 2786, + "aag": 2787, + "bet": 2788, + "hij": 2789, + "kun": 2790, + "uw": 2791, + "dt": 2792, + "door": 2793, + "tij": 2794, + "ond": 2795, + "geen": 2796, + "gev": 2797, + "veel": 2798, + "naar": 2799, + "aten": 2800, + "kunnen": 2801, + "echt": 2802, + "goe": 2803, + "twee": 2804, + "delijk": 2805, + "uur": 2806, + "toe": 2807, + "meer": 2808, + "onze": 2809, + "tijd": 2810, + "hoe": 2811, + "tot": 2812, + "zou": 2813, + "aak": 2814, + "amen": 2815, + "woor": 2816, + "wordt": 2817, + "gelijk": 2818, + "gaan": 2819, + "ker": 2820, + "eld": 2821, + "hou": 2822, + "zel": 2823, + "tegen": 2824, + "komen": 2825, + "werk": 2826, + "goed": 2827, + "zal": 2828, + "zij": 2829, + "slag": 2830, + "zien": 2831, + "echter": 2832, + "itie": 2833, + "tie": 2834, + "elijk": 2835, + "ische": 2836, + "belan": 2837, + "haar": 2838, + "vr": 2839, + "grijk": 2840, + "doen": 2841, + "land": 2842, + "belangrijk": 2843, + "open": 2844, + "ctie": 2845, + "zelf": 2846, + "mij": 2847, + "iteit": 2848, + "stem": 2849, + "mee": 2850, + "aren": 2851, + "dien": 2852, + "gaat": 2853, + "prob": 2854, + "moe": 2855, + "ullen": 2856, + "zich": 2857, + "daarom": 2858, + "orm": 2859, + "staat": 2860, + "zit": 2861, + "dui": 2862, + "dus": 2863, + "ds": 2864, + "verslag": 2865, + "kelijk": 2866, + "proble": 2867, + "schap": 2868, + "gd": 2869, + "hun": 2870, + "erd": 2871, + "zet": 2872, + "staan": 2873, + "maal": 2874, + "inder": 2875, + "eid": 2876, + "kken": 2877, + "ged": 2878, + "zullen": 2879, + "mensen": 2880, + "jaar": 2881, + "regel": 2882, + "ieder": 2883, + "volgen": 2884, + "geven": 2885, + "even": 2886, + "blij": 2887, + "ië": 2888, + "uwe": 2889, + "maken": 2890, + "oek": 2891, + "nieuwe": 2892, + "baar": 2893, + "andere": 2894, + "ruik": 2895, + "agen": 2896, + "ouw": 2897, + "willen": 2898, + "aakt": 2899, + "hoo": 2900, + "anden": 2901, + "lig": 2902, + "samen": 2903, + "zeer": 2904, + "duidelijk": 2905, + "antwoor": 2906, + "heel": 2907, + "punt": 2908, + "houden": 2909, + "vraag": 2910, + "gele": 2911, + "eens": 2912, + "besch": 2913, + "omen": 2914, + "erg": 2915, + "doel": 2916, + "dag": 2917, + "uren": 2918, + "ings": 2919, + "oren": 2920, + "delen": 2921, + "steun": 2922, + "innen": 2923, + "pol": 2924, + "oon": 2925, + "sn": 2926, + "zonder": 2927, + "nodig": 2928, + "alleen": 2929, + "mid": 2930, + "ragen": 2931, + "iets": 2932, + "versch": 2933, + "gebruik": 2934, + "rouw": 2935, + "stellen": 2936, + "menten": 2937, + "eerste": 2938, + "laat": 2939, + "groot": 2940, + "ood": 2941, + "toch": 2942, + "laten": 2943, + "aard": 2944, + "sle": 2945, + "deel": 2946, + "plaat": 2947, + "ree": 2948, + "betre": 2949, + "lid": 2950, + "uiten": 2951, + "racht": 2952, + "beleid": 2953, + "stie": 2954, + "staten": 2955, + "ggen": 2956, + "reken": 2957, + "alen": 2958, + "ming": 2959, + "mogelijk": 2960, + "grote": 2961, + "altijd": 2962, + "enkel": 2963, + "wik": 2964, + "politie": 2965, + "elk": 2966, + "handel": 2967, + "kwe": 2968, + "maat": 2969, + "elen": 2970, + "vrij": 2971, + "jes": 2972, + "aam": 2973, + "huis": 2974, + "weer": 2975, + "lidstaten": 2976, + "king": 2977, + "kle": 2978, + "bed": 2979, + "geval": 2980, + "wikkel": 2981, + "kwestie": 2982, + "stee": 2983, + "hel": 2984, + "komst": 2985, + "iden": 2986, + "eerd": 2987, + "tweede": 2988, + "probleem": 2989, + "ussen": 2990, + "snel": 2991, + "tig": 2992, + "ult": 2993, + "nemen": 2994, + "commis": 2995, + "verschil": 2996, + "zoek": 2997, + "krij": 2998, + "graag": 2999, + "denk": 3000, + "landen": 3001, + "reden": 3002, + "besl": 3003, + "oeg": 3004, + "beter": 3005, + "heden": 3006, + "mag": 3007, + "boven": 3008, + "cont": 3009, + "fd": 3010, + "hele": 3011, + "vier": 3012, + "gez": 3013, + "kw": 3014, + "aas": 3015, + "ontwikkel": 3016, + "drie": 3017, + "vaak": 3018, + "plaats": 3019, + "gang": 3020, + "ijf": 3021, + "natuur": 3022, + "tussen": 3023, + "bat": 3024, + "komt": 3025, + "wacht": 3026, + "aad": 3027, + "achter": 3028, + "gebie": 3029, + "verk": 3030, + "ligt": 3031, + "nieuw": 3032, + "vand": 3033, + "ý": 3034, + "ď": 3035, + "ě": 3036, + "ř": 3037, + "ť": 3038, + "ů": 3039, + "„": 3040, + "ní": 3041, + "ně": 3042, + "ře": 3043, + "ná": 3044, + "vě": 3045, + "vá": 3046, + "rá": 3047, + "vy": 3048, + "mě": 3049, + "ři": 3050, + "ří": 3051, + "že": 3052, + "jí": 3053, + "vý": 3054, + "ji": 3055, + "dě": 3056, + "če": 3057, + "tě": 3058, + "ky": 3059, + "še": 3060, + "ké": 3061, + "ší": 3062, + "pře": 3063, + "ví": 3064, + "ný": 3065, + "ži": 3066, + "má": 3067, + "cí": 3068, + "zá": 3069, + "ské": 3070, + "dá": 3071, + "byl": 3072, + "tí": 3073, + "pří": 3074, + "při": 3075, + "či": 3076, + "vní": 3077, + "ča": 3078, + "dí": 3079, + "dní": 3080, + "ká": 3081, + "nou": 3082, + "vět": 3083, + "pě": 3084, + "kou": 3085, + "ých": 3086, + "bě": 3087, + "prá": 3088, + "jako": 3089, + "ží": 3090, + "zí": 3091, + "jsou": 3092, + "jsem": 3093, + "lní": 3094, + "cké": 3095, + "vat": 3096, + "před": 3097, + "hla": 3098, + "stá": 3099, + "čí": 3100, + "ši": 3101, + "kla": 3102, + "ště": 3103, + "lou": 3104, + "mů": 3105, + "chá": 3106, + "pů": 3107, + "také": 3108, + "dů": 3109, + "nost": 3110, + "tře": 3111, + "sku": 3112, + "vše": 3113, + "tní": 3114, + "byla": 3115, + "ční": 3116, + "jeho": 3117, + "bý": 3118, + "vání": 3119, + "ných": 3120, + "tři": 3121, + "vz": 3122, + "stře": 3123, + "dva": 3124, + "hle": 3125, + "čá": 3126, + "nosti": 3127, + "vš": 3128, + "hra": 3129, + "jen": 3130, + "slo": 3131, + "však": 3132, + "kdy": 3133, + "bylo": 3134, + "bude": 3135, + "jší": 3136, + "vých": 3137, + "ním": 3138, + "sm": 3139, + "koli": 3140, + "rů": 3141, + "může": 3142, + "není": 3143, + "hod": 3144, + "bí": 3145, + "tý": 3146, + "stě": 3147, + "uje": 3148, + "sá": 3149, + "pět": 3150, + "krá": 3151, + "tom": 3152, + "ství": 3153, + "vně": 3154, + "sed": 3155, + "své": 3156, + "pí": 3157, + "musí": 3158, + "už": 3159, + "tím": 3160, + "jící": 3161, + "jedno": 3162, + "čas": 3163, + "čty": 3164, + "ský": 3165, + "evro": 3166, + "toho": 3167, + "hy": 3168, + "kter": 3169, + "rní": 3170, + "stí": 3171, + "svě": 3172, + "pak": 3173, + "všech": 3174, + "ků": 3175, + "ng": 3176, + "ád": 3177, + "chází": 3178, + "být": 3179, + "první": 3180, + "mno": 3181, + "ského": 3182, + "pá": 3183, + "nebo": 3184, + "kem": 3185, + "sla": 3186, + "ného": 3187, + "zde": 3188, + "další": 3189, + "řa": 3190, + "čtyři": 3191, + "hrá": 3192, + "druh": 3193, + "lně": 3194, + "vla": 3195, + "ských": 3196, + "ško": 3197, + "půso": 3198, + "proto": 3199, + "vů": 3200, + "ská": 3201, + "šest": 3202, + "dně": 3203, + "ještě": 3204, + "mezi": 3205, + "několi": 3206, + "již": 3207, + "čně": 3208, + "slu": 3209, + "zná": 3210, + "sedm": 3211, + "vlá": 3212, + "osm": 3213, + "byly": 3214, + "vám": 3215, + "cký": 3216, + "tech": 3217, + "ději": 3218, + "velmi": 3219, + "leži": 3220, + "vala": 3221, + "lý": 3222, + "tvo": 3223, + "spole": 3224, + "stup": 3225, + "mož": 3226, + "evrop": 3227, + "stal": 3228, + "jde": 3229, + "rodi": 3230, + "její": 3231, + "poli": 3232, + "devět": 3233, + "sme": 3234, + "až": 3235, + "této": 3236, + "tento": 3237, + "kaž": 3238, + "nula": 3239, + "bych": 3240, + "moc": 3241, + "stou": 3242, + "kdo": 3243, + "zd": 3244, + "praco": 3245, + "tomu": 3246, + "ným": 3247, + "živo": 3248, + "zem": 3249, + "násle": 3250, + "sky": 3251, + "jich": 3252, + "měl": 3253, + "děla": 3254, + "jsme": 3255, + "nice": 3256, + "stej": 3257, + "stní": 3258, + "náro": 3259, + "nit": 3260, + "později": 3261, + "tako": 3262, + "nce": 3263, + "čer": 3264, + "ším": 3265, + "něco": 3266, + "vál": 3267, + "řej": 3268, + "krát": 3269, + "ální": 3270, + "asi": 3271, + "které": 3272, + "stav": 3273, + "mají": 3274, + "mys": 3275, + "době": 3276, + "sně": 3277, + "zku": 3278, + "tů": 3279, + "chod": 3280, + "spě": 3281, + "jejich": 3282, + "součas": 3283, + "vali": 3284, + "kte": 3285, + "prů": 3286, + "zení": 3287, + "pat": 3288, + "potře": 3289, + "dnes": 3290, + "zemí": 3291, + "znam": 3292, + "mám": 3293, + "tedy": 3294, + "hlavní": 3295, + "použí": 3296, + "bní": 3297, + "vede": 3298, + "lep": 3299, + "jek": 3300, + "prav": 3301, + "politi": 3302, + "dne": 3303, + "čení": 3304, + "než": 3305, + "děl": 3306, + "čo": 3307, + "cích": 3308, + "sté": 3309, + "dlou": 3310, + "několik": 3311, + "vyu": 3312, + "ckých": 3313, + "nové": 3314, + "čin": 3315, + "dělá": 3316, + "ký": 3317, + "obla": 3318, + "podle": 3319, + "důleži": 3320, + "poku": 3321, + "kone": 3322, + "dý": 3323, + "dvě": 3324, + "žád": 3325, + "nout": 3326, + "tku": 3327, + "tvr": 3328, + "ckého": 3329, + "rov": 3330, + "tele": 3331, + "psa": 3332, + "svět": 3333, + "tivní": 3334, + "dosta": 3335, + "šel": 3336, + "druhé": 3337, + "skou": 3338, + "žo": 3339, + "jedná": 3340, + "význam": 3341, + "problé": 3342, + "publi": 3343, + "ván": 3344, + "odpo": 3345, + "podpo": 3346, + "dle": 3347, + "jaké": 3348, + "šení": 3349, + "vím": 3350, + "během": 3351, + "nachází": 3352, + "slou": 3353, + "pouze": 3354, + "otá": 3355, + "plo": 3356, + "tové": 3357, + "větši": 3358, + "komi": 3359, + "vají": 3360, + "tyto": 3361, + "zápa": 3362, + "změ": 3363, + "moh": 3364, + "více": 3365, + "společ": 3366, + "auto": 3367, + "proti": 3368, + "dět": 3369, + "cháze": 3370, + "žel": 3371, + "«": 3372, + "»": 3373, + "а": 3374, + "б": 3375, + "в": 3376, + "г": 3377, + "д": 3378, + "е": 3379, + "ж": 3380, + "з": 3381, + "и": 3382, + "й": 3383, + "к": 3384, + "л": 3385, + "м": 3386, + "н": 3387, + "о": 3388, + "п": 3389, + "р": 3390, + "с": 3391, + "т": 3392, + "у": 3393, + "ф": 3394, + "х": 3395, + "ц": 3396, + "ч": 3397, + "ш": 3398, + "щ": 3399, + "ъ": 3400, + "ы": 3401, + "ь": 3402, + "э": 3403, + "ю": 3404, + "я": 3405, + "ё": 3406, + "‑": 3407, + "−": 3408, + "ст": 3409, + "ен": 3410, + "но": 3411, + "на": 3412, + "пр": 3413, + "то": 3414, + "по": 3415, + "ра": 3416, + "го": 3417, + "ко": 3418, + "не": 3419, + "во": 3420, + "ва": 3421, + "ет": 3422, + "ер": 3423, + "ни": 3424, + "ел": 3425, + "ит": 3426, + "ны": 3427, + "за": 3428, + "ро": 3429, + "ени": 3430, + "ка": 3431, + "ли": 3432, + "ем": 3433, + "да": 3434, + "об": 3435, + "ла": 3436, + "до": 3437, + "ся": 3438, + "ть": 3439, + "от": 3440, + "ло": 3441, + "ль": 3442, + "ед": 3443, + "со": 3444, + "ми": 3445, + "ре": 3446, + "мо": 3447, + "ци": 3448, + "про": 3449, + "та": 3450, + "это": 3451, + "ки": 3452, + "ру": 3453, + "при": 3454, + "ти": 3455, + "се": 3456, + "ста": 3457, + "вы": 3458, + "мы": 3459, + "ви": 3460, + "бы": 3461, + "ма": 3462, + "ес": 3463, + "ля": 3464, + "сти": 3465, + "ле": 3466, + "что": 3467, + "ме": 3468, + "ри": 3469, + "ча": 3470, + "од": 3471, + "ей": 3472, + "ель": 3473, + "ения": 3474, + "га": 3475, + "ну": 3476, + "си": 3477, + "па": 3478, + "раз": 3479, + "бо": 3480, + "сто": 3481, + "су": 3482, + "са": 3483, + "ду": 3484, + "его": 3485, + "ест": 3486, + "ин": 3487, + "ить": 3488, + "из": 3489, + "же": 3490, + "му": 3491, + "пер": 3492, + "под": 3493, + "ение": 3494, + "сь": 3495, + "ку": 3496, + "пред": 3497, + "ного": 3498, + "ных": 3499, + "вер": 3500, + "те": 3501, + "ной": 3502, + "ции": 3503, + "де": 3504, + "ры": 3505, + "дел": 3506, + "лю": 3507, + "ве": 3508, + "он": 3509, + "мен": 3510, + "ги": 3511, + "ня": 3512, + "бу": 3513, + "пра": 3514, + "все": 3515, + "ется": 3516, + "сть": 3517, + "жа": 3518, + "дол": 3519, + "жи": 3520, + "бе": 3521, + "кон": 3522, + "сл": 3523, + "ши": 3524, + "ди": 3525, + "ств": 3526, + "ско": 3527, + "ные": 3528, + "чи": 3529, + "ют": 3530, + "дер": 3531, + "стра": 3532, + "ты": 3533, + "ход": 3534, + "щи": 3535, + "зо": 3536, + "зна": 3537, + "ности": 3538, + "чес": 3539, + "вля": 3540, + "вать": 3541, + "ор": 3542, + "пол": 3543, + "вет": 3544, + "так": 3545, + "ша": 3546, + "ту": 3547, + "сво": 3548, + "пре": 3549, + "она": 3550, + "итель": 3551, + "ный": 3552, + "сло": 3553, + "как": 3554, + "вл": 3555, + "ность": 3556, + "хо": 3557, + "мож": 3558, + "пе": 3559, + "для": 3560, + "ния": 3561, + "ное": 3562, + "рас": 3563, + "долж": 3564, + "дар": 3565, + "тель": 3566, + "ска": 3567, + "пу": 3568, + "ство": 3569, + "кото": 3570, + "раб": 3571, + "ее": 3572, + "род": 3573, + "эти": 3574, + "соб": 3575, + "ору": 3576, + "жен": 3577, + "ным": 3578, + "ити": 3579, + "ние": 3580, + "ком": 3581, + "дет": 3582, + "сту": 3583, + "гу": 3584, + "пи": 3585, + "меж": 3586, + "ению": 3587, + "тер": 3588, + "работ": 3589, + "воз": 3590, + "ция": 3591, + "кой": 3592, + "щест": 3593, + "гра": 3594, + "зи": 3595, + "ря": 3596, + "между": 3597, + "ства": 3598, + "вс": 3599, + "ело": 3600, + "ше": 3601, + "мер": 3602, + "ба": 3603, + "зы": 3604, + "лу": 3605, + "аль": 3606, + "дей": 3607, + "гла": 3608, + "народ": 3609, + "кти": 3610, + "предста": 3611, + "лся": 3612, + "явля": 3613, + "ски": 3614, + "нов": 3615, + "един": 3616, + "ров": 3617, + "ис": 3618, + "нима": 3619, + "рем": 3620, + "ходи": 3621, + "также": 3622, + "дру": 3623, + "ать": 3624, + "след": 3625, + "гово": 3626, + "ная": 3627, + "ющи": 3628, + "ень": 3629, + "которы": 3630, + "хот": 3631, + "ву": 3632, + "их": 3633, + "ему": 3634, + "чит": 3635, + "важ": 3636, + "орга": 3637, + "чески": 3638, + "ще": 3639, + "ке": 3640, + "ха": 3641, + "пос": 3642, + "том": 3643, + "боль": 3644, + "мне": 3645, + "пас": 3646, + "объ": 3647, + "прав": 3648, + "конф": 3649, + "слу": 3650, + "поддер": 3651, + "стви": 3652, + "наш": 3653, + "лько": 3654, + "стоя": 3655, + "ную": 3656, + "лем": 3657, + "енных": 3658, + "кра": 3659, + "ды": 3660, + "международ": 3661, + "гда": 3662, + "необ": 3663, + "госу": 3664, + "ству": 3665, + "ении": 3666, + "государ": 3667, + "кто": 3668, + "им": 3669, + "чест": 3670, + "рет": 3671, + "вопро": 3672, + "лен": 3673, + "ели": 3674, + "рова": 3675, + "ций": 3676, + "нам": 3677, + "этой": 3678, + "жения": 3679, + "необходи": 3680, + "меня": 3681, + "было": 3682, + "сили": 3683, + "фи": 3684, + "вя": 3685, + "шь": 3686, + "этого": 3687, + "они": 3688, + "органи": 3689, + "безо": 3690, + "проб": 3691, + "име": 3692, + "реш": 3693, + "би": 3694, + "безопас": 3695, + "ются": 3696, + "оста": 3697, + "енно": 3698, + "год": 3699, + "ела": 3700, + "представ": 3701, + "ться": 3702, + "слово": 3703, + "организа": 3704, + "должны": 3705, + "этом": 3706, + "бла": 3707, + "че": 3708, + "чу": 3709, + "благо": 3710, + "этому": 3711, + "врем": 3712, + "спе": 3713, + "ном": 3714, + "ений": 3715, + "спо": 3716, + "нас": 3717, + "нет": 3718, + "зу": 3719, + "вед": 3720, + "еще": 3721, + "сказа": 3722, + "сей": 3723, + "ерен": 3724, + "дан": 3725, + "сам": 3726, + "еля": 3727, + "ран": 3728, + "зыва": 3729, + "является": 3730, + "будет": 3731, + "ктив": 3732, + "тре": 3733, + "деле": 3734, + "мот": 3735, + "конферен": 3736, + "лась": 3737, + "час": 3738, + "сторо": 3739, + "кого": 3740, + "ез": 3741, + "ней": 3742, + "ос": 3743, + "лись": 3744, + "разору": 3745, + "пере": 3746, + "сси": 3747, + "ными": 3748, + "проц": 3749, + "голо": 3750, + "чело": 3751, + "боле": 3752, + "челове": 3753, + "сер": 3754, + "пл": 3755, + "чет": 3756, + "стран": 3757, + "пя": 3758, + "был": 3759, + "кла": 3760, + "тов": 3761, + "жд": 3762, + "дела": 3763, + "ера": 3764, + "уже": 3765, + "совет": 3766, + "ген": 3767, + "безопасности": 3768, + "ца": 3769, + "седа": 3770, + "поз": 3771, + "ответ": 3772, + "проблем": 3773, + "нако": 3774, + "тем": 3775, + "доста": 3776, + "пы": 3777, + "ща": 3778, + "вой": 3779, + "сущест": 3780, + "необходимо": 3781, + "быть": 3782, + "может": 3783, + "дем": 3784, + "чтобы": 3785, + "ек": 3786, + "чер": 3787, + "усили": 3788, + "рес": 3789, + "руд": 3790, + "единенных": 3791, + "доб": 3792, + "дости": 3793, + "ствен": 3794, + "ядер": 3795, + "годня": 3796, + "каза": 3797, + "сегодня": 3798, + "сейчас": 3799, + "только": 3800, + "вод": 3801, + "есь": 3802, + "много": 3803, + "буду": 3804, + "ев": 3805, + "есть": 3806, + "три": 3807, + "общест": 3808, + "явл": 3809, + "высту": 3810, + "ред": 3811, + "счит": 3812, + "сит": 3813, + "делега": 3814, + "лож": 3815, + "этот": 3816, + "фор": 3817, + "клю": 3818, + "возмож": 3819, + "вания": 3820, + "бли": 3821, + "или": 3822, + "вз": 3823, + "наций": 3824, + "ского": 3825, + "приня": 3826, + "пла": 3827, + "оч": 3828, + "иться": 3829, + "сте": 3830, + "наши": 3831, + "которые": 3832, + "ар": 3833, + "имеет": 3834, + "сот": 3835, + "знач": 3836, + "перь": 3837, + "следу": 3838, + "ены": 3839, + "таки": 3840, + "объединенных": 3841, + "стро": 3842, + "теперь": 3843, + "бле": 3844, + "благодар": 3845, + "разв": 3846, + "ан": 3847, + "жива": 3848, + "очень": 3849, + "ят": 3850, + "без": 3851, + "обес": 3852, + "гро": 3853, + "лось": 3854, + "сы": 3855, + "организации": 3856, + "член": 3857, + "того": 3858, + "ональ": 3859, + "жда": 3860, + "всех": 3861, + "свя": 3862, + "более": 3863, + "сов": 3864, + "когда": 3865, + "вот": 3866, + "кре": 3867, + "кры": 3868, + "поэтому": 3869, + "воль": 3870, + "ой": 3871, + "генера": 3872, + "чем": 3873, + "лы": 3874, + "полити": 3875, + "вен": 3876, + "конференции": 3877, + "процес": 3878, + "бя": 3879, + "ите": 3880, + "отно": 3881, + "развити": 3882, + "аф": 3883, + "ющ": 3884, + "вно": 3885, + "мир": 3886, + "нии": 3887, + "кая": 3888, + "ас": 3889, + "ительно": 3890, + "вто": 3891, + "ением": 3892, + "генераль": 3893, + "прот": 3894, + "всем": 3895, + "самбле": 3896, + "ассамбле": 3897, + "ом": 3898, + "зд": 3899, + "смот": 3900, + "реги": 3901, + "чего": 3902, + "однако": 3903, + "усилия": 3904, + "действи": 3905, + "чно": 3906, + "уча": 3907, + "образ": 3908, + "вос": 3909, + "эта": 3910, + "перего": 3911, + "говор": 3912, + "вам": 3913, + "моло": 3914, + "время": 3915, + "дь": 3916, + "хотел": 3917, + "гру": 3918, + "заявл": 3919, + "предоста": 3920, + "поль": 3921, + "нее": 3922, + "резо": 3923, + "перегово": 3924, + "резолю": 3925, + "крет": 3926, + "поддерж": 3927, + "обеспе": 3928, + "него": 3929, + "представит": 3930, + "наде": 3931, + "кри": 3932, + "чь": 3933, + "проек": 3934, + "лет": 3935, + "други": 3936, + "_": 3937, + "،": 3938, + "؛": 3939, + "؟": 3940, + "ء": 3941, + "آ": 3942, + "أ": 3943, + "ؤ": 3944, + "إ": 3945, + "ئ": 3946, + "ا": 3947, + "ب": 3948, + "ة": 3949, + "ت": 3950, + "ث": 3951, + "ج": 3952, + "ح": 3953, + "خ": 3954, + "د": 3955, + "ذ": 3956, + "ر": 3957, + "ز": 3958, + "س": 3959, + "ش": 3960, + "ص": 3961, + "ض": 3962, + "ط": 3963, + "ظ": 3964, + "ع": 3965, + "غ": 3966, + "ـ": 3967, + "ف": 3968, + "ق": 3969, + "ك": 3970, + "ل": 3971, + "م": 3972, + "ن": 3973, + "ه": 3974, + "و": 3975, + "ى": 3976, + "ي": 3977, + "ً": 3978, + "ٌ": 3979, + "ٍ": 3980, + "َ": 3981, + "ُ": 3982, + "ِ": 3983, + "ّ": 3984, + "ْ": 3985, + "ٰ": 3986, + "چ": 3987, + "ڨ": 3988, + "ک": 3989, + "ھ": 3990, + "ی": 3991, + "ۖ": 3992, + "ۗ": 3993, + "ۘ": 3994, + "ۚ": 3995, + "ۛ": 3996, + "—": 3997, + "☭": 3998, + "ﺃ": 3999, + "ﻻ": 4000, + "ال": 4001, + "َا": 4002, + "وَ": 4003, + "َّ": 4004, + "ِي": 4005, + "أَ": 4006, + "لَ": 4007, + "نَ": 4008, + "الْ": 4009, + "هُ": 4010, + "ُو": 4011, + "ما": 4012, + "نْ": 4013, + "من": 4014, + "عَ": 4015, + "نا": 4016, + "لا": 4017, + "مَ": 4018, + "تَ": 4019, + "فَ": 4020, + "أن": 4021, + "لي": 4022, + "مِ": 4023, + "ان": 4024, + "في": 4025, + "رَ": 4026, + "يَ": 4027, + "هِ": 4028, + "مْ": 4029, + "قَ": 4030, + "بِ": 4031, + "لى": 4032, + "ين": 4033, + "إِ": 4034, + "لِ": 4035, + "وا": 4036, + "كَ": 4037, + "ها": 4038, + "ًا": 4039, + "مُ": 4040, + "ون": 4041, + "الم": 4042, + "بَ": 4043, + "يا": 4044, + "ذا": 4045, + "سا": 4046, + "الل": 4047, + "مي": 4048, + "يْ": 4049, + "را": 4050, + "ري": 4051, + "لك": 4052, + "مَا": 4053, + "نَّ": 4054, + "لم": 4055, + "إن": 4056, + "ست": 4057, + "وم": 4058, + "َّا": 4059, + "لَا": 4060, + "هم": 4061, + "ِّ": 4062, + "كُ": 4063, + "كان": 4064, + "سَ": 4065, + "با": 4066, + "دي": 4067, + "حَ": 4068, + "عْ": 4069, + "بي": 4070, + "الأ": 4071, + "ول": 4072, + "فِي": 4073, + "رِ": 4074, + "دا": 4075, + "مِنْ": 4076, + "ُونَ": 4077, + "وْ": 4078, + "هَا": 4079, + "ُّ": 4080, + "الس": 4081, + "الَ": 4082, + "ني": 4083, + "لْ": 4084, + "تُ": 4085, + "هل": 4086, + "رة": 4087, + "دَ": 4088, + "سْ": 4089, + "تِ": 4090, + "نَا": 4091, + "رْ": 4092, + "اللَّ": 4093, + "سامي": 4094, + "كن": 4095, + "كل": 4096, + "هَ": 4097, + "عَلَ": 4098, + "على": 4099, + "مع": 4100, + "إلى": 4101, + "قد": 4102, + "الر": 4103, + "ُوا": 4104, + "ير": 4105, + "عن": 4106, + "يُ": 4107, + "نِ": 4108, + "بْ": 4109, + "الح": 4110, + "هُمْ": 4111, + "قا": 4112, + "ذه": 4113, + "الت": 4114, + "ِينَ": 4115, + "جَ": 4116, + "هذا": 4117, + "عد": 4118, + "الع": 4119, + "دْ": 4120, + "قَالَ": 4121, + "رُ": 4122, + "يم": 4123, + "ية": 4124, + "نُ": 4125, + "خَ": 4126, + "رب": 4127, + "الك": 4128, + "وَا": 4129, + "أنا": 4130, + "ةِ": 4131, + "الن": 4132, + "حد": 4133, + "عِ": 4134, + "تا": 4135, + "هو": 4136, + "فا": 4137, + "عا": 4138, + "الش": 4139, + "لُ": 4140, + "يت": 4141, + "ذَا": 4142, + "يع": 4143, + "الذ": 4144, + "حْ": 4145, + "الص": 4146, + "إِنَّ": 4147, + "جا": 4148, + "علي": 4149, + "كَا": 4150, + "بُ": 4151, + "تع": 4152, + "وق": 4153, + "مل": 4154, + "لَّ": 4155, + "يد": 4156, + "أخ": 4157, + "رف": 4158, + "تي": 4159, + "الِ": 4160, + "ّا": 4161, + "ذلك": 4162, + "أَنْ": 4163, + "سِ": 4164, + "توم": 4165, + "مر": 4166, + "مَنْ": 4167, + "بل": 4168, + "الق": 4169, + "الله": 4170, + "ِيَ": 4171, + "كم": 4172, + "ذَ": 4173, + "عل": 4174, + "حب": 4175, + "سي": 4176, + "عُ": 4177, + "الج": 4178, + "الد": 4179, + "شَ": 4180, + "تك": 4181, + "فْ": 4182, + "صَ": 4183, + "لل": 4184, + "دِ": 4185, + "بر": 4186, + "فِ": 4187, + "ته": 4188, + "أع": 4189, + "تْ": 4190, + "قْ": 4191, + "الْأَ": 4192, + "ئِ": 4193, + "عَنْ": 4194, + "ور": 4195, + "حا": 4196, + "الَّ": 4197, + "مت": 4198, + "فر": 4199, + "دُ": 4200, + "هنا": 4201, + "وَأَ": 4202, + "تب": 4203, + "ةُ": 4204, + "أي": 4205, + "سب": 4206, + "ريد": 4207, + "وج": 4208, + "كُمْ": 4209, + "حِ": 4210, + "كْ": 4211, + "در": 4212, + "َاء": 4213, + "هذه": 4214, + "الط": 4215, + "الْمُ": 4216, + "دة": 4217, + "قل": 4218, + "غَ": 4219, + "يوم": 4220, + "الَّذ": 4221, + "كر": 4222, + "تر": 4223, + "كِ": 4224, + "كي": 4225, + "عَلَى": 4226, + "رَب": 4227, + "عة": 4228, + "قُ": 4229, + "جْ": 4230, + "فض": 4231, + "لة": 4232, + "هْ": 4233, + "رَا": 4234, + "وَلَ": 4235, + "الْمَ": 4236, + "أَنَّ": 4237, + "يَا": 4238, + "أُ": 4239, + "شي": 4240, + "اللَّهُ": 4241, + "لَى": 4242, + "قِ": 4243, + "أت": 4244, + "عَلَيْ": 4245, + "اللَّهِ": 4246, + "الب": 4247, + "ضَ": 4248, + "ةً": 4249, + "قي": 4250, + "ار": 4251, + "بد": 4252, + "خْ": 4253, + "سْتَ": 4254, + "طَ": 4255, + "قَدْ": 4256, + "ذهب": 4257, + "أم": 4258, + "ماذا": 4259, + "وَإِ": 4260, + "ةٌ": 4261, + "ونَ": 4262, + "ليلى": 4263, + "ولا": 4264, + "حُ": 4265, + "هي": 4266, + "صل": 4267, + "الخ": 4268, + "ود": 4269, + "ليس": 4270, + "لدي": 4271, + "قال": 4272, + "كَانَ": 4273, + "مَّ": 4274, + "حي": 4275, + "تم": 4276, + "لن": 4277, + "وَلَا": 4278, + "بع": 4279, + "يمكن": 4280, + "سُ": 4281, + "ةَ": 4282, + "حت": 4283, + "رًا": 4284, + "كا": 4285, + "شا": 4286, + "هِمْ": 4287, + "لَهُ": 4288, + "زَ": 4289, + "داً": 4290, + "مس": 4291, + "كث": 4292, + "الْعَ": 4293, + "جِ": 4294, + "صْ": 4295, + "فَا": 4296, + "له": 4297, + "وي": 4298, + "عَا": 4299, + "هُوَ": 4300, + "بِي": 4301, + "بَا": 4302, + "أس": 4303, + "ثَ": 4304, + "لِي": 4305, + "رض": 4306, + "الرَّ": 4307, + "لِكَ": 4308, + "تَّ": 4309, + "فُ": 4310, + "قة": 4311, + "فعل": 4312, + "مِن": 4313, + "الآ": 4314, + "ثُ": 4315, + "سم": 4316, + "مَّا": 4317, + "بِهِ": 4318, + "تق": 4319, + "خر": 4320, + "لقد": 4321, + "خل": 4322, + "شر": 4323, + "أنت": 4324, + "لَّا": 4325, + "سن": 4326, + "السَّ": 4327, + "الذي": 4328, + "سَا": 4329, + "وما": 4330, + "زل": 4331, + "وب": 4332, + "أْ": 4333, + "إذا": 4334, + "رِي": 4335, + "حة": 4336, + "نِي": 4337, + "الْحَ": 4338, + "وَقَالَ": 4339, + "به": 4340, + "ةٍ": 4341, + "سأ": 4342, + "رٌ": 4343, + "بال": 4344, + "مة": 4345, + "شْ": 4346, + "وت": 4347, + "عند": 4348, + "فس": 4349, + "بَعْ": 4350, + "هر": 4351, + "قط": 4352, + "أح": 4353, + "إنه": 4354, + "وع": 4355, + "فت": 4356, + "غا": 4357, + "هناك": 4358, + "بت": 4359, + "مِنَ": 4360, + "سر": 4361, + "ذَلِكَ": 4362, + "رس": 4363, + "حدث": 4364, + "غْ": 4365, + "ِّي": 4366, + "الإ": 4367, + "وَيَ": 4368, + "جل": 4369, + "است": 4370, + "قِي": 4371, + "عب": 4372, + "وس": 4373, + "يش": 4374, + "الَّذِينَ": 4375, + "تاب": 4376, + "دِي": 4377, + "جب": 4378, + "كون": 4379, + "بن": 4380, + "الث": 4381, + "لَيْ": 4382, + "بعد": 4383, + "وَالْ": 4384, + "فَأَ": 4385, + "عم": 4386, + "هُم": 4387, + "تن": 4388, + "ذْ": 4389, + "أص": 4390, + "أين": 4391, + "رَبِّ": 4392, + "الذين": 4393, + "إِن": 4394, + "بين": 4395, + "جُ": 4396, + "عَلَيْهِ": 4397, + "حَا": 4398, + "لو": 4399, + "ستط": 4400, + "ظر": 4401, + "لَمْ": 4402, + "ءِ": 4403, + "كُل": 4404, + "طل": 4405, + "تَا": 4406, + "ضُ": 4407, + "كنت": 4408, + "لًا": 4409, + "مٌ": 4410, + "قبل": 4411, + "ــ": 4412, + "ذِ": 4413, + "قَوْ": 4414, + "صِ": 4415, + "مًا": 4416, + "كانت": 4417, + "صا": 4418, + "يق": 4419, + "الف": 4420, + "النا": 4421, + "مٍ": 4422, + "إِنْ": 4423, + "النَّ": 4424, + "جد": 4425, + "وَمَا": 4426, + "تت": 4427, + "بح": 4428, + "مكان": 4429, + "كيف": 4430, + "ّة": 4431, + "الا": 4432, + "جَا": 4433, + "أو": 4434, + "ساعد": 4435, + "ضِ": 4436, + "إلا": 4437, + "راً": 4438, + "قَا": 4439, + "رأ": 4440, + "عت": 4441, + "أحد": 4442, + "هد": 4443, + "ضا": 4444, + "طر": 4445, + "أق": 4446, + "ماء": 4447, + "دَّ": 4448, + "البا": 4449, + "مُو": 4450, + "أَوْ": 4451, + "طا": 4452, + "قُو": 4453, + "خِ": 4454, + "تل": 4455, + "ستطيع": 4456, + "دَا": 4457, + "النَّا": 4458, + "إلَى": 4459, + "وَتَ": 4460, + "هَذَا": 4461, + "بة": 4462, + "عليك": 4463, + "جر": 4464, + "المن": 4465, + "زا": 4466, + "رٍ": 4467, + "دع": 4468, + "ًّا": 4469, + "سة": 4470, + "ثُمَّ": 4471, + "شيء": 4472, + "الغ": 4473, + "تح": 4474, + "رُونَ": 4475, + "اليوم": 4476, + "مِي": 4477, + "نُوا": 4478, + "أر": 4479, + "تُمْ": 4480, + "عر": 4481, + "يف": 4482, + "أب": 4483, + "دًا": 4484, + "صَا": 4485, + "التَّ": 4486, + "أريد": 4487, + "الز": 4488, + "يَوْ": 4489, + "إلي": 4490, + "جي": 4491, + "يَعْ": 4492, + "فضل": 4493, + "الإن": 4494, + "أنه": 4495, + "1": 4496, + "2": 4497, + "3": 4498, + "4": 4499, + "5": 4500, + "·": 4501, + "×": 4502, + "̃": 4503, + "̌": 4504, + "ε": 4505, + "λ": 4506, + "μ": 4507, + "•": 4508, + "‧": 4509, + "─": 4510, + "□": 4511, + "、": 4512, + "。": 4513, + "〈": 4514, + "〉": 4515, + "《": 4516, + "》": 4517, + "「": 4518, + "」": 4519, + "『": 4520, + "』": 4521, + "ア": 4522, + "オ": 4523, + "カ": 4524, + "チ": 4525, + "ド": 4526, + "ベ": 4527, + "ャ": 4528, + "ヤ": 4529, + "ン": 4530, + "・": 4531, + "ー": 4532, + "ㄟ": 4533, + "!": 4534, + "(": 4535, + ")": 4536, + ",": 4537, + "-": 4538, + "/": 4539, + ":": 4540, + ";": 4541, + "?": 4542, + "p": 4543, + "i4": 4544, + "zh": 4545, + "i2": 4546, + "ng1": 4547, + "u4": 4548, + "i1": 4549, + "ng2": 4550, + "u3": 4551, + "de5": 4552, + "e4": 4553, + "i3": 4554, + "ng4": 4555, + "an4": 4556, + "shi4": 4557, + "an2": 4558, + "u2": 4559, + "u1": 4560, + "ng3": 4561, + "a1": 4562, + "an1": 4563, + "e2": 4564, + "a4": 4565, + "ei4": 4566, + "ong1": 4567, + "ai4": 4568, + "ao4": 4569, + "ang1": 4570, + "an3": 4571, + "wei4": 4572, + "uo2": 4573, + "n1": 4574, + "en2": 4575, + "ao3": 4576, + "e1": 4577, + "qi": 4578, + "eng2": 4579, + "zho": 4580, + "ang3": 4581, + "ang4": 4582, + "ang2": 4583, + "uo4": 4584, + "ge4": 4585, + "yi1": 4586, + "guo2": 4587, + "a3": 4588, + "he2": 4589, + "e3": 4590, + "yi2": 4591, + "di4": 4592, + "zhong1": 4593, + "bu4": 4594, + "ai2": 4595, + "n2": 4596, + "zai4": 4597, + "shi2": 4598, + "eng1": 4599, + "ren2": 4600, + "ong2": 4601, + "xian4": 4602, + "xu": 4603, + "n4": 4604, + "li4": 4605, + "en4": 4606, + "yu2": 4607, + "ei2": 4608, + "yi2ge4": 4609, + "ou4": 4610, + "ei3": 4611, + "ui4": 4612, + "a2": 4613, + "you3": 4614, + "ao1": 4615, + "da4": 4616, + "cheng2": 4617, + "en1": 4618, + "eng4": 4619, + "yi4": 4620, + "si1": 4621, + "zhi4": 4622, + "jia1": 4623, + "yuan2": 4624, + "ta1": 4625, + "de5yi2ge4": 4626, + "ke1": 4627, + "shu3": 4628, + "xi1": 4629, + "ji2": 4630, + "ao2": 4631, + "ou3": 4632, + "ong4": 4633, + "xia4": 4634, + "ai1": 4635, + "gong1": 4636, + "zhi1": 4637, + "en3": 4638, + "wei2": 4639, + "xue2": 4640, + "qu1": 4641, + "zhou1": 4642, + "er3": 4643, + "ming2": 4644, + "zhong3": 4645, + "li3": 4646, + "wu4": 4647, + "yi3": 4648, + "uo1": 4649, + "e5": 4650, + "ji4": 4651, + "xing2": 4652, + "jian4": 4653, + "hua4": 4654, + "yu3": 4655, + "uo3": 4656, + "ji1": 4657, + "ai3": 4658, + "zuo4": 4659, + "hou4": 4660, + "hui4": 4661, + "ei1": 4662, + "nian2": 4663, + "qi2": 4664, + "dao4": 4665, + "sheng1": 4666, + "de2": 4667, + "dai4": 4668, + "uan2": 4669, + "zhe4": 4670, + "zheng4": 4671, + "ben3": 4672, + "shang4": 4673, + "zhu3": 4674, + "bei4": 4675, + "ye4": 4676, + "chu1": 4677, + "zhan4": 4678, + "le5": 4679, + "lai2": 4680, + "shi3": 4681, + "nan2": 4682, + "ren4": 4683, + "you2": 4684, + "ke4": 4685, + "ba1": 4686, + "fu4": 4687, + "dui4": 4688, + "ya4": 4689, + "mei3": 4690, + "zi4": 4691, + "xin1": 4692, + "jing1": 4693, + "zhu": 4694, + "n3": 4695, + "yong4": 4696, + "mu4": 4697, + "jiao4": 4698, + "ye3": 4699, + "jin4": 4700, + "bian4": 4701, + "lu4": 4702, + "qi1": 4703, + "she4": 4704, + "xiang1": 4705, + "ong3": 4706, + "shu4": 4707, + "dong4": 4708, + "suo3": 4709, + "guan1": 4710, + "san1": 4711, + "te4": 4712, + "duo1": 4713, + "fu2": 4714, + "min2": 4715, + "la1": 4716, + "zhi2": 4717, + "zhen4": 4718, + "ou1": 4719, + "wu3": 4720, + "ma3": 4721, + "i5": 4722, + "zi5": 4723, + "ju4": 4724, + "er4": 4725, + "yao4": 4726, + "xia4de5yi2ge4": 4727, + "si4": 4728, + "tu2": 4729, + "shan1": 4730, + "zui4": 4731, + "yin1": 4732, + "er2": 4733, + "tong2": 4734, + "dong1": 4735, + "yu4": 4736, + "yan2": 4737, + "qian2": 4738, + "shu3xia4de5yi2ge4": 4739, + "jun1": 4740, + "ke3": 4741, + "wen2": 4742, + "fa3": 4743, + "luo2": 4744, + "zhu4": 4745, + "xi4": 4746, + "kou3": 4747, + "bei3": 4748, + "jian1": 4749, + "fa1": 4750, + "dian4": 4751, + "jiang1": 4752, + "wei4yu2": 4753, + "xiang4": 4754, + "zhi3": 4755, + "eng3": 4756, + "fang1": 4757, + "lan2": 4758, + "shu": 4759, + "ri4": 4760, + "lian2": 4761, + "shou3": 4762, + "qiu2": 4763, + "jin1": 4764, + "huo4": 4765, + "shu3xia4de5yi2ge4zhong3": 4766, + "fen1": 4767, + "nei4": 4768, + "gai1": 4769, + "mei3guo2": 4770, + "un2": 4771, + "ge2": 4772, + "bao3": 4773, + "qing1": 4774, + "gao1": 4775, + "tai2": 4776, + "xiao3": 4777, + "jie2": 4778, + "tian1": 4779, + "chang2": 4780, + "quan2": 4781, + "lie4": 4782, + "hai3": 4783, + "fei1": 4784, + "ti3": 4785, + "jue2": 4786, + "ou2": 4787, + "ci3": 4788, + "zu2": 4789, + "ni2": 4790, + "biao3": 4791, + "zhong1guo2": 4792, + "du4": 4793, + "yue4": 4794, + "xing4": 4795, + "sheng4": 4796, + "che1": 4797, + "dan1": 4798, + "jie1": 4799, + "lin2": 4800, + "ping2": 4801, + "fu3": 4802, + "gu3": 4803, + "jie4": 4804, + "v3": 4805, + "sheng3": 4806, + "na4": 4807, + "yuan4": 4808, + "zhang3": 4809, + "guan3": 4810, + "dao3": 4811, + "zu3": 4812, + "ding4": 4813, + "dian3": 4814, + "ceng2": 4815, + "ren2kou3": 4816, + "tai4": 4817, + "tong1": 4818, + "guo4": 4819, + "neng2": 4820, + "chang3": 4821, + "hua2": 4822, + "liu2": 4823, + "ying1": 4824, + "xiao4": 4825, + "ci4": 4826, + "bian4hua4": 4827, + "liang3": 4828, + "gong4": 4829, + "zhong4": 4830, + "de5yi1": 4831, + "se4": 4832, + "kai1": 4833, + "wang2": 4834, + "jiu4": 4835, + "shi1": 4836, + "shou4": 4837, + "mei2": 4838, + "feng1": 4839, + "ze2": 4840, + "tu2shi4": 4841, + "ti2": 4842, + "qi4": 4843, + "jiu3": 4844, + "shen1": 4845, + "zhe3": 4846, + "ren2kou3bian4hua4": 4847, + "ren2kou3bian4hua4tu2shi4": 4848, + "di4qu1": 4849, + "yang2": 4850, + "men5": 4851, + "long2": 4852, + "bing4": 4853, + "chan3": 4854, + "zhu1": 4855, + "wei3": 4856, + "wai4": 4857, + "xing1": 4858, + "bo1": 4859, + "bi3": 4860, + "tang2": 4861, + "hua1": 4862, + "bo2": 4863, + "shui3": 4864, + "shu1": 4865, + "dou1": 4866, + "sai4": 4867, + "chao2": 4868, + "bi4": 4869, + "ling2": 4870, + "lei4": 4871, + "da4xue2": 4872, + "fen4": 4873, + "shu3de5": 4874, + "mu3": 4875, + "jiao1": 4876, + "dang1": 4877, + "cheng1": 4878, + "tong3": 4879, + "nv3": 4880, + "qi3": 4881, + "yan3": 4882, + "mian4": 4883, + "luo4": 4884, + "jing4": 4885, + "ge1": 4886, + "ru4": 4887, + "dan4": 4888, + "ri4ben3": 4889, + "pu3": 4890, + "yun4": 4891, + "huang2": 4892, + "wo3": 4893, + "lv": 4894, + "hai2": 4895, + "shi4yi1": 4896, + "xie1": 4897, + "ying3": 4898, + "wu2": 4899, + "shen2": 4900, + "wang3": 4901, + "guang3": 4902, + "liu4": 4903, + "su4": 4904, + "shi4zhen4": 4905, + "can1": 4906, + "cao3": 4907, + "xia2": 4908, + "ka3": 4909, + "da2": 4910, + "hu4": 4911, + "ban4": 4912, + "dang3": 4913, + "hu2": 4914, + "zong3": 4915, + "deng3": 4916, + "de5yi2ge4shi4zhen4": 4917, + "chuan2": 4918, + "mo4": 4919, + "zhang1": 4920, + "ban1": 4921, + "mo2": 4922, + "cha2": 4923, + "ce4": 4924, + "zhu3yao4": 4925, + "tou2": 4926, + "ju2": 4927, + "shi4wei4yu2": 4928, + "sa4": 4929, + "un1": 4930, + "ke3yi3": 4931, + "du1": 4932, + "han4": 4933, + "liang4": 4934, + "sha1": 4935, + "jia3": 4936, + "zi1": 4937, + "lv4": 4938, + "fu1": 4939, + "xian1": 4940, + "xu4": 4941, + "guang1": 4942, + "meng2": 4943, + "bao4": 4944, + "you4": 4945, + "rong2": 4946, + "zhi1yi1": 4947, + "wei1": 4948, + "mao2": 4949, + "guo2jia1": 4950, + "cong2": 4951, + "gou4": 4952, + "tie3": 4953, + "zhen1": 4954, + "du2": 4955, + "bian1": 4956, + "ci2": 4957, + "qu3": 4958, + "fan4": 4959, + "xiang3": 4960, + "men2": 4961, + "ju1": 4962, + "hong2": 4963, + "zi3": 4964, + "ta1men5": 4965, + "ji3": 4966, + "zong1": 4967, + "zhou1de5yi2ge4shi4zhen4": 4968, + "tuan2": 4969, + "jing3": 4970, + "gong1si1": 4971, + "xie4": 4972, + "li2": 4973, + "li4shi3": 4974, + "bao1": 4975, + "gang3": 4976, + "gui1": 4977, + "zheng1": 4978, + "zhi2wu4": 4979, + "ta1de5": 4980, + "pin3": 4981, + "zhuan1": 4982, + "chong2": 4983, + "shi3yong4": 4984, + "wa3": 4985, + "shuo1": 4986, + "chuan1": 4987, + "lei2": 4988, + "wan1": 4989, + "huo2": 4990, + "su1": 4991, + "zao3": 4992, + "gai3": 4993, + "qu4": 4994, + "gu4": 4995, + "xi2": 4996, + "hang2": 4997, + "ying4": 4998, + "cun1": 4999, + "gen1": 5000, + "ying2": 5001, + "ting2": 5002, + "cheng2shi4": 5003, + "jiang3": 5004, + "ling3": 5005, + "lun2": 5006, + "bu4fen4": 5007, + "deng1": 5008, + "xuan3": 5009, + "dong4wu4": 5010, + "de2guo2": 5011, + "xian3": 5012, + "fan3": 5013, + "zhe5": 5014, + "han2": 5015, + "hao4": 5016, + "mi4": 5017, + "ran2": 5018, + "qin1": 5019, + "tiao2": 5020, + "zhan3": 5021, + "[ar]": 5022, + "[zh-cn]": 5023, + "shi": 5026, + "tsu": 5027, + "teki": 5028, + "nai": 5029, + "aru": 5030, + "uu": 5031, + "kai": 5032, + "shite": 5033, + "mono": 5034, + "koto": 5035, + "kara": 5036, + "shita": 5037, + "suru": 5038, + "masu": 5039, + "tai": 5040, + "ware": 5041, + "shin": 5042, + "oku": 5043, + "yuu": 5044, + "iru": 5045, + "jiko": 5046, + "desu": 5047, + "rare": 5048, + "shou": 5049, + "sha": 5050, + "sekai": 5051, + "kyou": 5052, + "mashita": 5053, + "nara": 5054, + "kei": 5055, + "ita": 5056, + "ari": 5057, + "itsu": 5058, + "kono": 5059, + "naka": 5060, + "chou": 5061, + "sore": 5062, + "naru": 5063, + "gaku": 5064, + "reba": 5065, + "hito": 5066, + "sai": 5067, + "nan": 5068, + "dai": 5069, + "tsuku": 5070, + "shiki": 5071, + "sare": 5072, + "naku": 5073, + "jun": 5074, + "kaku": 5075, + "zai": 5076, + "wata": 5077, + "shuu": 5078, + "ii": 5079, + "kare": 5080, + "shii": 5081, + "made": 5082, + "sho": 5083, + "kereba": 5084, + "shika": 5085, + "ichi": 5086, + "deki": 5087, + "nin": 5088, + "wareware": 5089, + "nakereba": 5090, + "oite": 5091, + "yaku": 5092, + "mujun": 5093, + "yoku": 5094, + "butsu": 5095, + "omo": 5096, + "gae": 5097, + "naranai": 5098, + "tachi": 5099, + "chuu": 5100, + "kangae": 5101, + "toki": 5102, + "koro": 5103, + "mujunteki": 5104, + "naga": 5105, + "jin": 5106, + "shima": 5107, + "iku": 5108, + "imasu": 5109, + "hon": 5110, + "kae": 5111, + "kore": 5112, + "kita": 5113, + "datta": 5114, + "jitsu": 5115, + "mae": 5116, + "toku": 5117, + "douitsu": 5118, + "ritsu": 5119, + "kyuu": 5120, + "hyou": 5121, + "rareta": 5122, + "keisei": 5123, + "kkan": 5124, + "rareru": 5125, + "mou": 5126, + "doko": 5127, + "ryou": 5128, + "dake": 5129, + "nakatta": 5130, + "soko": 5131, + "tabe": 5132, + "hana": 5133, + "fuku": 5134, + "yasu": 5135, + "wataku": 5136, + "yama": 5137, + "kyo": 5138, + "genzai": 5139, + "boku": 5140, + "ata": 5141, + "kawa": 5142, + "masen": 5143, + "juu": 5144, + "natte": 5145, + "watakushi": 5146, + "yotte": 5147, + "hai": 5148, + "jishin": 5149, + "rete": 5150, + "oka": 5151, + "kagaku": 5152, + "natta": 5153, + "karu": 5154, + "nari": 5155, + "mata": 5156, + "kuru": 5157, + "gai": 5158, + "kari": 5159, + "shakai": 5160, + "koui": 5161, + "yori": 5162, + "setsu": 5163, + "reru": 5164, + "tokoro": 5165, + "jutsu": 5166, + "saku": 5167, + "ttai": 5168, + "ningen": 5169, + "tame": 5170, + "kankyou": 5171, + "ooku": 5172, + "watashi": 5173, + "tsukuru": 5174, + "sugi": 5175, + "jibun": 5176, + "shitsu": 5177, + "keru": 5178, + "kishi": 5179, + "shikashi": 5180, + "moto": 5181, + "mari": 5182, + "itte": 5183, + "deshita": 5184, + "nde": 5185, + "arimasu": 5186, + "koe": 5187, + "zettai": 5188, + "kkanteki": 5189, + "rekishi": 5190, + "dekiru": 5191, + "tsuka": 5192, + "itta": 5193, + "kobutsu": 5194, + "miru": 5195, + "shoku": 5196, + "shimasu": 5197, + "gijutsu": 5198, + "gyou": 5199, + "joushiki": 5200, + "atta": 5201, + "hodo": 5202, + "koko": 5203, + "tsukurareta": 5204, + "zoku": 5205, + "hitei": 5206, + "koku": 5207, + "rekishiteki": 5208, + "kete": 5209, + "kako": 5210, + "nagara": 5211, + "kakaru": 5212, + "shutai": 5213, + "haji": 5214, + "taku": 5215, + "douitsuteki": 5216, + "mete": 5217, + "tsuu": 5218, + "sarete": 5219, + "genjitsu": 5220, + "bai": 5221, + "nawa": 5222, + "jikan": 5223, + "waru": 5224, + "rt": 5225, + "atsu": 5226, + "soku": 5227, + "kouiteki": 5228, + "kata": 5229, + "tetsu": 5230, + "gawa": 5231, + "kedo": 5232, + "reta": 5233, + "sayou": 5234, + "tteru": 5235, + "tori": 5236, + "kimi": 5237, + "mura": 5238, + "sareru": 5239, + "machi": 5240, + "kya": 5241, + "osa": 5242, + "konna": 5243, + "aku": 5244, + "sareta": 5245, + "ipp": 5246, + "shiku": 5247, + "uchi": 5248, + "hitotsu": 5249, + "hatara": 5250, + "tachiba": 5251, + "shiro": 5252, + "katachi": 5253, + "tomo": 5254, + "ete": 5255, + "meru": 5256, + "nichi": 5257, + "dare": 5258, + "katta": 5259, + "eru": 5260, + "suki": 5261, + "ooki": 5262, + "maru": 5263, + "moku": 5264, + "oko": 5265, + "kangaerareru": 5266, + "oto": 5267, + "tanni": 5268, + "tada": 5269, + "taiteki": 5270, + "motte": 5271, + "kinou": 5272, + "shinai": 5273, + "kki": 5274, + "tari": 5275, + "ranai": 5276, + "kkou": 5277, + "mirai": 5278, + "ppon": 5279, + "goto": 5280, + "hitsu": 5281, + "teru": 5282, + "mochi": 5283, + "katsu": 5284, + "nyuu": 5285, + "zuka": 5286, + "tsuite": 5287, + "nomi": 5288, + "sugu": 5289, + "kuda": 5290, + "tetsugaku": 5291, + "ika": 5292, + "ronri": 5293, + "oki": 5294, + "nippon": 5295, + "shimashita": 5296, + "chishiki": 5297, + "chokkanteki": 5298, + "suko": 5299, + "kuu": 5300, + "arou": 5301, + "katte": 5302, + "kuri": 5303, + "inai": 5304, + "hyougen": 5305, + "ishiki": 5306, + "doku": 5307, + "atte": 5308, + "atara": 5309, + "wari": 5310, + "kao": 5311, + "seisan": 5312, + "hanashi": 5313, + "kake": 5314, + "naji": 5315, + "sunawa": 5316, + "sunawachi": 5317, + "ugo": 5318, + "suu": 5319, + "bara": 5320, + "hiro": 5321, + "iwa": 5322, + "betsu": 5323, + "yoi": 5324, + "seru": 5325, + "shiteru": 5326, + "rarete": 5327, + "toshi": 5328, + "seki": 5329, + "tairitsu": 5330, + "wakara": 5331, + "tokyo": 5332, + "kka": 5333, + "kyoku": 5334, + "iro": 5335, + "mite": 5336, + "saki": 5337, + "kanji": 5338, + "mita": 5339, + "sube": 5340, + "ryoku": 5341, + "matta": 5342, + "kudasai": 5343, + "omoi": 5344, + "wareru": 5345, + "hitsuyou": 5346, + "kashi": 5347, + "renai": 5348, + "kankei": 5349, + "gatte": 5350, + "ochi": 5351, + "motsu": 5352, + "sonzai": 5353, + "taishite": 5354, + "ame": 5355, + "seimei": 5356, + "kano": 5357, + "giri": 5358, + "kangaeru": 5359, + "yue": 5360, + "asa": 5361, + "onaji": 5362, + "yoru": 5363, + "niku": 5364, + "osaka": 5365, + "sukoshi": 5366, + "tama": 5367, + "kanojo": 5368, + "kite": 5369, + "mondai": 5370, + "amari": 5371, + "eki": 5372, + "kojin": 5373, + "haya": 5374, + "dete": 5375, + "atarashii": 5376, + "awa": 5377, + "gakkou": 5378, + "tsuzu": 5379, + "shukan": 5380, + "imashita": 5381, + "atae": 5382, + "darou": 5383, + "hataraku": 5384, + "gata": 5385, + "dachi": 5386, + "matsu": 5387, + "arimasen": 5388, + "seibutsu": 5389, + "mitsu": 5390, + "heya": 5391, + "yasui": 5392, + "deni": 5393, + "noko": 5394, + "haha": 5395, + "domo": 5396, + "kami": 5397, + "sudeni": 5398, + "nao": 5399, + "raku": 5400, + "ike": 5401, + "meta": 5402, + "kodomo": 5403, + "soshite": 5404, + "game": 5405, + "bakari": 5406, + "tote": 5407, + "hatsu": 5408, + "mise": 5409, + "mokuteki": 5410, + "dakara": 5411, + "[ja]": 5412 + }, + "merges": [ + "t h", + "i n", + "th e", + "a n", + "e r", + "o u", + "r e", + "o n", + "a t", + "e d", + "e n", + "t o", + "in g", + "an d", + "i s", + "a s", + "a l", + "o r", + "o f", + "a r", + "i t", + "e s", + "h e", + "s t", + "l e", + "o m", + "s e", + "b e", + "a d", + "o w", + "l y", + "c h", + "w h", + "th at", + "y ou", + "l i", + "v e", + "a c", + "t i", + "l d", + "m e", + "w as", + "g h", + "i d", + "l l", + "w i", + "en t", + "f or", + "a y", + "r o", + "v er", + "i c", + "h er", + "k e", + "h is", + "n o", + "u t", + "u n", + "i r", + "l o", + "w e", + "r i", + "h a", + "wi th", + "gh t", + "ou t", + "i m", + "i on", + "al l", + "a b", + "on e", + "n e", + "g e", + "ou ld", + "t er", + "m o", + "h ad", + "c e", + "s he", + "g o", + "s h", + "u r", + "a m", + "s o", + "p e", + "m y", + "d e", + "a re", + "b ut", + "om e", + "f r", + "the r", + "f e", + "s u", + "d o", + "c on", + "t e", + "a in", + "er e", + "p o", + "i f", + "the y", + "u s", + "a g", + "t r", + "n ow", + "ou n", + "th is", + "ha ve", + "no t", + "s a", + "i l", + "u p", + "th ing", + "fr om", + "a p", + "h im", + "ac k", + "at ion", + "an t", + "ou r", + "o p", + "li ke", + "u st", + "es s", + "b o", + "o k", + "u l", + "in d", + "e x", + "c om", + "s ome", + "the re", + "er s", + "c o", + "re s", + "m an", + "ar d", + "p l", + "w or", + "w ay", + "ti on", + "f o", + "c a", + "w ere", + "b y", + "at e", + "p ro", + "t ed", + "oun d", + "ow n", + "w ould", + "t s", + "wh at", + "q u", + "al ly", + "i ght", + "c k", + "g r", + "wh en", + "v en", + "c an", + "ou gh", + "in e", + "en d", + "p er", + "ou s", + "o d", + "id e", + "k now", + "t y", + "ver y", + "s i", + "a k", + "wh o", + "ab out", + "i ll", + "the m", + "es t", + "re d", + "y e", + "c ould", + "on g", + "you r", + "the ir", + "e m", + "j ust", + "o ther", + "in to", + "an y", + "wh i", + "u m", + "t w", + "as t", + "d er", + "d id", + "i e", + "be en", + "ac e", + "in k", + "it y", + "b ack", + "t ing", + "b r", + "mo re", + "a ke", + "p p", + "the n", + "s p", + "e l", + "u se", + "b l", + "sa id", + "o ver", + "ge t", + "e n", + "e r", + "c h", + "e i", + "i e", + "u n", + "i ch", + "ei n", + "s t", + "a n", + "t e", + "g e", + "a u", + "i n", + "s ch", + "d er", + "un d", + "d ie", + "d a", + "e s", + "a l", + "d en", + "a r", + "g en", + "z u", + "d e", + "h r", + "o n", + "t en", + "e l", + "o r", + "m i", + "s ie", + "da s", + "a t", + "b e", + "ein e", + "ich t", + "b er", + "l e", + "a ch", + "v er", + "s e", + "au f", + "w i", + "s o", + "t er", + "l ich", + "c k", + "u r", + "n icht", + "m m", + "b en", + "a s", + "w ar", + "r e", + "mi t", + "s ich", + "i g", + "l l", + "au s", + "i st", + "w ie", + "o ch", + "un g", + "an n", + "ü r", + "h n", + "i hr", + "s a", + "s en", + "t z", + "de m", + "ei t", + "u m", + "h at", + "wi r", + "v on", + "h a", + "s p", + "w ei", + "i er", + "r o", + "h er", + "r a", + "ein en", + "n e", + "v or", + "al s", + "an d", + "al l", + "w as", + "w o", + "r ei", + "st e", + "l ie", + "au ch", + "d u", + "d es", + "k o", + "ü ber", + "a m", + "b ei", + "h en", + "h m", + "l ei", + "a ber", + "w en", + "h l", + "g er", + "i m", + "u t", + "n ach", + "h e", + "i s", + "b r", + "f t", + "en t", + "i mm", + "j e", + "sch en", + "w er", + "s er", + "a b", + "ä n", + "m e", + "s ein", + "i t", + "o l", + "ch t", + "f ür", + "k l", + "f f", + "eine m", + "n en", + "w e", + "j a", + "u s", + "n och", + "hat te", + "t r", + "p f", + "h in", + "d i", + "ch en", + "b l", + "m an", + "r ü", + "ie l", + "s el", + "das s", + "i hn", + "mi r", + "sch l", + "ö n", + "g an", + "g t", + "ein er", + "st en", + "m ich", + "wen n", + "el l", + "g te", + "in d", + "m al", + "ge l", + "k en", + "n ur", + "mm en", + "f ü", + "er n", + "ö r", + "un ter", + "f r", + "an der", + "g r", + "i l", + "d ur", + "u ch", + "f e", + "t a", + "m en", + "m ach", + "d och", + "t i", + "dur ch", + "o s", + "g l", + "h al", + "ihr e", + "w ä", + "imm er", + "i hm", + "k ann", + "or t", + "d ann", + "l an", + "tz t", + "o der", + "hr en", + "e t", + "k ön", + "i ck", + "f a", + "in g", + "i r", + "wie der", + "da ß", + "m ein", + "f en", + "gan z", + "die se", + "st er", + "da r", + "w a", + "ge s", + "n a", + "f l", + "i gen", + "sch e", + "un gen", + "me hr", + "ß en", + "o t", + "k on", + "ge w", + "ha ben", + "ge h", + "ä t", + "s ind", + "d r", + "w el", + "un s", + "v o", + "m a", + "u te", + "sch on", + "b es", + "ge sch", + "b t", + "ch e", + "s on", + "o b", + "l a", + "p p", + "rü ck", + "s eine", + "k r", + "f re", + "ei l", + "zu m", + "u l", + "h ier", + "k t", + "i ge", + "sp r", + "k e", + "le ben", + "b st", + "z eit", + "i on", + "g ro", + "den n", + "h o", + "sch a", + "b ar", + "al le", + "ge gen", + "w ür", + "m ü", + "z e", + "wer den", + "je tzt", + "ko mmen", + "n ie", + "s ei", + "h eit", + "so ll", + "g lei", + "m eine", + "wo ll", + "n er", + "ha be", + "w ur", + "lich en", + "p er", + "as sen", + "n te", + "se hen", + "wir d", + "b is", + "g ar", + "i en", + "m us", + "u ß", + "ä r", + "st ell", + "k eit", + "z wei", + "sel bst", + "st a", + "p a", + "sa gte", + "te t", + "k am", + "s sen", + "v iel", + "u g", + "z en", + "h ei", + "m ann", + "wi ll", + "ge b", + "war en", + "ü ck", + "ä ch", + "m er", + "r u", + "w or", + "h au", + "ei gen", + "an g", + "we g", + "bl ick", + "f ra", + "all es", + "k a", + "au gen", + "f in", + "lich e", + "t o", + "un ser", + "der n", + "her r", + "n un", + "v ie", + "ch te", + "wo hl", + "f all", + "h t", + "ü n", + "et was", + "st and", + "en d", + "ä u", + "e m", + "m ö", + "te l", + "r ie", + "d ich", + "die s", + "h and", + "b in", + "ff en", + "nicht s", + "d an", + "p l", + "hn e", + "ihn en", + "es en", + "die ser", + "fr au", + "an t", + "ar t", + "di r", + "i sch", + "er st", + "glei ch", + "ko mm", + "h ör", + "ß e", + "d ig", + "se hr", + "z ei", + "sa m", + "au m", + "h ät", + "in gen", + "g ut", + "b o", + "m ut", + "ck en", + "kon nte", + "st imm", + "p ro", + "zu r", + "i tz", + "wei l", + "wür de", + "f ä", + "kön nen", + "k eine", + "f er", + "i schen", + "vo ll", + "ein es", + "se tz", + "z ie", + "de l", + "te te", + "sein er", + "ier en", + "ge st", + "zu rück", + "wur de", + "sch n", + "p r", + "lie ß", + "t ra", + "m ä", + "gen d", + "f ol", + "i k", + "schl a", + "scha ft", + "at er", + "wei ß", + "s einen", + "l assen", + "l u", + "und en", + "t eil", + "ne u", + "ier t", + "men schen", + "hm en", + "st r", + "g i", + "sa h", + "ihr en", + "el n", + "wei ter", + "ge hen", + "ig er", + "mach t", + "ta g", + "al so", + "hal ten", + "n is", + "ach t", + "ge ben", + "f or", + "o g", + "n at", + "m ar", + "de t", + "o hne", + "h aus", + "t ro", + "an ge", + "l au", + "sp iel", + "t re", + "sch r", + "in n", + "s u", + "l os", + "mach en", + "hät te", + "be g", + "wir k", + "al t", + "g lich", + "te s", + "r icht", + "fre und", + "m o", + "ihr er", + "f el", + "b el", + "so l", + "ein mal", + "e ben", + "h ol", + "h än", + "q u", + "ter n", + "h ö", + "sch w", + "re cht", + "wa hr", + "s einem", + "ste hen", + "hl en", + "in s", + "g ing", + "woll te", + "wi ssen", + "ung s", + "al d", + "as s", + "ja hr", + "m or", + "wel t", + "un der", + "zu sa", + "at ion", + "ko pf", + "lan g", + "hin ter", + "at z", + "st ra", + "an gen", + "an k", + "a de", + "gl au", + "f ach", + "hat ten", + "l o", + "f ort", + "ei cht", + "i ff", + "l er", + "m ei", + "diese m", + "k ein", + "f rei", + "fü hr", + "vo m", + "e s", + "e n", + "a i", + "o u", + "o n", + "l e", + "d e", + "r e", + "q u", + "a n", + "e r", + "en t", + "e t", + "l a", + "n e", + "i l", + "a r", + "i s", + "ai t", + "t e", + "a u", + "i n", + "qu e", + "i t", + "u r", + "s e", + "l es", + "c h", + "c e", + "m e", + "o r", + "ou r", + "a s", + "p r", + "a v", + "o m", + "ai s", + "u n", + "an t", + "ou s", + "t r", + "t i", + "l u", + "o i", + "e u", + "l le", + "s i", + "p ar", + "d es", + "an s", + "m ent", + "é t", + "es t", + "j e", + "u ne", + "a l", + "p as", + "t re", + "qu i", + "d u", + "r i", + "c on", + "s on", + "c om", + "e lle", + "d é", + "p our", + "d ans", + "l i", + "s a", + "r é", + "t ou", + "v ous", + "d i", + "v i", + "a g", + "a m", + "a t", + "ou v", + "a p", + "ti on", + "m on", + "s ur", + "c i", + "o s", + "p lu", + "s u", + "en d", + "a b", + "è re", + "ai n", + "m ais", + "o is", + "r es", + "plu s", + "é e", + "ai ent", + "m p", + "ch e", + "lu i", + "av e", + "ét ait", + "m a", + "s es", + "tou t", + "i r", + "v o", + "a c", + "s er", + "an d", + "f f", + "oi r", + "g r", + "av ait", + "é s", + "m es", + "n ous", + "eu x", + "b i", + "t er", + "c o", + "on s", + "p u", + "c es", + "g e", + "t u", + "le ur", + "pr o", + "d on", + "e ur", + "et te", + "ai re", + "ave c", + "d it", + "t é", + "i e", + "u s", + "il le", + "p er", + "com me", + "c r", + "or t", + "m i", + "e x", + "u x", + "v er", + "m o", + "è s", + "v e", + "au x", + "r a", + "j our", + "il s", + "bi en", + "c ou", + "p e", + "que l", + "p eu", + "c ette", + "t es", + "p o", + "in s", + "c u", + "m ê", + "s o", + "f ait", + "g u", + "m ar", + "ê tre", + "l o", + "it é", + "f r", + "a tion", + "en s", + "b r", + "n i", + "l é", + "d is", + "b le", + "m an", + "n é", + "pu is", + "mê me", + "qu es", + "f i", + "e l", + "ag e", + "g ar", + "m oi", + "en ce", + "on t", + "m ain", + "or s", + "au t", + "an ce", + "v en", + "m é", + "s ans", + "e m", + "s é", + "l on", + "h om", + "r o", + "u t", + "c ar", + "ab le", + "i m", + "de r", + "ch er", + "n o", + "vi e", + "au s", + "b e", + "de ux", + "en f", + "o ù", + "t en", + "p h", + "u re", + "te mp", + "p os", + "r ent", + "p é", + "f aire", + "p i", + "tr es", + "ç a", + "an g", + "end re", + "f or", + "p a", + "b on", + "s ou", + "in t", + "pr é", + "s ent", + "t ant", + "n er", + "c er", + "l à", + "l ais", + "pr ès", + "b re", + "c our", + "p et", + "i on", + "i ne", + "com p", + "l ait", + "tr ouv", + "t a", + "ent re", + "son t", + "de v", + "n u", + "temp s", + "d ou", + "r ait", + "b ou", + "qu and", + "jour s", + "l an", + "er s", + "av oir", + "ét é", + "a le", + "p re", + "f ois", + "or te", + "v é", + "m er", + "n on", + "t ous", + "j us", + "cou p", + "t s", + "hom me", + "ê te", + "a d", + "aus si", + "ur s", + "se u", + "or d", + "o b", + "m in", + "g é", + "co re", + "v a", + "v re", + "en core", + "se m", + "i te", + "au tre", + "pr is", + "peu t", + "u e", + "an te", + "m al", + "g n", + "ré p", + "h u", + "si on", + "vo tre", + "di re", + "e z", + "f em", + "leur s", + "m et", + "f in", + "c ri", + "m is", + "t our", + "r ai", + "j am", + "re gar", + "ri en", + "ver s", + "su is", + "p ouv", + "o p", + "v is", + "gr and", + "ant s", + "c or", + "re r", + "ar d", + "c é", + "t ent", + "pr es", + "v ou", + "f a", + "al ors", + "si eur", + "ai ne", + "le r", + "qu oi", + "f on", + "end ant", + "ar ri", + "eu re", + "a près", + "don c", + "it u", + "l è", + "s ait", + "t oi", + "ch a", + "ai l", + "as se", + "i mp", + "vo y", + "con n", + "p la", + "pet it", + "av ant", + "n om", + "t in", + "don t", + "d a", + "s ous", + "e mp", + "per son", + "el les", + "be au", + "par ti", + "ch o", + "pr it", + "tou jours", + "m en", + "r ais", + "jam ais", + "tr av", + "tion s", + "tr ès", + "v oi", + "r en", + "y eux", + "f er", + "v oir", + "pre mi", + "c a", + "g ne", + "h eure", + "r ou", + "e ff", + "no tre", + "ment s", + "t on", + "f ais", + "ce la", + "i er", + "rép on", + "con s", + "ai r", + "ô t", + "p endant", + "i ci", + "tou te", + "j et", + "p ort", + "ét aient", + "p en", + "h é", + "au tres", + "p ère", + "o c", + "quel ques", + "i que", + "l is", + "fem me", + "j ou", + "te ur", + "mon de", + "u se", + "n es", + "d re", + "a ff", + "r ap", + "par t", + "le ment", + "c la", + "f ut", + "quel que", + "pr endre", + "r ê", + "ai lle", + "s ais", + "ch es", + "le t", + "ch ar", + "è res", + "ent s", + "b er", + "g er", + "mo ins", + "e au", + "a î", + "j eu", + "h eur", + "é es", + "tr i", + "po int", + "m om", + "v ent", + "n ouv", + "gr an", + "tr ois", + "s ant", + "tout es", + "con tre", + "è rent", + "che z", + "ave z", + "û t", + "a lle", + "at t", + "p au", + "p orte", + "ouv er", + "b ar", + "l it", + "f ort", + "o t", + "as s", + "pr és", + "cho se", + "v it", + "mon sieur", + "h ab", + "t ête", + "j u", + "te ment", + "c tion", + "v rai", + "la r", + "c et", + "regar d", + "l ant", + "de m", + "s om", + "mom ent", + "il les", + "p le", + "p s", + "b es", + "m ère", + "c l", + "s our", + "y s", + "tr op", + "en ne", + "jus qu", + "av aient", + "av ais", + "jeu ne", + "de puis", + "person ne", + "f it", + "cer t", + "j o", + "g es", + "ou i", + "r est", + "sem b", + "c ap", + "m at", + "m u", + "lon g", + "fr an", + "f aut", + "it i", + "b li", + "che v", + "pr i", + "ent e", + "ain si", + "ch am", + "l ors", + "c as", + "d o", + "il i", + "b é", + "n os", + "an ge", + "su i", + "r it", + "cr o", + "gu e", + "d e", + "e n", + "e s", + "o s", + "l a", + "e r", + "q u", + "a r", + "a n", + "o n", + "qu e", + "a s", + "o r", + "e l", + "d o", + "a l", + "c i", + "u n", + "r e", + "a b", + "i n", + "t e", + "t o", + "s e", + "d i", + "t r", + "d a", + "c on", + "t a", + "s u", + "m i", + "c o", + "t i", + "l e", + "l os", + "n o", + "l o", + "í a", + "c u", + "c a", + "s i", + "v i", + "m e", + "p or", + "m o", + "p ar", + "r a", + "r i", + "la s", + "c h", + "r o", + "m a", + "p er", + "ó n", + "m en", + "de s", + "un a", + "m p", + "s o", + "ab a", + "p u", + "d os", + "t u", + "g u", + "er a", + "de l", + "h a", + "m u", + "l i", + "en t", + "m b", + "h ab", + "es t", + "g o", + "p a", + "r es", + "par a", + "p o", + "á s", + "m os", + "tr a", + "t en", + "an do", + "p i", + "qu i", + "b i", + "m an", + "co mo", + "v e", + "m ás", + "j o", + "ci ón", + "i s", + "t an", + "v o", + "da d", + "c e", + "a do", + "v er", + "f u", + "ci a", + "c er", + "p e", + "c as", + "c ar", + "men te", + "n i", + "su s", + "t ar", + "n a", + "f i", + "t er", + "z a", + "p ro", + "tr o", + "s a", + "l u", + "b a", + "per o", + "s er", + "c es", + "d as", + "d u", + "s in", + "e mp", + "m ar", + "l la", + "e x", + "á n", + "c or", + "i a", + "v a", + "r an", + "ch o", + "g a", + "y o", + "t os", + "c os", + "mi s", + "l es", + "t es", + "v en", + "h o", + "y a", + "en te", + "on es", + "hab ía", + "n u", + "u s", + "p as", + "h i", + "n os", + "es ta", + "la n", + "m as", + "t or", + "l le", + "h e", + "s on", + "b re", + "p re", + "ab an", + "d or", + "í an", + "i r", + "t as", + "é n", + "r u", + "en do", + "a que", + "er o", + "i o", + "qu é", + "m in", + "c ab", + "j a", + "de r", + "t al", + "é s", + "se ñ", + "or a", + "to do", + "la r", + "d on", + "g ar", + "s al", + "p r", + "cu ando", + "j e", + "h u", + "g un", + "b u", + "g i", + "d ar", + "n e", + "r as", + "de n", + "es to", + "par e", + "p en", + "é l", + "tr as", + "c an", + "b o", + "j os", + "mi en", + "pu e", + "c re", + "co mp", + "p on", + "d ía", + "tr os", + "s ab", + "so bre", + "es e", + "mb re", + "er on", + "a ñ", + "m or", + "f or", + "i do", + "por que", + "el la", + "p ri", + "g ran", + "f a", + "c en", + "di s", + "c ri", + "mu y", + "ch a", + "c al", + "es te", + "h as", + "c ó", + "g ra", + "r os", + "p os", + "o b", + "al l", + "aque l", + "j u", + "p res", + "m er", + "di jo", + "c ía", + "ent re", + "z o", + "ci ones", + "bi en", + "mb i", + "el o", + "t ó", + "in a", + "to dos", + "g en", + "ti en", + "est aba", + "de ci", + "ci o", + "h er", + "ñ o", + "l or", + "nu es", + "me di", + "l en", + "vi da", + "f e", + "al i", + "m on", + "c la", + "d re", + "pu es", + "al es", + "vo l", + "m í", + "r ar", + "b le", + "ci on", + "has ta", + "señ or", + "con o", + "a h", + "di os", + "s en", + "es a", + "ú n", + "v ar", + "s an", + "gu i", + "a c", + "o tros", + "ta do", + "bu en", + "ñ a", + "ti emp", + "ha cer", + "j er", + "f er", + "v u", + "f in", + "an a", + "as í", + "an tes", + "t in", + "ve z", + "mien to", + "j ar", + "la b", + "ch e", + "cas a", + "d r", + "es o", + "e go", + "di ó", + "an te", + "est á", + "m al", + "en cia", + "el i", + "í as", + "tiemp o", + "z ar", + "v an", + "m un", + "er ta", + "ta mbi", + "s í", + "b ar", + "a un", + "al e", + "mis mo", + "ent es", + "vi s", + "man o", + "el e", + "na da", + "se gu", + "me j", + "er ra", + "ab le", + "b e", + "ti r", + "un o", + "don de", + "to da", + "des de", + "r en", + "tambi én", + "cu er", + "per son", + "ho mbre", + "o tro", + "li b", + "tr ar", + "cu al", + "ha y", + "a u", + "ca da", + "t aba", + "i mp", + "men to", + "ten ía", + "qu er", + "er an", + "si emp", + "siemp re", + "er to", + "qu í", + "g os", + "pu és", + "el los", + "des pués", + "nu e", + "g an", + "l lo", + "in ter", + "có mo", + "tr i", + "ah ora", + "us te", + "tr aba", + "la do", + "in o", + "po co", + "er te", + "mu jer", + "i m", + "qui er", + "al gun", + "fu e", + "o jos", + "ent on", + "v os", + "es per", + "mu ch", + "o tra", + "a z", + "a d", + "in g", + "e za", + "a quí", + "ci as", + "gu a", + "mu cho", + "deci r", + "es ti", + "i dad", + "al go", + "e z", + "o cu", + "enton ces", + "di do", + "ent os", + "g ri", + "da do", + "i os", + "so l", + "dos e", + "uste d", + "qui en", + "a mi", + "un to", + "f r", + "mi r", + "mej or", + "b as", + "so lo", + "pre gun", + "tu r", + "al g", + "p la", + "to das", + "par te", + "e mb", + "c to", + "mun do", + "tien e", + "tan te", + "pa lab", + "tr an", + "aque lla", + "ci os", + "aun que", + "a y", + "cu en", + "ten er", + "f un", + "res pon", + "all í", + "x i", + "h an", + "pen s", + "con tra", + "tu ra", + "v al", + "di o", + "tr es", + "t re", + "tan to", + "ca min", + "m ó", + "es p", + "a da", + "í o", + "in s", + "ha cia", + "de j", + "est ar", + "i ón", + "g as", + "b er", + "v as", + "no che", + "é r", + "añ os", + "pa dre", + "gu s", + "á r", + "sin o", + "man os", + "ci do", + "es tu", + "a de", + "hu bi", + "vi r", + "b ri", + "ra z", + "ch i", + "pue de", + "men os", + "hab i", + "ho mb", + "ne ces", + "ma y", + "er os", + "r ía", + "he cho", + "es cu", + "l ti", + "án do", + "b us", + "cos as", + "t ú", + "es pa", + "re ci", + "c tor", + "pri m", + "di a", + "de se", + "mien tras", + "h or", + "fu er", + "i da", + "pos i", + "lan te", + "t on", + "an o", + "est as", + "p li", + "ch ar", + "lu ego", + "si ón", + "ci n", + "ti erra", + "m es", + "gu ar", + "ca do", + "en con", + "pr en", + "may or", + "f al", + "e r", + "o n", + "a n", + "t o", + "d i", + "r e", + "l a", + "i n", + "e n", + "a l", + "t a", + "c h", + "e l", + "r i", + "c o", + "t i", + "t e", + "s i", + "r a", + "u n", + "l e", + "l i", + "ch e", + "r o", + "c i", + "c a", + "s e", + "q u", + "m a", + "p o", + "s o", + "i l", + "d o", + "e s", + "v a", + "p er", + "l o", + "c on", + "d el", + "p a", + "m o", + "s a", + "p i", + "d a", + "m i", + "g i", + "s u", + "d e", + "v i", + "z i", + "m e", + "g li", + "n o", + "m en", + "v o", + "t u", + "n on", + "v e", + "t to", + "s t", + "on e", + "an o", + "ch i", + "er a", + "er e", + "f a", + "c e", + "z a", + "un a", + "b i", + "p re", + "s ta", + "o r", + "a r", + "f i", + "on o", + "t ra", + "n a", + "n el", + "n e", + "p ro", + "t ro", + "al e", + "v er", + "n i", + "c u", + "t ti", + "men te", + "del la", + "t er", + "zi one", + "g u", + "p e", + "t ta", + "an do", + "t à", + "al i", + "u o", + "qu el", + "co m", + "s en", + "co me", + "b a", + "al la", + "p ri", + "d u", + "qu es", + "l u", + "on i", + "g gi", + "pa r", + "s si", + "v en", + "in a", + "g a", + "pi ù", + "ci a", + "i m", + "co r", + "m an", + "in o", + "in i", + "t en", + "r an", + "b b", + "g o", + "s to", + "t re", + "a ve", + "a v", + "s ono", + "er i", + "a c", + "s se", + "er o", + "h a", + "s c", + "su l", + "f or", + "v ano", + "po r", + "s ti", + "su o", + "c chi", + "t an", + "z za", + "an che", + "p u", + "i o", + "t te", + "vo l", + "es s", + "s ci", + "co l", + "r u", + "p en", + "f u", + "al l", + "s so", + "s te", + "se m", + "s sa", + "d en", + "a d", + "t ri", + "de i", + "in e", + "ave va", + "men to", + "z z", + "a mo", + "g no", + "f o", + "un o", + "su a", + "g en", + "ri a", + "g e", + "st ra", + "s ì", + "c er", + "ch é", + "b u", + "a p", + "c en", + "d al", + "on a", + "s pe", + "g ni", + "b o", + "t t", + "del le", + "ques to", + "nel la", + "f f", + "d ere", + "an no", + "del l", + "un i", + "bb e", + "an ti", + "g ra", + "s p", + "en e", + "gi o", + "u to", + "qu al", + "gli a", + "qu ando", + "tu tto", + "c an", + "gli o", + "zi oni", + "ca m", + "h o", + "es so", + "s s", + "mo l", + "a t", + "lo ro", + "per ché", + "co sa", + "du e", + "po i", + "ca r", + "s co", + "ci o", + "to r", + "c co", + "c re", + "a m", + "g na", + "te m", + "pri ma", + "lu i", + "co sì", + "qu e", + "gu ar", + "ess ere", + "an i", + "con o", + "b ra", + "al le", + "m on", + "ri o", + "an co", + "cu i", + "s pi", + "vi a", + "g ran", + "gi or", + "a i", + "bi le", + "u l", + "ggi o", + "f e", + "an te", + "ma i", + "ta re", + "in ter", + "in di", + "re bbe", + "sen za", + "so lo", + "zi o", + "e d", + "en te", + "tu tti", + "sta to", + "zi a", + "d alla", + "tu ra", + "mi a", + "vi ta", + "quel la", + "qu a", + "ma r", + "do ve", + "g h", + "al lo", + "sem pre", + "zz o", + "si a", + "mo r", + "do po", + "por ta", + "d re", + "c cia", + "er ano", + "an ni", + "di o", + "chi a", + "en za", + "pro pri", + "qu i", + "m u", + "m b", + "an da", + "c ca", + "o cchi", + "ques ta", + "f fi", + "le i", + "par te", + "d on", + "r on", + "mi o", + "tan to", + "ri s", + "o gni", + "di s", + "r in", + "fa r", + "men ti", + "t el", + "anco ra", + "f ra", + "fa tto", + "man i", + "sen ti", + "p ra", + "tem po", + "es si", + "b bi", + "f in", + "a re", + "la re", + "per s", + "f on", + "b el", + "so r", + "d er", + "pre n", + "an za", + "di re", + "pi e", + "o ra", + "ver so", + "se gu", + "al tro", + "ta to", + "ca to", + "a to", + "vol ta", + "c c", + "fa re", + "pa re", + "ci ò", + "li b", + "bi li", + "n uo", + "s er", + "quel lo", + "co lo", + "p po", + "ca sa", + "tro va", + "o re", + "f er", + "r ono", + "d es", + "mol to", + "al mente", + "s ca", + "vo le", + "t ali", + "sul la", + "s ce", + "men o", + "an to", + "p un", + "s tu", + "ca pi", + "so l", + "gi u", + "m ini", + "m ano", + "z e", + "pi a", + "par ti", + "s al", + "la vo", + "ver o", + "r si", + "al tri", + "es ti", + "s cia", + "suo i", + "gli e", + "so tto", + "b ene", + "sc ri", + "t ale", + "de gli", + "n u", + "al c", + "uo mo", + "p el", + "f re", + "po te", + "es sa", + "s cu", + "si gno", + "el e", + "st ro", + "u ti", + "di a", + "si one", + "g re", + "f ini", + "ar ri", + "l un", + "c ri", + "e si", + "pa ssa", + "r à", + "men tre", + "an d", + "h anno", + "el o", + "u sci", + "gi a", + "gi à", + "di e", + "m ina", + "b e", + "ti ca", + "gior no", + "t in", + "es se", + "mo do", + "c al", + "s pa", + "propri o", + "l en", + "o ri", + "con tro", + "st ru", + "di ven", + "di sse", + "ra to", + "no i", + "v ere", + "pu ò", + "di ce", + "s an", + "es a", + "c ci", + "se con", + "re n", + "c cio", + "qual che", + "tu tta", + "g g", + "mon do", + "for ma", + "p li", + "m ma", + "pen sa", + "de va", + "tu r", + "fo sse", + "so pra", + "ta mente", + "n ess", + "qu anto", + "ra ga", + "un que", + "ca re", + "st re", + "gran de", + "pi cco", + "guar da", + "b en", + "nel l", + "a ff", + "po ssi", + "pre sen", + "r ò", + "pa ro", + "tu a", + "v in", + "an e", + "a s", + "ste sso", + "da v", + "ne i", + "nel le", + "gh i", + "pi o", + "ta r", + "an a", + "la to", + "si d", + "f ine", + "f uo", + "m er", + "z o", + "qua si", + "ul ti", + "i to", + "su e", + "si e", + "f il", + "allo ra", + "m in", + "ven i", + "t ano", + "el lo", + "d e", + "r a", + "e s", + "d o", + "e n", + "q u", + "c o", + "a s", + "o s", + "e r", + "a r", + "s e", + "qu e", + "a n", + "i n", + "i s", + "t o", + "ã o", + "t e", + "d a", + "m a", + "e l", + "t a", + "o r", + "i a", + "r e", + "e m", + "a l", + "co m", + "p a", + "o u", + "c a", + "u m", + "r o", + "v a", + "t i", + "s o", + "m en", + "n ão", + "h a", + "co n", + "m e", + "r i", + "pa ra", + "p o", + "d i", + "s a", + "v o", + "u ma", + "c i", + "n a", + "p or", + "n o", + "g u", + "s u", + "h o", + "an do", + "t ra", + "e i", + "v i", + "e u", + "i m", + "do s", + "el e", + "r es", + "m o", + "en t", + "f i", + "l a", + "e ra", + "l e", + "de s", + "el a", + "men te", + "l h", + "p er", + "l i", + "ç ão", + "m as", + "t er", + "m u", + "es t", + "v e", + "g o", + "l o", + "u s", + "ma is", + "v er", + "c ê", + "in ha", + "vo cê", + "f a", + "t u", + "c u", + "p ar", + "com o", + "p ro", + "s i", + "m os", + "e c", + "p re", + "d as", + "ç a", + "es ta", + "s er", + "u n", + "da de", + "d is", + "f o", + "e x", + "c h", + "i r", + "ra n", + "t ar", + "en te", + "g a", + "t r", + "p e", + "t os", + "b o", + "c ia", + "p en", + "c ar", + "s en", + "su a", + "se m", + "c as", + "f or", + "to u", + "n os", + "te m", + "r ia", + "m es", + "se u", + "co r", + "o n", + "a o", + "p os", + "ra m", + "v el", + "é m", + "t en", + "po de", + "t es", + "esta va", + "c e", + "b a", + "qu ando", + "m i", + "qu er", + "men to", + "se gu", + "t as", + "is so", + "mu i", + "g ar", + "t ro", + "d u", + "fa z", + "õ es", + "p es", + "an to", + "l u", + "p i", + "i x", + "ve z", + "s im", + "j a", + "p r", + "m in", + "b e", + "ra s", + "m an", + "p res", + "est á", + "c er", + "b re", + "p as", + "d ia", + "m b", + "dis se", + "n i", + "r os", + "es se", + "v ia", + "o lh", + "is a", + "an te", + "ê n", + "z a", + "qu i", + "b i", + "t inha", + "me u", + "s ão", + "m inha", + "a c", + "ri o", + "m ar", + "a t", + "p el", + "mui to", + "ta l", + "to r", + "fo i", + "h or", + "j o", + "b em", + "g i", + "f al", + "vo l", + "po n", + "di z", + "l ar", + "gu n", + "m or", + "r u", + "par ec", + "ç o", + "do r", + "pes so", + "n e", + "f er", + "b er", + "p u", + "po is", + "in a", + "es p", + "d ar", + "en do", + "de n", + "so bre", + "co s", + "p ri", + "al i", + "mes mo", + "ç ões", + "g ra", + "se us", + "me i", + "b ra", + "vi da", + "an tes", + "b ri", + "at é", + "ên cia", + "lh e", + "ti v", + "m ã", + "al g", + "qu anto", + "s ó", + "g os", + "de r", + "t ão", + "tu do", + "ent ão", + "r ou", + "es s", + "in da", + "b al", + "in do", + "ci o", + "n do", + "j á", + "va m", + "re i", + "l es", + "ei to", + "v is", + "tem po", + "de pois", + "c ha", + "m el", + "ch e", + "l ha", + "a inda", + "faz er", + "con tra", + "p ou", + "per gun", + "de ix", + "ta mb", + "ra r", + "al a", + "v en", + "t in", + "pel o", + "tamb ém", + "fi ca", + "pre c", + "el es", + "tra n", + "ha via", + "l á", + "to dos", + "j u", + "qu al", + "c an", + "ta do", + "cas a", + "es sa", + "n as", + "g em", + "m em", + "se i", + "na da", + "sen ti", + "c ri", + "ó s", + "de u", + "ei ro", + ". .", + "f un", + "as sim", + "s ou", + "ent re", + "com e", + "i or", + "h ar", + "f e", + "por que", + "s or", + "f in", + "ta mente", + "a qui", + "cu l", + "t ó", + "for ma", + "s ar", + "ou tra", + "olh os", + "i ma", + "m im", + "a go", + "in s", + "co u", + "g ran", + "v al", + "pesso as", + "era m", + "ei ra", + "a que", + "com p", + "de i", + "p ela", + "co isa", + "m ão", + "con h", + "ca da", + "ago ra", + "ia m", + "h á", + "con s", + "su as", + "gu ém", + "o b", + "l an", + "es ti", + "á s", + "la do", + "in ter", + "ca be", + "por ta", + "n em", + "í vel", + "r is", + "j e", + "n un", + "sem pre", + "con segu", + "h as", + "tra bal", + "f u", + "le v", + "l em", + "l as", + "va i", + "tr os", + "t ante", + "te i", + "pr ó", + "que m", + "tu ra", + "on de", + "cabe ça", + "nun ca", + "men tos", + "h um", + "de le", + "ver dade", + "t á", + "h os", + "el i", + "ent es", + "m er", + "alg um", + "diz er", + "s in", + "pen as", + "n ós", + "en quanto", + "ou tro", + "l ho", + "es te", + "mel hor", + "est ar", + "g an", + "b ar", + "pri mei", + "a u", + "i u", + "pen sa", + "a penas", + "p ra", + "es tou", + "con te", + "res pon", + "ho mem", + "do is", + "a do", + "c al", + "a b", + "l os", + "ç as", + "pou co", + "sen hor", + "t ando", + "esp era", + "pa i", + "ri os", + "no i", + "i da", + "ba ix", + "as e", + "is as", + "f r", + "ho ra", + "mu ndo", + "pas sa", + "fi car", + "to do", + "se ja", + "al mente", + "â n", + "c lar", + "a d", + "in c", + "f os", + "lo n", + "g ri", + "ou vi", + "v em", + "g e", + "ta va", + "á rio", + "mo n", + "s os", + "in ho", + "ma l", + "t an", + "t re", + "gran de", + "ran do", + "b u", + "v ou", + "ê s", + "co isas", + "a conte", + "lh er", + "g en", + "ci on", + "an os", + "i do", + "tal vez", + "est ão", + "li v", + "sa b", + "su r", + "ou tros", + "c re", + "qual quer", + "g ou", + "t ri", + "l í", + "tiv esse", + "ra do", + "prec isa", + "mã e", + "su s", + "t anto", + "de la", + "men os", + "s al", + "en tra", + "p é", + "ma ior", + "noi te", + "ti va", + "p ala", + "so n", + "ra ção", + "de us", + "s as", + "un i", + "l or", + "u l", + "in te", + "f ei", + "an o", + "par ti", + "pala v", + "tr ás", + "par te", + "b el", + "ci dade", + "lu gar", + "v os", + "vez es", + "do u", + "en contra", + "tr u", + "e ci", + "a r", + "e r", + "a n", + "e n", + "i n", + "i r", + "o r", + "d e", + "a k", + "ı n", + "a l", + "d i", + "d a", + "b u", + "b ir", + "y or", + "i l", + "e k", + "y a", + "m a", + "l a", + "e l", + "u n", + "k a", + "l ar", + "i m", + "d ı", + "e t", + "o n", + "d u", + "o l", + "e y", + "t ı", + "m i", + "h a", + "b a", + "l er", + "ü n", + "m ı", + "i z", + "l e", + "ı r", + "m e", + "i s", + "n e", + "o k", + "t a", + "s a", + "u m", + "r a", + "g ö", + "i k", + "s ı", + "d en", + "e s", + "b il", + "t i", + "l ı", + "ü z", + "i ç", + "ü r", + "g i", + "u r", + "t e", + "b en", + "d an", + "i y", + "ı m", + "u z", + "v e", + "c ak", + "a y", + "c e", + "i ş", + "ın ı", + "i yor", + "ba ş", + "d ü", + "a t", + "a m", + "g el", + "de ğ", + "k ar", + "i ̇", + "m u", + "e v", + "ö y", + "bu n", + "v ar", + "ya p", + "s en", + "an a", + "s un", + "in i", + "gö r", + "y ı", + "k i", + "l i", + "ar a", + "al ı", + "on u", + "ç ı", + "ş ey", + "s ın", + "k ı", + "ka d", + "s e", + "t an", + "a ğ", + "değ il", + "s in", + "ü k", + "a z", + "ç ok", + "s on", + "ş ı", + "b i", + "ü l", + "t u", + "v er", + "iç in", + "g e", + "k en", + "ey e", + "ol du", + "mı ş", + "y e", + "k al", + "m ek", + "l an", + "öy le", + "yor du", + "er i", + "y üz", + "mi ş", + "b e", + "m ak", + "o la", + "in e", + "y an", + "h er", + "c ek", + "yor um", + "b ak", + "ü m", + "ö n", + "lar ı", + "o ğ", + "d er", + "kad ar", + "h al", + "ar ı", + "s t", + "s an", + "ın da", + "du r", + "g ün", + "v a", + "y ok", + "y er", + "dı m", + "k o", + "da ha", + "l u", + "ın a", + "di m", + "e m", + "bil ir", + "ik i", + "s iz", + "s i", + "n a", + "di ğ", + "s u", + "b ü", + "ha y", + "s or", + "dü ş", + "ü ç", + "un u", + "ö r", + "d ir", + "m ü", + "c a", + "am an", + "f ak", + "a da", + "e de", + "son ra", + "h iç", + "ak i", + "ğ ı", + "bu l", + "r u", + "ma z", + "an la", + "bu ra", + "ge ç", + "ma ya", + "l en", + "k onu", + "c i", + "c u", + "d in", + "t ek", + "z aman", + "el er", + "ö z", + "dı r", + "gi bi", + "o t", + "ş a", + "g er", + "ler i", + "k im", + "k u", + "fak at", + "y ar", + "gö z", + "c ı", + "yor sun", + "b ek", + "in de", + "r o", + "p ek", + "bun u", + "l ik", + "m an", + "il er", + "e di", + "ö l", + "s ür", + "b in", + "s ır", + "çı k", + "sı l", + "al ar", + "k es", + "y ak", + "ç ek", + "yı l", + "e cek", + "ı z", + "gi t", + "ka p", + "a ma", + "ı l", + "lar ın", + "b iz", + "tı r", + "o y", + "an cak", + "d oğ", + "ç a", + "b ana", + "ş im", + "baş la", + "l ü", + "ma dı", + "ben i", + "t ir", + "y ük", + "lı k", + "be ş", + "b el", + "b er", + "m er", + "na sıl", + "tı k", + "k e", + "t ür", + "a v", + ". .", + "d aki", + "p ar", + "t er", + "ce ğ", + "t en", + "z ı", + "iy i", + "d ok", + "ben im", + "c ağ", + "n er", + "y en", + "ş u", + "me z", + "düş ün", + "ken di", + "şim di", + "y ol", + "y u", + "de v", + "is te", + "s ek", + "ma m", + "s öyle", + "di k", + "t o", + "k ur", + "oldu ğ", + "s ını", + "t ar", + "bil iyor", + "k an", + "y al", + "m eye", + "mu ş", + "f a", + "ka ç", + "bil e", + "iy e", + "t ü", + "e f", + "tı m", + "ev et", + "ç o", + "y et", + "g en", + "bura da", + "t im", + "bir az", + "es i", + "k or", + "doğ ru", + "in in", + "kı z", + "di ye", + "d ör", + "et ti", + "on un", + "is ti", + "ğ i", + "h e", + "s ana", + "ü ş", + "ar ka", + "hay ır", + "kar şı", + "h ar", + "il e", + "h ak", + "ı yor", + "ne den", + "s ev", + "sı z", + "ço cu", + "me m", + "ç alı", + "ol ur", + "b ır", + "g ir", + "is e", + "i h", + "c an", + "k ır", + "d ön", + "b öyle", + "sen i", + "! \"", + "al t", + "dör t", + "s öy", + "o ş", + "mu sun", + "la ş", + "h an", + "i p", + "ka y", + "h em", + "bü yük", + "a ç", + "bır ak", + "mi sin", + "s öz", + "u l", + "değ iş", + "ün ü", + "g ül", + "k ö", + "kar ı", + "ta mam", + "ol u", + "r ar", + "yen i", + "la m", + "mış tı", + "ya ş", + "al a", + "in iz", + "kad ın", + "bun un", + "m ey", + "al tı", + "y i", + "s o", + "in den", + "sen in", + "ya t", + "to p", + "s er", + "is i", + "d ün", + "s es", + "hiç bir", + "y on", + "d ın", + "t ün", + "baş ka", + "a s", + "he p", + "i t", + "ir mi", + "dev am", + "ola cak", + "ar tık", + "r e", + "dur um", + "im iz", + "üz el", + "ler ini", + "sa ğ", + "p ro", + "ger ek", + "y irmi", + "ş ek", + "ba ğ", + "me di", + "lar a", + "a h", + "t ur", + "y ür", + "ma sı", + "ka tı", + "de di", + "g ü", + "sor un", + "el i", + "ün e", + "mı z", + "yap ı", + "m il", + "ğ ını", + "t ara", + "m en", + "ha t", + "var dı", + "m et", + "konu ş", + "ar ak", + "lar ak", + "çocu k", + "bü tün", + "l ey", + "d ür", + "g üzel", + "ay ı", + "yap a", + "n ı", + "ay r", + "ö ne", + "yordu m", + "b an", + "i̇ ş", + "du m", + "un a", + "on a", + "yor lar", + "lar ını", + "çı kar", + "z an", + "se ç", + "l iyor", + "t ak", + "şı k", + "tek rar", + "a ş", + "e ş", + "miş ti", + "f ar", + "k in", + "im i", + "i f", + "e ğ", + "gi di", + "le ş", + "başla dı", + "gi de", + "ot ur", + "d de", + "ın dan", + "üz er", + "ın ın", + "n ız", + "u y", + "ye di", + "ka t", + "o larak", + "la dı", + "yal nız", + "ba h", + "iy et", + "m al", + "s ak", + "a çık", + "sın da", + ".. .", + "in san", + "ay nı", + "e der", + "is tan", + "uz un", + "sa h", + "d o", + "g eri", + "er ek", + "ol an", + "ger çek", + "f en", + "al an", + "dı ş", + "alı k", + "far k", + "ü st", + "sa de", + "r i", + "k iş", + "l dı", + "z or", + "et ir", + "her kes", + "s al", + "ö mer", + "s el", + "un da", + "ha f", + "bun a", + "y dı", + "pek i", + "ada m", + "ha z", + "sın a", + "kap ı", + "gör üş", + "sade ce", + "al dı", + "gel di", + "i e", + "n ie", + "n a", + "r z", + "s z", + "c z", + "p o", + "s t", + "c h", + "i ę", + "d z", + "n i", + "a ł", + "r a", + "j e", + "r o", + "d o", + "s ię", + "z a", + "g o", + "e m", + "w i", + "c i", + "rz e", + "k o", + "l e", + "l i", + "w a", + "t o", + "k a", + "m i", + "ż e", + "t a", + "w ie", + "b y", + "m o", + "w y", + "rz y", + "ł a", + "j a", + "n o", + "ł o", + "w o", + "p a", + "m a", + "t e", + "t y", + "n y", + "k i", + "d a", + "n e", + "dz ie", + "dz i", + "cz y", + "c ie", + "m y", + "p rze", + "d y", + "o d", + "l a", + "k ie", + "r y", + "st a", + "j ą", + "ó w", + "c e", + "p rzy", + "c o", + "k u", + "m ie", + "sz y", + "cz e", + "r e", + "b a", + "s i", + "b ie", + "m u", + "w e", + "c y", + "ni a", + "ś ci", + "sz e", + "je st", + "k t", + "s a", + "b o", + "t u", + "ż y", + "n ą", + "b i", + "r u", + "a le", + "kt ó", + "p ra", + "ał a", + "m nie", + "p ie", + "ł y", + "cz a", + "ja k", + "ro z", + "r ó", + "l u", + "z na", + "g a", + "ra z", + "ł u", + "ta k", + "j u", + "p i", + "ś ć", + "s o", + "wi a", + "m ó", + "ch o", + "w szy", + "p e", + "s po", + "c a", + "g dy", + "w ał", + "w ię", + "d e", + "b e", + "p ro", + "ł em", + "j ę", + "s k", + "z e", + "l o", + "g i", + "r ę", + "do b", + "d u", + "ju ż", + "st o", + "b ę", + "ał em", + "sz a", + "m e", + "po d", + "d la", + "pa n", + "n ę", + "z o", + "mo że", + "ś li", + "s ie", + "ał o", + "t em", + "l ko", + "ny ch", + "po wie", + "c ię", + "s u", + "ty lko", + "i n", + "b u", + "na j", + "ch a", + "te go", + "p u", + "s ki", + "ne go", + "wszy st", + "sz cze", + "je d", + "je j", + "t wo", + "ą d", + "ś my", + "cz ę", + "wa ć", + "je go", + "ż a", + "i m", + "s y", + "pra w", + "ty m", + "któ ry", + "ał y", + "t rze", + "nie j", + "s e", + "ny m", + "i ch", + "o b", + ". .", + "g ło", + "ją c", + "mó wi", + "s ka", + "o n", + "ne j", + "s łu", + "w ła", + "bę dzie", + "d ę", + "p ó", + "be z", + "ni c", + "p ła", + "ś cie", + "mi a", + "s ą", + "t rzy", + "kie m", + "by ł", + "mo g", + "ro bi", + "ta m", + "c u", + "te n", + "m ię", + "z y", + "pe w", + "ci a", + "my ś", + "prze d", + "s ko", + "n u", + "któ re", + "a l", + "l ę", + "w sze", + "ą c", + "by ło", + "so bie", + "p y", + "ci ą", + "ba r", + "je szcze", + "h a", + "t ę", + "b ra", + "cza s", + "sz ę", + "g ł", + "k ę", + "ma r", + "cz u", + "prze z", + "f i", + "s ło", + "w z", + "k to", + "k ów", + "cz o", + "li śmy", + "st ra", + "wię c", + "r ą", + "ma m", + "w ó", + "rz a", + "g ro", + "no ści", + "f a", + "we t", + "ną ł", + "ś mie", + "na wet", + "mu si", + "s wo", + "te j", + "w ą", + "w u", + "wi ą", + "ni u", + "cz ą", + "b li", + "dz o", + "s kie", + "n em", + "je śli", + "cze go", + "ch y", + "d ł", + "ty ch", + "by m", + "ż o", + "e ś", + "si ą", + "kie dy", + "na s", + "w ró", + "dz e", + "d ro", + "t ra", + "r ów", + "pa ni", + "z ie", + "ku l", + "na d", + "ch wi", + "ni m", + "t ro", + "by ć", + "cho dzi", + "ni o", + "dob rze", + "te raz", + "wo kul", + "co ś", + "k ł", + "pie r", + "h e", + "g dzie", + "dz y", + "p ię", + "d ź", + "k ą", + "g ó", + "z da", + "ch ce", + "st ę", + "o r", + "ś wia", + "wszyst ko", + "st ro", + "pe ł", + "wie m", + "wie l", + "ka ż", + "ki m", + "rz u", + "s ły", + "jed na", + "z u", + "myś l", + "mó j", + "g u", + "wa r", + "jest em", + "ó ż", + "mie j", + "mo ż", + "k ła", + "re sz", + "d łu", + "st wo", + "n ię", + "ma sz", + "że by", + "nie m", + "ja kie", + "st y", + "ni ą", + "we j", + "o j", + "g ra", + "s ła", + "no ść", + "z ło", + "sz czę", + ".. .", + "r i", + "le j", + "we go", + "c ał", + "dzi ał", + "ki ch", + "dz a", + "dz ię", + "o czy", + "zo sta", + "cz ło", + "na m", + "ki l", + "o na", + "sz u", + "w ę", + "pa r", + "mi ał", + "st rze", + "ce j", + "e j", + "zna j", + "da ć", + "miej s", + "k ró", + "k ry", + "bar dzo", + "si a", + "z i", + "ś nie", + "l ą", + "g ie", + "cie bie", + "d ni", + "st u", + "po trze", + "wokul ski", + "u wa", + "u mie", + "jedna k", + "k ra", + "wró ci", + "czło wie", + "czy ć", + "by ła", + "że li", + "m ę", + "c ę", + "z robi", + "mog ę", + "pro wa", + "r em", + "nie ch", + "cz nie", + "k ro", + "t ą", + "ch ci", + "b ro", + "dzie ć", + "sz ą", + "pa d", + "t rz", + "t ru", + "je m", + "a ni", + "t ów", + "a r", + "d ru", + "ta j", + "rze kł", + "sa m", + "st e", + "nie go", + "ta kie", + "w ała", + "to wa", + "ka pła", + "wi dzi", + "po dob", + "dz ę", + "t ał", + "stę p", + "b ą", + "po ko", + "w em", + "g ę", + "a by", + "g e", + "al bo", + "s pra", + "z no", + "de n", + "s mo", + "je sz", + "k się", + "jest eś", + "po z", + "ni gdy", + "k sią", + "c óż", + "w s", + "po w", + "t ka", + "ś wie", + "sz ka", + "sa mo", + "s ł", + "rz ę", + "na le", + "chce sz", + "ni k", + "p ę", + "chy ba", + "cią g", + "ją cy", + "wo j", + "na sze", + "mnie j", + "wię cej", + "z wy", + "o sta", + "f e", + "wa ż", + "h o", + "se r", + "śmie r", + "wie r", + "dz ą", + "za ś", + "gdy by", + "ja ki", + "wo l", + "wi n", + "d ą", + "ści a", + "roz ma", + "wa l", + "pa nie", + "sta r", + "ka z", + "je żeli", + "d em", + "w ra", + "ko ń", + "sie bie", + "zno wu", + "p ró", + "cz em", + "st wa", + "i sto", + "pó ł", + "d ał", + "ko bie", + "ała m", + "wy ch", + "ce sa", + "ni ch", + "za wsze", + "dzi ć", + "te ż", + "le pie", + "pro szę", + "k re", + "t wa", + "o t", + "ł ą", + "ch u", + "c ą", + "p rz", + "ł e", + "sze dł", + "od powie", + "my śli", + "ś wią", + "e n", + "e r", + "d e", + "a n", + "e t", + "i j", + "i n", + "e l", + "a a", + "s t", + "o r", + "g e", + "i s", + "a t", + "i e", + "c h", + "o n", + "e en", + "h et", + "i t", + "v er", + "aa r", + "a l", + "o or", + "g en", + "v an", + "o p", + "d en", + "h e", + "o m", + "t e", + "w e", + "i k", + "r e", + "z e", + "ij n", + "d at", + "b e", + "d er", + "in g", + "o e", + "ij k", + "a an", + "ch t", + "v oor", + "l e", + "i et", + "r o", + "m o", + "k en", + "z ijn", + "m en", + "i g", + "j e", + "n iet", + "a r", + "o o", + "i d", + "u n", + "i l", + "s ch", + "mo et", + "st e", + "u r", + "o l", + "he b", + "u it", + "g el", + "w ij", + "a s", + "m e", + "t en", + "w or", + "o u", + "v en", + "l en", + "aa t", + "d it", + "m et", + "r a", + "b en", + "s p", + "o ver", + "d ie", + "n o", + "w er", + "l ijk", + "f t", + "s l", + "an d", + "v e", + "t er", + "i er", + "i en", + "t o", + "d aar", + "g r", + "b el", + "de ze", + "d u", + "a g", + "k an", + "wor den", + "in gen", + "moet en", + "n en", + "on der", + "heb ben", + "r u", + "oo k", + "s en", + "c t", + "k t", + "no g", + "aa l", + "w as", + "u l", + "e er", + "b ij", + "m ijn", + "p ro", + "v ol", + "d o", + "k om", + "at ie", + "e ft", + "k el", + "al s", + "r ij", + "he id", + "a f", + "st el", + "m aar", + "a p", + "we e", + "a d", + "he eft", + "w aar", + "i cht", + "d an", + "er en", + "n e", + "w el", + "w at", + "w il", + "a cht", + "aa g", + "ge b", + "c on", + "z o", + "k e", + "b et", + "h ij", + "d ig", + "k un", + "u w", + "d t", + "d oor", + "t ij", + "a m", + "an g", + "on d", + "er s", + "is ch", + "ge en", + "i ge", + "ge v", + "ve el", + "n u", + "m a", + "on s", + "o f", + "b l", + "n aar", + "g ro", + "p l", + "an der", + "at en", + "kun nen", + "e cht", + "h ier", + "g oe", + "an t", + "u s", + "t wee", + "on t", + "de lijk", + "el e", + "u ur", + "al le", + "t oe", + "me er", + "i st", + "n a", + "n ie", + "on ze", + "l o", + "i m", + "p en", + "h ad", + "tij d", + "h oe", + "to t", + "z ou", + "a k", + "aa k", + "a men", + "d r", + "w oor", + "s e", + "wor dt", + "o t", + "gel ijk", + "g aan", + "i c", + "g er", + "k er", + "el d", + "e m", + "h ou", + "de l", + "z en", + "z el", + "te gen", + "b o", + "kom en", + "c om", + "i gen", + "e it", + "wer k", + "goe d", + "z al", + "z ij", + "sl ag", + "e s", + "z ien", + "a st", + "echt er", + "it ie", + "t ie", + "el ijk", + "m is", + "isch e", + "bel an", + "h aar", + "i ch", + "b er", + "h an", + "v r", + "al e", + "c i", + "gr ijk", + "in d", + "do en", + "l and", + "belan grijk", + "p un", + "op en", + "ct ie", + "zel f", + "m ij", + "it eit", + "ste m", + "me e", + "ar en", + "al l", + "b r", + "re cht", + "d ien", + "h u", + "g aat", + "pro b", + "m oe", + "p er", + "a u", + "ul len", + "z ich", + "daar om", + "or m", + "k l", + "v o", + "en t", + "st aat", + "z it", + "du i", + "n at", + "du s", + "d s", + "ver slag", + "kel ijk", + "prob le", + "w et", + "ge m", + "c r", + "i on", + "p r", + "sch ap", + "g d", + "h un", + "z a", + "er d", + "z et", + "st aan", + "st r", + "m aal", + "in der", + "e id", + "st en", + "p ar", + "k ken", + "ge d", + "z ullen", + "re s", + "men sen", + "j aar", + "re gel", + "ie der", + "vol gen", + "ge ven", + "e ven", + "l u", + "bl ij", + "i ë", + "k o", + "u we", + "m an", + "ma ken", + "l ie", + "g a", + "oe k", + "nie uwe", + "b aar", + "h o", + "h er", + "in ter", + "ander e", + "ru ik", + "s u", + "a gen", + "or t", + "m er", + "ou w", + "st er", + "wil len", + "aa kt", + "h oo", + "an den", + "f f", + "l ig", + "t re", + "s amen", + "ze er", + "dui delijk", + "ant woor", + "he el", + "men t", + "pun t", + "hou den", + "we g", + "vr aag", + "gel e", + "een s", + "be sch", + "om en", + "er g", + "do el", + "d ag", + "sp e", + "ur en", + "ing s", + "or en", + "l ang", + "de len", + "m ar", + "ste un", + "in nen", + "p ol", + "o on", + "i de", + "s n", + "s ie", + "r icht", + "z onder", + "no dig", + "all een", + "m id", + "ra gen", + "iet s", + "ver sch", + "geb ruik", + "st u", + "ro uw", + "stel len", + "be g", + "men ten", + "v in", + "eer ste", + "l aat", + "gro ot", + "oo d", + "to ch", + "l aten", + "aar d", + "s le", + "de el", + "st and", + "pl aat", + "re e", + "bet re", + "d i", + "l id", + "uit en", + "ra cht", + "bel eid", + "g et", + "ar t", + "st ie", + "st aten", + "g gen", + "re ken", + "e in", + "al en", + "m ing", + "mo gelijk", + "gro te", + "al tijd", + "z or", + "en kel", + "w ik", + "pol itie", + "e igen", + "el k", + "han del", + "g t", + "k we", + "m aat", + "el en", + "i p", + "v rij", + "s om", + "je s", + "aa m", + "hu is", + "v al", + "we er", + "lid staten", + "k ing", + "k le", + "be d", + "gev al", + "stel l", + "a i", + "wik kel", + "kwe stie", + "t al", + "ste e", + "a b", + "h el", + "kom st", + "p as", + "s s", + "it u", + "i den", + "eer d", + "m in", + "c e", + "p o", + "twee de", + "proble em", + "w aren", + "us sen", + "sn el", + "t ig", + "ge w", + "j u", + "ul t", + "ne men", + "com mis", + "versch il", + "k on", + "z oek", + "k rij", + "gr aag", + "den k", + "l anden", + "re den", + "be sl", + "oe g", + "bet er", + "he den", + "m ag", + "p e", + "bo ven", + "a c", + "con t", + "f d", + "h ele", + "k r", + "v ier", + "w in", + "ge z", + "k w", + "m il", + "v or", + "he m", + "ra m", + "aa s", + "ont wikkel", + "dr ie", + "v aak", + "plaat s", + "l a", + "g ang", + "ij f", + "f in", + "nat uur", + "t ussen", + "u g", + "in e", + "d a", + "b at", + "kom t", + "w acht", + "aa d", + "u t", + "é n", + "acht er", + "geb ie", + "ver k", + "lig t", + "c es", + "nie uw", + "van d", + "s t", + "n í", + "j e", + "p o", + "c h", + "r o", + "n a", + "s e", + "t o", + "n e", + "l e", + "k o", + "l a", + "d o", + "r a", + "n o", + "t e", + "h o", + "n ě", + "v a", + "l i", + "l o", + "ř e", + "c e", + "d e", + "v e", + "b y", + "n i", + "s k", + "t a", + "n á", + "z a", + "p ro", + "v o", + "v ě", + "m e", + "v á", + "s o", + "k a", + "r á", + "v y", + "z e", + "m i", + "p a", + "t i", + "st a", + "m ě", + "n é", + "ř i", + "ř í", + "m o", + "ž e", + "m a", + "j í", + "v ý", + "j i", + "d ě", + "r e", + "d a", + "k u", + "j a", + "c i", + "r u", + "č e", + "o b", + "t ě", + "m u", + "k y", + "d i", + "š e", + "k é", + "š í", + "t u", + "v i", + "p ře", + "v í", + "s i", + "n ý", + "o d", + "so u", + "v é", + "n y", + "r i", + "d y", + "b u", + "b o", + "t y", + "l á", + "l u", + "n u", + "ž i", + "m á", + "st i", + "c í", + "z á", + "p ra", + "sk é", + "m í", + "c o", + "d u", + "d á", + "by l", + "st o", + "s a", + "t í", + "je d", + "p ří", + "p ři", + "t é", + "s í", + "č i", + "v ní", + "č a", + "d í", + "z i", + "st u", + "p e", + "b a", + "d ní", + "ro z", + "va l", + "l í", + "s po", + "k á", + "b e", + "p i", + "no u", + "ta k", + "st e", + "r y", + "l é", + "vě t", + "se m", + "p ě", + "ko n", + "ne j", + "l y", + "ko u", + "ý ch", + "b ě", + "p r", + "f i", + "p rá", + "a le", + "ja ko", + "po d", + "ž í", + "z í", + "j sou", + "j sem", + "ch o", + "l ní", + "c ké", + "t á", + "m y", + "a k", + "h u", + "va t", + "pře d", + "h la", + "k e", + "st á", + "č í", + "š i", + "s le", + "k la", + "š tě", + "lo u", + "m ů", + "z na", + "ch á", + "o r", + "p ů", + "h a", + "b i", + "ta ké", + "d ů", + "no st", + "t ře", + "te r", + "p u", + "i n", + "v r", + "ve l", + "sk u", + "v še", + "t ní", + "do b", + "by la", + "č ní", + "ja k", + "v u", + "je ho", + "b ý", + "vá ní", + "ný ch", + "po u", + "te n", + "t ři", + "v z", + "st ře", + "d va", + "h le", + "č á", + "no sti", + "c k", + "v š", + "vo u", + "s u", + "h e", + "h ra", + "je n", + "s y", + "da l", + "po z", + "s lo", + "te l", + "d ru", + "de n", + "vš ak", + "g i", + "k dy", + "by lo", + "bu de", + "st ra", + "j ší", + "m é", + "me n", + "vý ch", + "ní m", + "s m", + "ko li", + "r ů", + "t ra", + "mů že", + "ne ní", + "ho d", + "b í", + "do u", + "sk a", + "t ý", + "st ě", + "u je", + "s á", + "pě t", + "ne s", + "k rá", + "to m", + "st ví", + "v ně", + "se d", + "s vé", + "p í", + "z o", + "mu sí", + "u ž", + "tí m", + "jí cí", + "jed no", + "t r", + "ča s", + "e v", + "č ty", + "sk ý", + "ni c", + "ev ro", + "to ho", + "h y", + "k ter", + "r ní", + "st í", + "s vě", + "pa k", + "vše ch", + "k ů", + "n g", + "á d", + "chá zí", + "a ni", + "a r", + "jed na", + "bý t", + "t ro", + "k ra", + "pr vní", + "m no", + "ské ho", + "p á", + "p la", + "le m", + "ne bo", + "ke m", + "st ro", + "s la", + "né ho", + "z de", + "dal ší", + "ř a", + "čty ři", + "h rá", + "dru h", + "l ně", + "v la", + "sk ých", + "š ko", + "pů so", + "pro to", + "v ů", + "sk á", + "ve n", + "še st", + "d ně", + "je ště", + "me zi", + "te k", + "s ko", + "ch a", + "ně koli", + "be z", + "g ra", + "ji ž", + "č ně", + "j á", + "s lu", + "z ná", + "ve r", + "sed m", + "k ro", + "ta m", + "a no", + "v lá", + "o sm", + "byl y", + "vá m", + "ck ý", + "te ch", + "dě ji", + "vel mi", + "le ži", + "va la", + "l ý", + "t vo", + "spo le", + "ch u", + "stu p", + "mo ž", + "evro p", + "g e", + "sta l", + "j de", + "ch y", + "ro di", + "je jí", + "po li", + "de vět", + "s me", + "a ž", + "té to", + "re m", + "d é", + "f or", + "u ni", + "f o", + "ten to", + "a u", + "ka ž", + "nu la", + "na d", + "by ch", + "mo c", + "sto u", + "e x", + "le n", + "k do", + "z d", + "pra co", + "to mu", + "ný m", + "ži vo", + "ze m", + "f e", + "f u", + "ná sle", + "j o", + "sk y", + "ji ch", + "h á", + "mě l", + "dě la", + "j sme", + "p re", + "ni ce", + "ste j", + "ne m", + "st ní", + "he m", + "ná ro", + "z u", + "b li", + "ni t", + "pa r", + "a l", + "poz ději", + "ta ko", + "n ce", + "če r", + "ší m", + "ně co", + "vá l", + "ře j", + "krá t", + "á lní", + "u r", + ". .", + "a si", + "kter é", + "sta v", + "ma jí", + "my s", + "do bě", + "s ně", + "ce n", + "z y", + "z ku", + "t ů", + "ch od", + "s pě", + "je jich", + "sou čas", + "d r", + "va li", + "ri e", + "k te", + "pr ů", + "ze ní", + "pa t", + "a n", + "po tře", + "de m", + "d nes", + "ze mí", + "sa mo", + "zna m", + "b ra", + "má m", + "te dy", + "g o", + "hla vní", + "pou ží", + "b ní", + "ve de", + "le p", + "je k", + "pra v", + "poli ti", + "d ne", + "je m", + "le t", + "če ní", + "pro b", + "ne ž", + "dě l", + "fi l", + "č o", + "cí ch", + "st é", + "d lou", + "h i", + "a by", + "to u", + "několi k", + "d la", + "vy u", + "vi t", + "ho u", + "ck ých", + "no vé", + "či n", + "st y", + "dě lá", + "k ý", + "ob la", + "pod le", + "ra n", + "dů leži", + "ta to", + "po ku", + "ko ne", + "d ý", + "d vě", + "ž ád", + "nou t", + "t ku", + "t vr", + "cké ho", + "ro v", + "r é", + "te le", + "p sa", + "s vět", + "ti vní", + "do sta", + "te m", + "še l", + "druh é", + "s kou", + "ž o", + "jed ná", + "vý znam", + "prob lé", + "pu bli", + "vá n", + "od po", + "pod po", + "d le", + "ja ké", + "še ní", + "ví m", + "bě hem", + "na chází", + "s lou", + "pou ze", + "o tá", + "p lo", + "to vé", + "vět ši", + "ko mi", + "va jí", + "ty to", + "zá pa", + "z mě", + "mo h", + "ví ce", + "spole č", + "au to", + "pro ti", + "st ru", + "dě t", + "chá ze", + "že l", + "с т", + "е н", + "н о", + "н а", + "п р", + "т о", + "п о", + "р а", + "г о", + "к о", + "н е", + "в о", + "в а", + "е т", + "е р", + "н и", + "е л", + "и т", + "н ы", + "з а", + "р о", + "ен и", + "к а", + "л и", + "е м", + "д а", + "о б", + "л а", + "д о", + "с я", + "т ь", + "о т", + "л о", + "л ь", + "е д", + "с о", + "м и", + "р е", + "м о", + "ц и", + "пр о", + "т а", + "э то", + "к и", + "р у", + "пр и", + "т и", + "с е", + "ст а", + "в ы", + "м ы", + "в и", + "б ы", + "м а", + "е с", + "л я", + "ст и", + "л е", + "ч то", + "м е", + "р и", + "ч а", + "о д", + "е й", + "ел ь", + "ени я", + "г а", + "н у", + "с и", + "п а", + "ра з", + "б о", + "ст о", + "с у", + "с а", + "д у", + "е го", + "е ст", + "и н", + "ит ь", + "и з", + "ж е", + "м у", + "п ер", + "по д", + "ени е", + "с ь", + "к у", + "пр ед", + "но го", + "ны х", + "в ер", + "т е", + "но й", + "ци и", + "д е", + "р ы", + "д ел", + "л ю", + "в е", + "о н", + "м ен", + "г и", + "н я", + "б у", + "пр а", + "в се", + "ет ся", + "ст ь", + "ж а", + "до л", + "ж и", + "б е", + "ко н", + "с л", + "ш и", + "д и", + "ст в", + "с ко", + "ны е", + "ч и", + "ю т", + "д ер", + "ст ра", + "т ы", + "х од", + "щ и", + "з о", + "з на", + "но сти", + "ч ес", + "в ля", + "ва ть", + "о р", + "по л", + "в ет", + "та к", + "ш а", + "т у", + "с во", + "пр е", + "о на", + "ит ель", + "ны й", + "с ло", + "ка к", + "в л", + "но сть", + "х о", + "мо ж", + "п е", + "д ля", + "ни я", + "но е", + "ра с", + "дол ж", + "да р", + "т ель", + "с ка", + "п у", + "ст во", + "ко то", + "ра б", + "е е", + "ро д", + "э ти", + "с об", + "о ру", + "ж ен", + "ны м", + "ит и", + "ни е", + "ко м", + "д ет", + "ст у", + "г у", + "п и", + "ме ж", + "ени ю", + "т ер", + "раб от", + "во з", + "ци я", + "ко й", + "щ ест", + "г ра", + "з и", + "р я", + "меж ду", + "ст ва", + "в с", + "ел о", + "ш е", + "м ер", + "б а", + "з ы", + "л у", + "а ль", + "д ей", + "г ла", + "на род", + "к ти", + "пред ста", + "л ся", + "я вля", + "с ки", + "но в", + "ед ин", + "ро в", + "и с", + "ни ма", + "р ем", + "ход и", + "так же", + "д ру", + "а ть", + "сл ед", + "го во", + "на я", + "ю щи", + "ен ь", + "кото ры", + "х от", + "в у", + "и х", + "ем у", + "ч ит", + "ва ж", + "ор га", + "чес ки", + "щ е", + "к е", + "х а", + "по с", + "то м", + "бо ль", + "м не", + "па с", + "об ъ", + "пра в", + "кон ф", + "сл у", + "под дер", + "ст ви", + "на ш", + "ль ко", + "сто я", + "ну ю", + "л ем", + "ен ных", + "к ра", + "д ы", + "между народ", + "г да", + "не об", + "го су", + "ств у", + "ени и", + "госу дар", + "к то", + "и м", + "ч ест", + "р ет", + "во про", + "л ен", + "ел и", + "ро ва", + "ци й", + "на м", + "это й", + "ж ения", + "необ ходи", + "мен я", + "бы ло", + "си ли", + "ф и", + "в я", + "ш ь", + "это го", + "о ни", + "орга ни", + "бе зо", + "пр об", + "и ме", + "ре ш", + "б и", + "безо пас", + "ют ся", + "о ста", + "ен но", + "го д", + "ел а", + "предста в", + "ть ся", + "сло во", + "органи за", + "долж ны", + "это м", + "б ла", + "ч е", + "ч у", + "бла го", + "это му", + "в рем", + "с пе", + "но м", + "ени й", + "с по", + "на с", + "не т", + "з у", + "в ед", + "е ще", + "ска за", + "се й", + "ер ен", + "да н", + "са м", + "ел я", + "ра н", + "зы ва", + "явля ется", + "бу дет", + "кти в", + "т ре", + "дел е", + "м от", + "конф ерен", + "ла сь", + "ча с", + "сто ро", + "ко го", + "е з", + "не й", + "о с", + "ли сь", + "раз ору", + "пер е", + "с си", + "ны ми", + "про ц", + "го ло", + "ч ело", + "бо ле", + "чело ве", + "с ер", + "п л", + "ч ет", + "стра н", + "п я", + "бы л", + "к ла", + "то в", + "ж д", + "дел а", + "е ра", + "у же", + "со вет", + "г ен", + "безопас ности", + "ц а", + "се да", + "по з", + "от вет", + "проб лем", + "на ко", + "т ем", + "до ста", + "п ы", + "щ а", + "во й", + "су щест", + "необходи мо", + "бы ть", + "мож ет", + "д ем", + "что бы", + "е к", + "ч ер", + "у сили", + "ре с", + "ру д", + "един енных", + "д об", + "до сти", + "ств ен", + "я дер", + "год ня", + "ка за", + "се годня", + "сей час", + "то лько", + "во д", + "ес ь", + "м ного", + "бу ду", + "е в", + "ест ь", + "т ри", + "об щест", + ". .", + "я вл", + "вы сту", + "р ед", + "с чит", + "с ит", + "деле га", + "ло ж", + "это т", + "ф ор", + "к лю", + "воз мож", + "ва ния", + "б ли", + "и ли", + "в з", + "на ций", + "ско го", + "при ня", + "п ла", + "о ч", + "ить ся", + "ст е", + "на ши", + "которы е", + "а р", + "име ет", + "с от", + "зна ч", + "пер ь", + "след у", + "ен ы", + "та ки", + "объ единенных", + "ст ро", + "те перь", + "б ле", + "благо дар", + "раз в", + "а н", + "жи ва", + "оч ень", + "я т", + "бе з", + "об ес", + "г ро", + "ло сь", + "с ы", + "организа ции", + "ч лен", + "то го", + "она ль", + "ж да", + "все х", + "с вя", + "боле е", + "со в", + "ко гда", + "во т", + "к ре", + "к ры", + "по этому", + "во ль", + "о й", + "ген ера", + "ч ем", + "л ы", + "пол ити", + "в ен", + "конферен ции", + "проц ес", + "б я", + "ит е", + "от но", + "разв ити", + "а ф", + "ю щ", + "в но", + "ми р", + "ни и", + "ка я", + "а с", + "итель но", + "в то", + "ени ем", + "генера ль", + "пр от", + "вс ем", + "сам бле", + "ас самбле", + "о м", + "з д", + "с мот", + "ре ги", + "ч его", + "од нако", + "усили я", + "дей стви", + "ч но", + "у ча", + "об раз", + "во с", + "э та", + "пер его", + "гово р", + "ва м", + "мо ло", + "врем я", + "д ь", + "хот ел", + "г ру", + "за явл", + "пре доста", + "по ль", + "не е", + "ре зо", + "перего во", + "резо лю", + "к рет", + "поддер ж", + "обес пе", + "не го", + "представ ит", + "на де", + "к ри", + "ч ь", + "про ек", + "л ет", + "дру ги", + "ا ل", + "َ ا", + "و َ", + "ّ َ", + "ِ ي", + "أ َ", + "ل َ", + "ن َ", + "ال ْ", + "ه ُ", + "ُ و", + "م ا", + "ن ْ", + "م ن", + "ع َ", + "ن ا", + "ل ا", + "م َ", + "ت َ", + "ف َ", + "أ ن", + "ل ي", + "م ِ", + "ا ن", + "ف ي", + "ر َ", + "ي َ", + "ه ِ", + "م ْ", + "ق َ", + "ب ِ", + "ل ى", + "ي ن", + "إ ِ", + "ل ِ", + "و ا", + "ك َ", + "ه ا", + "ً ا", + "م ُ", + "و ن", + "ال م", + "ب َ", + "ي ا", + "ذ ا", + "س ا", + "ال ل", + "م ي", + "ي ْ", + "ر ا", + "ر ي", + "ل ك", + "م َا", + "ن َّ", + "ل م", + "إ ن", + "س ت", + "و م", + "ّ َا", + "ل َا", + "ه م", + "ّ ِ", + "ك ُ", + "ك ان", + "س َ", + "ب ا", + "د ي", + "ح َ", + "ع ْ", + "ب ي", + "ال أ", + "و ل", + "ف ِي", + "ر ِ", + "د ا", + "مِ نْ", + "ُو نَ", + "و ْ", + "ه َا", + "ّ ُ", + "ال س", + "ال َ", + "ن ي", + "ل ْ", + "ت ُ", + "ه ل", + "ر ة", + "د َ", + "س ْ", + "ت ِ", + "ن َا", + "ر ْ", + "الل َّ", + "سا مي", + "ك ن", + "ك ل", + "ه َ", + "عَ لَ", + "ع لى", + "م ع", + "إ لى", + "ق د", + "ال ر", + "ُو ا", + "ي ر", + "ع ن", + "ي ُ", + "ن ِ", + "ب ْ", + "ال ح", + "هُ مْ", + "ق ا", + "ذ ه", + "ال ت", + "ِي نَ", + "ج َ", + "ه ذا", + "ع د", + "ال ع", + "د ْ", + "قَ الَ", + "ر ُ", + "ي م", + "ي ة", + "ن ُ", + "خ َ", + "ر ب", + "ال ك", + "و َا", + "أ نا", + "ة ِ", + "ال ن", + "ح د", + "ع ِ", + "ت ا", + "ه و", + "ف ا", + "ع ا", + "ال ش", + "ل ُ", + "ي ت", + "ذ َا", + "ي ع", + "ال ذ", + "ح ْ", + "ال ص", + "إِ نَّ", + "ج ا", + "ع لي", + "ك َا", + "ب ُ", + "ت ع", + "و ق", + "م ل", + "ل َّ", + "ي د", + "أ خ", + "ر ف", + "ت ي", + "ال ِ", + "ّ ا", + "ذ لك", + "أَ نْ", + "س ِ", + "ت وم", + "م ر", + "مَ نْ", + "ب ل", + "ال ق", + "الل ه", + "ِي َ", + "ك م", + "ذ َ", + "ع ل", + "ح ب", + "س ي", + "ع ُ", + "ال ج", + "ال د", + "ش َ", + "ت ك", + "ف ْ", + "ص َ", + "ل ل", + "د ِ", + "ب ر", + "ف ِ", + "ت ه", + "أ ع", + "ت ْ", + "ق ْ", + "الْ أَ", + "ئ ِ", + "عَ نْ", + "و ر", + "ح ا", + "ال َّ", + "م ت", + "ف ر", + "د ُ", + "ه نا", + "وَ أَ", + "ت ب", + "ة ُ", + "أ ي", + "س ب", + "ري د", + "و ج", + "كُ مْ", + "ح ِ", + "ك ْ", + "د ر", + "َا ء", + "ه ذه", + "ال ط", + "الْ مُ", + "د ة", + "ق ل", + "غ َ", + "ي وم", + "الَّ ذ", + "ك ر", + "ت ر", + "ك ِ", + "ك ي", + "عَلَ ى", + "رَ ب", + "ع ة", + "ق ُ", + "ج ْ", + "ف ض", + "ل ة", + "ه ْ", + "ر َا", + "وَ لَ", + "الْ مَ", + "أَ نَّ", + "ي َا", + "أ ُ", + "ش ي", + "اللَّ هُ", + "لَ ى", + "ق ِ", + "أ ت", + "عَلَ يْ", + "اللَّ هِ", + "ال ب", + "ض َ", + "ة ً", + "ق ي", + "ا ر", + "ب د", + "خ ْ", + "سْ تَ", + "ط َ", + "قَ دْ", + "ذه ب", + "أ م", + "ما ذا", + "وَ إِ", + "ة ٌ", + "و نَ", + "لي لى", + "و لا", + "ح ُ", + "ه ي", + "ص ل", + "ال خ", + "و د", + "لي س", + "ل دي", + "ق ال", + "كَا نَ", + "م َّ", + "ح ي", + "ت م", + "ل ن", + "وَ لَا", + "ب ع", + "يم كن", + "س ُ", + "ة َ", + "ح ت", + "ر ًا", + "ك ا", + "ش ا", + "هِ مْ", + "لَ هُ", + "ز َ", + "دا ً", + "م س", + "ك ث", + "الْ عَ", + "ج ِ", + "ص ْ", + "ف َا", + "ل ه", + "و ي", + "ع َا", + "هُ وَ", + "ب ِي", + "ب َا", + "أ س", + "ث َ", + "ل ِي", + "ر ض", + "الر َّ", + "لِ كَ", + "ت َّ", + "ف ُ", + "ق ة", + "ف عل", + "مِ ن", + "ال آ", + "ث ُ", + "س م", + "م َّا", + "بِ هِ", + "ت ق", + "خ ر", + "ل قد", + "خ ل", + "ش ر", + "أن ت", + "ل َّا", + "س ن", + "الس َّ", + "الذ ي", + "س َا", + "و ما", + "ز ل", + "و ب", + "أ ْ", + "إ ذا", + "ر ِي", + "ح ة", + "ن ِي", + "الْ حَ", + "وَ قَالَ", + "ب ه", + "ة ٍ", + "س أ", + "ر ٌ", + "ب ال", + "م ة", + "ش ْ", + "و ت", + "عن د", + "ف س", + "بَ عْ", + "ه ر", + "ق ط", + "أ ح", + "إن ه", + "و ع", + "ف ت", + "غ ا", + "هنا ك", + "ب ت", + "مِ نَ", + "س ر", + "ذَ لِكَ", + "ر س", + "حد ث", + "غ ْ", + "ّ ِي", + "ال إ", + "وَ يَ", + "ج ل", + "ا ست", + "ق ِي", + "ع ب", + "و س", + "ي ش", + "الَّذ ِينَ", + "تا ب", + "د ِي", + "ج ب", + "ك ون", + "ب ن", + "ال ث", + "لَ يْ", + "ب عد", + "وَ الْ", + "فَ أَ", + "ع م", + "هُ م", + "ت ن", + "ذ ْ", + "أ ص", + "أ ين", + "رَب ِّ", + "الذ ين", + "إِ ن", + "ب ين", + "ج ُ", + "عَلَيْ هِ", + "ح َا", + "ل و", + "ست ط", + "ظ ر", + "لَ مْ", + "ء ِ", + "كُ ل", + "ط ل", + "ت َا", + "ض ُ", + "كن ت", + "ل ًا", + "م ٌ", + "ق بل", + "ـ ـ", + "ذ ِ", + "قَ وْ", + "ص ِ", + "م ًا", + "كان ت", + "ص ا", + "ي ق", + "ال ف", + "ال نا", + "م ٍ", + "إِ نْ", + "ال نَّ", + "ج د", + "وَ مَا", + "ت ت", + "ب ح", + "م كان", + "كي ف", + "ّ ة", + "ال ا", + "ج َا", + "أ و", + "سا عد", + "ض ِ", + "إ لا", + "را ً", + "ق َا", + "ر أ", + "ع ت", + "أ حد", + "ه د", + "ض ا", + "ط ر", + "أ ق", + "ما ء", + "د َّ", + "ال با", + "م ُو", + "أَ وْ", + "ط ا", + "ق ُو", + "خ ِ", + "ت ل", + "ستط يع", + "د َا", + "الن َّا", + "إ لَى", + "وَ تَ", + "هَ ذَا", + "ب ة", + "علي ك", + "ج ر", + "ال من", + "ز ا", + "ر ٍ", + "د ع", + "ّ ًا", + "س ة", + "ثُ مَّ", + "شي ء", + "ال غ", + "ت ح", + "ر ُونَ", + "ال يوم", + "م ِي", + "ن ُوا", + "أ ر", + "تُ مْ", + "ع ر", + "ي ف", + "أ ب", + "د ًا", + "ص َا", + "الت َّ", + "أ ريد", + "ال ز", + "يَ وْ", + "إ لي", + "ج ي", + "يَ عْ", + "فض ل", + "ال إن", + "أن ه", + "n g", + "i 4", + "a n", + "s h", + "z h", + "i 2", + "ng 1", + "u 4", + "i 1", + "ng 2", + "d e", + "j i", + "a o", + "x i", + "u 3", + "de 5", + "e 4", + "i 3", + "ng 4", + "an 4", + "e n", + "u o", + "sh i4", + "an 2", + "u 2", + "c h", + "u 1", + "ng 3", + "a 1", + "an 1", + "e 2", + "a 4", + "e i4", + "o ng1", + "a i4", + "ao 4", + "h u", + "a ng1", + "l i", + "y o", + "an 3", + "w ei4", + "uo 2", + "n 1", + "en 2", + "ao 3", + "e 1", + "y u", + "q i", + "e ng2", + "zh o", + "a ng3", + "a ng4", + "a ng2", + "uo 4", + "m i", + "g e4", + "y i1", + "g uo2", + "e r", + "b i", + "a 3", + "h e2", + "e 3", + "y i2", + "d i4", + "zh ong1", + "b u4", + "g u", + "a i2", + "n 2", + "z ai4", + "sh i2", + "e ng1", + "r en2", + "o ng2", + "xi an4", + "y i", + "x u", + "n 4", + "l i4", + "en 4", + "y u2", + "e i2", + "yi2 ge4", + "o u4", + "e i3", + "d i", + "u i4", + "a 2", + "yo u3", + "ao 1", + "d a4", + "ch eng2", + "en 1", + "e ng4", + "y i4", + "s i1", + "zh i4", + "ji a1", + "yu an2", + "n i", + "t a1", + "de5 yi2ge4", + "k e1", + "sh u3", + "x i1", + "j i2", + "ao 2", + "t i", + "o u3", + "o ng4", + "xi a4", + "a i1", + "g ong1", + "zh i1", + "en 3", + "w ei2", + "j u", + "xu e2", + "q u1", + "zho u1", + "er 3", + "mi ng2", + "zho ng3", + "l i3", + "w u4", + "y i3", + "uo 1", + "e 5", + "j i4", + "xi ng2", + "ji an4", + "hu a4", + "y u3", + "uo 3", + "j i1", + "a i3", + "z uo4", + "h ou4", + "hu i4", + "e i1", + "ni an2", + "q i2", + "p i", + "d ao4", + "sh eng1", + "de 2", + "d ai4", + "u an2", + "zh e4", + "zh eng4", + "b en3", + "sh ang4", + "zh u3", + "b ei4", + "y e4", + "ch u1", + "zh an4", + "l e5", + "l ai2", + "sh i3", + "n an2", + "r en4", + "yo u2", + "k e4", + "b a1", + "f u4", + "d ui4", + "y a4", + "m ei3", + "z i4", + "xi n1", + "ji ng1", + "zh u", + "n 3", + "yo ng4", + "m u4", + "ji ao4", + "y e3", + "ji n4", + "bi an4", + "l u4", + "q i1", + "sh e4", + "xi ang1", + "o ng3", + "sh u4", + "d ong4", + "s uo3", + "gu an1", + "s an1", + "b o", + "t e4", + "d uo1", + "f u2", + "mi n2", + "l a1", + "zh i2", + "zh en4", + "o u1", + "w u3", + "m a3", + "i 5", + "z i5", + "j u4", + "er 4", + "y ao4", + "xia4 de5yi2ge4", + "s i4", + "t u2", + "sh an1", + "z ui4", + "ch u", + "yi n1", + "er 2", + "t ong2", + "d ong1", + "y u4", + "y an2", + "qi an2", + "shu3 xia4de5yi2ge4", + "ju n1", + "k e3", + "w en2", + "f a3", + "l uo2", + "zh u4", + "x i4", + "k ou3", + "b ei3", + "ji an1", + "f a1", + "di an4", + "ji ang1", + "wei4 yu2", + "xi ang4", + "zh i3", + "e ng3", + "f ang1", + "l an2", + "sh u", + "r i4", + "li an2", + "sh ou3", + "m o", + "qi u2", + "ji n1", + "h uo4", + "shu3xia4de5yi2ge4 zhong3", + "f en1", + "n ei4", + "g ai1", + "mei3 guo2", + "u n2", + "g e2", + "b ao3", + "qi ng1", + "g ao1", + "t ai2", + "d u", + "xi ao3", + "ji e2", + "ti an1", + "ch ang2", + "q uan2", + "li e4", + "h ai3", + "f ei1", + "t i3", + "ju e2", + "o u2", + "c i3", + "z u2", + "n i2", + "bi ao3", + "zhong1 guo2", + "d u4", + "yu e4", + "xi ng4", + "sh eng4", + "ch e1", + "d an1", + "ji e1", + "li n2", + "pi ng2", + "f u3", + "g u3", + "ji e4", + "w o", + "v 3", + "sh eng3", + "n a4", + "yu an4", + "zh ang3", + "gu an3", + "d ao3", + "z u3", + "di ng4", + "di an3", + "c eng2", + "ren2 kou3", + "t ai4", + "t ong1", + "g uo4", + "n eng2", + "ch ang3", + "hu a2", + "li u2", + "yi ng1", + "xi ao4", + "c i4", + "bian4 hua4", + "li ang3", + "g ong4", + "zho ng4", + "de5 yi1", + "s e4", + "k ai1", + "w ang2", + "ji u4", + "sh i1", + "sh ou4", + "m ei2", + "k u", + "s u", + "f eng1", + "z e2", + "tu2 shi4", + "t i2", + "q i4", + "ji u3", + "sh en1", + "zh e3", + "ren2kou3 bian4hua4", + "ren2kou3bian4hua4 tu2shi4", + "di4 qu1", + "y ang2", + "m en", + "men 5", + "l ong2", + "bi ng4", + "ch an3", + "zh u1", + "w ei3", + "w ai4", + "xi ng1", + "bo 1", + "b i3", + "t ang2", + "hu a1", + "bo 2", + "shu i3", + "sh u1", + "d ou1", + "s ai4", + "ch ao2", + "b i4", + "li ng2", + "l ei4", + "da4 xue2", + "f en4", + "shu3 de5", + "m u3", + "ji ao1", + "d ang1", + "ch eng1", + "t ong3", + "n v3", + "q i3", + "y an3", + "mi an4", + "l uo4", + "ji ng4", + "g e1", + "r u4", + "d an4", + "ri4 ben3", + "p u3", + "yu n4", + "hu ang2", + "wo 3", + "l v", + "h ai2", + "shi4 yi1", + "xi e1", + "yi ng3", + "w u2", + "sh en2", + "w ang3", + "gu ang3", + "li u4", + "s u4", + "shi4 zhen4", + "c an1", + "c ao3", + "xi a2", + "k a3", + "d a2", + "h u4", + "b an4", + "d ang3", + "h u2", + "z ong3", + "de ng3", + "de5yi2ge4 shi4zhen4", + "ch uan2", + "mo 4", + "zh ang1", + "b an1", + "mo 2", + "ch a2", + "c e4", + "zhu3 yao4", + "t ou2", + "j u2", + "shi4 wei4yu2", + "s a4", + "u n1", + "ke3 yi3", + "d u1", + "h an4", + "li ang4", + "sh a1", + "ji a3", + "z i1", + "lv 4", + "f u1", + "xi an1", + "x u4", + "gu ang1", + "m eng2", + "b ao4", + "yo u4", + "r ong2", + "zhi1 yi1", + "w ei1", + "m ao2", + "guo2 jia1", + "c ong2", + "g ou4", + "ti e3", + "zh en1", + "d u2", + "bi an1", + "c i2", + "q u3", + "f an4", + "xi ang3", + "m en2", + "j u1", + "h ong2", + "z i3", + "ta1 men5", + "ji 3", + "z ong1", + "zhou1 de5yi2ge4shi4zhen4", + "t uan2", + "ji ng3", + "gong1 si1", + "xi e4", + "l i2", + "li4 shi3", + "b ao1", + "g ang3", + "gu i1", + "zh eng1", + "zhi2 wu4", + "ta1 de5", + "pi n3", + "zhu an1", + "ch ong2", + "shi3 yong4", + "w a3", + "sh uo1", + "chu an1", + "l ei2", + "w an1", + "h uo2", + "q u", + "s u1", + "z ao3", + "g ai3", + "q u4", + "g u4", + "l u", + "x i2", + "h ang2", + "yi ng4", + "c un1", + "g en1", + "yi ng2", + "ti ng2", + "cheng2 shi4", + "ji ang3", + "li ng3", + "l un2", + "bu4 fen4", + "de ng1", + "xu an3", + "dong4 wu4", + "de2 guo2", + "xi an3", + "f an3", + "zh e5", + "h an2", + "h ao4", + "m i4", + "r an2", + "qi n1", + "ti ao2", + "zh an3", + "h i", + "k a", + "n o", + "t e", + "s u", + "s hi", + "t a", + "t o", + "n a", + "w a", + "o u", + "r u", + "n i", + "k u", + "k i", + "g a", + "d e", + "k o", + "m a", + "r e", + "r a", + "m o", + "t su", + "w o", + "e n", + "r i", + "s a", + "d a", + "s e", + "j i", + "h a", + "c hi", + "k e", + "te ki", + "m i", + "y ou", + "s h", + "s o", + "y o", + "y a", + "na i", + "t te", + "a ru", + "b a", + "u u", + "t ta", + "ka i", + "ka n", + "shi te", + "m e", + "d o", + "mo no", + "se i", + "r o", + "ko to", + "ka ra", + "shi ta", + "b u", + "m u", + "c h", + "su ru", + "k ou", + "g o", + "ma su", + "ta i", + "f u", + "k en", + "i u", + "g en", + "wa re", + "shi n", + "z u", + "a i", + "o n", + "o ku", + "g i", + "d ou", + "n e", + "y uu", + "i ru", + "i te", + "ji ko", + "de su", + "j u", + "ra re", + "sh u", + "b e", + "sh ou", + "s ha", + "se kai", + "s ou", + "k you", + "ma shita", + "s en", + "na ra", + "sa n", + "ke i", + "i ta", + "a ri", + "i tsu", + "ko no", + "j ou", + "na ka", + "ch ou", + "so re", + "g u", + "na ru", + "ga ku", + "re ba", + "g e", + "h o", + "i n", + "hi to", + "sa i", + "na n", + "da i", + "tsu ku", + "shi ki", + "sa re", + "na ku", + "p p", + "bu n", + "ju n", + "so no", + "ka ku", + "z ai", + "b i", + "to u", + "wa ta", + "sh uu", + "i i", + "te i", + "ka re", + "y u", + "shi i", + "ma de", + "sh o", + "a n", + "ke reba", + "shi ka", + "i chi", + "ha n", + "de ki", + "ni n", + "ware ware", + "na kereba", + "o ite", + "h ou", + "ya ku", + "ra i", + "mu jun", + "l e", + "yo ku", + "bu tsu", + "o o", + "ko n", + "o mo", + "ga e", + "nara nai", + "ta chi", + "z en", + "ch uu", + "kan gae", + "ta ra", + "to ki", + "ko ro", + "mujun teki", + "z e", + "na ga", + "ji n", + "shi ma", + "te n", + "i ki", + "i ku", + "no u", + "i masu", + "r ou", + "h on", + "ka e", + "t to", + "ko re", + "ta n", + "ki ta", + "i s", + "da tta", + "ji tsu", + "ma e", + "i e", + "me i", + "da n", + "h e", + "to ku", + "dou itsu", + "ri tsu", + "k yuu", + "h you", + "rare ta", + "kei sei", + "k kan", + "rare ru", + "m ou", + "do ko", + "r you", + "da ke", + "naka tta", + "so ko", + "ta be", + "e r", + "ha na", + "c o", + "fu ku", + "p a", + "so n", + "ya su", + "ch o", + "wata ku", + "ya ma", + "z a", + "k yo", + "gen zai", + "b oku", + "a ta", + "j a", + "ka wa", + "ma sen", + "j uu", + "ro n", + "b o", + "na tte", + "wataku shi", + "yo tte", + "ma i", + "g ou", + "ha i", + "mo n", + "ba n", + "ji shin", + "c a", + "re te", + "n en", + "o ka", + "ka gaku", + "na tta", + "p o", + "ka ru", + "na ri", + "m en", + "ma ta", + "e i", + "ku ru", + "ga i", + "ka ri", + "sha kai", + "kou i", + "yo ri", + "se tsu", + "j o", + "re ru", + "to koro", + "ju tsu", + "i on", + "sa ku", + "tta i", + "c ha", + "nin gen", + "n u", + "c e", + "ta me", + "kan kyou", + "de n", + "o oku", + "i ma", + "wata shi", + "tsuku ru", + "su gi", + "b en", + "ji bun", + "shi tsu", + "ke ru", + "ki n", + "ki shi", + "shika shi", + "mo to", + "ma ri", + "i tte", + "de shita", + "n de", + "ari masu", + "te r", + "z ou", + "ko e", + "ze ttai", + "kkan teki", + "h en", + "re kishi", + "deki ru", + "tsu ka", + "l a", + "i tta", + "o i", + "ko butsu", + "mi ru", + "sh oku", + "shi masu", + "gi jutsu", + "g you", + "jou shiki", + "a tta", + "ho do", + "ko ko", + "tsuku rareta", + "z oku", + "hi tei", + "ko ku", + "rekishi teki", + "ke te", + "o ri", + "i mi", + "ka ko", + "naga ra", + "ka karu", + "shu tai", + "ha ji", + "ma n", + "ta ku", + "ra n", + "douitsu teki", + "z o", + "me te", + "re i", + "tsu u", + "sare te", + "gen jitsu", + "p e", + "s t", + "ba i", + "na wa", + "ji kan", + "wa ru", + "r t", + "a tsu", + "so ku", + "koui teki", + "a ra", + "u ma", + "a no", + "i de", + "ka ta", + "te tsu", + "ga wa", + "ke do", + "re ta", + "mi n", + "sa you", + "tte ru", + "to ri", + "p u", + "ki mi", + "b ou", + "mu ra", + "sare ru", + "ma chi", + "k ya", + "o sa", + "kon na", + "a ku", + "a l", + "sare ta", + "i pp", + "shi ku", + "u chi", + "hito tsu", + "ha tara", + "tachi ba", + "shi ro", + "ka tachi", + "to mo", + "e te", + "me ru", + "ni chi", + "da re", + "ka tta", + "e ru", + "su ki", + "a ge", + "oo ki", + "ma ru", + "mo ku", + "o ko", + "kangae rareru", + "o to", + "tan ni", + "ta da", + "tai teki", + "mo tte", + "ki nou", + "shi nai", + "k ki", + "u e", + "ta ri", + "l i", + "ra nai", + "k kou", + "mi rai", + "pp on", + "go to", + "hi n", + "hi tsu", + "te ru", + "mo chi", + "ka tsu", + "re n", + "n yuu", + "su i", + "zu ka", + "tsu ite", + "no mi", + "su gu", + "ku da", + "tetsu gaku", + "i ka", + "ron ri", + "o ki", + "ni ppon", + "p er", + "shi mashita", + "chi shiki", + "cho kkanteki", + "su ko", + "t ion", + "ku u", + "a na", + "a rou", + "ka tte", + "ku ri", + "i nai", + "hyou gen", + "i shiki", + "do ku", + "a tte", + "a tara", + "to n", + "wa ri", + "ka o", + "sei san", + "hana shi", + "s i", + "ka ke", + "na ji", + "su nawa", + "sunawa chi", + "u go", + "su u", + "ba ra", + "le v", + "hi ro", + "i wa", + "be tsu", + "yo i", + "se ru", + "shite ru", + "rare te", + "to shi", + "se ki", + "tai ritsu", + "wa kara", + "to kyo", + "k ka", + "k yoku", + "u n", + "i ro", + "mi te", + "sa ki", + "kan ji", + "mi ta", + "su be", + "r yoku", + "ma tta", + "kuda sai", + "omo i", + "ta no", + "ware ru", + "co m", + "hitsu you", + "ka shi", + "re nai", + "kan kei", + "a to", + "ga tte", + "o chi", + "mo tsu", + "in g", + "son zai", + "l l", + "o re", + "tai shite", + "a me", + "sei mei", + "ka no", + "gi ri", + "kangae ru", + "yu e", + "a sa", + "o naji", + "yo ru", + "ni ku", + "osa ka", + "suko shi", + "c k", + "ta ma", + "kano jo", + "ki te", + "mon dai", + "a mari", + "e ki", + "ko jin", + "ha ya", + "i t", + "de te", + "atara shii", + "a wa", + "ga kkou", + "tsu zu", + "shu kan", + "i mashita", + "mi na", + "ata e", + "da rou", + "hatara ku", + "ga ta", + "da chi", + "ma tsu", + "ari masen", + "sei butsu", + "mi tsu", + "he ya", + "yasu i", + "d i", + "de ni", + "no ko", + "ha ha", + "do mo", + "ka mi", + "su deni", + "na o", + "ra ku", + "i ke", + "a ki", + "me ta", + "l o", + "ko domo", + "so shite", + "ga me", + "ba kari", + "to te", + "ha tsu", + "mi se", + "moku teki", + "da kara" + ] + } +} \ No newline at end of file diff --git a/tests/text_tests/test_punctuation.py b/tests/text_tests/test_punctuation.py index 141c10e48f..bb7b11edce 100644 --- a/tests/text_tests/test_punctuation.py +++ b/tests/text_tests/test_punctuation.py @@ -11,6 +11,11 @@ def setUp(self): ("This, is my text ... to be striped !! from text", "This is my text to be striped from text"), ("This, is my text ... to be striped from text?", "This is my text to be striped from text"), ("This, is my text to be striped from text", "This is my text to be striped from text"), + (".", ""), + (" . ", ""), + ("!!! Attention !!!", "Attention"), + ("!!! Attention !!! This is just a ... test.", "Attention This is just a test"), + ("!!! Attention! This is just a ... test.", "Attention This is just a test"), ] def test_get_set_puncs(self): diff --git a/tests/tts_tests2/test_delightful_tts_layers.py b/tests/tts_tests2/test_delightful_tts_layers.py index 073bb1eb5a..b9951fc208 100644 --- a/tests/tts_tests2/test_delightful_tts_layers.py +++ b/tests/tts_tests2/test_delightful_tts_layers.py @@ -28,7 +28,7 @@ def test_acoustic_model(): dummy_tokens = torch.rand((1, 41)).long().to(device) - dummy_text_lens = torch.tensor([41]).to(device) + dummy_text_lens = torch.tensor([41]).long().to(device) dummy_spec = torch.rand((1, 100, 207)).to(device) dummy_spec_lens = torch.tensor([207]).to(device) dummy_pitch = torch.rand((1, 1, 207)).long().to(device) @@ -38,6 +38,7 @@ def test_acoustic_model(): args.num_mels = 100 acoustic_model = AcousticModel(args=args, tokenizer=tokenizer, speaker_manager=None).to(device) + acoustic_model = acoustic_model.train() output = acoustic_model( tokens=dummy_tokens, @@ -51,16 +52,12 @@ def test_acoustic_model(): speaker_idx=None, ) assert list(output["model_outputs"].shape) == [1, 207, 100] - output["model_outputs"].sum().backward() + # output["model_outputs"].sum().backward() def test_hifi_decoder(): dummy_input = torch.rand((1, 207, 100)).to(device) - dummy_text_lens = torch.tensor([41]).to(device) - dummy_spec = torch.rand((1, 100, 207)).to(device) dummy_spec_lens = torch.tensor([207]).to(device) - dummy_pitch = torch.rand((1, 1, 207)).long().to(device) - dummy_energy = torch.rand((1, 1, 207)).long().to(device) waveform_decoder = HifiganGenerator( 100, @@ -77,6 +74,7 @@ def test_hifi_decoder(): conv_post_weight_norm=False, conv_post_bias=False, ).to(device) + waveform_decoder = waveform_decoder.train() vocoder_input_slices, slice_ids = rand_segments( # pylint: disable=unused-variable x=dummy_input.transpose(1, 2), @@ -88,4 +86,4 @@ def test_hifi_decoder(): outputs = waveform_decoder(x=vocoder_input_slices.detach()) assert list(outputs.shape) == [1, 1, 8192] - outputs.sum().backward() + # outputs.sum().backward() diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 2a35aa2e37..95501c2d39 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -5,6 +5,7 @@ from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.numpy_transforms import stft from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT TESTS_PATH = get_tests_path() @@ -21,7 +22,7 @@ def test_torch_stft(): torch_stft = TorchSTFT(ap.fft_size, ap.hop_length, ap.win_length) # librosa stft wav = ap.load_wav(WAV_FILE) - M_librosa = abs(ap._stft(wav)) # pylint: disable=protected-access + M_librosa = abs(stft(y=wav, fft_size=ap.fft_size, hop_length=ap.hop_length, win_length=ap.win_length)) # torch stft wav = torch.from_numpy(wav[None, :]).float() M_torch = torch_stft(wav) diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py new file mode 100644 index 0000000000..b8b9a4e388 --- /dev/null +++ b/tests/xtts_tests/test_xtts_gpt_train.py @@ -0,0 +1,163 @@ +import os +import shutil + +import torch +from trainer import Trainer, TrainerArgs + +from tests import get_tests_output_path +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.dvae import DiscreteVAE +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig + +config_dataset = BaseDatasetConfig( + formatter="ljspeech", + dataset_name="ljspeech", + path="tests/data/ljspeech/", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + language="en", +) + +DATASETS_CONFIG_LIST = [config_dataset] + +# Logging parameters +RUN_NAME = "GPT_XTTS_LJSpeech_FT" +PROJECT_NAME = "XTTS_trainer" +DASHBOARD_LOGGER = "tensorboard" +LOGGER_URI = None + +# Set here the path that the checkpoints will be saved. Default: ./run/training/ +OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") +os.makedirs(OUT_PATH, exist_ok=True) + +# Create DVAE checkpoint and mel_norms on test time +# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model +DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint +MEL_NORM_FILE = os.path.join( + OUT_PATH, "mel_stats.pth" +) # Mel spectrogram norms, required for dvae mel spectrogram extraction +dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=8192, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, +) +torch.save(dvae.state_dict(), DVAE_CHECKPOINT) +mel_stats = torch.ones(80) +torch.save(mel_stats, MEL_NORM_FILE) + + +# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. +TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file +XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file + + +# Training sentences generations +SPEAKER_REFERENCE = [ + "tests/data/ljspeech/wavs/LJ001-0002.wav" +] # speaker reference to be used in training test sentences +LANGUAGE = config_dataset.language + + +# Training Parameters +OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False +START_WITH_EVAL = False # if True it will star with evaluation +BATCH_SIZE = 2 # set here the batch size +GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + + +# init args and config +model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, +) +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) +config = GPTTrainerConfig( + epochs=1, + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description=""" + GPT XTTS training + """, + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], +) + +# init the model from config +model = GPTTrainer.init_from_config(config) + +# load training samples +train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) + +# init the trainer and 🚀 +trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=True, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, +) +trainer.fit() + +# remove output path +shutil.rmtree(OUT_PATH) diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py new file mode 100644 index 0000000000..6663433c12 --- /dev/null +++ b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py @@ -0,0 +1,163 @@ +import os +import shutil + +import torch +from trainer import Trainer, TrainerArgs + +from tests import get_tests_output_path +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.dvae import DiscreteVAE +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig + +config_dataset = BaseDatasetConfig( + formatter="ljspeech", + dataset_name="ljspeech", + path="tests/data/ljspeech/", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + language="en", +) + +DATASETS_CONFIG_LIST = [config_dataset] + +# Logging parameters +RUN_NAME = "GPT_XTTS_LJSpeech_FT" +PROJECT_NAME = "XTTS_trainer" +DASHBOARD_LOGGER = "tensorboard" +LOGGER_URI = None + +OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") +os.makedirs(OUT_PATH, exist_ok=True) + +# Create DVAE checkpoint and mel_norms on test time +# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model +DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint +# Mel spectrogram norms, required for dvae mel spectrogram extraction +MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth") +dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=8192, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, +) +torch.save(dvae.state_dict(), DVAE_CHECKPOINT) +mel_stats = torch.ones(80) +torch.save(mel_stats, MEL_NORM_FILE) + + +# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. +TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file +XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file + + +# Training sentences generations +SPEAKER_REFERENCE = [ + "tests/data/ljspeech/wavs/LJ001-0002.wav" +] # speaker reference to be used in training test sentences +LANGUAGE = config_dataset.language + + +# Training Parameters +OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False +START_WITH_EVAL = False # if True it will star with evaluation +BATCH_SIZE = 2 # set here the batch size +GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. + + +# init args and config +model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, + gpt_use_masking_gt_prompt_approach=True, + gpt_use_perceiver_resampler=True, +) + +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + +config = GPTTrainerConfig( + epochs=1, + output_path=OUT_PATH, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description="GPT XTTS training", + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], +) + +# init the model from config +model = GPTTrainer.init_from_config(config) + +# load training samples +train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) + +# init the trainer and 🚀 +trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=True, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=OUT_PATH, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, +) +trainer.fit() + +# remove output path +shutil.rmtree(OUT_PATH) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index db144f1ccf..8fa56e287a 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -14,8 +14,8 @@ MODELS_WITH_SEP_TESTS = [ "tts_models/multilingual/multi-dataset/bark", "tts_models/en/multi-dataset/tortoise-v2", - "tts_models/multilingual/multi-dataset/xtts_v1", "tts_models/multilingual/multi-dataset/xtts_v1.1", + "tts_models/multilingual/multi-dataset/xtts_v2", ] @@ -82,14 +82,14 @@ def test_xtts(): if use_gpu: run_cli( "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) else: run_cli( "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' f'--speaker_wav "{speaker_wav}" --language_idx "en"' ) @@ -99,8 +99,64 @@ def test_xtts_streaming(): """Testing the new inference_stream method""" from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") + speaker_wav.append(speaker_wav_2) + model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1") + config = XttsConfig() + config.load_json(os.path.join(model_path, "config.json")) + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=model_path) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chuncks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chuncks.append(chunk) + assert len(wav_chuncks) > 1 + + +def test_xtts_v2(): + """XTTS is too big to run on github actions. We need to test it locally""" + output_path = os.path.join(get_tests_output_path(), "output.wav") speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") + speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") + use_gpu = torch.cuda.is_available() + if use_gpu: + run_cli( + "yes | " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " + f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' + f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' + ) + else: + run_cli( + "yes | " + f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " + f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' + f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' + ) + + +def test_xtts_v2_streaming(): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) @@ -108,14 +164,14 @@ def test_xtts_streaming(): model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) print("Computing speaker latents...") - gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) print("Inference...") chunks = model.inference_stream( "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", "en", gpt_cond_latent, - speaker_embedding + speaker_embedding, ) wav_chuncks = [] for i, chunk in enumerate(chunks): @@ -123,6 +179,34 @@ def test_xtts_streaming(): assert chunk.shape[-1] > 5000 wav_chuncks.append(chunk) assert len(wav_chuncks) > 1 + normal_len = sum([len(chunk) for chunk in wav_chuncks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=1.5, + ) + wav_chuncks = [] + for i, chunk in enumerate(chunks): + wav_chuncks.append(chunk) + fast_len = sum([len(chunk) for chunk in wav_chuncks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=0.66, + ) + wav_chuncks = [] + for i, chunk in enumerate(chunks): + wav_chuncks.append(chunk) + slow_len = sum([len(chunk) for chunk in wav_chuncks]) + + assert slow_len > normal_len + assert normal_len > fast_len def test_tortoise():