Skip to content

Commit

Permalink
Merge pull request #3162 from lissyx/update-r0.8
Browse files Browse the repository at this point in the history
Update r0.8
  • Loading branch information
lissyx authored Jul 15, 2020
2 parents 19fb172 + 820350f commit 32e185f
Show file tree
Hide file tree
Showing 67 changed files with 428 additions and 285 deletions.
89 changes: 89 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# These environment variables must be set in CircleCI UI
#
# DOCKERHUB_REPO - docker hub repo, format: <username>/<repo>
# DOCKER_USER - login info for docker hub
# DOCKER_PASS
#
version: 2
jobs:
build:
docker:
- image: docker:stable-git
working_directory: /dockerflow
steps:
- checkout
- setup_remote_docker

- run:
name: os-release
command: |
cat /etc/os-release
- run:
name: install make
command: |
apk add make
- run:
name: Create a Dockerfile.train
command: |
make Dockerfile.train \
DEEPSPEECH_REPO="https://github.com/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME" \
DEEPSPEECH_SHA=$CIRCLE_SHA1
- run:
name: Build Docker image
command: docker build -t app:build -f Dockerfile.train .

# save the built docker container into CircleCI's cache. This is
# required since Workflows do not have the same remote docker instance.
- run:
name: docker save app:build
command: mkdir -p /cache; docker save -o /cache/docker.tar "app:build"
- save_cache:
key: v1-{{ .Branch }}-{{epoch}}
paths:
- /cache/docker.tar

deploy:
docker:
- image: docker:18.02.0-ce
steps:
- setup_remote_docker
- restore_cache:
key: v1-{{.Branch}}
- run:
name: Restore Docker image cache
command: docker load -i /cache/docker.tar

- run:
name: Deploy to Dockerhub
command: |
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
# deploy master
if [ "${CIRCLE_BRANCH}" == "master" ]; then
docker tag app:build ${DOCKERHUB_REPO}:latest
docker push ${DOCKERHUB_REPO}:latest
elif [ ! -z "${CIRCLE_TAG}" ]; then
# deploy a release tag...
echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
docker tag app:build "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
docker images
docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
fi
workflows:
version: 2
build-deploy:
jobs:
- build:
filters:
tags:
only: /.*/

- deploy:
requires:
- build
filters:
tags:
only: /.*/
3 changes: 3 additions & 0 deletions Dockerfile.build.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/2.0.0/bazel_2.0.0-linux-x86_64.deb"
RUN dpkg -i bazel_*.deb

# Try and free some space
RUN rm -rf /var/lib/apt/lists/*

# << END Install base software

# >> START Configure Tensorflow Build
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile.train.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ RUN apt-get purge -y python3-xdg
# Install dependencies for audio augmentation
RUN apt-get install -y --no-install-recommends libopus0 libsndfile1

# Try and free some space
RUN rm -rf /var/lib/apt/lists/*

WORKDIR /
RUN git lfs install
RUN git clone $DEEPSPEECH_REPO
Expand Down
19 changes: 18 additions & 1 deletion bin/build_sdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import progressbar

from deepspeech_training.util.audio import (
AUDIO_TYPE_PCM,
AUDIO_TYPE_OPUS,
AUDIO_TYPE_WAV,
change_audio_types,
Expand All @@ -17,17 +18,28 @@
DirectSDBWriter,
samples_from_sources,
)
from deepspeech_training.util.augmentations import (
parse_augmentations,
apply_sample_augmentations,
SampleAugmentation
)

AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}


def build_sdb():
audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
augmentations = parse_augmentations(CLI_ARGS.augment)
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
print("Warning: Some of the augmentations cannot be applied by this command.")
with DirectSDBWriter(
CLI_ARGS.target, audio_type=audio_type, labeled=not CLI_ARGS.unlabeled
) as sdb_writer:
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
bar = progressbar.ProgressBar(max_value=len(samples), widgets=SIMPLE_BAR)
num_samples = len(samples)
if augmentations:
samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
for sample in bar(
change_audio_types(samples, audio_type=audio_type, bitrate=CLI_ARGS.bitrate, processes=CLI_ARGS.workers)
):
Expand Down Expand Up @@ -67,6 +79,11 @@ def handle_args():
help="If to build an SDB with unlabeled (audio only) samples - "
"typically used for building noise augmentation corpora",
)
parser.add_argument(
"--augment",
action='append',
help="Add an augmentation operation",
)
return parser.parse_args()


Expand Down
91 changes: 44 additions & 47 deletions bin/import_cv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
Use "python3 import_cv2.py -h" for help
"""
import csv
import itertools
import os
import subprocess
import unicodedata
Expand All @@ -24,27 +23,39 @@
get_validate_label,
print_import_report,
)
from deepspeech_training.util.text import Alphabet
from ds_ctcdecoder import Alphabet

FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
SAMPLE_RATE = 16000
MAX_SECS = 10
PARAMS = None
FILTER_OBJ = None


def _preprocess_data(tsv_dir, audio_dir, filter_obj, space_after_every_character=False):
exclude = []
for dataset in ["test", "dev", "train", "validated", "other"]:
set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_every_character)
if dataset in ["test", "dev"]:
exclude += set_samples
if dataset == "validated":
_maybe_convert_set("train-all", tsv_dir, audio_dir, filter_obj, space_after_every_character,
rows=set_samples, exclude=exclude)
class LabelFilter:
def __init__(self, normalize, alphabet, validate_fun):
self.normalize = normalize
self.alphabet = alphabet
self.validate_fun = validate_fun

def filter(self, label):
if self.normalize:
label = unicodedata.normalize("NFKD", label.strip()).encode("ascii", "ignore").decode("ascii", "ignore")
label = self.validate_fun(label)
if self.alphabet and label and not self.alphabet.CanEncode(label):
label = None
return label


def one_sample(args):
def init_worker(params):
global FILTER_OBJ # pylint: disable=global-statement
validate_label = get_validate_label(params)
alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)


def one_sample(sample):
""" Take an audio file, and optionally convert it to 16kHz WAV """
sample, filter_obj = args
mp3_filename = sample[0]
if not os.path.splitext(mp3_filename.lower())[1] == ".mp3":
mp3_filename += ".mp3"
Expand All @@ -60,7 +71,7 @@ def one_sample(args):
["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
)
)
label = filter_obj.filter(sample[1])
label = FILTER_OBJ.filter(sample[1])
rows = []
counter = get_counter()
if file_size == -1:
Expand Down Expand Up @@ -110,10 +121,9 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
num_samples = len(samples)

print("Importing mp3 files...")
pool = Pool()
pool = Pool(initializer=init_worker, initargs=(PARAMS,))
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
samples_with_context = itertools.zip_longest(samples, [], fillvalue=filter_obj)
for i, processed in enumerate(pool.imap_unordered(one_sample, samples_with_context), start=1):
for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
counter += processed[0]
rows += processed[1]
bar.update(i)
Expand Down Expand Up @@ -155,6 +165,17 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
return rows


def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
exclude = []
for dataset in ["test", "dev", "train", "validated", "other"]:
set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, space_after_every_character)
if dataset in ["test", "dev"]:
exclude += set_samples
if dataset == "validated":
_maybe_convert_set("train-all", tsv_dir, audio_dir, space_after_every_character,
rows=set_samples, exclude=exclude)


def _maybe_convert_wav(mp3_filename, wav_filename):
if not os.path.exists(wav_filename):
transformer = sox.Transformer()
Expand All @@ -164,28 +185,8 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
except sox.core.SoxError:
pass

class LabelFilter:
def __init__(self, normalize, alphabet, validate_fun):
self.normalize = normalize
self.alphabet = alphabet
self.validate_fun = validate_fun

def filter(self, label):
if self.normalize:
label = (
unicodedata.normalize("NFKD", label.strip())
.encode("ascii", "ignore")
.decode("ascii", "ignore")
)
label = self.validate_fun(label)
if self.alphabet and label:
try:
self.alphabet.encode(label)
except KeyError:
label = None
return label

def main():
def parse_args():
parser = get_importers_parser(description="Import CommonVoice v2.0 corpora")
parser.add_argument("tsv_dir", help="Directory containing tsv files")
parser.add_argument(
Expand All @@ -206,18 +207,14 @@ def main():
action="store_true",
help="To help transcript join by white space",
)
return parser.parse_args()

params = parser.parse_args()
validate_label = get_validate_label(params)

audio_dir = (
params.audio_dir if params.audio_dir else os.path.join(params.tsv_dir, "clips")
)
alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
def main():
audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips")
_preprocess_data(PARAMS.tsv_dir, audio_dir, PARAMS.space_after_every_character)

filter_obj = LabelFilter(params.normalize, alphabet, validate_label)
_preprocess_data(params.tsv_dir, audio_dir, filter_obj,
params.space_after_every_character)

if __name__ == "__main__":
PARAMS = parse_args()
main()
11 changes: 4 additions & 7 deletions bin/import_lingua_libre.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
get_validate_label,
print_import_report,
)
from deepspeech_training.util.text import Alphabet
from ds_ctcdecoder import Alphabet

FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
SAMPLE_RATE = 16000
Expand Down Expand Up @@ -198,7 +198,7 @@ def handle_args():
"--iso639-3", type=str, required=True, help="ISO639-3 language code"
)
parser.add_argument(
"--english-name", type=str, required=True, help="Enligh name of the language"
"--english-name", type=str, required=True, help="English name of the language"
)
parser.add_argument(
"--filter_alphabet",
Expand Down Expand Up @@ -242,11 +242,8 @@ def label_filter(label):
.decode("ascii", "ignore")
)
label = validate_label(label)
if ALPHABET and label:
try:
ALPHABET.encode(label)
except KeyError:
label = None
if ALPHABET and label and not ALPHABET.CanEncode(label):
label = None
return label

ARCHIVE_NAME = ARCHIVE_NAME.format(
Expand Down
9 changes: 3 additions & 6 deletions bin/import_m-ailabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
get_validate_label,
print_import_report,
)
from deepspeech_training.util.text import Alphabet
from ds_ctcdecoder import Alphabet

FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
SAMPLE_RATE = 16000
Expand Down Expand Up @@ -215,11 +215,8 @@ def label_filter(label):
.decode("ascii", "ignore")
)
label = validate_label(label)
if ALPHABET and label:
try:
ALPHABET.encode(label)
except KeyError:
label = None
if ALPHABET and label and not ALPHABET.CanEncode(label):
label = None
return label

ARCHIVE_DIR_NAME = ARCHIVE_DIR_NAME.format(language=CLI_ARGS.language)
Expand Down
Loading

0 comments on commit 32e185f

Please sign in to comment.