Merge pull request #3162 from lissyx/update-r0.8

Update r0.8
mozilla · Jul 15, 2020 · 32e185f · 32e185f
2 parents 19fb172 + 820350f
commit 32e185f
Show file tree

Hide file tree

Showing 67 changed files with 428 additions and 285 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,89 @@
+# These environment variables must be set in CircleCI UI
+#
+# DOCKERHUB_REPO - docker hub repo, format: <username>/<repo>
+# DOCKER_USER    - login info for docker hub
+# DOCKER_PASS
+#
+version: 2
+jobs:
+  build:
+    docker:
+      - image: docker:stable-git
+    working_directory: /dockerflow
+    steps:
+      - checkout
+      - setup_remote_docker
+
+      - run:
+          name: os-release
+          command: |
+            cat /etc/os-release
+
+      - run:
+          name: install make
+          command: |
+            apk add make
+
+      - run:
+          name: Create a Dockerfile.train
+          command: |
+            make Dockerfile.train \
+                DEEPSPEECH_REPO="https://github.com/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME" \
+                DEEPSPEECH_SHA=$CIRCLE_SHA1
+
+      - run:
+          name: Build Docker image
+          command: docker build -t app:build -f Dockerfile.train .
+
+      # save the built docker container into CircleCI's cache. This is
+      # required since Workflows do not have the same remote docker instance.
+      - run:
+          name: docker save app:build
+          command: mkdir -p /cache; docker save -o /cache/docker.tar "app:build"
+      - save_cache:
+          key: v1-{{ .Branch }}-{{epoch}}
+          paths:
+            - /cache/docker.tar
+
+  deploy:
+    docker:
+      - image: docker:18.02.0-ce
+    steps:
+      - setup_remote_docker
+      - restore_cache:
+          key: v1-{{.Branch}}
+      - run:
+          name: Restore Docker image cache
+          command: docker load -i /cache/docker.tar
+
+      - run:
+          name: Deploy to Dockerhub
+          command: |
+            echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
+            # deploy master
+            if [ "${CIRCLE_BRANCH}" == "master" ]; then
+              docker tag app:build ${DOCKERHUB_REPO}:latest
+              docker push ${DOCKERHUB_REPO}:latest
+            elif  [ ! -z "${CIRCLE_TAG}" ]; then
+            # deploy a release tag...
+              echo "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
+              docker tag app:build "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
+              docker images
+              docker push "${DOCKERHUB_REPO}:${CIRCLE_TAG}"
+            fi
+
+workflows:
+  version: 2
+  build-deploy:
+    jobs:
+      - build:
+          filters:
+            tags:
+              only: /.*/
+
+      - deploy:
+          requires:
+            - build
+          filters:
+            tags:
+              only: /.*/
diff --git a/Dockerfile.build.tmpl b/Dockerfile.build.tmpl
@@ -49,6 +49,9 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
 RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/2.0.0/bazel_2.0.0-linux-x86_64.deb"
 RUN dpkg -i bazel_*.deb
 
+# Try and free some space
+RUN rm -rf /var/lib/apt/lists/*
+
 # << END Install base software
 
 # >> START Configure Tensorflow Build

diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl
@@ -28,6 +28,9 @@ RUN apt-get purge -y python3-xdg
 # Install dependencies for audio augmentation
 RUN apt-get install -y --no-install-recommends libopus0 libsndfile1
 
+# Try and free some space
+RUN rm -rf /var/lib/apt/lists/*
+
 WORKDIR /
 RUN git lfs install
 RUN git clone $DEEPSPEECH_REPO

diff --git a/bin/build_sdb.py b/bin/build_sdb.py
@@ -8,6 +8,7 @@
 import progressbar
 
 from deepspeech_training.util.audio import (
+    AUDIO_TYPE_PCM,
     AUDIO_TYPE_OPUS,
     AUDIO_TYPE_WAV,
     change_audio_types,
@@ -17,17 +18,28 @@
     DirectSDBWriter,
     samples_from_sources,
 )
+from deepspeech_training.util.augmentations import (
+    parse_augmentations,
+    apply_sample_augmentations,
+    SampleAugmentation
+)
 
 AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}
 
 
 def build_sdb():
     audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
+    augmentations = parse_augmentations(CLI_ARGS.augment)
+    if any(not isinstance(a, SampleAugmentation) for a in augmentations):
+        print("Warning: Some of the augmentations cannot be applied by this command.")
     with DirectSDBWriter(
         CLI_ARGS.target, audio_type=audio_type, labeled=not CLI_ARGS.unlabeled
     ) as sdb_writer:
         samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
-        bar = progressbar.ProgressBar(max_value=len(samples), widgets=SIMPLE_BAR)
+        num_samples = len(samples)
+        if augmentations:
+            samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
+        bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
         for sample in bar(
             change_audio_types(samples, audio_type=audio_type, bitrate=CLI_ARGS.bitrate, processes=CLI_ARGS.workers)
         ):
@@ -67,6 +79,11 @@ def handle_args():
         help="If to build an SDB with unlabeled (audio only) samples - "
         "typically used for building noise augmentation corpora",
     )
+    parser.add_argument(
+        "--augment",
+        action='append',
+        help="Add an augmentation operation",
+    )
     return parser.parse_args()
 
 

diff --git a/bin/import_cv2.py b/bin/import_cv2.py
@@ -7,7 +7,6 @@
 Use "python3 import_cv2.py -h" for help
 """
 import csv
-import itertools
 import os
 import subprocess
 import unicodedata
@@ -24,27 +23,39 @@
     get_validate_label,
     print_import_report,
 )
-from deepspeech_training.util.text import Alphabet
+from ds_ctcdecoder import Alphabet
 
 FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 SAMPLE_RATE = 16000
 MAX_SECS = 10
+PARAMS = None
+FILTER_OBJ = None
 
 
-def _preprocess_data(tsv_dir, audio_dir, filter_obj, space_after_every_character=False):
-    exclude = []
-    for dataset in ["test", "dev", "train", "validated", "other"]:
-        set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_every_character)
-        if dataset in ["test", "dev"]:
-            exclude += set_samples
-        if dataset == "validated":
-            _maybe_convert_set("train-all", tsv_dir, audio_dir, filter_obj, space_after_every_character,
-                               rows=set_samples, exclude=exclude)
+class LabelFilter:
+    def __init__(self, normalize, alphabet, validate_fun):
+        self.normalize = normalize
+        self.alphabet = alphabet
+        self.validate_fun = validate_fun
+
+    def filter(self, label):
+        if self.normalize:
+            label = unicodedata.normalize("NFKD", label.strip()).encode("ascii", "ignore").decode("ascii", "ignore")
+        label = self.validate_fun(label)
+        if self.alphabet and label and not self.alphabet.CanEncode(label):
+            label = None
+        return label
 
 
-def one_sample(args):
+def init_worker(params):
+    global FILTER_OBJ  # pylint: disable=global-statement
+    validate_label = get_validate_label(params)
+    alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
+    FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
+
+
+def one_sample(sample):
     """ Take an audio file, and optionally convert it to 16kHz WAV """
-    sample, filter_obj = args
     mp3_filename = sample[0]
     if not os.path.splitext(mp3_filename.lower())[1] == ".mp3":
         mp3_filename += ".mp3"
@@ -60,7 +71,7 @@ def one_sample(args):
                 ["soxi", "-s", wav_filename], stderr=subprocess.STDOUT
             )
         )
-    label = filter_obj.filter(sample[1])
+    label = FILTER_OBJ.filter(sample[1])
     rows = []
     counter = get_counter()
     if file_size == -1:
@@ -110,10 +121,9 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
         num_samples = len(samples)
 
         print("Importing mp3 files...")
-        pool = Pool()
+        pool = Pool(initializer=init_worker, initargs=(PARAMS,))
         bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
-        samples_with_context = itertools.zip_longest(samples, [], fillvalue=filter_obj)
-        for i, processed in enumerate(pool.imap_unordered(one_sample, samples_with_context), start=1):
+        for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
             counter += processed[0]
             rows += processed[1]
             bar.update(i)
@@ -155,6 +165,17 @@ def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_ever
     return rows
 
 
+def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
+    exclude = []
+    for dataset in ["test", "dev", "train", "validated", "other"]:
+        set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, space_after_every_character)
+        if dataset in ["test", "dev"]:
+            exclude += set_samples
+        if dataset == "validated":
+            _maybe_convert_set("train-all", tsv_dir, audio_dir, space_after_every_character,
+                               rows=set_samples, exclude=exclude)
+
+
 def _maybe_convert_wav(mp3_filename, wav_filename):
     if not os.path.exists(wav_filename):
         transformer = sox.Transformer()
@@ -164,28 +185,8 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
         except sox.core.SoxError:
             pass
 
-class LabelFilter:
-    def __init__(self, normalize, alphabet, validate_fun):
-        self.normalize = normalize
-        self.alphabet = alphabet
-        self.validate_fun = validate_fun
-
-    def filter(self, label):
-        if self.normalize:
-            label = (
-                unicodedata.normalize("NFKD", label.strip())
-                .encode("ascii", "ignore")
-                .decode("ascii", "ignore")
-            )
-        label = self.validate_fun(label)
-        if self.alphabet and label:
-            try:
-                self.alphabet.encode(label)
-            except KeyError:
-                label = None
-        return label
 
-def main():
+def parse_args():
     parser = get_importers_parser(description="Import CommonVoice v2.0 corpora")
     parser.add_argument("tsv_dir", help="Directory containing tsv files")
     parser.add_argument(
@@ -206,18 +207,14 @@ def main():
         action="store_true",
         help="To help transcript join by white space",
     )
+    return parser.parse_args()
 
-    params = parser.parse_args()
-    validate_label = get_validate_label(params)
 
-    audio_dir = (
-        params.audio_dir if params.audio_dir else os.path.join(params.tsv_dir, "clips")
-    )
-    alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
+def main():
+    audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(PARAMS.tsv_dir, "clips")
+    _preprocess_data(PARAMS.tsv_dir, audio_dir, PARAMS.space_after_every_character)
 
-    filter_obj = LabelFilter(params.normalize, alphabet, validate_label)
-    _preprocess_data(params.tsv_dir, audio_dir, filter_obj,
-                     params.space_after_every_character)
 
 if __name__ == "__main__":
+    PARAMS = parse_args()
     main()
diff --git a/bin/import_lingua_libre.py b/bin/import_lingua_libre.py
@@ -20,7 +20,7 @@
     get_validate_label,
     print_import_report,
 )
-from deepspeech_training.util.text import Alphabet
+from ds_ctcdecoder import Alphabet
 
 FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 SAMPLE_RATE = 16000
@@ -198,7 +198,7 @@ def handle_args():
         "--iso639-3", type=str, required=True, help="ISO639-3 language code"
     )
     parser.add_argument(
-        "--english-name", type=str, required=True, help="Enligh name of the language"
+        "--english-name", type=str, required=True, help="English name of the language"
     )
     parser.add_argument(
         "--filter_alphabet",
@@ -242,11 +242,8 @@ def label_filter(label):
                 .decode("ascii", "ignore")
             )
         label = validate_label(label)
-        if ALPHABET and label:
-            try:
-                ALPHABET.encode(label)
-            except KeyError:
-                label = None
+        if ALPHABET and label and not ALPHABET.CanEncode(label):
+            label = None
         return label
 
     ARCHIVE_NAME = ARCHIVE_NAME.format(

diff --git a/bin/import_m-ailabs.py b/bin/import_m-ailabs.py
@@ -18,7 +18,7 @@
     get_validate_label,
     print_import_report,
 )
-from deepspeech_training.util.text import Alphabet
+from ds_ctcdecoder import Alphabet
 
 FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
 SAMPLE_RATE = 16000
@@ -215,11 +215,8 @@ def label_filter(label):
                 .decode("ascii", "ignore")
             )
         label = validate_label(label)
-        if ALPHABET and label:
-            try:
-                ALPHABET.encode(label)
-            except KeyError:
-                label = None
+        if ALPHABET and label and not ALPHABET.CanEncode(label):
+            label = None
         return label
 
     ARCHIVE_DIR_NAME = ARCHIVE_DIR_NAME.format(language=CLI_ARGS.language)