s3prl · leo19941227 · Sep 23, 2021 · Sep 20, 2021 · Sep 21, 2021 · leo19941227
diff --git a/generate_librimix.sh → generate_librimix_sd.sh b/generate_librimix.sh → generate_librimix_sd.sh
@@ -59,7 +59,6 @@ function wham() {
 LibriSpeech_dev_clean &
 LibriSpeech_test_clean &
 LibriSpeech_clean100 &
-LibriSpeech_clean360 &
 wham &
 
 wait
@@ -79,5 +78,5 @@ for n_src in 2; do
     --n_src $n_src \
     --freqs 16k \
     --modes max \
-    --types mix_clean mix_both
+    --types mix_both
 done
diff --git a/generate_librimix_ss.sh b/generate_librimix_ss.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+set -eu  # Exit on error
+
+storage_dir=$1
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+librimix_outdir=$storage_dir/
+
+function LibriSpeech_dev_clean() {
+	if ! test -e $librispeech_dir/dev-clean; then
+		echo "Download LibriSpeech/dev-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/dev-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_test_clean() {
+	if ! test -e $librispeech_dir/test-clean; then
+		echo "Download LibriSpeech/test-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/test-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_clean100() {
+	if ! test -e $librispeech_dir/train-clean-100; then
+		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-100.tar.gz
+	fi
+}
+
+function LibriSpeech_clean360() {
+	if ! test -e $librispeech_dir/train-clean-360; then
+		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-360.tar.gz
+	fi
+}
+
+function wham() {
+	if ! test -e $wham_dir; then
+		echo "Download wham_noise into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
+		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
+		rm -rf $storage_dir/wham_noise.zip
+	fi
+}
+
+LibriSpeech_dev_clean &
+LibriSpeech_test_clean &
+LibriSpeech_clean100 &
+wham &
+
+wait
+
+# Path to python
+python_path=python
+
+# If you wish to rerun this script in the future please comment this line out.
+$python_path scripts/augment_train_noise.py --wham_dir $wham_dir
+
+for n_src in 2; do
+  metadata_dir=metadata/Libri$n_src"Mix"
+  $python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
+    --wham_dir $wham_dir \
+    --metadata_dir $metadata_dir \
+    --librimix_outdir $librimix_outdir \
+    --n_src $n_src \
+    --freqs 16k \
+    --modes min \
+    --types mix_clean
+done
diff --git a/scripts/create_librimix_from_metadata.py b/scripts/create_librimix_from_metadata.py
@@ -63,8 +63,10 @@ def create_librimix(librispeech_dir, wham_dir, out_dir, metadata_dir,
                     freqs, n_src, modes, types):
     """ Generate sources mixtures and saves them in out_dir"""
     # Get metadata files
+    print("[Warning] - train-clean-360 is ignored in create_librimix_from_metadata.py for less data preparation time."\
+        " Please note that in S3PRL we only use the train-clean-100 for downstream tasks.")
     md_filename_list = [file for file in os.listdir(metadata_dir)
-                        if 'info' not in file]
+                        if 'info' not in file and '360' not in file]
     # Create all parts of librimix
     for md_filename in md_filename_list:
         csv_path = os.path.join(metadata_dir, md_filename)
@@ -102,11 +104,7 @@ def process_metadata_file(csv_path, freqs, n_src, librispeech_dir, wham_dir,
             print(f"Creating mixtures and sources from {csv_path} "
                   f"in {dir_path}")
             # Create subdir
-            if types == ['mix_clean']:
-                subdirs = [f's{i + 1}' for i in range(n_src)] + ['mix_clean']
-            else:
-                subdirs = [f's{i + 1}' for i in range(n_src)] + types + [
-                    'noise']
+            subdirs = [f's{i + 1}' for i in range(n_src)] + types + ['noise']
             # Create directories accordingly
             for subdir in subdirs:
                 os.makedirs(os.path.join(dir_path, subdir))