Skip to content

Commit

Permalink
Merge pull request #1 from s3prl/less-data-preparation-time
Browse files Browse the repository at this point in the history
less data preparation time
  • Loading branch information
leo19941227 authored Sep 23, 2021
2 parents 7effc1a + 4c6a5b6 commit c985b5b
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 8 deletions.
3 changes: 1 addition & 2 deletions generate_librimix.sh → generate_librimix_sd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ function wham() {
LibriSpeech_dev_clean &
LibriSpeech_test_clean &
LibriSpeech_clean100 &
LibriSpeech_clean360 &
wham &

wait
Expand All @@ -79,5 +78,5 @@ for n_src in 2; do
--n_src $n_src \
--freqs 16k \
--modes max \
--types mix_clean mix_both
--types mix_both
done
82 changes: 82 additions & 0 deletions generate_librimix_ss.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash
set -eu # Exit on error

storage_dir=$1
librispeech_dir=$storage_dir/LibriSpeech
wham_dir=$storage_dir/wham_noise
librimix_outdir=$storage_dir/

function LibriSpeech_dev_clean() {
if ! test -e $librispeech_dir/dev-clean; then
echo "Download LibriSpeech/dev-clean into $storage_dir"
# If downloading stalls for more than 20s, relaunch from previous state.
wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
rm -rf $storage_dir/dev-clean.tar.gz
fi
}

function LibriSpeech_test_clean() {
if ! test -e $librispeech_dir/test-clean; then
echo "Download LibriSpeech/test-clean into $storage_dir"
# If downloading stalls for more than 20s, relaunch from previous state.
wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
rm -rf $storage_dir/test-clean.tar.gz
fi
}

function LibriSpeech_clean100() {
if ! test -e $librispeech_dir/train-clean-100; then
echo "Download LibriSpeech/train-clean-100 into $storage_dir"
# If downloading stalls for more than 20s, relaunch from previous state.
wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
rm -rf $storage_dir/train-clean-100.tar.gz
fi
}

function LibriSpeech_clean360() {
if ! test -e $librispeech_dir/train-clean-360; then
echo "Download LibriSpeech/train-clean-360 into $storage_dir"
# If downloading stalls for more than 20s, relaunch from previous state.
wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
rm -rf $storage_dir/train-clean-360.tar.gz
fi
}

function wham() {
if ! test -e $wham_dir; then
echo "Download wham_noise into $storage_dir"
# If downloading stalls for more than 20s, relaunch from previous state.
wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
rm -rf $storage_dir/wham_noise.zip
fi
}

LibriSpeech_dev_clean &
LibriSpeech_test_clean &
LibriSpeech_clean100 &
wham &

wait

# Path to python
python_path=python

# If you wish to rerun this script in the future please comment this line out.
$python_path scripts/augment_train_noise.py --wham_dir $wham_dir

for n_src in 2; do
metadata_dir=metadata/Libri$n_src"Mix"
$python_path scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
--wham_dir $wham_dir \
--metadata_dir $metadata_dir \
--librimix_outdir $librimix_outdir \
--n_src $n_src \
--freqs 16k \
--modes min \
--types mix_clean
done
10 changes: 4 additions & 6 deletions scripts/create_librimix_from_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def create_librimix(librispeech_dir, wham_dir, out_dir, metadata_dir,
freqs, n_src, modes, types):
""" Generate sources mixtures and saves them in out_dir"""
# Get metadata files
print("[Warning] - train-clean-360 is ignored in create_librimix_from_metadata.py for less data preparation time."\
" Please note that in S3PRL we only use the train-clean-100 for downstream tasks.")
md_filename_list = [file for file in os.listdir(metadata_dir)
if 'info' not in file]
if 'info' not in file and '360' not in file]
# Create all parts of librimix
for md_filename in md_filename_list:
csv_path = os.path.join(metadata_dir, md_filename)
Expand Down Expand Up @@ -102,11 +104,7 @@ def process_metadata_file(csv_path, freqs, n_src, librispeech_dir, wham_dir,
print(f"Creating mixtures and sources from {csv_path} "
f"in {dir_path}")
# Create subdir
if types == ['mix_clean']:
subdirs = [f's{i + 1}' for i in range(n_src)] + ['mix_clean']
else:
subdirs = [f's{i + 1}' for i in range(n_src)] + types + [
'noise']
subdirs = [f's{i + 1}' for i in range(n_src)] + types + ['noise']
# Create directories accordingly
for subdir in subdirs:
os.makedirs(os.path.join(dir_path, subdir))
Expand Down

0 comments on commit c985b5b

Please sign in to comment.