-
Notifications
You must be signed in to change notification settings - Fork 126
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
1,170 additions
and
259 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
exp_dir: exp/RedimnetB2-emb192-fbank72-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch120 | ||
gpus: "[0,1]" | ||
num_avg: 10 | ||
enable_amp: False # whether enable automatic mixed precision training | ||
|
||
seed: 42 | ||
num_epochs: 120 | ||
save_epoch_interval: 5 # save model every 5 epochs | ||
log_batch_interval: 100 # log every 100 batchs | ||
|
||
dataloader_args: | ||
batch_size: 256 | ||
num_workers: 4 | ||
pin_memory: false | ||
prefetch_factor: 4 | ||
drop_last: true | ||
|
||
dataset_args: | ||
# the sample number which will be traversed within one epoch, if the value equals to 0, | ||
# the utterance number in the dataset will be used as the sample_num_per_epoch. | ||
sample_num_per_epoch: 0 | ||
shuffle: True | ||
shuffle_args: | ||
shuffle_size: 2500 | ||
filter: True | ||
filter_args: | ||
min_num_frames: 100 | ||
max_num_frames: 800 | ||
resample_rate: 16000 | ||
speed_perturb: True | ||
num_frms: 200 | ||
aug_prob: 0.6 # prob to add reverb & noise aug per sample | ||
fbank_args: | ||
num_mel_bins: 72 | ||
frame_shift: 10 | ||
frame_length: 25 | ||
dither: 1.0 | ||
spec_aug: False | ||
spec_aug_args: | ||
num_t_mask: 1 | ||
num_f_mask: 1 | ||
max_t: 10 | ||
max_f: 8 | ||
prob: 0.6 | ||
|
||
model: ReDimNetB2 | ||
model_init: null | ||
model_args: | ||
feat_dim: 72 | ||
embed_dim: 192 | ||
pooling_func: "ASTP" # TSTP, ASTP, MQMHASTP | ||
two_emb_layer: False | ||
|
||
|
||
projection_args: | ||
project_type: "arc_margin" # add_margin, arc_margin, sphere, sphereface2, softmax, arc_margin_intertopk_subcenter | ||
scale: 32.0 | ||
easy_margin: False | ||
|
||
|
||
margin_scheduler: MarginScheduler | ||
margin_update: | ||
initial_margin: 0.0 | ||
final_margin: 0.2 | ||
increase_start_epoch: 20 | ||
fix_start_epoch: 40 | ||
update_margin: True | ||
increase_type: "exp" # exp, linear | ||
update_margin: true | ||
|
||
loss: CrossEntropyLoss | ||
loss_args: {} | ||
|
||
optimizer: SGD | ||
optimizer_args: | ||
momentum: 0.9 | ||
nesterov: True | ||
weight_decay: 2.0e-05 | ||
|
||
scheduler: ExponentialDecrease | ||
scheduler_args: | ||
initial_lr: 0.1 | ||
final_lr: 0.00005 | ||
warm_up_epoch: 6 | ||
warm_from_zero: True | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
This is a **WeSpeaker** speaker diarization recipe on the Voxconverse 2020 dataset. It focused on a ``in the wild`` scenario, which was collected from YouTube videos with a semi-automatic pipeline and released for the diarization track in VoxSRC 2020 Challenge. See https://www.robots.ox.ac.uk/~vgg/data/voxconverse/ for more detailed information. | ||
|
||
Two recipes are provided, including **v1** and **v2**. Their only difference is that in **v2**, we split the Fbank extraction, embedding extraction and clustering modules to different stages. We recommend newcomers to follow the **v2** recipe and run it stage by stage. | ||
|
||
🔥 UPDATE 2024.08.20: | ||
* silero-vad v5.1 is used in place of v3.1 | ||
* umap dimensionality reduction + hdbscan clustering is also supported in v2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
#!/bin/bash | ||
# Copyright (c) 2022-2023 Xu Xiang | ||
# 2022 Zhengyang Chen ([email protected]) | ||
# 2024 Hongji Wang ([email protected]) | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
|
@@ -18,8 +19,9 @@ | |
|
||
stage=-1 | ||
stop_stage=-1 | ||
sad_type="oracle" | ||
partition="dev" | ||
sad_type="oracle" # oracle/system | ||
partition="dev" # dev/test | ||
cluster_type="spectral" # spectral/umap | ||
|
||
# do cmn on the sub-segment or on the vad segment | ||
subseg_cmn=true | ||
|
@@ -36,11 +38,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | |
wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip | ||
unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools | ||
|
||
# [2] Download voice activity detection model pretrained by Silero Team | ||
wget -c https://github.com/snakers4/silero-vad/archive/refs/tags/v3.1.zip -O external_tools/silero-vad-v3.1.zip | ||
unzip -o external_tools/silero-vad-v3.1.zip -d external_tools | ||
|
||
# [3] Download ResNet34 speaker model pretrained by WeSpeaker Team | ||
# [2] Download ResNet34 speaker model pretrained by WeSpeaker Team | ||
mkdir -p pretrained_models | ||
|
||
wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx | ||
|
@@ -101,7 +99,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | |
if [[ "x${sad_type}" == "xsystem" ]]; then | ||
# System SAD: applying 'silero' VAD | ||
python3 wespeaker/diar/make_system_sad.py \ | ||
--repo-path external_tools/silero-vad-3.1 \ | ||
--scp data/${partition}/wav.scp \ | ||
--min-duration $min_duration > data/${partition}/system_sad | ||
fi | ||
|
@@ -144,24 +141,24 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then | |
fi | ||
|
||
|
||
# Applying spectral clustering algorithm | ||
# Applying spectral or ump+hdbscan clustering algorithm | ||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then | ||
|
||
[ -f "exp/spectral_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/spectral_cluster/${partition}_${sad_type}_sad_labels | ||
[ -f "exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels | ||
|
||
echo "Doing spectral clustering and store the result in exp/spectral_cluster/${partition}_${sad_type}_sad_labels" | ||
echo "Doing ${cluster_type} clustering and store the result in exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels" | ||
echo "..." | ||
python3 wespeaker/diar/spectral_clusterer.py \ | ||
python3 wespeaker/diar/${cluster_type}_clusterer.py \ | ||
--scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \ | ||
--output exp/spectral_cluster/${partition}_${sad_type}_sad_labels | ||
--output exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels | ||
fi | ||
|
||
|
||
# Convert labels to RTTMs | ||
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then | ||
python3 wespeaker/diar/make_rttm.py \ | ||
--labels exp/spectral_cluster/${partition}_${sad_type}_sad_labels \ | ||
--channel 1 > exp/spectral_cluster/${partition}_${sad_type}_sad_rttm | ||
--labels exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_labels \ | ||
--channel 1 > exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm | ||
fi | ||
|
||
|
||
|
@@ -173,18 +170,18 @@ if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then | |
perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ | ||
-c 0.25 \ | ||
-r <(cat ${ref_dir}/${partition}/*.rttm) \ | ||
-s exp/spectral_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/spectral_cluster/${partition}_${sad_type}_sad_res | ||
-s exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_res | ||
|
||
if [ ${get_each_file_res} -eq 1 ];then | ||
single_file_res_dir=exp/spectral_cluster/${partition}_${sad_type}_single_file_res | ||
single_file_res_dir=exp/${cluster_type}_cluster/${partition}_${sad_type}_single_file_res | ||
mkdir -p $single_file_res_dir | ||
echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..." | ||
|
||
awk '{print $2}' exp/spectral_cluster/${partition}_${sad_type}_sad_rttm | sort -u | while read file_name; do | ||
awk '{print $2}' exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm | sort -u | while read file_name; do | ||
perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \ | ||
-c 0.25 \ | ||
-r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \ | ||
-s <(grep "${file_name}" exp/spectral_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res | ||
-s <(grep "${file_name}" exp/${cluster_type}_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res | ||
done | ||
echo "Done!" | ||
fi | ||
|
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.