diff --git a/_sources/python_api/wespeaker.frontend.rst.txt b/_sources/python_api/wespeaker.frontend.rst.txt index ee09415d..be9f9cfe 100644 --- a/_sources/python_api/wespeaker.frontend.rst.txt +++ b/_sources/python_api/wespeaker.frontend.rst.txt @@ -13,3 +13,4 @@ Submodules :maxdepth: 2 wespeaker.frontend.s3prl + wespeaker.frontend.whisper_encoder diff --git a/_sources/python_api/wespeaker.frontend.whisper_encoder.rst.txt b/_sources/python_api/wespeaker.frontend.whisper_encoder.rst.txt new file mode 100644 index 00000000..7fa7bee4 --- /dev/null +++ b/_sources/python_api/wespeaker.frontend.whisper_encoder.rst.txt @@ -0,0 +1,7 @@ +wespeaker.frontend.whisper\_encoder module +========================================== + +.. automodule:: wespeaker.frontend.whisper_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/_sources/python_api/wespeaker.models.rst.txt b/_sources/python_api/wespeaker.models.rst.txt index 6a11b6cb..298f7c59 100644 --- a/_sources/python_api/wespeaker.models.rst.txt +++ b/_sources/python_api/wespeaker.models.rst.txt @@ -25,3 +25,4 @@ Submodules wespeaker.models.resnet wespeaker.models.speaker_model wespeaker.models.tdnn + wespeaker.models.whisper_PMFA diff --git a/_sources/python_api/wespeaker.models.whisper_PMFA.rst.txt b/_sources/python_api/wespeaker.models.whisper_PMFA.rst.txt new file mode 100644 index 00000000..d51acc31 --- /dev/null +++ b/_sources/python_api/wespeaker.models.whisper_PMFA.rst.txt @@ -0,0 +1,7 @@ +wespeaker.models.whisper\_PMFA module +===================================== + +.. automodule:: wespeaker.models.whisper_PMFA + :members: + :undoc-members: + :show-inheritance: diff --git a/objects.inv b/objects.inv index b65cb70d..9eef2f1c 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/python_api/wespeaker.frontend.html b/python_api/wespeaker.frontend.html index 5db22bf5..47a76b54 100644 --- a/python_api/wespeaker.frontend.html +++ b/python_api/wespeaker.frontend.html @@ -100,6 +100,7 @@

Submodules diff --git a/python_api/wespeaker.frontend.s3prl.html b/python_api/wespeaker.frontend.s3prl.html index 5b84e842..033a8a10 100644 --- a/python_api/wespeaker.frontend.s3prl.html +++ b/python_api/wespeaker.frontend.s3prl.html @@ -22,7 +22,7 @@ - + @@ -103,7 +103,7 @@

wespeaker.frontend.s3prl module - +
diff --git a/python_api/wespeaker.frontend.whisper_encoder.html b/python_api/wespeaker.frontend.whisper_encoder.html new file mode 100644 index 00000000..56c43ecb --- /dev/null +++ b/python_api/wespeaker.frontend.whisper_encoder.html @@ -0,0 +1,132 @@ + + + + + + + wespeaker.frontend.whisper_encoder module — wespeaker 1.2.0 documentation + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

wespeaker.frontend.whisper_encoder module

+
+ + +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/python_api/wespeaker.models.html b/python_api/wespeaker.models.html index e627cedd..4cb1da9d 100644 --- a/python_api/wespeaker.models.html +++ b/python_api/wespeaker.models.html @@ -23,7 +23,7 @@ - + @@ -112,6 +112,7 @@

Submoduleswespeaker.models.resnet module
  • wespeaker.models.speaker_model module
  • wespeaker.models.tdnn module
  • +
  • wespeaker.models.whisper_PMFA module
  • @@ -121,7 +122,7 @@

    Submodules - + diff --git a/python_api/wespeaker.models.tdnn.html b/python_api/wespeaker.models.tdnn.html index 71d34fb2..82df22f8 100644 --- a/python_api/wespeaker.models.tdnn.html +++ b/python_api/wespeaker.models.tdnn.html @@ -22,7 +22,7 @@ - + @@ -103,7 +103,7 @@

    wespeaker.models.tdnn module - +
    diff --git a/python_api/wespeaker.models.whisper_PMFA.html b/python_api/wespeaker.models.whisper_PMFA.html new file mode 100644 index 00000000..f1e869ed --- /dev/null +++ b/python_api/wespeaker.models.whisper_PMFA.html @@ -0,0 +1,132 @@ + + + + + + + wespeaker.models.whisper_PMFA module — wespeaker 1.2.0 documentation + + + + + + + + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +
    +

    wespeaker.models.whisper_PMFA module

    +
    + + +
    +
    + +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/python_api/wespeaker.utils.html b/python_api/wespeaker.utils.html index ded22fd0..fca4cc10 100644 --- a/python_api/wespeaker.utils.html +++ b/python_api/wespeaker.utils.html @@ -23,7 +23,7 @@ - + @@ -116,7 +116,7 @@

    Submodules - + diff --git a/searchindex.js b/searchindex.js index b05d499e..78fc6f9b 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["README", "contribute", "index", "paper", "pretrained", "python_api/modules", "python_api/wespeaker", "python_api/wespeaker.cli", "python_api/wespeaker.cli.hub", "python_api/wespeaker.cli.speaker", "python_api/wespeaker.cli.utils", "python_api/wespeaker.diar", "python_api/wespeaker.diar.extract_emb", "python_api/wespeaker.diar.make_fbank", "python_api/wespeaker.diar.make_oracle_sad", "python_api/wespeaker.diar.make_rttm", "python_api/wespeaker.diar.make_system_sad", "python_api/wespeaker.diar.spectral_clusterer", "python_api/wespeaker.diar.umap_clusterer", "python_api/wespeaker.frontend", "python_api/wespeaker.frontend.s3prl", "python_api/wespeaker.models", "python_api/wespeaker.models.campplus", "python_api/wespeaker.models.convert_repvgg", "python_api/wespeaker.models.ecapa_tdnn", "python_api/wespeaker.models.eres2net", "python_api/wespeaker.models.gemini_dfresnet", "python_api/wespeaker.models.pooling_layers", "python_api/wespeaker.models.projections", "python_api/wespeaker.models.redimnet", "python_api/wespeaker.models.repvgg", "python_api/wespeaker.models.res2net", "python_api/wespeaker.models.resnet", "python_api/wespeaker.models.speaker_model", "python_api/wespeaker.models.tdnn", "python_api/wespeaker.utils", "python_api/wespeaker.utils.checkpoint", "python_api/wespeaker.utils.embedding_processing", "python_api/wespeaker.utils.executor", "python_api/wespeaker.utils.executor_deprecated", "python_api/wespeaker.utils.file_utils", "python_api/wespeaker.utils.schedulers", "python_api/wespeaker.utils.score_metrics", "python_api/wespeaker.utils.utils", "python_package", "reference", "requirements", "runtime", "speaker_recognition_papers", "train", "vox", "vox_ssl", "voxconverse_diar"], "filenames": ["README.md", "contribute.md", "index.rst", "paper.md", "pretrained.md", "python_api/modules.rst", "python_api/wespeaker.rst", "python_api/wespeaker.cli.rst", "python_api/wespeaker.cli.hub.rst", "python_api/wespeaker.cli.speaker.rst", "python_api/wespeaker.cli.utils.rst", "python_api/wespeaker.diar.rst", "python_api/wespeaker.diar.extract_emb.rst", "python_api/wespeaker.diar.make_fbank.rst", "python_api/wespeaker.diar.make_oracle_sad.rst", "python_api/wespeaker.diar.make_rttm.rst", "python_api/wespeaker.diar.make_system_sad.rst", "python_api/wespeaker.diar.spectral_clusterer.rst", "python_api/wespeaker.diar.umap_clusterer.rst", "python_api/wespeaker.frontend.rst", "python_api/wespeaker.frontend.s3prl.rst", "python_api/wespeaker.models.rst", "python_api/wespeaker.models.campplus.rst", "python_api/wespeaker.models.convert_repvgg.rst", "python_api/wespeaker.models.ecapa_tdnn.rst", "python_api/wespeaker.models.eres2net.rst", "python_api/wespeaker.models.gemini_dfresnet.rst", "python_api/wespeaker.models.pooling_layers.rst", "python_api/wespeaker.models.projections.rst", "python_api/wespeaker.models.redimnet.rst", "python_api/wespeaker.models.repvgg.rst", "python_api/wespeaker.models.res2net.rst", "python_api/wespeaker.models.resnet.rst", "python_api/wespeaker.models.speaker_model.rst", "python_api/wespeaker.models.tdnn.rst", "python_api/wespeaker.utils.rst", "python_api/wespeaker.utils.checkpoint.rst", "python_api/wespeaker.utils.embedding_processing.rst", "python_api/wespeaker.utils.executor.rst", "python_api/wespeaker.utils.executor_deprecated.rst", "python_api/wespeaker.utils.file_utils.rst", "python_api/wespeaker.utils.schedulers.rst", "python_api/wespeaker.utils.score_metrics.rst", "python_api/wespeaker.utils.utils.rst", "python_package.md", "reference.rst", "requirements.txt", "runtime.md", "speaker_recognition_papers.md", "train.rst", "vox.md", "vox_ssl.md", "voxconverse_diar.md"], "titles": ["Basic Documents for WeSpeaker", "Contributing to Wespeaker", "Welcome to Wespeaker\u2019s documentation!", "Wespeaker Papers", "Pretrained Models in Wespeaker", "Python API Reference", "wespeaker package", "wespeaker.cli package", "wespeaker.cli.hub module", "wespeaker.cli.speaker module", "wespeaker.cli.utils module", "wespeaker.diar package", "wespeaker.diar.extract_emb module", "wespeaker.diar.make_fbank module", "wespeaker.diar.make_oracle_sad module", "wespeaker.diar.make_rttm module", "wespeaker.diar.make_system_sad module", "wespeaker.diar.spectral_clusterer module", "wespeaker.diar.umap_clusterer module", "wespeaker.frontend package", "wespeaker.frontend.s3prl module", "wespeaker.models package", "wespeaker.models.campplus module", "wespeaker.models.convert_repvgg module", "wespeaker.models.ecapa_tdnn module", "wespeaker.models.eres2net module", "wespeaker.models.gemini_dfresnet module", "wespeaker.models.pooling_layers module", "wespeaker.models.projections module", "wespeaker.models.redimnet module", "wespeaker.models.repvgg module", "wespeaker.models.res2net module", "wespeaker.models.resnet module", "wespeaker.models.speaker_model module", "wespeaker.models.tdnn module", "wespeaker.utils package", "wespeaker.utils.checkpoint module", "wespeaker.utils.embedding_processing module", "wespeaker.utils.executor module", "wespeaker.utils.executor_deprecated module", "wespeaker.utils.file_utils module", "wespeaker.utils.schedulers module", "wespeaker.utils.score_metrics module", "wespeaker.utils.utils module", "Python Package", "Reference", "<no title>", "Runtime for Wespeaker", "Speaker Recognition Papers", "How to train models?", "SV Tutorial on VoxCeleb v2 (Supervised)", "SV Tutorial on VoxCeleb v3 (Self-Supervised)", "Diarization Tutorial on VoxConverse v2"], "terms": {"thi": [0, 47, 50, 51, 52], "directori": [0, 44, 47, 50, 51], "includ": [0, 1, 51], "sv": [0, 2, 4, 49], "tutori": [0, 2, 49], "voxceleb": [0, 2, 4, 44, 47, 48, 49, 52], "v2": [0, 2, 47, 49, 51], "supervis": [0, 2, 48, 49], "v3": [0, 2, 49, 52], "self": [0, 2, 48, 49], "dino": [0, 51], "diariz": [0, 2, 4, 44, 49], "voxconvers": [0, 2, 4, 49], "suggest": [0, 1, 47, 50, 51, 52], "paper": [0, 2, 45, 50], "speaker": [0, 2, 3, 4, 7, 44, 45, 47], "embed": [0, 3, 4, 44, 47, 48], "learn": [0, 3, 48, 51], "provid": [0, 1, 4, 44, 50, 51, 52], "pretrain": [0, 2, 52], "model": [0, 1, 2, 6, 52], "off": 0, "shelf": 0, "usag": [0, 2], "from": [0, 4, 44, 47, 48, 50, 51, 52], "command": [0, 2, 47, 50, 51], "line": [0, 2, 50, 51, 52], "python": [0, 1, 2, 4, 45, 47, 50, 51], "code": [0, 2, 4], "how": [0, 1, 2, 47], "contribut": [0, 2], "ncmmsc": 0, "represent": 0, "slide": 0, "chines": [0, 44, 48], "handbook": 0, "introduct": [0, 51], "video": 0, "your": [1, 4, 47, 50, 51, 52], "interest": 1, "our": [1, 4, 47, 50, 51, 52], "commun": 1, "i": [1, 2, 4, 47, 50, 51], "open": [1, 50, 51, 52], "everyon": 1, "welcom": 1, "all": [1, 50, 51], "kind": [1, 47, 50, 51, 52], "matter": [1, 48], "small": [1, 50, 51], "larg": [1, 4, 48, 51], "There": 1, "ar": [1, 44, 50, 51, 52], "sever": [1, 47, 50, 51], "wai": [1, 47, 50, 51], "can": [1, 4, 44, 47, 50, 51, 52], "project": [1, 21, 47, 51], "identifi": 1, "ani": [1, 50, 51, 52], "bug": 1, "add": [1, 47, 51], "new": [1, 52], "implement": 1, "featur": [1, 47, 51], "howev": 1, "rememb": 1, "aren": 1, "t": [1, 44, 47], "just": [1, 50, 51], "about": [1, 4, 52], "we": [1, 4, 44, 47, 50, 51, 52], "believ": 1, "power": 1, "support": [1, 2], "thu": [1, 50, 51, 52], "answer": 1, "queri": 1, "assist": [1, 51], "other": [1, 44, 47, 50, 51], "enhanc": 1, "document": 1, "highli": 1, "regard": [1, 52], "benefici": 1, "final": [1, 47, 50, 51, 52], "one": [1, 47, 50, 51, 52], "most": [1, 51], "impact": 1, "u": [1, 47, 52], "rais": 1, "awar": 1, "talk": 1, "blog": 1, "post": 1, "highlight": 1, "": [1, 4, 47, 50, 51, 52], "drive": 1, "incred": 1, "If": [1, 44, 47, 50, 51, 52], "encount": 1, "have": [1, 47, 50, 51], "pleas": [1, 4, 44, 47, 50, 51, 52], "check": [1, 47, 50, 51, 52], "page": [1, 2], "first": [1, 47], "see": [1, 4], "someon": 1, "els": [1, 47, 50, 51], "ha": [1, 47, 50], "alreadi": 1, "file": [1, 4, 44, 47, 50, 51, 52], "much": 1, "relev": 1, "inform": [1, 52], "possibl": 1, "In": [1, 47, 48, 50, 51], "gener": [1, 47, 48, 51], "adher": 1, "googl": 1, "c": [1, 50, 51, 52], "when": [1, 47, 50, 51, 52], "submit": 1, "make": [1, 52], "sure": [1, 44, 47], "been": 1, "rebas": 1, "top": 1, "latest": [1, 47, 52], "commit": 1, "master": [1, 47, 52], "branch": 1, "ensur": [1, 51], "properli": 1, "format": [1, 4, 44, 50, 51, 52], "detail": [1, 44, 50, 51, 52], "descript": [1, 48], "chang": [1, 47], "explain": 1, "why": 1, "made": 1, "did": 1, "fix": [1, 47], "an": [1, 2, 44, 47, 51], "refer": [1, 2, 4, 51], "submiss": 1, "member": 1, "requir": [1, 4, 47, 51], "To": [1, 4, 48], "process": [1, 50, 51, 52], "smooth": 1, "keep": 1, "concis": 1, "involv": 1, "multipl": 1, "unrel": 1, "consid": 1, "split": [1, 52], "separ": [1, 50, 51, 52], "respond": 1, "comment": 1, "within": [1, 52], "reason": [1, 50, 51], "time": [1, 47, 50, 51, 52], "frame": [1, 47], "isn": 1, "clear": 1, "disagre": 1, "feel": [1, 50, 51, 52], "free": [1, 50, 51, 52], "ask": [1, 50, 51, 52], "clarif": 1, "discuss": 1, "take": [1, 47, 50, 51], "read": [1, 51, 52], "guidelin": 1, "great": 1, "tool": [1, 50, 51, 52], "research": [2, 3], "product": [2, 3, 47], "orient": [2, 3], "verif": [2, 48, 51], "recognit": [2, 45], "toolkit": [2, 3, 52], "packag": [2, 5], "instal": 2, "program": 2, "train": [2, 4, 44, 48, 52], "licens": 2, "onnx": [2, 47, 52], "infer": [2, 47, 50, 51], "demo": 2, "list": [2, 44, 50, 51], "runtim": [2, 4, 50, 51], "platform": 2, "onnxruntim": [2, 4], "server": 2, "tensorrt": 2, "gpu": [2, 50, 51, 52], "api": [2, 45], "issu": [2, 50, 51, 52], "report": 2, "style": [2, 44], "guid": 2, "pull": 2, "request": [2, 47], "review": 2, "thank": [2, 50, 51, 52], "you": [2, 4, 44, 47, 50, 51, 52], "index": [2, 52], "modul": [2, 7, 11, 19, 21, 35, 51, 52], "search": 2, "A": [3, 48], "accept": [3, 47], "icassp": 3, "2023": 3, "baselin": [3, 52], "voxsrc2023": [3, 52], "besid": [4, 50, 51], "relat": [4, 51], "task": [4, 44, 52], "util": [4, 6, 7], "mani": [4, 47, 50, 51], "which": [4, 44, 47, 50, 51, 52], "voic": [4, 52], "convers": 4, "text": [4, 48, 52], "speech": [4, 47, 52], "adapt": [4, 50], "asr": 4, "target": [4, 47, 50, 51], "extract": [4, 44, 47], "For": [4, 47, 48, 50, 51, 52], "user": [4, 50], "who": 4, "would": [4, 52], "like": [4, 47, 50, 51, 52], "verifi": 4, "perform": [4, 44, 47, 50, 51], "abov": [4, 44, 47, 50, 51], "without": 4, "troubl": 4, "learner": 4, "two": [4, 44, 47, 50, 51, 52], "type": [4, 52], "checkpoint": [4, 35, 47, 50, 51], "suffix": 4, "pt": [4, 44, 47, 50, 51], "save": [4, 44, 50, 52], "reproduc": 4, "publish": 4, "result": [4, 44, 47, 50, 51], "us": [4, 44, 47, 48, 50, 51, 52], "continu": [4, 50, 51], "export": 4, "The": [4, 47, 48, 50, 51, 52], "wenet": [4, 44, 47, 50, 51], "follow": [4, 44, 47, 50, 51, 52], "correspond": [4, 47, 51], "dataset": [4, 48, 50, 51, 52], "exampl": [4, 47, 50, 51, 52], "creativ": 4, "common": 4, "attribut": 4, "4": 4, "0": [4, 44, 50, 51, 52], "intern": 4, "sinc": [4, 47], "http": [4, 44, 47, 52], "mm": 4, "kaist": 4, "ac": [4, 52], "kr": 4, "pytorch": [4, 44, 50, 51], "directli": 4, "run": [4, 47, 50, 51, 52], "sh": [4, 47, 50, 51, 52], "recip": [4, 50, 51, 52], "As": 4, "toi": 4, "download": [4, 44], "onnx_path": 4, "wav_path": [4, 50, 51, 52], "path": [4, 44, 47, 50, 51, 52], "wave": 4, "16k": 4, "bin": [4, 47, 50, 51], "infer_onnx": 4, "py": [4, 47, 50, 51, 52], "easili": [4, 50, 51], "applic": [4, 47], "found": [4, 44, 47, 50, 51], "lm": [4, 50, 52], "mean": [4, 47, 52], "further": [4, 50], "fine": [4, 50], "tune": [4, 50], "margin": [4, 48], "could": [4, 50, 51, 52], "better": [4, 50, 52], "long": [4, 47, 50, 51], "audio": [4, 44, 47, 52], "e": [4, 44, 47, 50, 51], "g": [4, 47, 50, 51, 52], "3": 4, "languag": [4, 44, 48], "en": 4, "resnet34": [4, 47, 51, 52], "resnet34_lm": [4, 44], "resnet152_lm": 4, "resnet221_lm": [4, 44], "resnet293_lm": 4, "cam": 4, "_lm": 4, "ecapa512": 4, "ecapa512_lm": 4, "ecapa1024": 4, "ecapa1024_lm": 4, "gemini_dfresnet114_lm": 4, "cnceleb": [4, 44, 48], "cn": [4, 48], "wespeak": [5, 44, 45, 50, 51, 52], "subpackag": 5, "cli": 6, "submodul": 6, "diar": [6, 52], "frontend": 6, "hub": 7, "get_arg": [7, 10], "sourc": [10, 52], "extract_emb": [11, 52], "make_fbank": [11, 52], "make_oracle_sad": [11, 52], "make_rttm": [11, 52], "make_system_sad": [11, 52], "spectral_cluster": [11, 52], "umap_cluster": 11, "s3prl": 19, "campplu": [21, 44], "convert_repvgg": [21, 50], "ecapa_tdnn": 21, "eres2net": [21, 44], "gemini_dfresnet": 21, "pooling_lay": 21, "redimnet": 21, "repvgg": [21, 50], "res2net": 21, "resnet": [21, 47, 50, 51], "speaker_model": [21, 47], "tdnn": [21, 48], "embedding_process": 35, "executor": 35, "executor_deprec": 35, "file_util": 35, "schedul": 35, "score_metr": 35, "pip": 44, "git": [44, 52], "github": [44, 47, 50, 51, 52], "com": [44, 47, 52], "e2": [44, 47], "develop": 44, "clone": [44, 52], "cd": [44, 47, 50, 51, 52], "audio_fil": [44, 47], "wav": [44, 47, 50, 51, 52], "output_fil": [44, 50, 51], "txt": [44, 47], "embedding_kaldi": 44, "wav_scp": [44, 47, 51], "scp": [44, 47, 50, 51, 52], "similar": [44, 47, 50, 51], "audio_file2": 44, "audio2": 44, "devic": [44, 47, 52], "cuda": [44, 47, 52], "window": [44, 47], "linux": [44, 47], "mp": 44, "metal": 44, "shader": 44, "maco": [44, 47], "specifi": [44, 47, 50, 51, 52], "paramet": [44, 47, 50, 51], "h": 44, "five": 44, "now": [44, 47], "output": [44, 47, 52], "_": [44, 47, 50, 51, 52], "kaldi": [44, 47, 50, 51], "ark": [44, 47, 50, 51], "comput": [44, 47, 52], "rang": [44, 47], "1": 44, "appli": [44, 51], "input": [44, 47], "l": [44, 52], "english": 44, "p": [44, 47, 50, 52], "avg_model": [44, 47, 50, 51], "config": [44, 47, 50, 51], "yaml": [44, 47, 50, 51], "should": [44, 47, 50, 51, 52], "contain": [44, 50, 51, 52], "set": [44, 47, 52], "cpu": [44, 47, 52], "campplus_cn_common_200k": 44, "damo": 44, "res2net_cn_common_200k": 44, "file2": 44, "specif": 44, "each": [44, 50, 51, 52], "kei": [44, 51], "resample_r": [44, 51], "resampl": 44, "rate": [44, 47, 50, 51, 52], "default": [44, 50, 51, 52], "16000": [44, 47], "vad": 44, "true": [44, 47, 52], "differ": [44, 47, 50, 51, 52], "warn": 44, "want": [44, 47, 50, 51, 52], "link": 44, "renam": 44, "By": [44, 50, 51], "option": [44, 47, 52], "either": 44, "ones": [44, 47], "yourself": 44, "import": [44, 47, 51], "load_model": 44, "tensor": 44, "alloc": 44, "set_devic": 44, "extract_embed": 44, "utt_nam": 44, "extract_embedding_list": 44, "compute_similar": 44, "audio1": 44, "diar_result": 44, "give_this_utt_a_nam": 44, "regist": 44, "recogn": 44, "spk1": 44, "spk1_audio1": 44, "spk2": 44, "spk2_audio1": 44, "spk3": 44, "spk3_audio1": 44, "spk1_audio2": 44, "jinja2": 46, "nbsphinx": 46, "sphinx": 46, "recommonmark": 46, "markdown": 46, "tabl": 46, "rtd": 46, "theme": 46, "x86": 47, "android": 47, "come": 47, "ncnn": 47, "experi": 47, "blob": 47, "exp": [47, 50, 51, 52], "dir": [47, 50, 52], "onnx_dir": 47, "export_onnx": 47, "output_model": 47, "finish": [47, 50, 51, 52], "find": [47, 50, 51], "cmake": 47, "14": 47, "gcc": 47, "5": 47, "mkdir": [47, 50, 52], "donnx": 47, "ON": 47, "don": 47, "dgpu": 47, "note": [47, 50, 51, 52], "need": 47, "usr": 47, "local": [47, 50, 51, 52], "11": 47, "ld_library_path": 47, "lib64": 47, "rtf": 47, "real": [47, 51], "factor": 47, "shown": [47, 50], "consol": 47, "written": 47, "glog_logtostderr": 47, "glog_v": 47, "your_test_wav_scp": 47, "your_model_dir": 47, "embed_out": 47, "your_embedding_txt": 47, "extract_emb_main": 47, "speaker_model_path": 47, "embedding_s": 47, "256": 47, "samples_per_chunk": 47, "32000": 47, "sampl": [47, 51], "per": [47, 52], "chunk": [47, 51], "durat": [47, 48, 50, 52], "whole": [47, 50, 51, 52], "sentenc": [47, 51], "averag": [47, 50, 51], "calcul": [47, 50, 51], "asv_main": 47, "enroll_wav": 47, "wav1_path": 47, "test_wav": 47, "wav2_path": 47, "threshold": 47, "show": 47, "convert": [47, 50, 51, 52], "deploi": 47, "them": 47, "triton": 47, "onli": [47, 51, 52], "instead": 47, "mai": 47, "well": [47, 52], "after": [47, 50, 51, 52], "get": [47, 50, 51, 52], "under": [47, 50, 51, 52], "xxx": 47, "folder": 47, "ll": 47, "go": [47, 50, 51, 52], "exp_dir": [47, 50, 51], "python3": [47, 52], "minu": [47, 52], "vector": [47, 48], "simpli": [47, 52], "mean_vec": 47, "npy": 47, "vox2_dev": [47, 50, 51], "skip": [47, 51], "part": 47, "section": [47, 52], "repositori": 47, "let": 47, "22": 47, "03": 47, "therefor": 47, "here": [47, 50, 51, 52], "docker": 47, "move": 47, "v": 47, "nvcr": 47, "io": [47, 50, 51], "nvidia": 47, "py3": 47, "shape": 47, "bxtxf": 47, "batchsiz": 47, "sequence_length": 47, "feature_s": 47, "trtexec": 47, "saveengin": 47, "b1_b128_s3000_fp16": 47, "trt": 47, "minshap": 47, "feat": 47, "1x200x80": 47, "optshap": 47, "64x200x80": 47, "maxshap": 47, "128x3000x80": 47, "fp16": 47, "maximum": 47, "sequenc": 47, "length": [47, 52], "3000": 47, "minimum": 47, "200": 47, "stride": 47, "10m": 47, "02": 47, "second": [47, 52], "30": 47, "respect": 47, "extractor": 47, "notic": 47, "number": [47, 50, 51], "depend": [47, 48], "ve": 47, "ad": [47, 51], "pratic": 47, "affect": 47, "accuraci": 47, "improv": [47, 50, 51], "same": [47, 50, 51], "know": 47, "idea": 47, "try": [47, 52], "below": 47, "script": [47, 50, 51], "torchaudio": 47, "complianc": 47, "torch": 47, "audio_dur_in_second": 47, "feat_dim": 47, "80": 47, "dont": 47, "sample_r": 47, "waveform": [47, 48], "unsqueez": 47, "feat_tensor": 47, "fbank": 47, "num_mel_bin": 47, "frame_shift": [47, 51, 52], "10": [47, 51, 52], "frame_length": [47, 51], "25": [47, 51, 52], "energy_floor": 47, "window_typ": 47, "ham": 47, "htk_compat": 47, "use_energi": 47, "fals": [47, 52], "dither": 47, "print": [47, 50, 51, 52], "198": 47, "Then": [47, 50, 51], "actual": 47, "That": 47, "segment": [47, 50, 51, 52], "edit": 47, "model_repo": 47, "pbtxt": 47, "replac": 47, "default_model_filenam": 47, "name": [47, 50, 51, 52], "put": [47, 50, 51], "And": [47, 48, 51], "ecapa": [47, 48, 50, 51], "dim": 47, "192": 47, "also": [47, 50, 51, 52], "backend": 47, "kind_gpu": 47, "kind_cpu": 47, "dockerfil": 47, "Be": 47, "version": [47, 52], "f": [47, 50, 51, 52], "network": [47, 48], "host": 47, "pwd": [47, 52], "w": 47, "shm": 47, "size": [47, 50], "1g": 47, "ulimit": 47, "memlock": 47, "8000": 47, "8001": 47, "8002": 47, "stack": 47, "67108864": 47, "ti": 47, "tritonserv": 47, "port": 47, "grpc": 47, "wespeaker_cli": 47, "data": 47, "url": 47, "ip": 47, "wavscp": 47, "raid": 47, "dgxsa": 47, "slyne": 47, "vox1": [47, 50, 51], "output_directori": 47, "direcotri": 47, "someth": [47, 51], "xvector_000": 47, "xvextor_000": 47, "xvector_001": 47, "cat": [47, 52], "xvector_": 47, "xvector": 47, "conf": [47, 50, 51], "trials_dir": 47, "trial": [47, 50, 51], "eval_scp_path": 47, "x86_gpu": 47, "cal_mean": 47, "cal_mean_dir": 47, "p_target": 47, "01": 47, "c_miss": 47, "c_fa": 47, "vox1_o_clean": [47, 50, 51], "vox1_e_clean": [47, 50, 51], "vox1_h_clean": [47, 50, 51], "tee": [47, 52], "vox1_cos_result": [47, 50, 51], "resnet_b1_b128_s200_fp16": 47, "resnet_avg_model": 47, "128x200x80": 47, "ecapa_b1_b128_s200_fp16": 47, "ecapa_avg_model": 47, "t4": 47, "throughput": 47, "bz": 47, "64": 47, "utter": [47, 48, 51], "39": 47, "7842": 47, "2546": 47, "52": 47, "958": 47, "3389": 47, "generate_input": 47, "perf_analyz": 47, "m": 47, "b": [47, 51], "concurr": 47, "1000": [47, 50, 51], "json": 47, "localhost": 47, "conccur": 47, "avg": 47, "latenc": 47, "p99": 47, "2033": 47, "98": 47, "111": 47, "400": [47, 52], "2010": 47, "208": 47, "2647": 47, "75": [47, 52], "2726": 47, "147": 47, "172": 47, "scale": [48, 50, 51], "identif": 48, "voxceleb2": [48, 50, 51], "deep": 48, "wild": 48, "celeb": 48, "challeng": 48, "multi": [48, 50, 51], "genr": 48, "architectur": 48, "design": 48, "x": [48, 52], "robust": 48, "dnn": 48, "But": 48, "system": [48, 50, 51, 52], "2019": 48, "r": [48, 52], "rawnet": 48, "advanc": 48, "end": [48, 50, 51], "neural": 48, "raw": [48, 50, 51], "independ": 48, "With": 48, "sincnet": 48, "emphas": 48, "channel": [48, 52], "attent": 48, "propag": 48, "aggreg": 48, "base": [48, 50, 51], "optim": [48, 50, 51], "object": 48, "classif": 48, "loss": [48, 50, 51], "explor": 48, "encod": 48, "layer": 48, "function": [48, 50, 51], "angular": 48, "softmax": 48, "short": [48, 51, 52], "ensembl": 48, "addit": [48, 51], "toward": 48, "more": [48, 51, 52], "discrimin": 48, "triplet": 48, "On": 48, "pool": 48, "method": [48, 51], "statist": [48, 50], "resolut": 48, "head": [48, 51, 52], "level": 48, "novel": 48, "learnabl": 48, "dictionari": 48, "augment": [48, 50, 51], "adversari": 48, "prototyp": 48, "momentum": 48, "contrast": [48, 51], "gate": 48, "meet": [50, 51, 52], "problem": [50, 51, 52], "through": [50, 51, 52], "feedback": [50, 51, 52], "simpl": [50, 51], "manual": [50, 51], "understand": [50, 51, 52], "bash": [50, 51, 52], "stop_stag": [50, 51, 52], "le": [50, 51, 52], "ge": [50, 51, 52], "echo": [50, 51, 52], "prepar": [50, 51], "prepare_data": [50, 51], "fi": [50, 51, 52], "voxceleb1": [50, 51], "musan": [50, 51], "rir": [50, 51], "nois": [50, 51], "reverber": [50, 51], "It": [50, 51], "start": [50, 51], "becaus": [50, 51], "recommand": [50, 51], "archiv": [50, 51, 52], "own": [50, 51], "download_data": [50, 51], "meta": [50, 51], "utt2spk": [50, 51], "spk2utt": [50, 51], "dev": [50, 51, 52], "record": [50, 51, 52], "blank": [50, 51, 52], "column": [50, 51, 52], "wav_id": [50, 51, 52], "id10001": [50, 51], "1zciwhmdeo4": [50, 51], "00001": [50, 51], "voxceleb1_wav_v2": [50, 51], "00002": [50, 51], "spk_id": [50, 51, 52], "belong": [50, 51], "00003": [50, 51], "id10002": [50, 51], "0_laien": [50, 51], "q44": [50, 51], "6wo410qoeuo": [50, 51], "three": [50, 51, 52], "enroll_wav_id": [50, 51], "test_wav_id": [50, 51], "label": [50, 51], "y8hivobuel": [50, 51], "id10943": [50, 51], "vncvj7ylwpu": [50, 51], "00005": [50, 51], "nontarget": [50, 51], "7w0ibewc9qw": [50, 51], "00004": [50, 51], "id10999": [50, 51], "g5r2": [50, 51], "hl7yx8": [50, 51], "00008": [50, 51], "covert": [50, 51], "test": [50, 51, 52], "data_typ": [50, 51], "dset": [50, 51], "do": [50, 51, 52], "shard": [50, 51], "make_shard_list": [50, 51], "num_utts_per_shard": [50, 51], "num_thread": [50, 51], "16": [50, 51], "prefix": [50, 51], "shuffl": [50, 51], "make_raw_list": [50, 51], "done": [50, 51, 52], "lmdb": [50, 51], "make_lmdb": [50, 51], "million": [50, 51], "frequent": [50, 51], "caus": [50, 51], "bottleneck": [50, 51], "restor": [50, 51], "some": [50, 51], "binari": [50, 51], "store": [50, 51, 52], "fastli": [50, 51], "random": [50, 51], "access": [50, 51], "num_gpu": [50, 51], "awk": [50, 51, 52], "nf": [50, 51, 52], "torchrun": [50, 51], "standalon": [50, 51], "nnode": [50, 51], "nproc_per_nod": [50, 51], "num_avg": [50, 51], "train_data": [50, 51], "train_label": 50, "reverb_data": [50, 51], "noise_data": [50, 51], "nn": [50, 51], "step": [50, 51, 52], "mode": [50, 51], "ddp": [50, 51], "id": [50, 51, 52], "variabl": [50, 51], "idx": [50, 51], "initi": [50, 51], "randomli": [50, 51], "pre": [50, 51, 52], "weight": [50, 51], "model_init": [50, 51], "param": [50, 51], "resum": [50, 51], "termin": [50, 51], "epoch": [50, 51], "accident": [50, 51], "peopl": [50, 51], "out": [50, 51], "memori": [50, 51], "your_exp": [50, 51], "n": [50, 51, 52], "structur": [50, 51], "configur": [50, 51], "et": [50, 51], "al": [50, 51], "average_model": 50, "dst_model": [50, 51], "src_path": [50, 51], "num": [50, 51, 52], "model_path": [50, 51], "load": 50, "convert_model": 50, "extract_vox": [50, 51], "nj": [50, 51, 52], "last": [50, 51], "ensambl": [50, 51], "strategi": [50, 51], "forward": 50, "paradigm": 50, "somet": 50, "wrong": [50, 51, 52], "happen": [50, 51], "log": [50, 51], "stop": [50, 51], "trail": [50, 51], "cosin": [50, 51], "pair": [50, 51], "At": [50, 51], "equal": [50, 51], "error": [50, 51, 52], "eer": [50, 51], "mindcf": [50, 51], "trial_xx": [50, 51], "norm": 50, "score_norm": 50, "score_norm_method": 50, "cohort_set": 50, "top_n": 50, "adapta": 50, "normal": [50, 51, 52], "vox1_": 50, "_result": 50, "asnorm": [50, 51], "snorm": 50, "algorithm": [50, 51], "neg": 50, "cohort": 50, "best": [50, 51], "export_jit": [50, 51], "avg_": 50, "average_num": 50, "zip": [50, 51, 52], "libtorch": [50, 51], "lm_exp_dir": 50, "cp": 50, "model_0": 50, "lm_config": 50, "anoth": 50, "few": 50, "increas": 50, "enlarg": 50, "thei": 51, "current": 51, "commonli": 51, "framework": 51, "readm": 51, "md": [51, 52], "simclr": 51, "moco": 51, "basic": 51, "processs": 51, "exactli": 51, "befor": 51, "crucial": 51, "strongli": 51, "recommend": [51, 52], "incorpor": 51, "dure": 51, "ssl": 51, "train_dino": 51, "train_contrast": 51, "biggest": 51, "compar": 51, "organ": 51, "cannot": 51, "construct": 51, "assumpt": 51, "crop": 51, "suerpervis": 51, "dataload": 51, "defin": 51, "next": 51, "briefli": 51, "introduc": [51, 52], "firstli": 51, "storag": 51, "global": 51, "datalist": 51, "processor": 51, "url_open": 51, "tar_file_and_group": 51, "elif": 51, "parse_raw": 51, "parse_feat": 51, "shuffle_arg": 51, "composit": 51, "form": 51, "posit": 51, "fbank_arg": 51, "chunk_info_arg": 51, "chunk_len": 51, "ssl_processor": 51, "random_chunk_for_dino": 51, "veri": 51, "reverb": 51, "aug_prob": 51, "reverb_lmdb_fil": 51, "noise_lmdb_fil": 51, "lmdbdata": 51, "add_reverb_nois": 51, "notabl": 51, "facilit": 51, "effortless": 51, "divers": 51, "pipelin": 51, "both": 51, "effici": 51, "eas": 51, "extens": [51, 52], "order": 51, "compat": 51, "exist": 51, "wrapper": 51, "definit": 51, "so": 51, "average_dino_model": 51, "average_contrastive_model": 51, "student": 51, "necessari": 51, "remov": [51, 52], "subsequ": 51, "unlik": 51, "theoret": 51, "typic": 52, "downstream": 52, "learnt": 52, "2020": 52, "v1": 52, "Their": 52, "newcom": 52, "external_tool": 52, "wget": 52, "usnistgov": 52, "sctk": 52, "ref": 52, "tag": 52, "12": 52, "o": 52, "unzip": 52, "d": 52, "activ": 52, "detect": 52, "silero": 52, "team": 52, "snakers4": 52, "pretrained_model": 52, "1256283475": 52, "co": 52, "ap": 52, "shanghai": 52, "myqcloud": 52, "voxceleb_resnet34_lm": 52, "metric": 52, "silenc": 52, "extern": 52, "annot": 52, "joonson": 52, "voxconverse_mast": 52, "voxsrc": 52, "23": 52, "valid": 52, "look": 52, "jaesunghuh": 52, "recurs": 52, "certif": 52, "www": 52, "robot": 52, "ox": 52, "uk": 52, "vgg": 52, "voxconverse_dev_wav": 52, "creat": 52, "substr": 52, "voxconverse_test_wav": 52, "seem": 52, "repo": 52, "ground": 52, "truth": 52, "abjxc": 52, "afjiv": 52, "min": 52, "min_dur": 52, "255": 52, "sad_typ": 52, "xoracl": 52, "oracl": 52, "handl": 52, "overlap": 52, "too": 52, "region": 52, "while": 52, "utt": 52, "partit": 52, "oracle_sad": 52, "xsystem": 52, "system_sad": 52, "info": 52, "where": 52, "less": 52, "than": 52, "ignor": 52, "_sad_fbank": 52, "rm": 52, "_sad": 52, "store_dir": 52, "subseg_cmn": 52, "24": 52, "cepstral": 52, "cmn": 52, "sub": 52, "job": 52, "accord": 52, "core": 52, "_sad_embed": 52, "batch_siz": 52, "96": 52, "window_sec": 52, "period_sec": 52, "fashion": 52, "everi": 52, "contigu": 52, "decid": 52, "spectral_clust": 52, "_sad_label": 52, "emb": 52, "subseg_id": 52, "00000400": 52, "00007040": 52, "00000000": 52, "00000150": 52, "00000075": 52, "00000225": 52, "00000300": 52, "00000375": 52, "_sad_rttm": 52, "rich": 52, "transcript": 52, "mark": 52, "space": 52, "delimit": 52, "turn": 52, "ten": 52, "field": 52, "alwai": 52, "basenam": 52, "onset": 52, "begin": 52, "orthographi": 52, "na": 52, "uniqu": 52, "scope": 52, "confid": 52, "score": 52, "probabl": 52, "correct": 52, "signal": 52, "lookahead": 52, "instanc": 52, "640": 52, "680": 52, "55": 52, "960": 52, "ref_dir": 52, "perl": 52, "src": 52, "eval": 52, "pl": 52, "_sad_r": 52, "get_each_file_r": 52, "eq": 52, "single_file_res_dir": 52, "_single_file_r": 52, "nget": 52, "underd": 52, "sort": 52, "file_nam": 52, "grep": 52, "_re": 52, "sum": 52, "percentag": 52, "assign": 52, "alarm": 52, "nonspeech": 52, "incorrectli": 52, "miss": 52, "consult": 52, "nist": 52, "rt": 52, "09": 52, "plan": 52, "overal": 52, "singl": 52}, "objects": {"wespeaker": [[7, 0, 0, "-", "cli"]], "wespeaker.cli": [[10, 0, 0, "-", "utils"]], "wespeaker.cli.utils": [[10, 1, 1, "", "get_args"]]}, "objtypes": {"0": "py:module", "1": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"]}, "titleterms": {"basic": 0, "document": [0, 2], "wespeak": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47], "todo": 0, "list": [0, 4], "possibl": 0, "contribut": 1, "issu": 1, "report": 1, "code": 1, "style": 1, "guid": 1, "pull": 1, "request": 1, "review": 1, "thank": 1, "you": 1, "welcom": 2, "": 2, "content": [2, 45, 49], "indic": 2, "tabl": 2, "paper": [3, 48], "pretrain": [4, 44], "model": [4, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 44, 47, 49, 50, 51], "licens": 4, "onnx": 4, "infer": 4, "demo": 4, "modelscop": 4, "huggingfac": 4, "python": [5, 44], "api": 5, "refer": [5, 45], "packag": [6, 7, 11, 19, 21, 35, 44], "subpackag": 6, "cli": [7, 8, 9, 10], "submodul": [7, 11, 19, 21, 35], "hub": 8, "modul": [8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43], "speaker": [9, 48, 50, 51, 52], "util": [10, 35, 36, 37, 38, 39, 40, 41, 42, 43], "diar": [11, 12, 13, 14, 15, 16, 17, 18], "extract_emb": 12, "make_fbank": 13, "make_oracle_sad": 14, "make_rttm": 15, "make_system_sad": 16, "spectral_cluster": 17, "umap_cluster": 18, "frontend": [19, 20], "s3prl": 20, "campplu": 22, "convert_repvgg": 23, "ecapa_tdnn": 24, "eres2net": 25, "gemini_dfresnet": 26, "pooling_lay": 27, "project": 28, "redimnet": 29, "repvgg": 30, "res2net": 31, "resnet": 32, "speaker_model": 33, "tdnn": 34, "checkpoint": 36, "embedding_process": 37, "executor": 38, "executor_deprec": 39, "file_util": 40, "schedul": 41, "score_metr": 42, "instal": 44, "command": 44, "line": 44, "usag": 44, "support": [44, 47], "program": 44, "runtim": 47, "platform": 47, "onnxruntim": 47, "server": 47, "tensorrt": 47, "gpu": 47, "introduct": 47, "step": 47, "0": 47, "train": [47, 49, 50, 51], "1": [47, 50, 51, 52], "export": [47, 50, 51], "engin": 47, "construct": 47, "repo": 47, "2": [47, 50, 51, 52], "build": 47, "start": 47, "3": [47, 50, 51, 52], "client": 47, "4": [47, 50, 51, 52], "test": 47, "score": [47, 50, 51], "perf": 47, "pipelin": 47, "recognit": 48, "how": 49, "sv": [50, 51], "tutori": [50, 51, 52], "voxceleb": [50, 51], "v2": [50, 52], "supervis": [50, 51], "first": [50, 51, 52], "experi": [50, 51, 52], "stage": [50, 51, 52], "download": [50, 51, 52], "data": [50, 51, 52], "reformat": [50, 51, 52], "neural": [50, 51], "network": [50, 51], "embed": [50, 51, 52], "extract": [50, 51, 52], "5": [50, 51, 52], "evalu": [50, 51, 52], "set": [50, 51], "6": [50, 51, 52], "7": [50, 52], "option": [50, 51], "8": [50, 52], "larg": 50, "margin": 50, "finetun": 50, "v3": 51, "self": 51, "diariz": 52, "voxconvers": 52, "prerequisit": 52, "prepar": 52, "appli": 52, "sad": 52, "i": 52, "e": 52, "vad": 52, "fbank": 52, "featur": 52, "slide": 52, "window": 52, "spectral": 52, "cluster": 52, "label": 52, "rttm": 52, "result": 52, "der": 52}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx.ext.viewcode": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"Basic Documents for WeSpeaker": [[0, "basic-documents-for-wespeaker"]], "ToDo List (possible)": [[0, "todo-list-possible"]], "Contributing to Wespeaker": [[1, "contributing-to-wespeaker"]], "Issue Reporting": [[1, "issue-reporting"]], "Coding Style Guide": [[1, "coding-style-guide"]], "Pull Requests": [[1, "pull-requests"]], "Code Reviews": [[1, "code-reviews"]], "Thank You": [[1, "thank-you"]], "Welcome to Wespeaker\u2019s documentation!": [[2, "welcome-to-wespeaker-s-documentation"]], "Contents:": [[2, null], [45, null], [49, null]], "Indices and tables": [[2, "indices-and-tables"]], "Wespeaker Papers": [[3, "wespeaker-papers"]], "Pretrained Models in Wespeaker": [[4, "pretrained-models-in-wespeaker"]], "Model License": [[4, "model-license"]], "Onnx Inference Demo": [[4, "onnx-inference-demo"]], "Model List": [[4, "model-list"]], "modelscope": [[4, "modelscope"]], "huggingface": [[4, "huggingface"]], "Python API Reference": [[5, "python-api-reference"]], "wespeaker package": [[6, "wespeaker-package"]], "Subpackages": [[6, "subpackages"]], "wespeaker.cli package": [[7, "module-wespeaker.cli"]], "Submodules": [[7, "submodules"], [11, "submodules"], [19, "submodules"], [21, "submodules"], [35, "submodules"]], "wespeaker.cli.hub module": [[8, "wespeaker-cli-hub-module"]], "wespeaker.cli.speaker module": [[9, "wespeaker-cli-speaker-module"]], "wespeaker.cli.utils module": [[10, "module-wespeaker.cli.utils"]], "wespeaker.diar package": [[11, "wespeaker-diar-package"]], "wespeaker.diar.extract_emb module": [[12, "wespeaker-diar-extract-emb-module"]], "wespeaker.diar.make_fbank module": [[13, "wespeaker-diar-make-fbank-module"]], "wespeaker.diar.make_oracle_sad module": [[14, "wespeaker-diar-make-oracle-sad-module"]], "wespeaker.diar.make_rttm module": [[15, "wespeaker-diar-make-rttm-module"]], "wespeaker.diar.make_system_sad module": [[16, "wespeaker-diar-make-system-sad-module"]], "wespeaker.diar.spectral_clusterer module": [[17, "wespeaker-diar-spectral-clusterer-module"]], "wespeaker.diar.umap_clusterer module": [[18, "wespeaker-diar-umap-clusterer-module"]], "wespeaker.frontend package": [[19, "wespeaker-frontend-package"]], "wespeaker.frontend.s3prl module": [[20, "wespeaker-frontend-s3prl-module"]], "wespeaker.models package": [[21, "wespeaker-models-package"]], "wespeaker.models.campplus module": [[22, "wespeaker-models-campplus-module"]], "wespeaker.models.convert_repvgg module": [[23, "wespeaker-models-convert-repvgg-module"]], "wespeaker.models.ecapa_tdnn module": [[24, "wespeaker-models-ecapa-tdnn-module"]], "wespeaker.models.eres2net module": [[25, "wespeaker-models-eres2net-module"]], "wespeaker.models.gemini_dfresnet module": [[26, "wespeaker-models-gemini-dfresnet-module"]], "wespeaker.models.pooling_layers module": [[27, "wespeaker-models-pooling-layers-module"]], "wespeaker.models.projections module": [[28, "wespeaker-models-projections-module"]], "wespeaker.models.redimnet module": [[29, "wespeaker-models-redimnet-module"]], "wespeaker.models.repvgg module": [[30, "wespeaker-models-repvgg-module"]], "wespeaker.models.res2net module": [[31, "wespeaker-models-res2net-module"]], "wespeaker.models.resnet module": [[32, "wespeaker-models-resnet-module"]], "wespeaker.models.speaker_model module": [[33, "wespeaker-models-speaker-model-module"]], "wespeaker.models.tdnn module": [[34, "wespeaker-models-tdnn-module"]], "wespeaker.utils package": [[35, "wespeaker-utils-package"]], "wespeaker.utils.checkpoint module": [[36, "wespeaker-utils-checkpoint-module"]], "wespeaker.utils.embedding_processing module": [[37, "wespeaker-utils-embedding-processing-module"]], "wespeaker.utils.executor module": [[38, "wespeaker-utils-executor-module"]], "wespeaker.utils.executor_deprecated module": [[39, "wespeaker-utils-executor-deprecated-module"]], "wespeaker.utils.file_utils module": [[40, "wespeaker-utils-file-utils-module"]], "wespeaker.utils.schedulers module": [[41, "wespeaker-utils-schedulers-module"]], "wespeaker.utils.score_metrics module": [[42, "wespeaker-utils-score-metrics-module"]], "wespeaker.utils.utils module": [[43, "wespeaker-utils-utils-module"]], "Python Package": [[44, "python-package"]], "Install": [[44, "install"]], "Command Line Usage": [[44, "command-line-usage"]], "Pretrained model support": [[44, "pretrained-model-support"]], "Python Programming Usage": [[44, "python-programming-usage"]], "Reference": [[45, "reference"]], "Runtime for Wespeaker": [[47, "runtime-for-wespeaker"]], "Platforms Supported": [[47, "platforms-supported"]], "Onnxruntime": [[47, "onnxruntime"]], "Server (tensorrt gpu)": [[47, "server-tensorrt-gpu"]], "Introduction": [[47, "introduction"]], "Step 0. Train a model": [[47, "step-0-train-a-model"]], "Step 1. Export model": [[47, "step-1-export-model"]], "Export to Tensorrt Engine": [[47, "export-to-tensorrt-engine"]], "Construct Model Repo": [[47, "construct-model-repo"]], "Step 2. Build server and start server": [[47, "step-2-build-server-and-start-server"]], "Step 3. Build client and start client": [[47, "step-3-build-client-and-start-client"]], "Step 4. Test score": [[47, "step-4-test-score"]], "Perf": [[47, "perf"]], "Pipeline Perf": [[47, "pipeline-perf"]], "Speaker Recognition Papers": [[48, "speaker-recognition-papers"]], "How to train models?": [[49, "how-to-train-models"]], "SV Tutorial on VoxCeleb v2 (Supervised)": [[50, "sv-tutorial-on-voxceleb-v2-supervised"]], "First Experiment": [[50, "first-experiment"], [51, "first-experiment"], [52, "first-experiment"]], "Stage 1: Download Data": [[50, "stage-1-download-data"], [51, "stage-1-download-data"]], "Stage 2: Reformat the Data": [[50, "stage-2-reformat-the-data"], [51, "stage-2-reformat-the-data"]], "Stage 3: Neural Network training": [[50, "stage-3-neural-network-training"], [51, "stage-3-neural-network-training"]], "Stage 4: Speaker Embedding Extraction": [[50, "stage-4-speaker-embedding-extraction"], [51, "stage-4-speaker-embedding-extraction"]], "Stage 5: Scoring the Evaluation Set": [[50, "stage-5-scoring-the-evaluation-set"], [51, "stage-5-scoring-the-evaluation-set"]], "Stage 6: Scoring the Evaluation Set": [[50, "stage-6-scoring-the-evaluation-set"]], "Stage 7(Optional): Export the trained model": [[50, "stage-7-optional-export-the-trained-model"]], "Stage 8(Optional): Large Margin Finetuning": [[50, "stage-8-optional-large-margin-finetuning"]], "SV Tutorial on VoxCeleb v3 (Self-Supervised)": [[51, "sv-tutorial-on-voxceleb-v3-self-supervised"]], "Stage 6(Optional): Export the trained model": [[51, "stage-6-optional-export-the-trained-model"]], "Diarization Tutorial on VoxConverse v2": [[52, "diarization-tutorial-on-voxconverse-v2"]], "Stage 1: Download Prerequisites": [[52, "stage-1-download-prerequisites"]], "Stage 2: Download and Prepare Data": [[52, "stage-2-download-and-prepare-data"]], "Stage 3: Apply SAD (i.e., VAD)": [[52, "stage-3-apply-sad-i-e-vad"]], "Stage 4: Extract Fbank Features": [[52, "stage-4-extract-fbank-features"]], "Stage 5: Extract Sliding-window Speaker Embeddings": [[52, "stage-5-extract-sliding-window-speaker-embeddings"]], "Stage 6: Apply Spectral Clustering": [[52, "stage-6-apply-spectral-clustering"]], "Stage 7: Reformat Clustering Labels into RTTMs": [[52, "stage-7-reformat-clustering-labels-into-rttms"]], "Stage 8: Evaluate the Result (DER)": [[52, "stage-8-evaluate-the-result-der"]]}, "indexentries": {"module": [[7, "module-wespeaker.cli"], [10, "module-wespeaker.cli.utils"]], "wespeaker.cli": [[7, "module-wespeaker.cli"]], "get_args() (in module wespeaker.cli.utils)": [[10, "wespeaker.cli.utils.get_args"]], "wespeaker.cli.utils": [[10, "module-wespeaker.cli.utils"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["README", "contribute", "index", "paper", "pretrained", "python_api/modules", "python_api/wespeaker", "python_api/wespeaker.cli", "python_api/wespeaker.cli.hub", "python_api/wespeaker.cli.speaker", "python_api/wespeaker.cli.utils", "python_api/wespeaker.diar", "python_api/wespeaker.diar.extract_emb", "python_api/wespeaker.diar.make_fbank", "python_api/wespeaker.diar.make_oracle_sad", "python_api/wespeaker.diar.make_rttm", "python_api/wespeaker.diar.make_system_sad", "python_api/wespeaker.diar.spectral_clusterer", "python_api/wespeaker.diar.umap_clusterer", "python_api/wespeaker.frontend", "python_api/wespeaker.frontend.s3prl", "python_api/wespeaker.frontend.whisper_encoder", "python_api/wespeaker.models", "python_api/wespeaker.models.campplus", "python_api/wespeaker.models.convert_repvgg", "python_api/wespeaker.models.ecapa_tdnn", "python_api/wespeaker.models.eres2net", "python_api/wespeaker.models.gemini_dfresnet", "python_api/wespeaker.models.pooling_layers", "python_api/wespeaker.models.projections", "python_api/wespeaker.models.redimnet", "python_api/wespeaker.models.repvgg", "python_api/wespeaker.models.res2net", "python_api/wespeaker.models.resnet", "python_api/wespeaker.models.speaker_model", "python_api/wespeaker.models.tdnn", "python_api/wespeaker.models.whisper_PMFA", "python_api/wespeaker.utils", "python_api/wespeaker.utils.checkpoint", "python_api/wespeaker.utils.embedding_processing", "python_api/wespeaker.utils.executor", "python_api/wespeaker.utils.executor_deprecated", "python_api/wespeaker.utils.file_utils", "python_api/wespeaker.utils.schedulers", "python_api/wespeaker.utils.score_metrics", "python_api/wespeaker.utils.utils", "python_package", "reference", "requirements", "runtime", "speaker_recognition_papers", "train", "vox", "vox_ssl", "voxconverse_diar"], "filenames": ["README.md", "contribute.md", "index.rst", "paper.md", "pretrained.md", "python_api/modules.rst", "python_api/wespeaker.rst", "python_api/wespeaker.cli.rst", "python_api/wespeaker.cli.hub.rst", "python_api/wespeaker.cli.speaker.rst", "python_api/wespeaker.cli.utils.rst", "python_api/wespeaker.diar.rst", "python_api/wespeaker.diar.extract_emb.rst", "python_api/wespeaker.diar.make_fbank.rst", "python_api/wespeaker.diar.make_oracle_sad.rst", "python_api/wespeaker.diar.make_rttm.rst", "python_api/wespeaker.diar.make_system_sad.rst", "python_api/wespeaker.diar.spectral_clusterer.rst", "python_api/wespeaker.diar.umap_clusterer.rst", "python_api/wespeaker.frontend.rst", "python_api/wespeaker.frontend.s3prl.rst", "python_api/wespeaker.frontend.whisper_encoder.rst", "python_api/wespeaker.models.rst", "python_api/wespeaker.models.campplus.rst", "python_api/wespeaker.models.convert_repvgg.rst", "python_api/wespeaker.models.ecapa_tdnn.rst", "python_api/wespeaker.models.eres2net.rst", "python_api/wespeaker.models.gemini_dfresnet.rst", "python_api/wespeaker.models.pooling_layers.rst", "python_api/wespeaker.models.projections.rst", "python_api/wespeaker.models.redimnet.rst", "python_api/wespeaker.models.repvgg.rst", "python_api/wespeaker.models.res2net.rst", "python_api/wespeaker.models.resnet.rst", "python_api/wespeaker.models.speaker_model.rst", "python_api/wespeaker.models.tdnn.rst", "python_api/wespeaker.models.whisper_PMFA.rst", "python_api/wespeaker.utils.rst", "python_api/wespeaker.utils.checkpoint.rst", "python_api/wespeaker.utils.embedding_processing.rst", "python_api/wespeaker.utils.executor.rst", "python_api/wespeaker.utils.executor_deprecated.rst", "python_api/wespeaker.utils.file_utils.rst", "python_api/wespeaker.utils.schedulers.rst", "python_api/wespeaker.utils.score_metrics.rst", "python_api/wespeaker.utils.utils.rst", "python_package.md", "reference.rst", "requirements.txt", "runtime.md", "speaker_recognition_papers.md", "train.rst", "vox.md", "vox_ssl.md", "voxconverse_diar.md"], "titles": ["Basic Documents for WeSpeaker", "Contributing to Wespeaker", "Welcome to Wespeaker\u2019s documentation!", "Wespeaker Papers", "Pretrained Models in Wespeaker", "Python API Reference", "wespeaker package", "wespeaker.cli package", "wespeaker.cli.hub module", "wespeaker.cli.speaker module", "wespeaker.cli.utils module", "wespeaker.diar package", "wespeaker.diar.extract_emb module", "wespeaker.diar.make_fbank module", "wespeaker.diar.make_oracle_sad module", "wespeaker.diar.make_rttm module", "wespeaker.diar.make_system_sad module", "wespeaker.diar.spectral_clusterer module", "wespeaker.diar.umap_clusterer module", "wespeaker.frontend package", "wespeaker.frontend.s3prl module", "wespeaker.frontend.whisper_encoder module", "wespeaker.models package", "wespeaker.models.campplus module", "wespeaker.models.convert_repvgg module", "wespeaker.models.ecapa_tdnn module", "wespeaker.models.eres2net module", "wespeaker.models.gemini_dfresnet module", "wespeaker.models.pooling_layers module", "wespeaker.models.projections module", "wespeaker.models.redimnet module", "wespeaker.models.repvgg module", "wespeaker.models.res2net module", "wespeaker.models.resnet module", "wespeaker.models.speaker_model module", "wespeaker.models.tdnn module", "wespeaker.models.whisper_PMFA module", "wespeaker.utils package", "wespeaker.utils.checkpoint module", "wespeaker.utils.embedding_processing module", "wespeaker.utils.executor module", "wespeaker.utils.executor_deprecated module", "wespeaker.utils.file_utils module", "wespeaker.utils.schedulers module", "wespeaker.utils.score_metrics module", "wespeaker.utils.utils module", "Python Package", "Reference", "<no title>", "Runtime for Wespeaker", "Speaker Recognition Papers", "How to train models?", "SV Tutorial on VoxCeleb v2 (Supervised)", "SV Tutorial on VoxCeleb v3 (Self-Supervised)", "Diarization Tutorial on VoxConverse v2"], "terms": {"thi": [0, 49, 52, 53, 54], "directori": [0, 46, 49, 52, 53], "includ": [0, 1, 53], "sv": [0, 2, 4, 51], "tutori": [0, 2, 51], "voxceleb": [0, 2, 4, 46, 49, 50, 51, 54], "v2": [0, 2, 49, 51, 53], "supervis": [0, 2, 50, 51], "v3": [0, 2, 51, 54], "self": [0, 2, 50, 51], "dino": [0, 53], "diariz": [0, 2, 4, 46, 51], "voxconvers": [0, 2, 4, 51], "suggest": [0, 1, 49, 52, 53, 54], "paper": [0, 2, 47, 52], "speaker": [0, 2, 3, 4, 7, 46, 47, 49], "embed": [0, 3, 4, 46, 49, 50], "learn": [0, 3, 50, 53], "provid": [0, 1, 4, 46, 52, 53, 54], "pretrain": [0, 2, 54], "model": [0, 1, 2, 6, 54], "off": 0, "shelf": 0, "usag": [0, 2], "from": [0, 4, 46, 49, 50, 52, 53, 54], "command": [0, 2, 49, 52, 53], "line": [0, 2, 52, 53, 54], "python": [0, 1, 2, 4, 47, 49, 52, 53], "code": [0, 2, 4], "how": [0, 1, 2, 49], "contribut": [0, 2], "ncmmsc": 0, "represent": 0, "slide": 0, "chines": [0, 46, 50], "handbook": 0, "introduct": [0, 53], "video": 0, "your": [1, 4, 49, 52, 53, 54], "interest": 1, "our": [1, 4, 49, 52, 53, 54], "commun": 1, "i": [1, 2, 4, 49, 52, 53], "open": [1, 52, 53, 54], "everyon": 1, "welcom": 1, "all": [1, 52, 53], "kind": [1, 49, 52, 53, 54], "matter": [1, 50], "small": [1, 52, 53], "larg": [1, 4, 50, 53], "There": 1, "ar": [1, 46, 52, 53, 54], "sever": [1, 49, 52, 53], "wai": [1, 49, 52, 53], "can": [1, 4, 46, 49, 52, 53, 54], "project": [1, 22, 49, 53], "identifi": 1, "ani": [1, 52, 53, 54], "bug": 1, "add": [1, 49, 53], "new": [1, 54], "implement": 1, "featur": [1, 49, 53], "howev": 1, "rememb": 1, "aren": 1, "t": [1, 46, 49], "just": [1, 52, 53], "about": [1, 4, 54], "we": [1, 4, 46, 49, 52, 53, 54], "believ": 1, "power": 1, "support": [1, 2], "thu": [1, 52, 53, 54], "answer": 1, "queri": 1, "assist": [1, 53], "other": [1, 46, 49, 52, 53], "enhanc": 1, "document": 1, "highli": 1, "regard": [1, 54], "benefici": 1, "final": [1, 49, 52, 53, 54], "one": [1, 49, 52, 53, 54], "most": [1, 53], "impact": 1, "u": [1, 49, 54], "rais": 1, "awar": 1, "talk": 1, "blog": 1, "post": 1, "highlight": 1, "": [1, 4, 49, 52, 53, 54], "drive": 1, "incred": 1, "If": [1, 46, 49, 52, 53, 54], "encount": 1, "have": [1, 49, 52, 53], "pleas": [1, 4, 46, 49, 52, 53, 54], "check": [1, 49, 52, 53, 54], "page": [1, 2], "first": [1, 49], "see": [1, 4], "someon": 1, "els": [1, 49, 52, 53], "ha": [1, 49, 52], "alreadi": 1, "file": [1, 4, 46, 49, 52, 53, 54], "much": 1, "relev": 1, "inform": [1, 54], "possibl": 1, "In": [1, 49, 50, 52, 53], "gener": [1, 49, 50, 53], "adher": 1, "googl": 1, "c": [1, 52, 53, 54], "when": [1, 49, 52, 53, 54], "submit": 1, "make": [1, 54], "sure": [1, 46, 49], "been": 1, "rebas": 1, "top": 1, "latest": [1, 49, 54], "commit": 1, "master": [1, 49, 54], "branch": 1, "ensur": [1, 53], "properli": 1, "format": [1, 4, 46, 52, 53, 54], "detail": [1, 46, 52, 53, 54], "descript": [1, 50], "chang": [1, 49], "explain": 1, "why": 1, "made": 1, "did": 1, "fix": [1, 49], "an": [1, 2, 46, 49, 53], "refer": [1, 2, 4, 53], "submiss": 1, "member": 1, "requir": [1, 4, 49, 53], "To": [1, 4, 50], "process": [1, 52, 53, 54], "smooth": 1, "keep": 1, "concis": 1, "involv": 1, "multipl": 1, "unrel": 1, "consid": 1, "split": [1, 54], "separ": [1, 52, 53, 54], "respond": 1, "comment": 1, "within": [1, 54], "reason": [1, 52, 53], "time": [1, 49, 52, 53, 54], "frame": [1, 49], "isn": 1, "clear": 1, "disagre": 1, "feel": [1, 52, 53, 54], "free": [1, 52, 53, 54], "ask": [1, 52, 53, 54], "clarif": 1, "discuss": 1, "take": [1, 49, 52, 53], "read": [1, 53, 54], "guidelin": 1, "great": 1, "tool": [1, 52, 53, 54], "research": [2, 3], "product": [2, 3, 49], "orient": [2, 3], "verif": [2, 50, 53], "recognit": [2, 47], "toolkit": [2, 3, 54], "packag": [2, 5], "instal": 2, "program": 2, "train": [2, 4, 46, 50, 54], "licens": 2, "onnx": [2, 49, 54], "infer": [2, 49, 52, 53], "demo": 2, "list": [2, 46, 52, 53], "runtim": [2, 4, 52, 53], "platform": 2, "onnxruntim": [2, 4], "server": 2, "tensorrt": 2, "gpu": [2, 52, 53, 54], "api": [2, 47], "issu": [2, 52, 53, 54], "report": 2, "style": [2, 46], "guid": 2, "pull": 2, "request": [2, 49], "review": 2, "thank": [2, 52, 53, 54], "you": [2, 4, 46, 49, 52, 53, 54], "index": [2, 54], "modul": [2, 7, 11, 19, 22, 37, 53, 54], "search": 2, "A": [3, 50], "accept": [3, 49], "icassp": 3, "2023": 3, "baselin": [3, 54], "voxsrc2023": [3, 54], "besid": [4, 52, 53], "relat": [4, 53], "task": [4, 46, 54], "util": [4, 6, 7], "mani": [4, 49, 52, 53], "which": [4, 46, 49, 52, 53, 54], "voic": [4, 54], "convers": 4, "text": [4, 50, 54], "speech": [4, 49, 54], "adapt": [4, 52], "asr": 4, "target": [4, 49, 52, 53], "extract": [4, 46, 49], "For": [4, 49, 50, 52, 53, 54], "user": [4, 52], "who": 4, "would": [4, 54], "like": [4, 49, 52, 53, 54], "verifi": 4, "perform": [4, 46, 49, 52, 53], "abov": [4, 46, 49, 52, 53], "without": 4, "troubl": 4, "learner": 4, "two": [4, 46, 49, 52, 53, 54], "type": [4, 54], "checkpoint": [4, 37, 49, 52, 53], "suffix": 4, "pt": [4, 46, 49, 52, 53], "save": [4, 46, 52, 54], "reproduc": 4, "publish": 4, "result": [4, 46, 49, 52, 53], "us": [4, 46, 49, 50, 52, 53, 54], "continu": [4, 52, 53], "export": 4, "The": [4, 49, 50, 52, 53, 54], "wenet": [4, 46, 49, 52, 53], "follow": [4, 46, 49, 52, 53, 54], "correspond": [4, 49, 53], "dataset": [4, 50, 52, 53, 54], "exampl": [4, 49, 52, 53, 54], "creativ": 4, "common": 4, "attribut": 4, "4": 4, "0": [4, 46, 52, 53, 54], "intern": 4, "sinc": [4, 49], "http": [4, 46, 49, 54], "mm": 4, "kaist": 4, "ac": [4, 54], "kr": 4, "pytorch": [4, 46, 52, 53], "directli": 4, "run": [4, 49, 52, 53, 54], "sh": [4, 49, 52, 53, 54], "recip": [4, 52, 53, 54], "As": 4, "toi": 4, "download": [4, 46], "onnx_path": 4, "wav_path": [4, 52, 53, 54], "path": [4, 46, 49, 52, 53, 54], "wave": 4, "16k": 4, "bin": [4, 49, 52, 53], "infer_onnx": 4, "py": [4, 49, 52, 53, 54], "easili": [4, 52, 53], "applic": [4, 49], "found": [4, 46, 49, 52, 53], "lm": [4, 52, 54], "mean": [4, 49, 54], "further": [4, 52], "fine": [4, 52], "tune": [4, 52], "margin": [4, 50], "could": [4, 52, 53, 54], "better": [4, 52, 54], "long": [4, 49, 52, 53], "audio": [4, 46, 49, 54], "e": [4, 46, 49, 52, 53], "g": [4, 49, 52, 53, 54], "3": 4, "languag": [4, 46, 50], "en": 4, "resnet34": [4, 49, 53, 54], "resnet34_lm": [4, 46], "resnet152_lm": 4, "resnet221_lm": [4, 46], "resnet293_lm": 4, "cam": 4, "_lm": 4, "ecapa512": 4, "ecapa512_lm": 4, "ecapa1024": 4, "ecapa1024_lm": 4, "gemini_dfresnet114_lm": 4, "cnceleb": [4, 46, 50], "cn": [4, 50], "wespeak": [5, 46, 47, 52, 53, 54], "subpackag": 5, "cli": 6, "submodul": 6, "diar": [6, 54], "frontend": 6, "hub": 7, "get_arg": [7, 10], "sourc": [10, 54], "extract_emb": [11, 54], "make_fbank": [11, 54], "make_oracle_sad": [11, 54], "make_rttm": [11, 54], "make_system_sad": [11, 54], "spectral_cluster": [11, 54], "umap_cluster": 11, "s3prl": 19, "whisper_encod": 19, "campplu": [22, 46], "convert_repvgg": [22, 52], "ecapa_tdnn": 22, "eres2net": [22, 46], "gemini_dfresnet": 22, "pooling_lay": 22, "redimnet": 22, "repvgg": [22, 52], "res2net": 22, "resnet": [22, 49, 52, 53], "speaker_model": [22, 49], "tdnn": [22, 50], "whisper_pmfa": 22, "embedding_process": 37, "executor": 37, "executor_deprec": 37, "file_util": 37, "schedul": 37, "score_metr": 37, "pip": 46, "git": [46, 54], "github": [46, 49, 52, 53, 54], "com": [46, 49, 54], "e2": [46, 49], "develop": 46, "clone": [46, 54], "cd": [46, 49, 52, 53, 54], "audio_fil": [46, 49], "wav": [46, 49, 52, 53, 54], "output_fil": [46, 52, 53], "txt": [46, 49], "embedding_kaldi": 46, "wav_scp": [46, 49, 53], "scp": [46, 49, 52, 53, 54], "similar": [46, 49, 52, 53], "audio_file2": 46, "audio2": 46, "devic": [46, 49, 54], "cuda": [46, 49, 54], "window": [46, 49], "linux": [46, 49], "mp": 46, "metal": 46, "shader": 46, "maco": [46, 49], "specifi": [46, 49, 52, 53, 54], "paramet": [46, 49, 52, 53], "h": 46, "five": 46, "now": [46, 49], "output": [46, 49, 54], "_": [46, 49, 52, 53, 54], "kaldi": [46, 49, 52, 53], "ark": [46, 49, 52, 53], "comput": [46, 49, 54], "rang": [46, 49], "1": 46, "appli": [46, 53], "input": [46, 49], "l": [46, 54], "english": 46, "p": [46, 49, 52, 54], "avg_model": [46, 49, 52, 53], "config": [46, 49, 52, 53], "yaml": [46, 49, 52, 53], "should": [46, 49, 52, 53, 54], "contain": [46, 52, 53, 54], "set": [46, 49, 54], "cpu": [46, 49, 54], "campplus_cn_common_200k": 46, "damo": 46, "res2net_cn_common_200k": 46, "file2": 46, "specif": 46, "each": [46, 52, 53, 54], "kei": [46, 53], "resample_r": [46, 53], "resampl": 46, "rate": [46, 49, 52, 53, 54], "default": [46, 52, 53, 54], "16000": [46, 49], "vad": 46, "true": [46, 49, 54], "differ": [46, 49, 52, 53, 54], "warn": 46, "want": [46, 49, 52, 53, 54], "link": 46, "renam": 46, "By": [46, 52, 53], "option": [46, 49, 54], "either": 46, "ones": [46, 49], "yourself": 46, "import": [46, 49, 53], "load_model": 46, "tensor": 46, "alloc": 46, "set_devic": 46, "extract_embed": 46, "utt_nam": 46, "extract_embedding_list": 46, "compute_similar": 46, "audio1": 46, "diar_result": 46, "give_this_utt_a_nam": 46, "regist": 46, "recogn": 46, "spk1": 46, "spk1_audio1": 46, "spk2": 46, "spk2_audio1": 46, "spk3": 46, "spk3_audio1": 46, "spk1_audio2": 46, "jinja2": 48, "nbsphinx": 48, "sphinx": 48, "recommonmark": 48, "markdown": 48, "tabl": 48, "rtd": 48, "theme": 48, "x86": 49, "android": 49, "come": 49, "ncnn": 49, "experi": 49, "blob": 49, "exp": [49, 52, 53, 54], "dir": [49, 52, 54], "onnx_dir": 49, "export_onnx": 49, "output_model": 49, "finish": [49, 52, 53, 54], "find": [49, 52, 53], "cmake": 49, "14": 49, "gcc": 49, "5": 49, "mkdir": [49, 52, 54], "donnx": 49, "ON": 49, "don": 49, "dgpu": 49, "note": [49, 52, 53, 54], "need": 49, "usr": 49, "local": [49, 52, 53, 54], "11": 49, "ld_library_path": 49, "lib64": 49, "rtf": 49, "real": [49, 53], "factor": 49, "shown": [49, 52], "consol": 49, "written": 49, "glog_logtostderr": 49, "glog_v": 49, "your_test_wav_scp": 49, "your_model_dir": 49, "embed_out": 49, "your_embedding_txt": 49, "extract_emb_main": 49, "speaker_model_path": 49, "embedding_s": 49, "256": 49, "samples_per_chunk": 49, "32000": 49, "sampl": [49, 53], "per": [49, 54], "chunk": [49, 53], "durat": [49, 50, 52, 54], "whole": [49, 52, 53, 54], "sentenc": [49, 53], "averag": [49, 52, 53], "calcul": [49, 52, 53], "asv_main": 49, "enroll_wav": 49, "wav1_path": 49, "test_wav": 49, "wav2_path": 49, "threshold": 49, "show": 49, "convert": [49, 52, 53, 54], "deploi": 49, "them": 49, "triton": 49, "onli": [49, 53, 54], "instead": 49, "mai": 49, "well": [49, 54], "after": [49, 52, 53, 54], "get": [49, 52, 53, 54], "under": [49, 52, 53, 54], "xxx": 49, "folder": 49, "ll": 49, "go": [49, 52, 53, 54], "exp_dir": [49, 52, 53], "python3": [49, 54], "minu": [49, 54], "vector": [49, 50], "simpli": [49, 54], "mean_vec": 49, "npy": 49, "vox2_dev": [49, 52, 53], "skip": [49, 53], "part": 49, "section": [49, 54], "repositori": 49, "let": 49, "22": 49, "03": 49, "therefor": 49, "here": [49, 52, 53, 54], "docker": 49, "move": 49, "v": 49, "nvcr": 49, "io": [49, 52, 53], "nvidia": 49, "py3": 49, "shape": 49, "bxtxf": 49, "batchsiz": 49, "sequence_length": 49, "feature_s": 49, "trtexec": 49, "saveengin": 49, "b1_b128_s3000_fp16": 49, "trt": 49, "minshap": 49, "feat": 49, "1x200x80": 49, "optshap": 49, "64x200x80": 49, "maxshap": 49, "128x3000x80": 49, "fp16": 49, "maximum": 49, "sequenc": 49, "length": [49, 54], "3000": 49, "minimum": 49, "200": 49, "stride": 49, "10m": 49, "02": 49, "second": [49, 54], "30": 49, "respect": 49, "extractor": 49, "notic": 49, "number": [49, 52, 53], "depend": [49, 50], "ve": 49, "ad": [49, 53], "pratic": 49, "affect": 49, "accuraci": 49, "improv": [49, 52, 53], "same": [49, 52, 53], "know": 49, "idea": 49, "try": [49, 54], "below": 49, "script": [49, 52, 53], "torchaudio": 49, "complianc": 49, "torch": 49, "audio_dur_in_second": 49, "feat_dim": 49, "80": 49, "dont": 49, "sample_r": 49, "waveform": [49, 50], "unsqueez": 49, "feat_tensor": 49, "fbank": 49, "num_mel_bin": 49, "frame_shift": [49, 53, 54], "10": [49, 53, 54], "frame_length": [49, 53], "25": [49, 53, 54], "energy_floor": 49, "window_typ": 49, "ham": 49, "htk_compat": 49, "use_energi": 49, "fals": [49, 54], "dither": 49, "print": [49, 52, 53, 54], "198": 49, "Then": [49, 52, 53], "actual": 49, "That": 49, "segment": [49, 52, 53, 54], "edit": 49, "model_repo": 49, "pbtxt": 49, "replac": 49, "default_model_filenam": 49, "name": [49, 52, 53, 54], "put": [49, 52, 53], "And": [49, 50, 53], "ecapa": [49, 50, 52, 53], "dim": 49, "192": 49, "also": [49, 52, 53, 54], "backend": 49, "kind_gpu": 49, "kind_cpu": 49, "dockerfil": 49, "Be": 49, "version": [49, 54], "f": [49, 52, 53, 54], "network": [49, 50], "host": 49, "pwd": [49, 54], "w": 49, "shm": 49, "size": [49, 52], "1g": 49, "ulimit": 49, "memlock": 49, "8000": 49, "8001": 49, "8002": 49, "stack": 49, "67108864": 49, "ti": 49, "tritonserv": 49, "port": 49, "grpc": 49, "wespeaker_cli": 49, "data": 49, "url": 49, "ip": 49, "wavscp": 49, "raid": 49, "dgxsa": 49, "slyne": 49, "vox1": [49, 52, 53], "output_directori": 49, "direcotri": 49, "someth": [49, 53], "xvector_000": 49, "xvextor_000": 49, "xvector_001": 49, "cat": [49, 54], "xvector_": 49, "xvector": 49, "conf": [49, 52, 53], "trials_dir": 49, "trial": [49, 52, 53], "eval_scp_path": 49, "x86_gpu": 49, "cal_mean": 49, "cal_mean_dir": 49, "p_target": 49, "01": 49, "c_miss": 49, "c_fa": 49, "vox1_o_clean": [49, 52, 53], "vox1_e_clean": [49, 52, 53], "vox1_h_clean": [49, 52, 53], "tee": [49, 54], "vox1_cos_result": [49, 52, 53], "resnet_b1_b128_s200_fp16": 49, "resnet_avg_model": 49, "128x200x80": 49, "ecapa_b1_b128_s200_fp16": 49, "ecapa_avg_model": 49, "t4": 49, "throughput": 49, "bz": 49, "64": 49, "utter": [49, 50, 53], "39": 49, "7842": 49, "2546": 49, "52": 49, "958": 49, "3389": 49, "generate_input": 49, "perf_analyz": 49, "m": 49, "b": [49, 53], "concurr": 49, "1000": [49, 52, 53], "json": 49, "localhost": 49, "conccur": 49, "avg": 49, "latenc": 49, "p99": 49, "2033": 49, "98": 49, "111": 49, "400": [49, 54], "2010": 49, "208": 49, "2647": 49, "75": [49, 54], "2726": 49, "147": 49, "172": 49, "scale": [50, 52, 53], "identif": 50, "voxceleb2": [50, 52, 53], "deep": 50, "wild": 50, "celeb": 50, "challeng": 50, "multi": [50, 52, 53], "genr": 50, "architectur": 50, "design": 50, "x": [50, 54], "robust": 50, "dnn": 50, "But": 50, "system": [50, 52, 53, 54], "2019": 50, "r": [50, 54], "rawnet": 50, "advanc": 50, "end": [50, 52, 53], "neural": 50, "raw": [50, 52, 53], "independ": 50, "With": 50, "sincnet": 50, "emphas": 50, "channel": [50, 54], "attent": 50, "propag": 50, "aggreg": 50, "base": [50, 52, 53], "optim": [50, 52, 53], "object": 50, "classif": 50, "loss": [50, 52, 53], "explor": 50, "encod": 50, "layer": 50, "function": [50, 52, 53], "angular": 50, "softmax": 50, "short": [50, 53, 54], "ensembl": 50, "addit": [50, 53], "toward": 50, "more": [50, 53, 54], "discrimin": 50, "triplet": 50, "On": 50, "pool": 50, "method": [50, 53], "statist": [50, 52], "resolut": 50, "head": [50, 53, 54], "level": 50, "novel": 50, "learnabl": 50, "dictionari": 50, "augment": [50, 52, 53], "adversari": 50, "prototyp": 50, "momentum": 50, "contrast": [50, 53], "gate": 50, "meet": [52, 53, 54], "problem": [52, 53, 54], "through": [52, 53, 54], "feedback": [52, 53, 54], "simpl": [52, 53], "manual": [52, 53], "understand": [52, 53, 54], "bash": [52, 53, 54], "stop_stag": [52, 53, 54], "le": [52, 53, 54], "ge": [52, 53, 54], "echo": [52, 53, 54], "prepar": [52, 53], "prepare_data": [52, 53], "fi": [52, 53, 54], "voxceleb1": [52, 53], "musan": [52, 53], "rir": [52, 53], "nois": [52, 53], "reverber": [52, 53], "It": [52, 53], "start": [52, 53], "becaus": [52, 53], "recommand": [52, 53], "archiv": [52, 53, 54], "own": [52, 53], "download_data": [52, 53], "meta": [52, 53], "utt2spk": [52, 53], "spk2utt": [52, 53], "dev": [52, 53, 54], "record": [52, 53, 54], "blank": [52, 53, 54], "column": [52, 53, 54], "wav_id": [52, 53, 54], "id10001": [52, 53], "1zciwhmdeo4": [52, 53], "00001": [52, 53], "voxceleb1_wav_v2": [52, 53], "00002": [52, 53], "spk_id": [52, 53, 54], "belong": [52, 53], "00003": [52, 53], "id10002": [52, 53], "0_laien": [52, 53], "q44": [52, 53], "6wo410qoeuo": [52, 53], "three": [52, 53, 54], "enroll_wav_id": [52, 53], "test_wav_id": [52, 53], "label": [52, 53], "y8hivobuel": [52, 53], "id10943": [52, 53], "vncvj7ylwpu": [52, 53], "00005": [52, 53], "nontarget": [52, 53], "7w0ibewc9qw": [52, 53], "00004": [52, 53], "id10999": [52, 53], "g5r2": [52, 53], "hl7yx8": [52, 53], "00008": [52, 53], "covert": [52, 53], "test": [52, 53, 54], "data_typ": [52, 53], "dset": [52, 53], "do": [52, 53, 54], "shard": [52, 53], "make_shard_list": [52, 53], "num_utts_per_shard": [52, 53], "num_thread": [52, 53], "16": [52, 53], "prefix": [52, 53], "shuffl": [52, 53], "make_raw_list": [52, 53], "done": [52, 53, 54], "lmdb": [52, 53], "make_lmdb": [52, 53], "million": [52, 53], "frequent": [52, 53], "caus": [52, 53], "bottleneck": [52, 53], "restor": [52, 53], "some": [52, 53], "binari": [52, 53], "store": [52, 53, 54], "fastli": [52, 53], "random": [52, 53], "access": [52, 53], "num_gpu": [52, 53], "awk": [52, 53, 54], "nf": [52, 53, 54], "torchrun": [52, 53], "standalon": [52, 53], "nnode": [52, 53], "nproc_per_nod": [52, 53], "num_avg": [52, 53], "train_data": [52, 53], "train_label": 52, "reverb_data": [52, 53], "noise_data": [52, 53], "nn": [52, 53], "step": [52, 53, 54], "mode": [52, 53], "ddp": [52, 53], "id": [52, 53, 54], "variabl": [52, 53], "idx": [52, 53], "initi": [52, 53], "randomli": [52, 53], "pre": [52, 53, 54], "weight": [52, 53], "model_init": [52, 53], "param": [52, 53], "resum": [52, 53], "termin": [52, 53], "epoch": [52, 53], "accident": [52, 53], "peopl": [52, 53], "out": [52, 53], "memori": [52, 53], "your_exp": [52, 53], "n": [52, 53, 54], "structur": [52, 53], "configur": [52, 53], "et": [52, 53], "al": [52, 53], "average_model": 52, "dst_model": [52, 53], "src_path": [52, 53], "num": [52, 53, 54], "model_path": [52, 53], "load": 52, "convert_model": 52, "extract_vox": [52, 53], "nj": [52, 53, 54], "last": [52, 53], "ensambl": [52, 53], "strategi": [52, 53], "forward": 52, "paradigm": 52, "somet": 52, "wrong": [52, 53, 54], "happen": [52, 53], "log": [52, 53], "stop": [52, 53], "trail": [52, 53], "cosin": [52, 53], "pair": [52, 53], "At": [52, 53], "equal": [52, 53], "error": [52, 53, 54], "eer": [52, 53], "mindcf": [52, 53], "trial_xx": [52, 53], "norm": 52, "score_norm": 52, "score_norm_method": 52, "cohort_set": 52, "top_n": 52, "adapta": 52, "normal": [52, 53, 54], "vox1_": 52, "_result": 52, "asnorm": [52, 53], "snorm": 52, "algorithm": [52, 53], "neg": 52, "cohort": 52, "best": [52, 53], "export_jit": [52, 53], "avg_": 52, "average_num": 52, "zip": [52, 53, 54], "libtorch": [52, 53], "lm_exp_dir": 52, "cp": 52, "model_0": 52, "lm_config": 52, "anoth": 52, "few": 52, "increas": 52, "enlarg": 52, "thei": 53, "current": 53, "commonli": 53, "framework": 53, "readm": 53, "md": [53, 54], "simclr": 53, "moco": 53, "basic": 53, "processs": 53, "exactli": 53, "befor": 53, "crucial": 53, "strongli": 53, "recommend": [53, 54], "incorpor": 53, "dure": 53, "ssl": 53, "train_dino": 53, "train_contrast": 53, "biggest": 53, "compar": 53, "organ": 53, "cannot": 53, "construct": 53, "assumpt": 53, "crop": 53, "suerpervis": 53, "dataload": 53, "defin": 53, "next": 53, "briefli": 53, "introduc": [53, 54], "firstli": 53, "storag": 53, "global": 53, "datalist": 53, "processor": 53, "url_open": 53, "tar_file_and_group": 53, "elif": 53, "parse_raw": 53, "parse_feat": 53, "shuffle_arg": 53, "composit": 53, "form": 53, "posit": 53, "fbank_arg": 53, "chunk_info_arg": 53, "chunk_len": 53, "ssl_processor": 53, "random_chunk_for_dino": 53, "veri": 53, "reverb": 53, "aug_prob": 53, "reverb_lmdb_fil": 53, "noise_lmdb_fil": 53, "lmdbdata": 53, "add_reverb_nois": 53, "notabl": 53, "facilit": 53, "effortless": 53, "divers": 53, "pipelin": 53, "both": 53, "effici": 53, "eas": 53, "extens": [53, 54], "order": 53, "compat": 53, "exist": 53, "wrapper": 53, "definit": 53, "so": 53, "average_dino_model": 53, "average_contrastive_model": 53, "student": 53, "necessari": 53, "remov": [53, 54], "subsequ": 53, "unlik": 53, "theoret": 53, "typic": 54, "downstream": 54, "learnt": 54, "2020": 54, "v1": 54, "Their": 54, "newcom": 54, "external_tool": 54, "wget": 54, "usnistgov": 54, "sctk": 54, "ref": 54, "tag": 54, "12": 54, "o": 54, "unzip": 54, "d": 54, "activ": 54, "detect": 54, "silero": 54, "team": 54, "snakers4": 54, "pretrained_model": 54, "1256283475": 54, "co": 54, "ap": 54, "shanghai": 54, "myqcloud": 54, "voxceleb_resnet34_lm": 54, "metric": 54, "silenc": 54, "extern": 54, "annot": 54, "joonson": 54, "voxconverse_mast": 54, "voxsrc": 54, "23": 54, "valid": 54, "look": 54, "jaesunghuh": 54, "recurs": 54, "certif": 54, "www": 54, "robot": 54, "ox": 54, "uk": 54, "vgg": 54, "voxconverse_dev_wav": 54, "creat": 54, "substr": 54, "voxconverse_test_wav": 54, "seem": 54, "repo": 54, "ground": 54, "truth": 54, "abjxc": 54, "afjiv": 54, "min": 54, "min_dur": 54, "255": 54, "sad_typ": 54, "xoracl": 54, "oracl": 54, "handl": 54, "overlap": 54, "too": 54, "region": 54, "while": 54, "utt": 54, "partit": 54, "oracle_sad": 54, "xsystem": 54, "system_sad": 54, "info": 54, "where": 54, "less": 54, "than": 54, "ignor": 54, "_sad_fbank": 54, "rm": 54, "_sad": 54, "store_dir": 54, "subseg_cmn": 54, "24": 54, "cepstral": 54, "cmn": 54, "sub": 54, "job": 54, "accord": 54, "core": 54, "_sad_embed": 54, "batch_siz": 54, "96": 54, "window_sec": 54, "period_sec": 54, "fashion": 54, "everi": 54, "contigu": 54, "decid": 54, "spectral_clust": 54, "_sad_label": 54, "emb": 54, "subseg_id": 54, "00000400": 54, "00007040": 54, "00000000": 54, "00000150": 54, "00000075": 54, "00000225": 54, "00000300": 54, "00000375": 54, "_sad_rttm": 54, "rich": 54, "transcript": 54, "mark": 54, "space": 54, "delimit": 54, "turn": 54, "ten": 54, "field": 54, "alwai": 54, "basenam": 54, "onset": 54, "begin": 54, "orthographi": 54, "na": 54, "uniqu": 54, "scope": 54, "confid": 54, "score": 54, "probabl": 54, "correct": 54, "signal": 54, "lookahead": 54, "instanc": 54, "640": 54, "680": 54, "55": 54, "960": 54, "ref_dir": 54, "perl": 54, "src": 54, "eval": 54, "pl": 54, "_sad_r": 54, "get_each_file_r": 54, "eq": 54, "single_file_res_dir": 54, "_single_file_r": 54, "nget": 54, "underd": 54, "sort": 54, "file_nam": 54, "grep": 54, "_re": 54, "sum": 54, "percentag": 54, "assign": 54, "alarm": 54, "nonspeech": 54, "incorrectli": 54, "miss": 54, "consult": 54, "nist": 54, "rt": 54, "09": 54, "plan": 54, "overal": 54, "singl": 54}, "objects": {"wespeaker": [[7, 0, 0, "-", "cli"]], "wespeaker.cli": [[10, 0, 0, "-", "utils"]], "wespeaker.cli.utils": [[10, 1, 1, "", "get_args"]]}, "objtypes": {"0": "py:module", "1": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"]}, "titleterms": {"basic": 0, "document": [0, 2], "wespeak": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 49], "todo": 0, "list": [0, 4], "possibl": 0, "contribut": 1, "issu": 1, "report": 1, "code": 1, "style": 1, "guid": 1, "pull": 1, "request": 1, "review": 1, "thank": 1, "you": 1, "welcom": 2, "": 2, "content": [2, 47, 51], "indic": 2, "tabl": 2, "paper": [3, 50], "pretrain": [4, 46], "model": [4, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 46, 49, 51, 52, 53], "licens": 4, "onnx": 4, "infer": 4, "demo": 4, "modelscop": 4, "huggingfac": 4, "python": [5, 46], "api": 5, "refer": [5, 47], "packag": [6, 7, 11, 19, 22, 37, 46], "subpackag": 6, "cli": [7, 8, 9, 10], "submodul": [7, 11, 19, 22, 37], "hub": 8, "modul": [8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45], "speaker": [9, 50, 52, 53, 54], "util": [10, 37, 38, 39, 40, 41, 42, 43, 44, 45], "diar": [11, 12, 13, 14, 15, 16, 17, 18], "extract_emb": 12, "make_fbank": 13, "make_oracle_sad": 14, "make_rttm": 15, "make_system_sad": 16, "spectral_cluster": 17, "umap_cluster": 18, "frontend": [19, 20, 21], "s3prl": 20, "whisper_encod": 21, "campplu": 23, "convert_repvgg": 24, "ecapa_tdnn": 25, "eres2net": 26, "gemini_dfresnet": 27, "pooling_lay": 28, "project": 29, "redimnet": 30, "repvgg": 31, "res2net": 32, "resnet": 33, "speaker_model": 34, "tdnn": 35, "whisper_pmfa": 36, "checkpoint": 38, "embedding_process": 39, "executor": 40, "executor_deprec": 41, "file_util": 42, "schedul": 43, "score_metr": 44, "instal": 46, "command": 46, "line": 46, "usag": 46, "support": [46, 49], "program": 46, "runtim": 49, "platform": 49, "onnxruntim": 49, "server": 49, "tensorrt": 49, "gpu": 49, "introduct": 49, "step": 49, "0": 49, "train": [49, 51, 52, 53], "1": [49, 52, 53, 54], "export": [49, 52, 53], "engin": 49, "construct": 49, "repo": 49, "2": [49, 52, 53, 54], "build": 49, "start": 49, "3": [49, 52, 53, 54], "client": 49, "4": [49, 52, 53, 54], "test": 49, "score": [49, 52, 53], "perf": 49, "pipelin": 49, "recognit": 50, "how": 51, "sv": [52, 53], "tutori": [52, 53, 54], "voxceleb": [52, 53], "v2": [52, 54], "supervis": [52, 53], "first": [52, 53, 54], "experi": [52, 53, 54], "stage": [52, 53, 54], "download": [52, 53, 54], "data": [52, 53, 54], "reformat": [52, 53, 54], "neural": [52, 53], "network": [52, 53], "embed": [52, 53, 54], "extract": [52, 53, 54], "5": [52, 53, 54], "evalu": [52, 53, 54], "set": [52, 53], "6": [52, 53, 54], "7": [52, 54], "option": [52, 53], "8": [52, 54], "larg": 52, "margin": 52, "finetun": 52, "v3": 53, "self": 53, "diariz": 54, "voxconvers": 54, "prerequisit": 54, "prepar": 54, "appli": 54, "sad": 54, "i": 54, "e": 54, "vad": 54, "fbank": 54, "featur": 54, "slide": 54, "window": 54, "spectral": 54, "cluster": 54, "label": 54, "rttm": 54, "result": 54, "der": 54}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx.ext.viewcode": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"Basic Documents for WeSpeaker": [[0, "basic-documents-for-wespeaker"]], "ToDo List (possible)": [[0, "todo-list-possible"]], "Contributing to Wespeaker": [[1, "contributing-to-wespeaker"]], "Issue Reporting": [[1, "issue-reporting"]], "Coding Style Guide": [[1, "coding-style-guide"]], "Pull Requests": [[1, "pull-requests"]], "Code Reviews": [[1, "code-reviews"]], "Thank You": [[1, "thank-you"]], "Welcome to Wespeaker\u2019s documentation!": [[2, "welcome-to-wespeaker-s-documentation"]], "Contents:": [[2, null], [47, null], [51, null]], "Indices and tables": [[2, "indices-and-tables"]], "Wespeaker Papers": [[3, "wespeaker-papers"]], "Pretrained Models in Wespeaker": [[4, "pretrained-models-in-wespeaker"]], "Model License": [[4, "model-license"]], "Onnx Inference Demo": [[4, "onnx-inference-demo"]], "Model List": [[4, "model-list"]], "modelscope": [[4, "modelscope"]], "huggingface": [[4, "huggingface"]], "Python API Reference": [[5, "python-api-reference"]], "wespeaker package": [[6, "wespeaker-package"]], "Subpackages": [[6, "subpackages"]], "wespeaker.cli package": [[7, "module-wespeaker.cli"]], "Submodules": [[7, "submodules"], [11, "submodules"], [19, "submodules"], [22, "submodules"], [37, "submodules"]], "wespeaker.cli.hub module": [[8, "wespeaker-cli-hub-module"]], "wespeaker.cli.speaker module": [[9, "wespeaker-cli-speaker-module"]], "wespeaker.cli.utils module": [[10, "module-wespeaker.cli.utils"]], "wespeaker.diar package": [[11, "wespeaker-diar-package"]], "wespeaker.diar.extract_emb module": [[12, "wespeaker-diar-extract-emb-module"]], "wespeaker.diar.make_fbank module": [[13, "wespeaker-diar-make-fbank-module"]], "wespeaker.diar.make_oracle_sad module": [[14, "wespeaker-diar-make-oracle-sad-module"]], "wespeaker.diar.make_rttm module": [[15, "wespeaker-diar-make-rttm-module"]], "wespeaker.diar.make_system_sad module": [[16, "wespeaker-diar-make-system-sad-module"]], "wespeaker.diar.spectral_clusterer module": [[17, "wespeaker-diar-spectral-clusterer-module"]], "wespeaker.diar.umap_clusterer module": [[18, "wespeaker-diar-umap-clusterer-module"]], "wespeaker.frontend package": [[19, "wespeaker-frontend-package"]], "wespeaker.frontend.s3prl module": [[20, "wespeaker-frontend-s3prl-module"]], "wespeaker.frontend.whisper_encoder module": [[21, "wespeaker-frontend-whisper-encoder-module"]], "wespeaker.models package": [[22, "wespeaker-models-package"]], "wespeaker.models.campplus module": [[23, "wespeaker-models-campplus-module"]], "wespeaker.models.convert_repvgg module": [[24, "wespeaker-models-convert-repvgg-module"]], "wespeaker.models.ecapa_tdnn module": [[25, "wespeaker-models-ecapa-tdnn-module"]], "wespeaker.models.eres2net module": [[26, "wespeaker-models-eres2net-module"]], "wespeaker.models.gemini_dfresnet module": [[27, "wespeaker-models-gemini-dfresnet-module"]], "wespeaker.models.pooling_layers module": [[28, "wespeaker-models-pooling-layers-module"]], "wespeaker.models.projections module": [[29, "wespeaker-models-projections-module"]], "wespeaker.models.redimnet module": [[30, "wespeaker-models-redimnet-module"]], "wespeaker.models.repvgg module": [[31, "wespeaker-models-repvgg-module"]], "wespeaker.models.res2net module": [[32, "wespeaker-models-res2net-module"]], "wespeaker.models.resnet module": [[33, "wespeaker-models-resnet-module"]], "wespeaker.models.speaker_model module": [[34, "wespeaker-models-speaker-model-module"]], "wespeaker.models.tdnn module": [[35, "wespeaker-models-tdnn-module"]], "wespeaker.models.whisper_PMFA module": [[36, "wespeaker-models-whisper-pmfa-module"]], "wespeaker.utils package": [[37, "wespeaker-utils-package"]], "wespeaker.utils.checkpoint module": [[38, "wespeaker-utils-checkpoint-module"]], "wespeaker.utils.embedding_processing module": [[39, "wespeaker-utils-embedding-processing-module"]], "wespeaker.utils.executor module": [[40, "wespeaker-utils-executor-module"]], "wespeaker.utils.executor_deprecated module": [[41, "wespeaker-utils-executor-deprecated-module"]], "wespeaker.utils.file_utils module": [[42, "wespeaker-utils-file-utils-module"]], "wespeaker.utils.schedulers module": [[43, "wespeaker-utils-schedulers-module"]], "wespeaker.utils.score_metrics module": [[44, "wespeaker-utils-score-metrics-module"]], "wespeaker.utils.utils module": [[45, "wespeaker-utils-utils-module"]], "Python Package": [[46, "python-package"]], "Install": [[46, "install"]], "Command Line Usage": [[46, "command-line-usage"]], "Pretrained model support": [[46, "pretrained-model-support"]], "Python Programming Usage": [[46, "python-programming-usage"]], "Reference": [[47, "reference"]], "Runtime for Wespeaker": [[49, "runtime-for-wespeaker"]], "Platforms Supported": [[49, "platforms-supported"]], "Onnxruntime": [[49, "onnxruntime"]], "Server (tensorrt gpu)": [[49, "server-tensorrt-gpu"]], "Introduction": [[49, "introduction"]], "Step 0. Train a model": [[49, "step-0-train-a-model"]], "Step 1. Export model": [[49, "step-1-export-model"]], "Export to Tensorrt Engine": [[49, "export-to-tensorrt-engine"]], "Construct Model Repo": [[49, "construct-model-repo"]], "Step 2. Build server and start server": [[49, "step-2-build-server-and-start-server"]], "Step 3. Build client and start client": [[49, "step-3-build-client-and-start-client"]], "Step 4. Test score": [[49, "step-4-test-score"]], "Perf": [[49, "perf"]], "Pipeline Perf": [[49, "pipeline-perf"]], "Speaker Recognition Papers": [[50, "speaker-recognition-papers"]], "How to train models?": [[51, "how-to-train-models"]], "SV Tutorial on VoxCeleb v2 (Supervised)": [[52, "sv-tutorial-on-voxceleb-v2-supervised"]], "First Experiment": [[52, "first-experiment"], [53, "first-experiment"], [54, "first-experiment"]], "Stage 1: Download Data": [[52, "stage-1-download-data"], [53, "stage-1-download-data"]], "Stage 2: Reformat the Data": [[52, "stage-2-reformat-the-data"], [53, "stage-2-reformat-the-data"]], "Stage 3: Neural Network training": [[52, "stage-3-neural-network-training"], [53, "stage-3-neural-network-training"]], "Stage 4: Speaker Embedding Extraction": [[52, "stage-4-speaker-embedding-extraction"], [53, "stage-4-speaker-embedding-extraction"]], "Stage 5: Scoring the Evaluation Set": [[52, "stage-5-scoring-the-evaluation-set"], [53, "stage-5-scoring-the-evaluation-set"]], "Stage 6: Scoring the Evaluation Set": [[52, "stage-6-scoring-the-evaluation-set"]], "Stage 7(Optional): Export the trained model": [[52, "stage-7-optional-export-the-trained-model"]], "Stage 8(Optional): Large Margin Finetuning": [[52, "stage-8-optional-large-margin-finetuning"]], "SV Tutorial on VoxCeleb v3 (Self-Supervised)": [[53, "sv-tutorial-on-voxceleb-v3-self-supervised"]], "Stage 6(Optional): Export the trained model": [[53, "stage-6-optional-export-the-trained-model"]], "Diarization Tutorial on VoxConverse v2": [[54, "diarization-tutorial-on-voxconverse-v2"]], "Stage 1: Download Prerequisites": [[54, "stage-1-download-prerequisites"]], "Stage 2: Download and Prepare Data": [[54, "stage-2-download-and-prepare-data"]], "Stage 3: Apply SAD (i.e., VAD)": [[54, "stage-3-apply-sad-i-e-vad"]], "Stage 4: Extract Fbank Features": [[54, "stage-4-extract-fbank-features"]], "Stage 5: Extract Sliding-window Speaker Embeddings": [[54, "stage-5-extract-sliding-window-speaker-embeddings"]], "Stage 6: Apply Spectral Clustering": [[54, "stage-6-apply-spectral-clustering"]], "Stage 7: Reformat Clustering Labels into RTTMs": [[54, "stage-7-reformat-clustering-labels-into-rttms"]], "Stage 8: Evaluate the Result (DER)": [[54, "stage-8-evaluate-the-result-der"]]}, "indexentries": {"module": [[7, "module-wespeaker.cli"], [10, "module-wespeaker.cli.utils"]], "wespeaker.cli": [[7, "module-wespeaker.cli"]], "get_args() (in module wespeaker.cli.utils)": [[10, "wespeaker.cli.utils.get_args"]], "wespeaker.cli.utils": [[10, "module-wespeaker.cli.utils"]]}}) \ No newline at end of file