niobures commited on
Commit
9da9ecf
·
verified ·
1 Parent(s): 3a22dee

Wav2vec 2.0 (de, en, pl, ru, multi)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. distil-wav2vec2-onnx/.gitattributes +35 -0
  3. distil-wav2vec2-onnx/README.md +25 -0
  4. distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp16.onnx +3 -0
  5. distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp32.onnx +3 -0
  6. distil-wav2vec2-onnx/onnx/distil-wav2vec2_int8.onnx +3 -0
  7. distil-wav2vec2-onnx/source.txt +1 -0
  8. wav2vec2-alignment/.gitattributes +35 -0
  9. wav2vec2-alignment/README.md +44 -0
  10. wav2vec2-alignment/languages.txt +57 -0
  11. wav2vec2-alignment/source.txt +1 -0
  12. wav2vec2-alignment/wav2vec2-lv60-espeak-fp16.onnx +3 -0
  13. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/.gitattributes +35 -0
  14. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/README.md +89 -0
  15. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/config.json +111 -0
  16. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model.onnx +3 -0
  17. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_bnb4.onnx +3 -0
  18. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_fp16.onnx +3 -0
  19. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_int8.onnx +3 -0
  20. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4.onnx +3 -0
  21. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4f16.onnx +3 -0
  22. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_quantized.onnx +3 -0
  23. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_uint8.onnx +3 -0
  24. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/preprocessor_config.json +9 -0
  25. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/quantize_config.json +18 -0
  26. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/source.txt +1 -0
  27. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/special_tokens_map.json +6 -0
  28. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer.json +110 -0
  29. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer_config.json +49 -0
  30. wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/vocab.json +43 -0
  31. wav2vec2-end-of-speech-detection/.gitattributes +35 -0
  32. wav2vec2-end-of-speech-detection/5sec_audio.wav +3 -0
  33. wav2vec2-end-of-speech-detection/README.md +119 -0
  34. wav2vec2-end-of-speech-detection/eos-model-onnx/config.json +125 -0
  35. wav2vec2-end-of-speech-detection/eos-model-onnx/model.onnx +3 -0
  36. wav2vec2-end-of-speech-detection/eos-model-onnx/preprocessor_config.json +9 -0
  37. wav2vec2-end-of-speech-detection/eos-model-onnx/special_tokens_map.json +1 -0
  38. wav2vec2-end-of-speech-detection/eos-model-onnx/tokenizer_config.json +1 -0
  39. wav2vec2-end-of-speech-detection/eos-model-onnx/vocab.json +1 -0
  40. wav2vec2-end-of-speech-detection/inference.py +80 -0
  41. wav2vec2-end-of-speech-detection/languages.txt +5 -0
  42. wav2vec2-end-of-speech-detection/segments/segment_0.wav +0 -0
  43. wav2vec2-end-of-speech-detection/segments/segment_1.wav +0 -0
  44. wav2vec2-end-of-speech-detection/segments/segment_2.wav +0 -0
  45. wav2vec2-end-of-speech-detection/segments/segment_3.wav +0 -0
  46. wav2vec2-end-of-speech-detection/segments/segment_4.wav +0 -0
  47. wav2vec2-end-of-speech-detection/segments/segment_5.wav +0 -0
  48. wav2vec2-end-of-speech-detection/segments/segment_6.wav +0 -0
  49. wav2vec2-end-of-speech-detection/source.txt +1 -0
  50. wav2vec2-large-xlsr-53-german-cv9/.gitattributes +28 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wav2vec2-end-of-speech-detection/5sec_audio.wav filter=lfs diff=lfs merge=lfs -text
37
+ wav2vec2-lv-60-espeak-cv-ft-js/Simple[[:space:]]and[[:space:]]Effective[[:space:]]Zero-shot[[:space:]]Cross-lingual[[:space:]]Phoneme[[:space:]]Recognition.pdf filter=lfs diff=lfs merge=lfs -text
38
+ wav2vec2-lv-60-espeak-cv-ft-ONNX/Simple[[:space:]]and[[:space:]]Effective[[:space:]]Zero-shot[[:space:]]Cross-lingual[[:space:]]Phoneme[[:space:]]Recognition.pdf filter=lfs diff=lfs merge=lfs -text
distil-wav2vec2-onnx/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
distil-wav2vec2-onnx/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ base_model:
4
+ - OthmaneJ/distil-wav2vec2
5
+ datasets:
6
+ - librispeech_asr
7
+ tags:
8
+ - onnx
9
+ - speech
10
+ - audio
11
+ - automatic-speech-recognition
12
+ license: apache-2.0
13
+ ---
14
+ # Distil-wav2vec2 ONNX
15
+
16
+ This repository hosts ONNX exports of the Distil-wav2vec2 model.
17
+
18
+ ## Contents
19
+
20
+ - `onnx/distil-wav2vec2_fp32.onnx`, `onnx/distil-wav2vec2_fp16.onnx`, `onnx/distil-wav2vec2_int8.onnx`
21
+
22
+ ## Upstream
23
+
24
+ Original project:
25
+ https://huggingface.co/OthmaneJ/distil-wav2vec2
distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26048304a2f60cf74d0ba3e58d53b0cc01ef24b3fda90ec3eeacff28dcd6443e
3
+ size 104613131
distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp32.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51df0aae6a9314a4fdde8e664334d2529fa983a46e9a6c60facb9532c86e8532
3
+ size 207542006
distil-wav2vec2-onnx/onnx/distil-wav2vec2_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e994a38dcf2c4c24740a6afead518c6b7ea9add9c52d6c455a773b868b145e48
3
+ size 52161794
distil-wav2vec2-onnx/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/KevinAHM/distil-wav2vec2-onnx
wav2vec2-alignment/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
wav2vec2-alignment/README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - multilingual
5
+ license: apache-2.0
6
+ tags:
7
+ - onnx
8
+ - audio
9
+ - automatic-speech-recognition
10
+ - phoneme-recognition
11
+ - wav2vec2
12
+ base_model: facebook/wav2vec2-lv-60-espeak-cv-ft
13
+ ---
14
+
15
+ # Wav2Vec2-LV-60-Espeak-CV-FT (ONNX)
16
+
17
+ This is an **ONNX export** of the [facebook/wav2vec2-lv-60-espeak-cv-ft](https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft) model.
18
+
19
+ It is designed for client-side inference in the **UltrClick ContentPro** application to perform forced alignment of lyrics to audio.
20
+
21
+ ## Model Details
22
+
23
+ - **Original Model**: `facebook/wav2vec2-lv-60-espeak-cv-ft`
24
+ - **Format**: ONNX (Open Neural Network Exchange)
25
+ - **Precision**: FP16 (Float16)
26
+ - **Output**: IPA Phoneme logits (392 vocab size)
27
+ - **Sample Rate**: 16kHz
28
+
29
+ ## Usage
30
+
31
+ This model is intended to be used with the ONNX Runtime (e.g., via `ort` in Rust or `onnxruntime` in Python).
32
+
33
+ ### Input
34
+ - **Name**: `audio`
35
+ - **Shape**: `[batch_size, samples]`
36
+ - **Type**: Float32 tensor
37
+
38
+ ### Output
39
+ - **Name**: `logits`
40
+ - **Shape**: `[batch_size, frames, 392]` (392 is the vocab size)
41
+
42
+ ## License
43
+
44
+ This model is a derivative of the original `facebook/wav2vec2-lv-60-espeak-cv-ft` model and retains the **Apache 2.0** license.
wav2vec2-alignment/languages.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Abkhaz
2
+ Arabic
3
+ Assamese
4
+ Breton
5
+ Catalan
6
+ Hakha Chin
7
+ Czech
8
+ Chuvash
9
+ Welsh
10
+ German
11
+ Divehi
12
+ Greek
13
+ English
14
+ Esperanto
15
+ Spanish
16
+ Estonian
17
+ Basque
18
+ Persian
19
+ Finnish
20
+ French
21
+ Western Frisian
22
+ Irish
23
+ Hindi
24
+ Upper Sorbian
25
+ Hungarian
26
+ Interlingua
27
+ Indonesian
28
+ Italian
29
+ Japanese
30
+ Georgian
31
+ Kabyle
32
+ Kyrgyz
33
+ Ganda
34
+ Lithuanian
35
+ Latvian
36
+ Mongolian
37
+ Maltese
38
+ Dutch
39
+ Oriya
40
+ Panjabi
41
+ Polish
42
+ Portuguese
43
+ Romansh
44
+ Romanian
45
+ Russian
46
+ Kinyarwanda
47
+ Yakut
48
+ Slovenian
49
+ Swedish
50
+ Tamil
51
+ Thai
52
+ Turkish
53
+ Tatar
54
+ Ukrainian
55
+ Vietnamese
56
+ Votic
57
+ Chinese
wav2vec2-alignment/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Hochien/wav2vec2-alignment
wav2vec2-alignment/wav2vec2-lv60-espeak-fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65177b47f8b304dc11d962ce91a6943f54e55dca69caecaaa35beab80f49925
3
+ size 632239986
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/README.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: pl
3
+ tags:
4
+ - audio
5
+ - automatic-speech-recognition
6
+ - voxpopuli
7
+ license: cc-by-nc-4.0
8
+ library_name: transformers.js
9
+ base_model:
10
+ - facebook/wav2vec2-base-10k-voxpopuli-ft-pl
11
+ pipeline_tag: automatic-speech-recognition
12
+ ---
13
+
14
+
15
+
16
+ # wav2vec2-base-10k-voxpopuli-ft-pl (ONNX)
17
+
18
+
19
+ This is an ONNX version of [facebook/wav2vec2-base-10k-voxpopuli-ft-pl](https://huggingface.co/facebook/wav2vec2-base-10k-voxpopuli-ft-pl). It was automatically converted and uploaded using [this Hugging Face Space](https://huggingface.co/spaces/onnx-community/convert-to-onnx).
20
+
21
+
22
+ ## Usage with Transformers.js
23
+
24
+
25
+ See the pipeline documentation for `automatic-speech-recognition`: https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AutomaticSpeechRecognitionPipeline
26
+
27
+
28
+ ---
29
+
30
+
31
+ # Wav2Vec2-Base-VoxPopuli-Finetuned
32
+
33
+ [Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in pl (refer to Table 1 of paper for more information).
34
+
35
+ **Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
36
+ Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
37
+
38
+ **Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
39
+
40
+ See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
41
+
42
+
43
+ # Usage for inference
44
+
45
+ In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets)
46
+
47
+ ```python
48
+ #!/usr/bin/env python3
49
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
50
+ from datasets import load_dataset
51
+ import torchaudio
52
+ import torch
53
+
54
+ # resample audio
55
+
56
+ # load model & processor
57
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-pl")
58
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-pl")
59
+
60
+ # load dataset
61
+ ds = load_dataset("common_voice", "pl", split="validation[:1%]")
62
+
63
+ # common voice does not match target sampling rate
64
+ common_voice_sample_rate = 48000
65
+ target_sample_rate = 16000
66
+
67
+ resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate)
68
+
69
+
70
+ # define mapping fn to read in sound file and resample
71
+ def map_to_array(batch):
72
+ speech, _ = torchaudio.load(batch["path"])
73
+ speech = resampler(speech)
74
+ batch["speech"] = speech[0]
75
+ return batch
76
+
77
+
78
+ # load all audio files
79
+ ds = ds.map(map_to_array)
80
+
81
+ # run inference on the first 5 data samples
82
+ inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
83
+
84
+ # inference
85
+ logits = model(**inputs).logits
86
+ predicted_ids = torch.argmax(logits, axis=-1)
87
+
88
+ print(processor.batch_decode(predicted_ids))
89
+ ```
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "facebook/wav2vec2-base-10k-voxpopuli-ft-pl",
4
+ "activation_dropout": 0.1,
5
+ "adapter_attn_dim": null,
6
+ "adapter_kernel_size": 3,
7
+ "adapter_stride": 2,
8
+ "add_adapter": false,
9
+ "apply_spec_augment": true,
10
+ "architectures": [
11
+ "Wav2Vec2ForCTC"
12
+ ],
13
+ "attention_dropout": 0.1,
14
+ "bos_token_id": 0,
15
+ "classifier_proj_size": 256,
16
+ "codevector_dim": 256,
17
+ "contrastive_logits_temperature": 0.1,
18
+ "conv_bias": false,
19
+ "conv_dim": [
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512,
26
+ 512
27
+ ],
28
+ "conv_kernel": [
29
+ 10,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 3,
34
+ 2,
35
+ 2
36
+ ],
37
+ "conv_stride": [
38
+ 5,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2
45
+ ],
46
+ "ctc_loss_reduction": "sum",
47
+ "ctc_zero_infinity": false,
48
+ "diversity_loss_weight": 0.1,
49
+ "do_stable_layer_norm": false,
50
+ "eos_token_id": 2,
51
+ "feat_extract_activation": "gelu",
52
+ "feat_extract_dropout": 0.0,
53
+ "feat_extract_norm": "group",
54
+ "feat_proj_dropout": 0.1,
55
+ "feat_quantizer_dropout": 0.0,
56
+ "final_dropout": 0.1,
57
+ "gradient_checkpointing": false,
58
+ "hidden_act": "gelu",
59
+ "hidden_dropout": 0.1,
60
+ "hidden_dropout_prob": 0.1,
61
+ "hidden_size": 768,
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 3072,
64
+ "layer_norm_eps": 1e-05,
65
+ "layerdrop": 0.1,
66
+ "mask_feature_length": 10,
67
+ "mask_feature_min_masks": 0,
68
+ "mask_feature_prob": 0.0,
69
+ "mask_time_length": 10,
70
+ "mask_time_min_masks": 2,
71
+ "mask_time_prob": 0.05,
72
+ "model_type": "wav2vec2",
73
+ "num_adapter_layers": 3,
74
+ "num_attention_heads": 12,
75
+ "num_codevector_groups": 2,
76
+ "num_codevectors_per_group": 320,
77
+ "num_conv_pos_embedding_groups": 16,
78
+ "num_conv_pos_embeddings": 128,
79
+ "num_feat_extract_layers": 7,
80
+ "num_hidden_layers": 12,
81
+ "num_negatives": 100,
82
+ "output_hidden_size": 768,
83
+ "pad_token_id": 1,
84
+ "proj_codevector_dim": 256,
85
+ "tdnn_dilation": [
86
+ 1,
87
+ 2,
88
+ 3,
89
+ 1,
90
+ 1
91
+ ],
92
+ "tdnn_dim": [
93
+ 512,
94
+ 512,
95
+ 512,
96
+ 512,
97
+ 1500
98
+ ],
99
+ "tdnn_kernel": [
100
+ 5,
101
+ 3,
102
+ 3,
103
+ 1,
104
+ 1
105
+ ],
106
+ "torch_dtype": "float32",
107
+ "transformers_version": "4.49.0",
108
+ "use_weighted_layer_sum": false,
109
+ "vocab_size": 41,
110
+ "xvector_output_dim": 512
111
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9309e96f5bb53ba957a7646cc9c5a3d61bd47d930a6998d1863e0ec081c3b199
3
+ size 377939575
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5a5d1dd62965a54abd99edf07f0bf99bd01811b4fe3fcbec94579ddbae8c016
3
+ size 84528286
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4724fe007a6aabb2f2a8d63318d28e533491e29b3877752f5e80da92b34c15e4
3
+ size 189132785
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f066fac7d0662feec0e64002a4a969032a02ee5a4048bb1959d324801ead168
3
+ size 95219724
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9265d2e19425b2240ac9c251e963d4a2fcda05741dd775076f4098b5e806b797
3
+ size 89862702
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e104f81133126cb900fdc017e1daecf45e3663668ab9666d1c40cb8ce3057745
3
+ size 66439769
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb33c88ee1fc2010aebbb9a8eef0034638a1d418e3bb57cab01ba70c1a520ad3
3
+ size 95219762
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb33c88ee1fc2010aebbb9a8eef0034638a1d418e3bb57cab01ba70c1a520ad3
3
+ size 95219762
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/quantize_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "modes": [
3
+ "fp16",
4
+ "q8",
5
+ "int8",
6
+ "uint8",
7
+ "q4",
8
+ "q4f16",
9
+ "bnb4"
10
+ ],
11
+ "per_channel": false,
12
+ "reduce_range": false,
13
+ "block_size": null,
14
+ "is_symmetric": true,
15
+ "accuracy_level": null,
16
+ "quant_type": 1,
17
+ "op_block_list": null
18
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/onnx-community/wav2vec2-base-10k-voxpopuli-ft-pl-ONNX
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 1,
8
+ "content": "<s>",
9
+ "single_word": false,
10
+ "lstrip": true,
11
+ "rstrip": true,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 0,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": true,
20
+ "rstrip": true,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": true,
29
+ "rstrip": true,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": true,
38
+ "rstrip": true,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Replace",
45
+ "pattern": {
46
+ "String": " "
47
+ },
48
+ "content": "|"
49
+ },
50
+ "pre_tokenizer": {
51
+ "type": "Split",
52
+ "pattern": {
53
+ "Regex": ""
54
+ },
55
+ "behavior": "Isolated",
56
+ "invert": false
57
+ },
58
+ "post_processor": null,
59
+ "decoder": {
60
+ "type": "CTC",
61
+ "pad_token": "<pad>",
62
+ "word_delimiter_token": "|",
63
+ "cleanup": true
64
+ },
65
+ "model": {
66
+ "vocab": {
67
+ "<s>": 1,
68
+ "<pad>": 0,
69
+ "</s>": 2,
70
+ "<unk>": 3,
71
+ "|": 4,
72
+ "e": 5,
73
+ "a": 6,
74
+ "i": 7,
75
+ "o": 8,
76
+ "n": 9,
77
+ "z": 10,
78
+ "r": 11,
79
+ "w": 12,
80
+ "s": 13,
81
+ "t": 14,
82
+ "c": 15,
83
+ "y": 16,
84
+ "p": 17,
85
+ "d": 18,
86
+ "k": 19,
87
+ "m": 20,
88
+ "j": 21,
89
+ "u": 22,
90
+ "l": 23,
91
+ "b": 24,
92
+ "g": 25,
93
+ "\u0142": 26,
94
+ "h": 27,
95
+ "\u0105": 28,
96
+ "\u0119": 29,
97
+ "\u017c": 30,
98
+ "\u00f3": 31,
99
+ "\u015b": 32,
100
+ "\u0107": 33,
101
+ "f": 34,
102
+ "\u0144": 35,
103
+ "\u017a": 36,
104
+ "v": 37,
105
+ "x": 38,
106
+ "q": 39,
107
+ "1": 40
108
+ }
109
+ }
110
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "model_max_length": 1000000000000000019884624838656,
42
+ "pad_token": "<pad>",
43
+ "processor_class": "Wav2Vec2Processor",
44
+ "replace_word_delimiter_char": " ",
45
+ "target_lang": null,
46
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
47
+ "unk_token": "<unk>",
48
+ "word_delimiter_token": "|"
49
+ }
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/vocab.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": 40,
3
+ "</s>": 2,
4
+ "<pad>": 0,
5
+ "<s>": 1,
6
+ "<unk>": 3,
7
+ "a": 6,
8
+ "b": 24,
9
+ "c": 15,
10
+ "d": 18,
11
+ "e": 5,
12
+ "f": 34,
13
+ "g": 25,
14
+ "h": 27,
15
+ "i": 7,
16
+ "j": 21,
17
+ "k": 19,
18
+ "l": 23,
19
+ "m": 20,
20
+ "n": 9,
21
+ "o": 8,
22
+ "p": 17,
23
+ "q": 39,
24
+ "r": 11,
25
+ "s": 13,
26
+ "t": 14,
27
+ "u": 22,
28
+ "v": 37,
29
+ "w": 12,
30
+ "x": 38,
31
+ "y": 16,
32
+ "z": 10,
33
+ "|": 4,
34
+ "ó": 31,
35
+ "ą": 28,
36
+ "ć": 33,
37
+ "ę": 29,
38
+ "ł": 26,
39
+ "ń": 35,
40
+ "ś": 32,
41
+ "ź": 36,
42
+ "ż": 30
43
+ }
wav2vec2-end-of-speech-detection/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
wav2vec2-end-of-speech-detection/5sec_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ddddf2ffec49c3bf14f91967b7f66cecebd6c913fdd5e8a610bf45744eb4716
3
+ size 311930
wav2vec2-end-of-speech-detection/README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - mozilla-foundation/common_voice_16_0
5
+ language:
6
+ - en
7
+ - de
8
+ - pl
9
+ - fr
10
+ - it
11
+ base_model:
12
+ - facebook/wav2vec2-base-960h
13
+ pipeline_tag: audio-classification
14
+ ---
15
+
16
+ # End of Speech Detection with Wav2Vec 2.0
17
+
18
+ The End-of-Speech model is based on the open-source Wav2Vec 2.0 model from Meta AI. It uses convolutional feature encoders, which translate chunks of raw audio input into latent speech representations and a transformer to capture the information throughout this sequence of representations. This helps the model distinguish different pitch declines, as well as final lengthening (and the following pause) in the intonation and therefore distinguish when an end of speech event occurs - the same way us humans do.
19
+
20
+ # Training Data
21
+
22
+ The training data is constructed from the Common voice 16.0 English Audio dataset by the Mozilla Firefox foundation. It is under a permissive license CC0 1.0.
23
+
24
+ In order to train the wav2vec 2.0 model for end of speech, we would need a large enough dataset that consists of both end of speech and not end of speech samples. Since there weren’t any open source datasets that contained such ready samples, we needed to construct one. The common voice dataset consists of audio samples that contain only one spoken sentence each.
25
+
26
+ Unfortunately, there is additional noisy/empty audio in the beginning and end of the audio samples. To remove those and capture only the audio that corresponds to the spoken sentence, we would need the timestamp of the sentence, or better yet, the word level timestamps. This is achieved with the help of whisperX. This way we capture when the sentence starts and finishes and remove anything before and after.
27
+
28
+ After cleaning the samples, we ran through random samples to validate the correctness of the procedure. Afterwards we label the last 700/704ms of the audio samples as end of speech events and all before that as not end of speech.
29
+
30
+ Finally, in addition, we added overlapping segments to the dataset by moving the 700/704ms window in both directions.
31
+
32
+ # Input
33
+
34
+ The model is trained at 700 and 704ms (11x64ms) inputs of raw audio. The sample rate is 16kHz. During experiments different lengths have been tested (300ms, 500ms and 1 sec) and 700/704ms proved to be the middle ground between good enough performance and shortest chunk.
35
+
36
+ # Output
37
+
38
+ The model classifies each audio input into 2 classes - eos (id: 0) and not_eos (id: 1).
39
+
40
+ # Usage
41
+
42
+ ```python
43
+ from transformers import Wav2Vec2Processor, AutoConfig
44
+ import onnxruntime as rt
45
+ import torch
46
+ import torch.nn.functional as F
47
+ import numpy as np
48
+ import os
49
+ import torchaudio
50
+
51
+
52
+ class EndOfSpeechDetection:
53
+ processor: Wav2Vec2Processor
54
+ config: AutoConfig
55
+ session: rt.InferenceSession
56
+
57
+ def load_model(self, path, use_gpu=False):
58
+ processor = Wav2Vec2Processor.from_pretrained(path)
59
+ config = AutoConfig.from_pretrained(path)
60
+
61
+ sess_options = rt.SessionOptions()
62
+ sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
63
+
64
+ providers = ["ROCMExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
65
+ session = rt.InferenceSession(
66
+ os.path.join(path, "model.onnx"), sess_options, providers=providers
67
+ )
68
+ return processor, config, session
69
+
70
+ def predict(self, segment, file_type="pcm"):
71
+ if file_type == "pcm":
72
+ # pcm files
73
+ speech_array = np.memmap(segment, dtype="float32", mode="r").astype(
74
+ np.float32
75
+ )
76
+ else:
77
+ # wave files
78
+ speech_array, _ = torchaudio.load(segment)
79
+ speech_array = speech_array[0].numpy()
80
+
81
+ features = self.processor(
82
+ speech_array, sampling_rate=16000, return_tensors="pt", padding=True
83
+ )
84
+ input_values = features.input_values
85
+ outputs = self.session.run(
86
+ [self.session.get_outputs()[-1].name],
87
+ {self.session.get_inputs()[-1].name: input_values.detach().cpu().numpy()},
88
+ )[0]
89
+ softmax_output = F.softmax(torch.tensor(outputs), dim=1)
90
+
91
+ both_classes_with_prob = {
92
+ self.config.id2label[i]: softmax_output[0][i].item()
93
+ for i in range(len(softmax_output[0]))
94
+ }
95
+
96
+ return both_classes_with_prob
97
+
98
+
99
+ if __name__ == "__main__":
100
+ eos = EndOfSpeechDetection()
101
+ eos.processor, eos.config, eos.session = eos.load_model("eos-model-onnx")
102
+ print(eos.predict("some.pcm", file_type="pcm"))
103
+
104
+ ```
105
+
106
+ # Latency (& Memory) Optimization
107
+ - Knowledge Distillation
108
+ - Onnx format weights
109
+ - The weights are converted in the Onnx format (in order to optimize CPU & GPU Performance)
110
+ - As tested on an AMD Instinct MI100 GPU - sub 10ms inference per 704ms audio chunk
111
+
112
+ # Evaluation
113
+
114
+ Accuracy at 0.95 with 8120 samples tested.
115
+
116
+ | classes | precision | recall | f1-score | support |
117
+ |---|---|---|---|---|
118
+ | eos | 0.94 | 0.95 | 0.95 | 4060 |
119
+ | not_eos | 0.95 | 0.94 | 0.95 | 4060 |
wav2vec2-end-of-speech-detection/eos-model-onnx/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "eos-det/model_07/checkpoint-2283",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "freeze_feat_extract_train": true,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 768,
59
+ "id2label": {
60
+ "0": "eos",
61
+ "1": "not_eos"
62
+ },
63
+ "initializer_range": 0.02,
64
+ "intermediate_size": 3072,
65
+ "label2id": {
66
+ "eos": 0,
67
+ "not_eos": 1
68
+ },
69
+ "layer_norm_eps": 1e-05,
70
+ "layerdrop": 0.0,
71
+ "mask_channel_length": 10,
72
+ "mask_channel_min_space": 1,
73
+ "mask_channel_other": 0.0,
74
+ "mask_channel_prob": 0.0,
75
+ "mask_channel_selection": "static",
76
+ "mask_feature_length": 10,
77
+ "mask_feature_min_masks": 0,
78
+ "mask_feature_prob": 0.0,
79
+ "mask_time_length": 10,
80
+ "mask_time_min_masks": 2,
81
+ "mask_time_min_space": 1,
82
+ "mask_time_other": 0.0,
83
+ "mask_time_prob": 0.05,
84
+ "mask_time_selection": "static",
85
+ "model_type": "wav2vec2",
86
+ "no_mask_channel_overlap": false,
87
+ "no_mask_time_overlap": false,
88
+ "num_adapter_layers": 3,
89
+ "num_attention_heads": 12,
90
+ "num_codevector_groups": 2,
91
+ "num_codevectors_per_group": 320,
92
+ "num_conv_pos_embedding_groups": 16,
93
+ "num_conv_pos_embeddings": 128,
94
+ "num_feat_extract_layers": 7,
95
+ "num_hidden_layers": 12,
96
+ "num_negatives": 100,
97
+ "output_hidden_size": 768,
98
+ "pad_token_id": 0,
99
+ "proj_codevector_dim": 256,
100
+ "tdnn_dilation": [
101
+ 1,
102
+ 2,
103
+ 3,
104
+ 1,
105
+ 1
106
+ ],
107
+ "tdnn_dim": [
108
+ 512,
109
+ 512,
110
+ 512,
111
+ 512,
112
+ 1500
113
+ ],
114
+ "tdnn_kernel": [
115
+ 5,
116
+ 3,
117
+ 3,
118
+ 1,
119
+ 1
120
+ ],
121
+ "transformers_version": "4.38.2",
122
+ "use_weighted_layer_sum": false,
123
+ "vocab_size": 32,
124
+ "xvector_output_dim": 512
125
+ }
wav2vec2-end-of-speech-detection/eos-model-onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d526a75c63ea501292f463f6de28c209fee2ccf733ada042155c01c1c5bc31a9
3
+ size 378578988
wav2vec2-end-of-speech-detection/eos-model-onnx/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
wav2vec2-end-of-speech-detection/eos-model-onnx/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
wav2vec2-end-of-speech-detection/eos-model-onnx/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "return_attention_mask": false, "do_normalize": true}
wav2vec2-end-of-speech-detection/eos-model-onnx/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}
wav2vec2-end-of-speech-detection/inference.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2Processor, AutoConfig
2
+ import onnxruntime as rt
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import numpy as np
6
+ import os
7
+ import torchaudio
8
+ import soundfile as sf
9
+
10
+
11
+ class EndOfSpeechDetection:
12
+ processor: Wav2Vec2Processor
13
+ config: AutoConfig
14
+ session: rt.InferenceSession
15
+
16
+ def load_model(self, path, use_gpu=False):
17
+ processor = Wav2Vec2Processor.from_pretrained(path)
18
+ config = AutoConfig.from_pretrained(path)
19
+
20
+ sess_options = rt.SessionOptions()
21
+ sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
22
+
23
+ providers = ["ROCMExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
24
+ session = rt.InferenceSession(
25
+ os.path.join(path, "model.onnx"), sess_options, providers=providers
26
+ )
27
+ return processor, config, session
28
+
29
+ def predict(self, segment, file_type="pcm"):
30
+ if file_type == "pcm":
31
+ # pcm files
32
+ speech_array = np.memmap(segment, dtype="float32", mode="r").astype(
33
+ np.float32
34
+ )
35
+ else:
36
+ # wave files
37
+ speech_array, _ = torchaudio.load(segment)
38
+ speech_array = speech_array[0].numpy()
39
+
40
+ features = self.processor(
41
+ speech_array, sampling_rate=16000, return_tensors="pt", padding=True
42
+ )
43
+ input_values = features.input_values
44
+ outputs = self.session.run(
45
+ [self.session.get_outputs()[-1].name],
46
+ {self.session.get_inputs()[-1].name: input_values.detach().cpu().numpy()},
47
+ )[0]
48
+ softmax_output = F.softmax(torch.tensor(outputs), dim=1)
49
+
50
+ both_classes_with_prob = {
51
+ self.config.id2label[i]: softmax_output[0][i].item()
52
+ for i in range(len(softmax_output[0]))
53
+ }
54
+
55
+ return both_classes_with_prob
56
+
57
+
58
+ if __name__ == "__main__":
59
+ eos = EndOfSpeechDetection()
60
+ eos.processor, eos.config, eos.session = eos.load_model("eos-model-onnx")
61
+
62
+ audio_file = "5sec_audio.wav"
63
+ audio, sr = torchaudio.load(audio_file)
64
+ audio = audio[0].numpy()
65
+ audio_len = len(audio)
66
+ segment_len = 700 * sr // 1000
67
+ segments = []
68
+ for i in range(0, audio_len, segment_len):
69
+ if i + segment_len < audio_len:
70
+ segment = audio[i : i + segment_len]
71
+ else:
72
+ segment = audio[i:]
73
+
74
+ segments.append(segment)
75
+
76
+ if not os.path.exists("segments"):
77
+ os.makedirs("segments")
78
+ for i, segment in enumerate(segments):
79
+ sf.write(f"segments/segment_{i}.wav", segment, sr)
80
+ print(eos.predict(f"segments/segment_{i}.wav", file_type="wav"))
wav2vec2-end-of-speech-detection/languages.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ English
2
+ German
3
+ Polish
4
+ French
5
+ Italian
wav2vec2-end-of-speech-detection/segments/segment_0.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_1.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_2.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_3.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_4.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_5.wav ADDED
Binary file (22.4 kB). View file
 
wav2vec2-end-of-speech-detection/segments/segment_6.wav ADDED
Binary file (21.6 kB). View file
 
wav2vec2-end-of-speech-detection/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/telnyx/wav2vec2-end-of-speech-detection
wav2vec2-large-xlsr-53-german-cv9/.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ model.safetensors filter=lfs diff=lfs merge=lfs -text