Wav2vec 2.0 (de, en, pl, ru, multi)
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +3 -0
- distil-wav2vec2-onnx/.gitattributes +35 -0
- distil-wav2vec2-onnx/README.md +25 -0
- distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp16.onnx +3 -0
- distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp32.onnx +3 -0
- distil-wav2vec2-onnx/onnx/distil-wav2vec2_int8.onnx +3 -0
- distil-wav2vec2-onnx/source.txt +1 -0
- wav2vec2-alignment/.gitattributes +35 -0
- wav2vec2-alignment/README.md +44 -0
- wav2vec2-alignment/languages.txt +57 -0
- wav2vec2-alignment/source.txt +1 -0
- wav2vec2-alignment/wav2vec2-lv60-espeak-fp16.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/.gitattributes +35 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/README.md +89 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/config.json +111 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_bnb4.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_fp16.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_int8.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4f16.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_quantized.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_uint8.onnx +3 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/preprocessor_config.json +9 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/quantize_config.json +18 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/source.txt +1 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/special_tokens_map.json +6 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer.json +110 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer_config.json +49 -0
- wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/vocab.json +43 -0
- wav2vec2-end-of-speech-detection/.gitattributes +35 -0
- wav2vec2-end-of-speech-detection/5sec_audio.wav +3 -0
- wav2vec2-end-of-speech-detection/README.md +119 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/config.json +125 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/model.onnx +3 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/preprocessor_config.json +9 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/special_tokens_map.json +1 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/tokenizer_config.json +1 -0
- wav2vec2-end-of-speech-detection/eos-model-onnx/vocab.json +1 -0
- wav2vec2-end-of-speech-detection/inference.py +80 -0
- wav2vec2-end-of-speech-detection/languages.txt +5 -0
- wav2vec2-end-of-speech-detection/segments/segment_0.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_1.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_2.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_3.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_4.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_5.wav +0 -0
- wav2vec2-end-of-speech-detection/segments/segment_6.wav +0 -0
- wav2vec2-end-of-speech-detection/source.txt +1 -0
- wav2vec2-large-xlsr-53-german-cv9/.gitattributes +28 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wav2vec2-end-of-speech-detection/5sec_audio.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
wav2vec2-lv-60-espeak-cv-ft-js/Simple[[:space:]]and[[:space:]]Effective[[:space:]]Zero-shot[[:space:]]Cross-lingual[[:space:]]Phoneme[[:space:]]Recognition.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
wav2vec2-lv-60-espeak-cv-ft-ONNX/Simple[[:space:]]and[[:space:]]Effective[[:space:]]Zero-shot[[:space:]]Cross-lingual[[:space:]]Phoneme[[:space:]]Recognition.pdf filter=lfs diff=lfs merge=lfs -text
|
distil-wav2vec2-onnx/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
distil-wav2vec2-onnx/README.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
base_model:
|
| 4 |
+
- OthmaneJ/distil-wav2vec2
|
| 5 |
+
datasets:
|
| 6 |
+
- librispeech_asr
|
| 7 |
+
tags:
|
| 8 |
+
- onnx
|
| 9 |
+
- speech
|
| 10 |
+
- audio
|
| 11 |
+
- automatic-speech-recognition
|
| 12 |
+
license: apache-2.0
|
| 13 |
+
---
|
| 14 |
+
# Distil-wav2vec2 ONNX
|
| 15 |
+
|
| 16 |
+
This repository hosts ONNX exports of the Distil-wav2vec2 model.
|
| 17 |
+
|
| 18 |
+
## Contents
|
| 19 |
+
|
| 20 |
+
- `onnx/distil-wav2vec2_fp32.onnx`, `onnx/distil-wav2vec2_fp16.onnx`, `onnx/distil-wav2vec2_int8.onnx`
|
| 21 |
+
|
| 22 |
+
## Upstream
|
| 23 |
+
|
| 24 |
+
Original project:
|
| 25 |
+
https://huggingface.co/OthmaneJ/distil-wav2vec2
|
distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26048304a2f60cf74d0ba3e58d53b0cc01ef24b3fda90ec3eeacff28dcd6443e
|
| 3 |
+
size 104613131
|
distil-wav2vec2-onnx/onnx/distil-wav2vec2_fp32.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51df0aae6a9314a4fdde8e664334d2529fa983a46e9a6c60facb9532c86e8532
|
| 3 |
+
size 207542006
|
distil-wav2vec2-onnx/onnx/distil-wav2vec2_int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e994a38dcf2c4c24740a6afead518c6b7ea9add9c52d6c455a773b868b145e48
|
| 3 |
+
size 52161794
|
distil-wav2vec2-onnx/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/KevinAHM/distil-wav2vec2-onnx
|
wav2vec2-alignment/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
wav2vec2-alignment/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
- multilingual
|
| 5 |
+
license: apache-2.0
|
| 6 |
+
tags:
|
| 7 |
+
- onnx
|
| 8 |
+
- audio
|
| 9 |
+
- automatic-speech-recognition
|
| 10 |
+
- phoneme-recognition
|
| 11 |
+
- wav2vec2
|
| 12 |
+
base_model: facebook/wav2vec2-lv-60-espeak-cv-ft
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Wav2Vec2-LV-60-Espeak-CV-FT (ONNX)
|
| 16 |
+
|
| 17 |
+
This is an **ONNX export** of the [facebook/wav2vec2-lv-60-espeak-cv-ft](https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft) model.
|
| 18 |
+
|
| 19 |
+
It is designed for client-side inference in the **UltrClick ContentPro** application to perform forced alignment of lyrics to audio.
|
| 20 |
+
|
| 21 |
+
## Model Details
|
| 22 |
+
|
| 23 |
+
- **Original Model**: `facebook/wav2vec2-lv-60-espeak-cv-ft`
|
| 24 |
+
- **Format**: ONNX (Open Neural Network Exchange)
|
| 25 |
+
- **Precision**: FP16 (Float16)
|
| 26 |
+
- **Output**: IPA Phoneme logits (392 vocab size)
|
| 27 |
+
- **Sample Rate**: 16kHz
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
This model is intended to be used with the ONNX Runtime (e.g., via `ort` in Rust or `onnxruntime` in Python).
|
| 32 |
+
|
| 33 |
+
### Input
|
| 34 |
+
- **Name**: `audio`
|
| 35 |
+
- **Shape**: `[batch_size, samples]`
|
| 36 |
+
- **Type**: Float32 tensor
|
| 37 |
+
|
| 38 |
+
### Output
|
| 39 |
+
- **Name**: `logits`
|
| 40 |
+
- **Shape**: `[batch_size, frames, 392]` (392 is the vocab size)
|
| 41 |
+
|
| 42 |
+
## License
|
| 43 |
+
|
| 44 |
+
This model is a derivative of the original `facebook/wav2vec2-lv-60-espeak-cv-ft` model and retains the **Apache 2.0** license.
|
wav2vec2-alignment/languages.txt
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Abkhaz
|
| 2 |
+
Arabic
|
| 3 |
+
Assamese
|
| 4 |
+
Breton
|
| 5 |
+
Catalan
|
| 6 |
+
Hakha Chin
|
| 7 |
+
Czech
|
| 8 |
+
Chuvash
|
| 9 |
+
Welsh
|
| 10 |
+
German
|
| 11 |
+
Divehi
|
| 12 |
+
Greek
|
| 13 |
+
English
|
| 14 |
+
Esperanto
|
| 15 |
+
Spanish
|
| 16 |
+
Estonian
|
| 17 |
+
Basque
|
| 18 |
+
Persian
|
| 19 |
+
Finnish
|
| 20 |
+
French
|
| 21 |
+
Western Frisian
|
| 22 |
+
Irish
|
| 23 |
+
Hindi
|
| 24 |
+
Upper Sorbian
|
| 25 |
+
Hungarian
|
| 26 |
+
Interlingua
|
| 27 |
+
Indonesian
|
| 28 |
+
Italian
|
| 29 |
+
Japanese
|
| 30 |
+
Georgian
|
| 31 |
+
Kabyle
|
| 32 |
+
Kyrgyz
|
| 33 |
+
Ganda
|
| 34 |
+
Lithuanian
|
| 35 |
+
Latvian
|
| 36 |
+
Mongolian
|
| 37 |
+
Maltese
|
| 38 |
+
Dutch
|
| 39 |
+
Oriya
|
| 40 |
+
Panjabi
|
| 41 |
+
Polish
|
| 42 |
+
Portuguese
|
| 43 |
+
Romansh
|
| 44 |
+
Romanian
|
| 45 |
+
Russian
|
| 46 |
+
Kinyarwanda
|
| 47 |
+
Yakut
|
| 48 |
+
Slovenian
|
| 49 |
+
Swedish
|
| 50 |
+
Tamil
|
| 51 |
+
Thai
|
| 52 |
+
Turkish
|
| 53 |
+
Tatar
|
| 54 |
+
Ukrainian
|
| 55 |
+
Vietnamese
|
| 56 |
+
Votic
|
| 57 |
+
Chinese
|
wav2vec2-alignment/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Hochien/wav2vec2-alignment
|
wav2vec2-alignment/wav2vec2-lv60-espeak-fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a65177b47f8b304dc11d962ce91a6943f54e55dca69caecaaa35beab80f49925
|
| 3 |
+
size 632239986
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: pl
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
- automatic-speech-recognition
|
| 6 |
+
- voxpopuli
|
| 7 |
+
license: cc-by-nc-4.0
|
| 8 |
+
library_name: transformers.js
|
| 9 |
+
base_model:
|
| 10 |
+
- facebook/wav2vec2-base-10k-voxpopuli-ft-pl
|
| 11 |
+
pipeline_tag: automatic-speech-recognition
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# wav2vec2-base-10k-voxpopuli-ft-pl (ONNX)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
This is an ONNX version of [facebook/wav2vec2-base-10k-voxpopuli-ft-pl](https://huggingface.co/facebook/wav2vec2-base-10k-voxpopuli-ft-pl). It was automatically converted and uploaded using [this Hugging Face Space](https://huggingface.co/spaces/onnx-community/convert-to-onnx).
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## Usage with Transformers.js
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
See the pipeline documentation for `automatic-speech-recognition`: https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.AutomaticSpeechRecognitionPipeline
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Wav2Vec2-Base-VoxPopuli-Finetuned
|
| 32 |
+
|
| 33 |
+
[Facebook's Wav2Vec2](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) base model pretrained on the 10K unlabeled subset of [VoxPopuli corpus](https://arxiv.org/abs/2101.00390) and fine-tuned on the transcribed data in pl (refer to Table 1 of paper for more information).
|
| 34 |
+
|
| 35 |
+
**Paper**: *[VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
|
| 36 |
+
Learning, Semi-Supervised Learning and Interpretation](https://arxiv.org/abs/2101.00390)*
|
| 37 |
+
|
| 38 |
+
**Authors**: *Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, Emmanuel Dupoux* from *Facebook AI*
|
| 39 |
+
|
| 40 |
+
See the official website for more information, [here](https://github.com/facebookresearch/voxpopuli/)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Usage for inference
|
| 44 |
+
|
| 45 |
+
In the following it is shown how the model can be used in inference on a sample of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets)
|
| 46 |
+
|
| 47 |
+
```python
|
| 48 |
+
#!/usr/bin/env python3
|
| 49 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 50 |
+
from datasets import load_dataset
|
| 51 |
+
import torchaudio
|
| 52 |
+
import torch
|
| 53 |
+
|
| 54 |
+
# resample audio
|
| 55 |
+
|
| 56 |
+
# load model & processor
|
| 57 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-pl")
|
| 58 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-10k-voxpopuli-ft-pl")
|
| 59 |
+
|
| 60 |
+
# load dataset
|
| 61 |
+
ds = load_dataset("common_voice", "pl", split="validation[:1%]")
|
| 62 |
+
|
| 63 |
+
# common voice does not match target sampling rate
|
| 64 |
+
common_voice_sample_rate = 48000
|
| 65 |
+
target_sample_rate = 16000
|
| 66 |
+
|
| 67 |
+
resampler = torchaudio.transforms.Resample(common_voice_sample_rate, target_sample_rate)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# define mapping fn to read in sound file and resample
|
| 71 |
+
def map_to_array(batch):
|
| 72 |
+
speech, _ = torchaudio.load(batch["path"])
|
| 73 |
+
speech = resampler(speech)
|
| 74 |
+
batch["speech"] = speech[0]
|
| 75 |
+
return batch
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# load all audio files
|
| 79 |
+
ds = ds.map(map_to_array)
|
| 80 |
+
|
| 81 |
+
# run inference on the first 5 data samples
|
| 82 |
+
inputs = processor(ds[:5]["speech"], sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
|
| 83 |
+
|
| 84 |
+
# inference
|
| 85 |
+
logits = model(**inputs).logits
|
| 86 |
+
predicted_ids = torch.argmax(logits, axis=-1)
|
| 87 |
+
|
| 88 |
+
print(processor.batch_decode(predicted_ids))
|
| 89 |
+
```
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/config.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": true,
|
| 3 |
+
"_name_or_path": "facebook/wav2vec2-base-10k-voxpopuli-ft-pl",
|
| 4 |
+
"activation_dropout": 0.1,
|
| 5 |
+
"adapter_attn_dim": null,
|
| 6 |
+
"adapter_kernel_size": 3,
|
| 7 |
+
"adapter_stride": 2,
|
| 8 |
+
"add_adapter": false,
|
| 9 |
+
"apply_spec_augment": true,
|
| 10 |
+
"architectures": [
|
| 11 |
+
"Wav2Vec2ForCTC"
|
| 12 |
+
],
|
| 13 |
+
"attention_dropout": 0.1,
|
| 14 |
+
"bos_token_id": 0,
|
| 15 |
+
"classifier_proj_size": 256,
|
| 16 |
+
"codevector_dim": 256,
|
| 17 |
+
"contrastive_logits_temperature": 0.1,
|
| 18 |
+
"conv_bias": false,
|
| 19 |
+
"conv_dim": [
|
| 20 |
+
512,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512,
|
| 25 |
+
512,
|
| 26 |
+
512
|
| 27 |
+
],
|
| 28 |
+
"conv_kernel": [
|
| 29 |
+
10,
|
| 30 |
+
3,
|
| 31 |
+
3,
|
| 32 |
+
3,
|
| 33 |
+
3,
|
| 34 |
+
2,
|
| 35 |
+
2
|
| 36 |
+
],
|
| 37 |
+
"conv_stride": [
|
| 38 |
+
5,
|
| 39 |
+
2,
|
| 40 |
+
2,
|
| 41 |
+
2,
|
| 42 |
+
2,
|
| 43 |
+
2,
|
| 44 |
+
2
|
| 45 |
+
],
|
| 46 |
+
"ctc_loss_reduction": "sum",
|
| 47 |
+
"ctc_zero_infinity": false,
|
| 48 |
+
"diversity_loss_weight": 0.1,
|
| 49 |
+
"do_stable_layer_norm": false,
|
| 50 |
+
"eos_token_id": 2,
|
| 51 |
+
"feat_extract_activation": "gelu",
|
| 52 |
+
"feat_extract_dropout": 0.0,
|
| 53 |
+
"feat_extract_norm": "group",
|
| 54 |
+
"feat_proj_dropout": 0.1,
|
| 55 |
+
"feat_quantizer_dropout": 0.0,
|
| 56 |
+
"final_dropout": 0.1,
|
| 57 |
+
"gradient_checkpointing": false,
|
| 58 |
+
"hidden_act": "gelu",
|
| 59 |
+
"hidden_dropout": 0.1,
|
| 60 |
+
"hidden_dropout_prob": 0.1,
|
| 61 |
+
"hidden_size": 768,
|
| 62 |
+
"initializer_range": 0.02,
|
| 63 |
+
"intermediate_size": 3072,
|
| 64 |
+
"layer_norm_eps": 1e-05,
|
| 65 |
+
"layerdrop": 0.1,
|
| 66 |
+
"mask_feature_length": 10,
|
| 67 |
+
"mask_feature_min_masks": 0,
|
| 68 |
+
"mask_feature_prob": 0.0,
|
| 69 |
+
"mask_time_length": 10,
|
| 70 |
+
"mask_time_min_masks": 2,
|
| 71 |
+
"mask_time_prob": 0.05,
|
| 72 |
+
"model_type": "wav2vec2",
|
| 73 |
+
"num_adapter_layers": 3,
|
| 74 |
+
"num_attention_heads": 12,
|
| 75 |
+
"num_codevector_groups": 2,
|
| 76 |
+
"num_codevectors_per_group": 320,
|
| 77 |
+
"num_conv_pos_embedding_groups": 16,
|
| 78 |
+
"num_conv_pos_embeddings": 128,
|
| 79 |
+
"num_feat_extract_layers": 7,
|
| 80 |
+
"num_hidden_layers": 12,
|
| 81 |
+
"num_negatives": 100,
|
| 82 |
+
"output_hidden_size": 768,
|
| 83 |
+
"pad_token_id": 1,
|
| 84 |
+
"proj_codevector_dim": 256,
|
| 85 |
+
"tdnn_dilation": [
|
| 86 |
+
1,
|
| 87 |
+
2,
|
| 88 |
+
3,
|
| 89 |
+
1,
|
| 90 |
+
1
|
| 91 |
+
],
|
| 92 |
+
"tdnn_dim": [
|
| 93 |
+
512,
|
| 94 |
+
512,
|
| 95 |
+
512,
|
| 96 |
+
512,
|
| 97 |
+
1500
|
| 98 |
+
],
|
| 99 |
+
"tdnn_kernel": [
|
| 100 |
+
5,
|
| 101 |
+
3,
|
| 102 |
+
3,
|
| 103 |
+
1,
|
| 104 |
+
1
|
| 105 |
+
],
|
| 106 |
+
"torch_dtype": "float32",
|
| 107 |
+
"transformers_version": "4.49.0",
|
| 108 |
+
"use_weighted_layer_sum": false,
|
| 109 |
+
"vocab_size": 41,
|
| 110 |
+
"xvector_output_dim": 512
|
| 111 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9309e96f5bb53ba957a7646cc9c5a3d61bd47d930a6998d1863e0ec081c3b199
|
| 3 |
+
size 377939575
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_bnb4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5a5d1dd62965a54abd99edf07f0bf99bd01811b4fe3fcbec94579ddbae8c016
|
| 3 |
+
size 84528286
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4724fe007a6aabb2f2a8d63318d28e533491e29b3877752f5e80da92b34c15e4
|
| 3 |
+
size 189132785
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f066fac7d0662feec0e64002a4a969032a02ee5a4048bb1959d324801ead168
|
| 3 |
+
size 95219724
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9265d2e19425b2240ac9c251e963d4a2fcda05741dd775076f4098b5e806b797
|
| 3 |
+
size 89862702
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_q4f16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e104f81133126cb900fdc017e1daecf45e3663668ab9666d1c40cb8ce3057745
|
| 3 |
+
size 66439769
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb33c88ee1fc2010aebbb9a8eef0034638a1d418e3bb57cab01ba70c1a520ad3
|
| 3 |
+
size 95219762
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/onnx/model_uint8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb33c88ee1fc2010aebbb9a8eef0034638a1d418e3bb57cab01ba70c1a520ad3
|
| 3 |
+
size 95219762
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0,
|
| 7 |
+
"return_attention_mask": false,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/quantize_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"modes": [
|
| 3 |
+
"fp16",
|
| 4 |
+
"q8",
|
| 5 |
+
"int8",
|
| 6 |
+
"uint8",
|
| 7 |
+
"q4",
|
| 8 |
+
"q4f16",
|
| 9 |
+
"bnb4"
|
| 10 |
+
],
|
| 11 |
+
"per_channel": false,
|
| 12 |
+
"reduce_range": false,
|
| 13 |
+
"block_size": null,
|
| 14 |
+
"is_symmetric": true,
|
| 15 |
+
"accuracy_level": null,
|
| 16 |
+
"quant_type": 1,
|
| 17 |
+
"op_block_list": null
|
| 18 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/onnx-community/wav2vec2-base-10k-voxpopuli-ft-pl-ONNX
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"eos_token": "</s>",
|
| 4 |
+
"pad_token": "<pad>",
|
| 5 |
+
"unk_token": "<unk>"
|
| 6 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
| 5 |
+
"added_tokens": [
|
| 6 |
+
{
|
| 7 |
+
"id": 1,
|
| 8 |
+
"content": "<s>",
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"lstrip": true,
|
| 11 |
+
"rstrip": true,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"special": true
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": 0,
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"lstrip": true,
|
| 20 |
+
"rstrip": true,
|
| 21 |
+
"normalized": false,
|
| 22 |
+
"special": true
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"id": 2,
|
| 26 |
+
"content": "</s>",
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"lstrip": true,
|
| 29 |
+
"rstrip": true,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"special": true
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"id": 3,
|
| 35 |
+
"content": "<unk>",
|
| 36 |
+
"single_word": false,
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"rstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"special": true
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"normalizer": {
|
| 44 |
+
"type": "Replace",
|
| 45 |
+
"pattern": {
|
| 46 |
+
"String": " "
|
| 47 |
+
},
|
| 48 |
+
"content": "|"
|
| 49 |
+
},
|
| 50 |
+
"pre_tokenizer": {
|
| 51 |
+
"type": "Split",
|
| 52 |
+
"pattern": {
|
| 53 |
+
"Regex": ""
|
| 54 |
+
},
|
| 55 |
+
"behavior": "Isolated",
|
| 56 |
+
"invert": false
|
| 57 |
+
},
|
| 58 |
+
"post_processor": null,
|
| 59 |
+
"decoder": {
|
| 60 |
+
"type": "CTC",
|
| 61 |
+
"pad_token": "<pad>",
|
| 62 |
+
"word_delimiter_token": "|",
|
| 63 |
+
"cleanup": true
|
| 64 |
+
},
|
| 65 |
+
"model": {
|
| 66 |
+
"vocab": {
|
| 67 |
+
"<s>": 1,
|
| 68 |
+
"<pad>": 0,
|
| 69 |
+
"</s>": 2,
|
| 70 |
+
"<unk>": 3,
|
| 71 |
+
"|": 4,
|
| 72 |
+
"e": 5,
|
| 73 |
+
"a": 6,
|
| 74 |
+
"i": 7,
|
| 75 |
+
"o": 8,
|
| 76 |
+
"n": 9,
|
| 77 |
+
"z": 10,
|
| 78 |
+
"r": 11,
|
| 79 |
+
"w": 12,
|
| 80 |
+
"s": 13,
|
| 81 |
+
"t": 14,
|
| 82 |
+
"c": 15,
|
| 83 |
+
"y": 16,
|
| 84 |
+
"p": 17,
|
| 85 |
+
"d": 18,
|
| 86 |
+
"k": 19,
|
| 87 |
+
"m": 20,
|
| 88 |
+
"j": 21,
|
| 89 |
+
"u": 22,
|
| 90 |
+
"l": 23,
|
| 91 |
+
"b": 24,
|
| 92 |
+
"g": 25,
|
| 93 |
+
"\u0142": 26,
|
| 94 |
+
"h": 27,
|
| 95 |
+
"\u0105": 28,
|
| 96 |
+
"\u0119": 29,
|
| 97 |
+
"\u017c": 30,
|
| 98 |
+
"\u00f3": 31,
|
| 99 |
+
"\u015b": 32,
|
| 100 |
+
"\u0107": 33,
|
| 101 |
+
"f": 34,
|
| 102 |
+
"\u0144": 35,
|
| 103 |
+
"\u017a": 36,
|
| 104 |
+
"v": 37,
|
| 105 |
+
"x": 38,
|
| 106 |
+
"q": 39,
|
| 107 |
+
"1": 40
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/tokenizer_config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<pad>",
|
| 5 |
+
"lstrip": true,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": true,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": false
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<s>",
|
| 13 |
+
"lstrip": true,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": true,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": false
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": true,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": true,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": false
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": true,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": true,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": false
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"bos_token": "<s>",
|
| 37 |
+
"clean_up_tokenization_spaces": false,
|
| 38 |
+
"do_lower_case": false,
|
| 39 |
+
"eos_token": "</s>",
|
| 40 |
+
"extra_special_tokens": {},
|
| 41 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 42 |
+
"pad_token": "<pad>",
|
| 43 |
+
"processor_class": "Wav2Vec2Processor",
|
| 44 |
+
"replace_word_delimiter_char": " ",
|
| 45 |
+
"target_lang": null,
|
| 46 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
| 47 |
+
"unk_token": "<unk>",
|
| 48 |
+
"word_delimiter_token": "|"
|
| 49 |
+
}
|
wav2vec2-base-10k-voxpopuli-ft-pl-ONNX/vocab.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": 40,
|
| 3 |
+
"</s>": 2,
|
| 4 |
+
"<pad>": 0,
|
| 5 |
+
"<s>": 1,
|
| 6 |
+
"<unk>": 3,
|
| 7 |
+
"a": 6,
|
| 8 |
+
"b": 24,
|
| 9 |
+
"c": 15,
|
| 10 |
+
"d": 18,
|
| 11 |
+
"e": 5,
|
| 12 |
+
"f": 34,
|
| 13 |
+
"g": 25,
|
| 14 |
+
"h": 27,
|
| 15 |
+
"i": 7,
|
| 16 |
+
"j": 21,
|
| 17 |
+
"k": 19,
|
| 18 |
+
"l": 23,
|
| 19 |
+
"m": 20,
|
| 20 |
+
"n": 9,
|
| 21 |
+
"o": 8,
|
| 22 |
+
"p": 17,
|
| 23 |
+
"q": 39,
|
| 24 |
+
"r": 11,
|
| 25 |
+
"s": 13,
|
| 26 |
+
"t": 14,
|
| 27 |
+
"u": 22,
|
| 28 |
+
"v": 37,
|
| 29 |
+
"w": 12,
|
| 30 |
+
"x": 38,
|
| 31 |
+
"y": 16,
|
| 32 |
+
"z": 10,
|
| 33 |
+
"|": 4,
|
| 34 |
+
"ó": 31,
|
| 35 |
+
"ą": 28,
|
| 36 |
+
"ć": 33,
|
| 37 |
+
"ę": 29,
|
| 38 |
+
"ł": 26,
|
| 39 |
+
"ń": 35,
|
| 40 |
+
"ś": 32,
|
| 41 |
+
"ź": 36,
|
| 42 |
+
"ż": 30
|
| 43 |
+
}
|
wav2vec2-end-of-speech-detection/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
wav2vec2-end-of-speech-detection/5sec_audio.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ddddf2ffec49c3bf14f91967b7f66cecebd6c913fdd5e8a610bf45744eb4716
|
| 3 |
+
size 311930
|
wav2vec2-end-of-speech-detection/README.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- mozilla-foundation/common_voice_16_0
|
| 5 |
+
language:
|
| 6 |
+
- en
|
| 7 |
+
- de
|
| 8 |
+
- pl
|
| 9 |
+
- fr
|
| 10 |
+
- it
|
| 11 |
+
base_model:
|
| 12 |
+
- facebook/wav2vec2-base-960h
|
| 13 |
+
pipeline_tag: audio-classification
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# End of Speech Detection with Wav2Vec 2.0
|
| 17 |
+
|
| 18 |
+
The End-of-Speech model is based on the open-source Wav2Vec 2.0 model from Meta AI. It uses convolutional feature encoders, which translate chunks of raw audio input into latent speech representations and a transformer to capture the information throughout this sequence of representations. This helps the model distinguish different pitch declines, as well as final lengthening (and the following pause) in the intonation and therefore distinguish when an end of speech event occurs - the same way us humans do.
|
| 19 |
+
|
| 20 |
+
# Training Data
|
| 21 |
+
|
| 22 |
+
The training data is constructed from the Common voice 16.0 English Audio dataset by the Mozilla Firefox foundation. It is under a permissive license CC0 1.0.
|
| 23 |
+
|
| 24 |
+
In order to train the wav2vec 2.0 model for end of speech, we would need a large enough dataset that consists of both end of speech and not end of speech samples. Since there weren’t any open source datasets that contained such ready samples, we needed to construct one. The common voice dataset consists of audio samples that contain only one spoken sentence each.
|
| 25 |
+
|
| 26 |
+
Unfortunately, there is additional noisy/empty audio in the beginning and end of the audio samples. To remove those and capture only the audio that corresponds to the spoken sentence, we would need the timestamp of the sentence, or better yet, the word level timestamps. This is achieved with the help of whisperX. This way we capture when the sentence starts and finishes and remove anything before and after.
|
| 27 |
+
|
| 28 |
+
After cleaning the samples, we ran through random samples to validate the correctness of the procedure. Afterwards we label the last 700/704ms of the audio samples as end of speech events and all before that as not end of speech.
|
| 29 |
+
|
| 30 |
+
Finally, in addition, we added overlapping segments to the dataset by moving the 700/704ms window in both directions.
|
| 31 |
+
|
| 32 |
+
# Input
|
| 33 |
+
|
| 34 |
+
The model is trained at 700 and 704ms (11x64ms) inputs of raw audio. The sample rate is 16kHz. During experiments different lengths have been tested (300ms, 500ms and 1 sec) and 700/704ms proved to be the middle ground between good enough performance and shortest chunk.
|
| 35 |
+
|
| 36 |
+
# Output
|
| 37 |
+
|
| 38 |
+
The model classifies each audio input into 2 classes - eos (id: 0) and not_eos (id: 1).
|
| 39 |
+
|
| 40 |
+
# Usage
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
from transformers import Wav2Vec2Processor, AutoConfig
|
| 44 |
+
import onnxruntime as rt
|
| 45 |
+
import torch
|
| 46 |
+
import torch.nn.functional as F
|
| 47 |
+
import numpy as np
|
| 48 |
+
import os
|
| 49 |
+
import torchaudio
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class EndOfSpeechDetection:
|
| 53 |
+
processor: Wav2Vec2Processor
|
| 54 |
+
config: AutoConfig
|
| 55 |
+
session: rt.InferenceSession
|
| 56 |
+
|
| 57 |
+
def load_model(self, path, use_gpu=False):
|
| 58 |
+
processor = Wav2Vec2Processor.from_pretrained(path)
|
| 59 |
+
config = AutoConfig.from_pretrained(path)
|
| 60 |
+
|
| 61 |
+
sess_options = rt.SessionOptions()
|
| 62 |
+
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 63 |
+
|
| 64 |
+
providers = ["ROCMExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
|
| 65 |
+
session = rt.InferenceSession(
|
| 66 |
+
os.path.join(path, "model.onnx"), sess_options, providers=providers
|
| 67 |
+
)
|
| 68 |
+
return processor, config, session
|
| 69 |
+
|
| 70 |
+
def predict(self, segment, file_type="pcm"):
|
| 71 |
+
if file_type == "pcm":
|
| 72 |
+
# pcm files
|
| 73 |
+
speech_array = np.memmap(segment, dtype="float32", mode="r").astype(
|
| 74 |
+
np.float32
|
| 75 |
+
)
|
| 76 |
+
else:
|
| 77 |
+
# wave files
|
| 78 |
+
speech_array, _ = torchaudio.load(segment)
|
| 79 |
+
speech_array = speech_array[0].numpy()
|
| 80 |
+
|
| 81 |
+
features = self.processor(
|
| 82 |
+
speech_array, sampling_rate=16000, return_tensors="pt", padding=True
|
| 83 |
+
)
|
| 84 |
+
input_values = features.input_values
|
| 85 |
+
outputs = self.session.run(
|
| 86 |
+
[self.session.get_outputs()[-1].name],
|
| 87 |
+
{self.session.get_inputs()[-1].name: input_values.detach().cpu().numpy()},
|
| 88 |
+
)[0]
|
| 89 |
+
softmax_output = F.softmax(torch.tensor(outputs), dim=1)
|
| 90 |
+
|
| 91 |
+
both_classes_with_prob = {
|
| 92 |
+
self.config.id2label[i]: softmax_output[0][i].item()
|
| 93 |
+
for i in range(len(softmax_output[0]))
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
return both_classes_with_prob
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
eos = EndOfSpeechDetection()
|
| 101 |
+
eos.processor, eos.config, eos.session = eos.load_model("eos-model-onnx")
|
| 102 |
+
print(eos.predict("some.pcm", file_type="pcm"))
|
| 103 |
+
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
# Latency (& Memory) Optimization
|
| 107 |
+
- Knowledge Distillation
|
| 108 |
+
- Onnx format weights
|
| 109 |
+
- The weights are converted in the Onnx format (in order to optimize CPU & GPU Performance)
|
| 110 |
+
- As tested on an AMD Instinct MI100 GPU - sub 10ms inference per 704ms audio chunk
|
| 111 |
+
|
| 112 |
+
# Evaluation
|
| 113 |
+
|
| 114 |
+
Accuracy at 0.95 with 8120 samples tested.
|
| 115 |
+
|
| 116 |
+
| classes | precision | recall | f1-score | support |
|
| 117 |
+
|---|---|---|---|---|
|
| 118 |
+
| eos | 0.94 | 0.95 | 0.95 | 4060 |
|
| 119 |
+
| not_eos | 0.95 | 0.94 | 0.95 | 4060 |
|
wav2vec2-end-of-speech-detection/eos-model-onnx/config.json
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "eos-det/model_07/checkpoint-2283",
|
| 3 |
+
"activation_dropout": 0.0,
|
| 4 |
+
"adapter_attn_dim": null,
|
| 5 |
+
"adapter_kernel_size": 3,
|
| 6 |
+
"adapter_stride": 2,
|
| 7 |
+
"add_adapter": false,
|
| 8 |
+
"apply_spec_augment": true,
|
| 9 |
+
"architectures": [
|
| 10 |
+
"Wav2Vec2ForSequenceClassification"
|
| 11 |
+
],
|
| 12 |
+
"attention_dropout": 0.1,
|
| 13 |
+
"bos_token_id": 1,
|
| 14 |
+
"classifier_proj_size": 256,
|
| 15 |
+
"codevector_dim": 256,
|
| 16 |
+
"contrastive_logits_temperature": 0.1,
|
| 17 |
+
"conv_bias": false,
|
| 18 |
+
"conv_dim": [
|
| 19 |
+
512,
|
| 20 |
+
512,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512,
|
| 25 |
+
512
|
| 26 |
+
],
|
| 27 |
+
"conv_kernel": [
|
| 28 |
+
10,
|
| 29 |
+
3,
|
| 30 |
+
3,
|
| 31 |
+
3,
|
| 32 |
+
3,
|
| 33 |
+
2,
|
| 34 |
+
2
|
| 35 |
+
],
|
| 36 |
+
"conv_stride": [
|
| 37 |
+
5,
|
| 38 |
+
2,
|
| 39 |
+
2,
|
| 40 |
+
2,
|
| 41 |
+
2,
|
| 42 |
+
2,
|
| 43 |
+
2
|
| 44 |
+
],
|
| 45 |
+
"ctc_loss_reduction": "sum",
|
| 46 |
+
"ctc_zero_infinity": false,
|
| 47 |
+
"diversity_loss_weight": 0.1,
|
| 48 |
+
"do_stable_layer_norm": false,
|
| 49 |
+
"eos_token_id": 2,
|
| 50 |
+
"feat_extract_activation": "gelu",
|
| 51 |
+
"feat_extract_norm": "group",
|
| 52 |
+
"feat_proj_dropout": 0.1,
|
| 53 |
+
"feat_quantizer_dropout": 0.0,
|
| 54 |
+
"final_dropout": 0.0,
|
| 55 |
+
"freeze_feat_extract_train": true,
|
| 56 |
+
"hidden_act": "gelu",
|
| 57 |
+
"hidden_dropout": 0.1,
|
| 58 |
+
"hidden_size": 768,
|
| 59 |
+
"id2label": {
|
| 60 |
+
"0": "eos",
|
| 61 |
+
"1": "not_eos"
|
| 62 |
+
},
|
| 63 |
+
"initializer_range": 0.02,
|
| 64 |
+
"intermediate_size": 3072,
|
| 65 |
+
"label2id": {
|
| 66 |
+
"eos": 0,
|
| 67 |
+
"not_eos": 1
|
| 68 |
+
},
|
| 69 |
+
"layer_norm_eps": 1e-05,
|
| 70 |
+
"layerdrop": 0.0,
|
| 71 |
+
"mask_channel_length": 10,
|
| 72 |
+
"mask_channel_min_space": 1,
|
| 73 |
+
"mask_channel_other": 0.0,
|
| 74 |
+
"mask_channel_prob": 0.0,
|
| 75 |
+
"mask_channel_selection": "static",
|
| 76 |
+
"mask_feature_length": 10,
|
| 77 |
+
"mask_feature_min_masks": 0,
|
| 78 |
+
"mask_feature_prob": 0.0,
|
| 79 |
+
"mask_time_length": 10,
|
| 80 |
+
"mask_time_min_masks": 2,
|
| 81 |
+
"mask_time_min_space": 1,
|
| 82 |
+
"mask_time_other": 0.0,
|
| 83 |
+
"mask_time_prob": 0.05,
|
| 84 |
+
"mask_time_selection": "static",
|
| 85 |
+
"model_type": "wav2vec2",
|
| 86 |
+
"no_mask_channel_overlap": false,
|
| 87 |
+
"no_mask_time_overlap": false,
|
| 88 |
+
"num_adapter_layers": 3,
|
| 89 |
+
"num_attention_heads": 12,
|
| 90 |
+
"num_codevector_groups": 2,
|
| 91 |
+
"num_codevectors_per_group": 320,
|
| 92 |
+
"num_conv_pos_embedding_groups": 16,
|
| 93 |
+
"num_conv_pos_embeddings": 128,
|
| 94 |
+
"num_feat_extract_layers": 7,
|
| 95 |
+
"num_hidden_layers": 12,
|
| 96 |
+
"num_negatives": 100,
|
| 97 |
+
"output_hidden_size": 768,
|
| 98 |
+
"pad_token_id": 0,
|
| 99 |
+
"proj_codevector_dim": 256,
|
| 100 |
+
"tdnn_dilation": [
|
| 101 |
+
1,
|
| 102 |
+
2,
|
| 103 |
+
3,
|
| 104 |
+
1,
|
| 105 |
+
1
|
| 106 |
+
],
|
| 107 |
+
"tdnn_dim": [
|
| 108 |
+
512,
|
| 109 |
+
512,
|
| 110 |
+
512,
|
| 111 |
+
512,
|
| 112 |
+
1500
|
| 113 |
+
],
|
| 114 |
+
"tdnn_kernel": [
|
| 115 |
+
5,
|
| 116 |
+
3,
|
| 117 |
+
3,
|
| 118 |
+
1,
|
| 119 |
+
1
|
| 120 |
+
],
|
| 121 |
+
"transformers_version": "4.38.2",
|
| 122 |
+
"use_weighted_layer_sum": false,
|
| 123 |
+
"vocab_size": 32,
|
| 124 |
+
"xvector_output_dim": 512
|
| 125 |
+
}
|
wav2vec2-end-of-speech-detection/eos-model-onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d526a75c63ea501292f463f6de28c209fee2ccf733ada042155c01c1c5bc31a9
|
| 3 |
+
size 378578988
|
wav2vec2-end-of-speech-detection/eos-model-onnx/preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0.0,
|
| 7 |
+
"return_attention_mask": false,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
wav2vec2-end-of-speech-detection/eos-model-onnx/special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
wav2vec2-end-of-speech-detection/eos-model-onnx/tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "return_attention_mask": false, "do_normalize": true}
|
wav2vec2-end-of-speech-detection/eos-model-onnx/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}
|
wav2vec2-end-of-speech-detection/inference.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Wav2Vec2Processor, AutoConfig
|
| 2 |
+
import onnxruntime as rt
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
import torchaudio
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class EndOfSpeechDetection:
|
| 12 |
+
processor: Wav2Vec2Processor
|
| 13 |
+
config: AutoConfig
|
| 14 |
+
session: rt.InferenceSession
|
| 15 |
+
|
| 16 |
+
def load_model(self, path, use_gpu=False):
|
| 17 |
+
processor = Wav2Vec2Processor.from_pretrained(path)
|
| 18 |
+
config = AutoConfig.from_pretrained(path)
|
| 19 |
+
|
| 20 |
+
sess_options = rt.SessionOptions()
|
| 21 |
+
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 22 |
+
|
| 23 |
+
providers = ["ROCMExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
|
| 24 |
+
session = rt.InferenceSession(
|
| 25 |
+
os.path.join(path, "model.onnx"), sess_options, providers=providers
|
| 26 |
+
)
|
| 27 |
+
return processor, config, session
|
| 28 |
+
|
| 29 |
+
def predict(self, segment, file_type="pcm"):
|
| 30 |
+
if file_type == "pcm":
|
| 31 |
+
# pcm files
|
| 32 |
+
speech_array = np.memmap(segment, dtype="float32", mode="r").astype(
|
| 33 |
+
np.float32
|
| 34 |
+
)
|
| 35 |
+
else:
|
| 36 |
+
# wave files
|
| 37 |
+
speech_array, _ = torchaudio.load(segment)
|
| 38 |
+
speech_array = speech_array[0].numpy()
|
| 39 |
+
|
| 40 |
+
features = self.processor(
|
| 41 |
+
speech_array, sampling_rate=16000, return_tensors="pt", padding=True
|
| 42 |
+
)
|
| 43 |
+
input_values = features.input_values
|
| 44 |
+
outputs = self.session.run(
|
| 45 |
+
[self.session.get_outputs()[-1].name],
|
| 46 |
+
{self.session.get_inputs()[-1].name: input_values.detach().cpu().numpy()},
|
| 47 |
+
)[0]
|
| 48 |
+
softmax_output = F.softmax(torch.tensor(outputs), dim=1)
|
| 49 |
+
|
| 50 |
+
both_classes_with_prob = {
|
| 51 |
+
self.config.id2label[i]: softmax_output[0][i].item()
|
| 52 |
+
for i in range(len(softmax_output[0]))
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
return both_classes_with_prob
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
eos = EndOfSpeechDetection()
|
| 60 |
+
eos.processor, eos.config, eos.session = eos.load_model("eos-model-onnx")
|
| 61 |
+
|
| 62 |
+
audio_file = "5sec_audio.wav"
|
| 63 |
+
audio, sr = torchaudio.load(audio_file)
|
| 64 |
+
audio = audio[0].numpy()
|
| 65 |
+
audio_len = len(audio)
|
| 66 |
+
segment_len = 700 * sr // 1000
|
| 67 |
+
segments = []
|
| 68 |
+
for i in range(0, audio_len, segment_len):
|
| 69 |
+
if i + segment_len < audio_len:
|
| 70 |
+
segment = audio[i : i + segment_len]
|
| 71 |
+
else:
|
| 72 |
+
segment = audio[i:]
|
| 73 |
+
|
| 74 |
+
segments.append(segment)
|
| 75 |
+
|
| 76 |
+
if not os.path.exists("segments"):
|
| 77 |
+
os.makedirs("segments")
|
| 78 |
+
for i, segment in enumerate(segments):
|
| 79 |
+
sf.write(f"segments/segment_{i}.wav", segment, sr)
|
| 80 |
+
print(eos.predict(f"segments/segment_{i}.wav", file_type="wav"))
|
wav2vec2-end-of-speech-detection/languages.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
English
|
| 2 |
+
German
|
| 3 |
+
Polish
|
| 4 |
+
French
|
| 5 |
+
Italian
|
wav2vec2-end-of-speech-detection/segments/segment_0.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_1.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_2.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_3.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_4.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_5.wav
ADDED
|
Binary file (22.4 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/segments/segment_6.wav
ADDED
|
Binary file (21.6 kB). View file
|
|
|
wav2vec2-end-of-speech-detection/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/telnyx/wav2vec2-end-of-speech-detection
|
wav2vec2-large-xlsr-53-german-cv9/.gitattributes
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|