Update README.md
Browse files
README.md
CHANGED
|
@@ -75,56 +75,86 @@ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated sys
|
|
| 75 |
|
| 76 |
|
| 77 |
## How to Use the Model:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
The model is available for use in the NeMo toolkit [2], and can be used as a pre-trained checkpoint for inference.
|
| 79 |
|
| 80 |
### Automatically load the model
|
| 81 |
|
| 82 |
```python
|
|
|
|
| 83 |
import nemo.collections.asr as nemo_asr
|
| 84 |
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained(model_name="nvidia/frame_vad_multilingual_marblenet_v2.0")
|
| 85 |
-
```
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
|
|
|
|
|
|
| 89 |
```bash
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
```
|
| 100 |
-
|
|
|
|
| 101 |
|
| 102 |
```python
|
| 103 |
-
import
|
| 104 |
from nemo.core import typecheck
|
| 105 |
-
import nemo.collections.asr as nemo_asr
|
| 106 |
typecheck.set_typecheck_enabled(False)
|
| 107 |
|
|
|
|
| 108 |
ONNX_EXPORT_PATH = "frame_vad_multilingual_marblenet_v2.0.onnx"
|
| 109 |
|
| 110 |
-
#
|
| 111 |
-
vad_model =
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# Define input example for ONNX export
|
| 116 |
-
B, F, T = 16, 80, 400 # batch, feature dim, sequence length
|
| 117 |
inputs = {
|
| 118 |
-
"processed_signal":
|
| 119 |
-
"processed_signal_length":
|
| 120 |
}
|
| 121 |
|
| 122 |
-
# Export
|
| 123 |
torch.onnx.export(
|
| 124 |
model=vad_model,
|
| 125 |
args=inputs,
|
| 126 |
f=ONNX_EXPORT_PATH,
|
| 127 |
-
input_names=
|
| 128 |
output_names=["output"],
|
| 129 |
dynamic_axes={
|
| 130 |
"processed_signal": {0: "batch_size", 2: "sequence_length"},
|
|
@@ -133,9 +163,50 @@ torch.onnx.export(
|
|
| 133 |
}
|
| 134 |
)
|
| 135 |
|
| 136 |
-
# Validate
|
| 137 |
onnx.checker.check_model(onnx.load(ONNX_EXPORT_PATH))
|
| 138 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
## Software Integration:
|
| 140 |
**Runtime Engine(s):**
|
| 141 |
* NeMo-2.0.0 <br>
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
## How to Use the Model:
|
| 78 |
+
To train, fine-tune or play with the model you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo).
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
pip install -U nemo_toolkit['asr']
|
| 82 |
+
```
|
| 83 |
The model is available for use in the NeMo toolkit [2], and can be used as a pre-trained checkpoint for inference.
|
| 84 |
|
| 85 |
### Automatically load the model
|
| 86 |
|
| 87 |
```python
|
| 88 |
+
import torch
|
| 89 |
import nemo.collections.asr as nemo_asr
|
| 90 |
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained(model_name="nvidia/frame_vad_multilingual_marblenet_v2.0")
|
|
|
|
| 91 |
|
| 92 |
+
# Move the model to GPU if available
|
| 93 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 94 |
+
vad_model = vad_model.to(device)
|
| 95 |
+
vad_model.eval()
|
| 96 |
+
```
|
| 97 |
|
| 98 |
+
### Inference with PyTorch
|
| 99 |
+
First, let's get a sample
|
| 100 |
```bash
|
| 101 |
+
wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
| 102 |
+
```
|
| 103 |
+
Then run the following:
|
| 104 |
+
|
| 105 |
+
```python
|
| 106 |
+
import librosa
|
| 107 |
+
|
| 108 |
+
# Load the audio
|
| 109 |
+
input_signal = librosa.load("2086-149220-0033.wav", sr=16000, mono=True)[0]
|
| 110 |
+
input_signal = torch.tensor(input_signal).unsqueeze(0).float()
|
| 111 |
+
input_signal_length = torch.tensor([input_signal.shape[1]]).long()
|
| 112 |
+
|
| 113 |
+
# Perform inference
|
| 114 |
+
with torch.no_grad():
|
| 115 |
+
torch_outputs = vad_model(
|
| 116 |
+
input_signal=input_signal.to(device),
|
| 117 |
+
input_signal_length=input_signal_length.to(device)
|
| 118 |
+
).cpu()
|
| 119 |
+
|
| 120 |
+
# Check output dimensions
|
| 121 |
+
B, T, C = torch_outputs.shape
|
| 122 |
+
assert C == 2, "Output channels should be 2"
|
| 123 |
```
|
| 124 |
+
|
| 125 |
+
### Export to ONNX
|
| 126 |
|
| 127 |
```python
|
| 128 |
+
import onnx
|
| 129 |
from nemo.core import typecheck
|
|
|
|
| 130 |
typecheck.set_typecheck_enabled(False)
|
| 131 |
|
| 132 |
+
# Output file path for ONNX export
|
| 133 |
ONNX_EXPORT_PATH = "frame_vad_multilingual_marblenet_v2.0.onnx"
|
| 134 |
|
| 135 |
+
# Move everything to CPU
|
| 136 |
+
vad_model = vad_model.cpu()
|
| 137 |
+
input_signal = input_signal.cpu()
|
| 138 |
+
input_signal_length = input_signal_length.cpu()
|
| 139 |
+
|
| 140 |
+
# Preprocess input signal
|
| 141 |
+
processed_signal, processed_signal_length = vad_model.preprocessor(
|
| 142 |
+
input_signal=input_signal,
|
| 143 |
+
length=input_signal_length
|
| 144 |
+
)
|
| 145 |
|
| 146 |
# Define input example for ONNX export
|
|
|
|
| 147 |
inputs = {
|
| 148 |
+
"processed_signal": processed_signal,
|
| 149 |
+
"processed_signal_length": processed_signal_length
|
| 150 |
}
|
| 151 |
|
| 152 |
+
# Export
|
| 153 |
torch.onnx.export(
|
| 154 |
model=vad_model,
|
| 155 |
args=inputs,
|
| 156 |
f=ONNX_EXPORT_PATH,
|
| 157 |
+
input_names=list(inputs.keys()),
|
| 158 |
output_names=["output"],
|
| 159 |
dynamic_axes={
|
| 160 |
"processed_signal": {0: "batch_size", 2: "sequence_length"},
|
|
|
|
| 163 |
}
|
| 164 |
)
|
| 165 |
|
| 166 |
+
# Validate ONNX model
|
| 167 |
onnx.checker.check_model(onnx.load(ONNX_EXPORT_PATH))
|
| 168 |
```
|
| 169 |
+
|
| 170 |
+
### Inference with ONNX Runtime
|
| 171 |
+
```python
|
| 172 |
+
import onnxruntime
|
| 173 |
+
|
| 174 |
+
# Load the ONNX model
|
| 175 |
+
session = onnxruntime.InferenceSession(
|
| 176 |
+
ONNX_EXPORT_PATH,
|
| 177 |
+
providers=["CPUExecutionProvider"]
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Prepare input for ONNX Runtime
|
| 181 |
+
ort_inputs = {
|
| 182 |
+
input.name: inputs[input.name].numpy()
|
| 183 |
+
for input in session.get_inputs()
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
# Run inference
|
| 187 |
+
onnx_outputs = session.run(None, ort_inputs)[0]
|
| 188 |
+
|
| 189 |
+
# Compare with PyTorch output
|
| 190 |
+
for torch_out, onnx_out in zip(torch_outputs, onnx_outputs):
|
| 191 |
+
torch.testing.assert_close(torch_out, torch.from_numpy(onnx_out), atol=1e-3, rtol=1e-3)
|
| 192 |
+
print("✅ PyTorch and ONNX Runtime outputs match!")
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
### RTTM Output from Frame-Level Speech Predictions
|
| 196 |
+
|
| 197 |
+
To generate RTTM (Rich Transcription Time Marked) files from audio using the pretrained model:
|
| 198 |
+
```bash
|
| 199 |
+
python <NEMO_ROOT>/examples/asr/speech_classification/frame_vad_infer.py \
|
| 200 |
+
--config-path="../conf/vad" \
|
| 201 |
+
--config-name="frame_vad_infer_postprocess.yaml" \
|
| 202 |
+
vad.model_path="nvidia/frame_vad_multilingual_marblenet_v2.0" \
|
| 203 |
+
vad.parameters.shift_length_in_sec=0.02 \
|
| 204 |
+
prepare_manifest.auto_split=True \
|
| 205 |
+
prepare_manifest.split_duration=7200 \
|
| 206 |
+
input_manifest=<Path of manifest file of evaluation data, where audio files should have unique names> \
|
| 207 |
+
out_manifest_filepath=<Path of output manifest file>
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
## Software Integration:
|
| 211 |
**Runtime Engine(s):**
|
| 212 |
* NeMo-2.0.0 <br>
|