HY-2012 commited on Sep 18

Commit

1ebaeb9

verified ·

1 Parent(s): 1ff1404

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
README.md +118 -3
ax_model/.gitattributes +2 -0
ax_model/auto.npy +3 -0
ax_model/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
ax_model/event_emo.npy +3 -0
ax_model/sensevoice.axmodel +3 -0
ax_model/sensevoice/am.mvn +8 -0
ax_model/sensevoice/config.yaml +97 -0
ax_model/vad/am.mvn +8 -0
ax_model/vad/config.yaml +56 -0
ax_model/withitn.npy +3 -0
ax_speech_translate_demo.py +347 -0
libmelotts/install/libonnxruntime.so +3 -0
libmelotts/install/libonnxruntime.so.1.14.0 +3 -0
libmelotts/install/libonnxruntime_providers_shared.so +0 -0
libmelotts/install/melotts +3 -0
libmelotts/models/decoder-en.axmodel +3 -0
libmelotts/models/decoder-zh.axmodel +3 -0
libmelotts/models/encoder-en.onnx +3 -0
libmelotts/models/encoder-zh.onnx +3 -0
libmelotts/models/g-en.bin +3 -0
libmelotts/models/g-jp.bin +3 -0
libmelotts/models/g-zh_mix_en.bin +3 -0
libmelotts/models/lexicon.txt +0 -0
libmelotts/models/tokens.txt +112 -0
libtranslate/libax_translate.so +3 -0
libtranslate/libsentencepiece.so.0 +3 -0
libtranslate/opus-mt-en-zh.axmodel +3 -0
libtranslate/opus-mt-en-zh/.gitattributes +9 -0
libtranslate/opus-mt-en-zh/README.md +96 -0
libtranslate/opus-mt-en-zh/config.json +61 -0
libtranslate/opus-mt-en-zh/generation_config.json +16 -0
libtranslate/opus-mt-en-zh/metadata.json +1 -0
libtranslate/opus-mt-en-zh/source.spm +3 -0
libtranslate/opus-mt-en-zh/target.spm +3 -0
libtranslate/opus-mt-en-zh/tokenizer_config.json +1 -0
libtranslate/opus-mt-en-zh/vocab.json +0 -0
libtranslate/test_translate +0 -0
model.py +942 -0
requirements.txt +5 -0
utils/__init__.py +0 -0
utils/ax_model_bin.py +241 -0
utils/ax_vad_bin.py +156 -0
utils/ctc_alignment.py +76 -0
utils/frontend.py +433 -0
utils/infer_utils.py +312 -0
utils/utils/__init__.py +0 -0
utils/utils/e2e_vad.py +711 -0
utils/utils/frontend.py +448 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ax_model/sensevoice.axmodel filter=lfs diff=lfs merge=lfs -text
+libmelotts/install/libonnxruntime.so filter=lfs diff=lfs merge=lfs -text
+libmelotts/install/libonnxruntime.so.1.14.0 filter=lfs diff=lfs merge=lfs -text
+libmelotts/install/melotts filter=lfs diff=lfs merge=lfs -text
+libmelotts/models/decoder-en.axmodel filter=lfs diff=lfs merge=lfs -text
+libmelotts/models/decoder-zh.axmodel filter=lfs diff=lfs merge=lfs -text
+libtranslate/libax_translate.so filter=lfs diff=lfs merge=lfs -text
+libtranslate/libsentencepiece.so.0 filter=lfs diff=lfs merge=lfs -text
+libtranslate/opus-mt-en-zh/source.spm filter=lfs diff=lfs merge=lfs -text
+libtranslate/opus-mt-en-zh/target.spm filter=lfs diff=lfs merge=lfs -text
+libtranslate/opus-mt-en-zh.axmodel filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,118 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+pipeline_tag: Speech-Translation
+base_model:
+- FunAudioLLM/SenseVoiceSmall
+- opus-mt-en-zh
+- MeloTTS
+tags:
+- VAD
+- ASR
+- Translation
+- TTS
+---
+# Speech-Translation.axera
+speech translation demo on Axera
+- [x] Python 示例
+- [ ] C++ 示例
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+How to Convert from ONNX to axmodel
+- [ASR](https://github.com/AXERA-TECH/3D-Speaker-MT.axera/tree/main/model_convert)
+- [MeloTTS](https://github.com/ml-inory/melotts.axera/tree/main/model_convert)
+## 支持平台
+- AX650N
+## 功能
+语音翻译（支持语言：英->中）
+## Pipeline组件
+-  [ASR](https://github.com/AXERA-TECH/3D-Speaker-MT.axera/tree/main)
+-  [Text-Translate](https://github.com/AXERA-TECH/libtranslate.axera/tree/master),参考生成库文件,保存到libtranslate
+-  [MeloTTS](https://github.com/ml-inory/melotts.axera/tree/main/cpp)，参考生成库文件，保存到libmelotts
+## 上板部署
+-  AX650N 的设备已预装 Ubuntu22.04
+-  以 root 权限登陆 AX650N 的板卡设备
+-  链接互联网，确保 AX650N 的设备能正常执行 apt install, pip install 等指令
+-  已验证设备：AX650N DEMO Board
+## Python API 运行
+在python3.10(验证)
+1、添加动态库
+```
+export LD_LIBRARY_PATH=./libtranslate/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=./libmelotts/install/:$LD_LIBRARY_PATH
+```
+2、安装python库
+```
+pip3 install -r requirements.txt
+```
+## 在开发板运行以下命令
+```
+支持输入音频文件格式：wav，mp3
+```
+```
+python3 pipeline.py --audio_file wav/en.mp3 --output_dir output
+```
+运行参数说明:
+| 参数名称 | 说明|
+|-------|------|
+| `--audio_file`  | 音频路径   |
+| `--output_dir` | 结果保存路径 |
+输出保存为wav文件，具体结果如下：
+```
+原始音频: wav/en.mp3
+原始文本: The tribal chieftain called for the boy and presented him with 50 pieces of gold.
+翻译文本: 部落酋长召唤了男孩 给他50块黄金
+生成音频: output/output.wav
+```
+## Latency
+AX650N
+RTF: 约为2.0
+```
+eg:
+Inference time for en.mp3:  13.04 seconds
+  - VAD + ASR processing time: 0.89 seconds
+  - Translate time: 3.95 seconds
+  - TTS time: 8.20 seconds
+Audio duration: 7.18 seconds
+RTF: 1.82
+```
+参考：
+- [sensevoice.axera](https://github.com/ml-inory/sensevoice.axera/tree/main)
+- [3D-Speaker.axera](https://github.com/AXERA-TECH/3D-Speaker.axera/tree/master)
+- [libtranslate.axera](https://github.com/AXERA-TECH/libtranslate.axera/tree/master)
+- [melotts.axera](https://github.com/ml-inory/melotts.axera/tree/main)
+## 技术讨论
+- Github issues
+- QQ 群: 139953715

ax_model/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.axmodel filter=lfs diff=lfs merge=lfs -text
2	+ *.npy filter=lfs diff=lfs merge=lfs -text

ax_model/auto.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0997706b30274f7ff3b157ca90df50b7ed8ced35091a0231700355d5ee1374
+size 2368

ax_model/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
+size 377341

ax_model/event_emo.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d22e3df5d192fdc3e73e368a2cb576975a5a43a114a8432a91c036adf8e2263
+size 4608

ax_model/sensevoice.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b64a36fa15e75ab5e3b75f18ae87a058970cff76219407e503b54fb53dd8e38
+size 262170623

ax_model/sensevoice/am.mvn ADDED Viewed

	@@ -0,0 +1,8 @@

+<Nnet>
+<Splice> 560 560
+[ 0 ]
+<AddShift> 560 560
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 560 560
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>

ax_model/sensevoice/config.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+encoder: SenseVoiceEncoderSmall
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 50
+    tp_blocks: 20
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+model: SenseVoiceSmall
+model_conf:
+    length_normalized_loss: true
+    sos: 1
+    eos: 2
+    ignore_id: -1
+tokenizer: SentencepiecesTokenizer
+tokenizer_conf:
+  bpemodel: null
+  unk_symbol: <unk>
+  split_with_space: true
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+    cmvn_file: null
+dataset: SenseVoiceCTCDataset
+dataset_conf:
+  index_ds: IndexDSJsonl
+  batch_sampler: EspnetStyleBatchSampler
+  data_split_num: 32
+  batch_type: token
+  batch_size: 14000
+  max_token_length: 2000
+  min_token_length: 60
+  max_source_length: 2000
+  min_source_length: 60
+  max_target_length: 200
+  min_target_length: 0
+  shuffle: true
+  num_workers: 4
+  sos: ${model_conf.sos}
+  eos: ${model_conf.eos}
+  IndexDSJsonl: IndexDSJsonl
+  retry: 20
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 20
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  log_interval: 100
+  resume: true
+  validate_interval: 10000
+  save_checkpoint_interval: 10000
+optim: adamw
+optim_conf:
+  lr: 0.00002
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1

ax_model/vad/am.mvn ADDED Viewed

	@@ -0,0 +1,8 @@

+<Nnet>
+<Splice> 400 400
+[ 0 ]
+<AddShift> 400 400
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 400 400
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>

ax_model/vad/config.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+frontend: WavFrontendOnline
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    dither: 0.0
+    lfr_m: 5
+    lfr_n: 1
+model: FsmnVADStreaming
+model_conf:
+    sample_rate: 16000
+    detect_mode: 1
+    snr_mode: 0
+    max_end_silence_time: 800
+    max_start_silence_time: 3000
+    do_start_point_detection: True
+    do_end_point_detection: True
+    window_size_ms: 200
+    sil_to_speech_time_thres: 150
+    speech_to_sil_time_thres: 150
+    speech_2_noise_ratio: 1.0
+    do_extend: 1
+    lookback_time_start_point: 200
+    lookahead_time_end_point: 100
+    max_single_segment_time: 60000
+    snr_thres: -100.0
+    noise_frame_num_used_for_snr: 100
+    decibel_thres: -100.0
+    speech_noise_thres: 0.6
+    fe_prior_thres: 0.0001
+    silence_pdf_num: 1
+    sil_pdf_ids: [0]
+    speech_noise_thresh_low: -0.1
+    speech_noise_thresh_high: 0.3
+    output_frame_probs: False
+    frame_in_ms: 10
+    frame_length_ms: 25
+encoder: FSMN
+encoder_conf:
+    input_dim: 400
+    input_affine_dim: 140
+    fsmn_layers: 4
+    linear_dim: 250
+    proj_dim: 128
+    lorder: 20
+    rorder: 0
+    lstride: 1
+    rstride: 0
+    output_affine_dim: 140
+    output_dim: 248

ax_model/withitn.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39bf02586f59237894fc2918ab2db4f12ec3c084c41465718832fbd7646ea729
+size 2368

ax_speech_translate_demo.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import subprocess
+import tempfile
+import os
+import json
+import shutil
+import time
+import librosa
+import torch
+import argparse
+import soundfile as sf
+from pathlib import Path
+import cn2an
+# 导入SenseVoice相关模块
+from model import SinusoidalPositionEncoder
+from utils.ax_model_bin import AX_SenseVoiceSmall
+from utils.ax_vad_bin import AX_Fsmn_vad
+from utils.vad_utils import merge_vad
+from funasr.tokenizer.sentencepiece_tokenizer import SentencepiecesTokenizer
+# 配置参数
+# translate 参数
+TRANSLATE_EXECUTABLE = "libtranslate/test_translate"
+TRANSLATE_MODEL = "libtranslate/opus-mt-en-zh.axmodel"
+TRANSLATE_TOKENIZER_DIR = "libtranslate/opus-mt-en-zh/"
+# tts 参数
+TTS_EXECUTABLE = "libmelotts/install/melotts"
+TTS_MODEL_DIR = "libmelotts/models"
+TTS_MODEL_FILES = {
+    "g": "g-zh_mix_en.bin",
+    "encoder": "encoder-zh.onnx",
+    "lexicon": "lexicon.txt",
+    "tokens": "tokens.txt",
+    "decoder": "decoder-zh.axmodel"
+}
+class SpeechTranslationPipeline:
+    def __init__(self,
+                 translate_exec, translate_model, translate_tokenizer,
+                 tts_exec, tts_model_dir, tts_model_files,
+                 asr_model_dir="ax_model", seq_len=132):
+        self.translate_exec = translate_exec
+        self.translate_model = translate_model
+        self.translate_tokenizer = translate_tokenizer
+        self.tts_exec = tts_exec
+        self.tts_model_dir = tts_model_dir
+        self.tts_model_files = tts_model_files
+        self.asr_model_dir = asr_model_dir
+        self.seq_len = seq_len
+        # 初始化ASR模型
+        self._init_asr_models()
+        # 验证所有必需文件存在
+        self._validate_files()
+    def _init_asr_models(self):
+        """初始化语音识别相关模型"""
+        print("Initializing SenseVoice models...")
+        # VAD模型
+        self.model_vad = AX_Fsmn_vad(self.asr_model_dir)
+        # 位置编码
+        self.embed = SinusoidalPositionEncoder()
+        self.position_encoding = self.embed.get_position_encoding(
+            torch.randn(1, self.seq_len, 560)).numpy()
+        # ASR模型
+        self.model_bin = AX_SenseVoiceSmall(self.asr_model_dir, seq_len=self.seq_len)
+        # Tokenizer
+        tokenizer_path = os.path.join(self.asr_model_dir, "chn_jpn_yue_eng_ko_spectok.bpe.model")
+        self.tokenizer = SentencepiecesTokenizer(bpemodel=tokenizer_path)
+        print("SenseVoice models initialized successfully.")
+    def _validate_files(self):
+        """验证所有必需的文件都存在"""
+        # 检查翻译相关文件
+        if not os.path.exists(self.translate_exec):
+            raise FileNotFoundError(f"翻译可执行文件不存在: {self.translate_exec}")
+        if not os.path.exists(self.translate_model):
+            raise FileNotFoundError(f"翻译模型不存在: {self.translate_model}")
+        if not os.path.exists(self.translate_tokenizer):
+            raise FileNotFoundError(f"翻译tokenizer目录不存在: {self.translate_tokenizer}")
+        # 检查TTS相关文件
+        if not os.path.exists(self.tts_exec):
+            raise FileNotFoundError(f"TTS可执行文件不存在: {self.tts_exec}")
+        for key, filename in self.tts_model_files.items():
+            filepath = os.path.join(self.tts_model_dir, filename)
+            if not os.path.exists(filepath):
+                raise FileNotFoundError(f"TTS模型文件不存在: {filepath}")
+    def speech_recognition(self, speech, fs):
+        """
+        第一步：语音识别（ASR）
+        """
+        speech_lengths = len(speech)
+        # VAD处理
+        print("Running VAD...")
+        vad_start_time = time.time()
+        res_vad = self.model_vad(speech)[0]
+        vad_segments = merge_vad(res_vad, 15 * 1000)
+        vad_time_cost = time.time() - vad_start_time
+        print(f"VAD processing time: {vad_time_cost:.2f} seconds")
+        print(f"VAD segments detected: {len(vad_segments)}")
+        # ASR处理
+        print("Running ASR...")
+        asr_start_time = time.time()
+        all_results = ""
+        # 遍历每个VAD片段并处理
+        for i, segment in enumerate(vad_segments):
+            segment_start, segment_end = segment
+            start_sample = int(segment_start / 1000 * fs)
+            end_sample = min(int(segment_end / 1000 * fs), speech_lengths)
+            segment_speech = speech[start_sample:end_sample]
+            # 为当前片段创建临时文件
+            segment_filename = f"temp_segment_{i}.wav"
+            sf.write(segment_filename, segment_speech, fs)
+            # 对当前片段进行识别
+            try:
+                segment_res = self.model_bin(
+                    segment_filename,
+                    "auto",  # 语言自动检测
+                    True,    # withitn
+                    self.position_encoding,
+                    tokenizer=self.tokenizer,
+                )
+                all_results += segment_res
+                # 清理临时文件
+                if os.path.exists(segment_filename):
+                    os.remove(segment_filename)
+            except Exception as e:
+                if os.path.exists(segment_filename):
+                    os.remove(segment_filename)
+                print(f"Error processing segment {i}: {e}")
+                continue
+        asr_time_cost = time.time() - asr_start_time
+        print(f"ASR processing time: {asr_time_cost:.2f} seconds")
+        print(f"ASR Result: {all_results}")
+        return all_results.strip()
+    def run_translation(self, english_text):
+        """
+        第二步：调用翻译程序将英文翻译成中文
+        """
+        # 构建命令参数
+        cmd = [
+            self.translate_exec,
+            "--model", self.translate_model,
+            "--tokenizer_dir", self.translate_tokenizer,
+            "--text", f'"{english_text}"'  # 添加引号处理包含空格和特殊字符的文本
+        ]
+        try:
+            # 执行命令
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=30  # 设置超时时间，单位秒
+            )
+            # 检查执行结果
+            if result.returncode != 0:
+                error_msg = f"翻译程序执行失败: {result.stderr}"
+                raise RuntimeError(error_msg)
+            # 提取翻译结果
+            chinese_text = result.stdout.strip()
+            # 清理可能的额外输出
+            # if "翻译结果:" in chinese_text:
+            #     chinese_text = chinese_text.split("翻译结果:", 1)[-1].strip()
+            chinese_text = chinese_text.split("output: ")[-1].split("\nAX_ENGINE_Deinit")[0]
+            print(f"翻译结果: {chinese_text}")
+            return chinese_text
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("翻译程序执行超时")
+        except Exception as e:
+            raise e
+    def run_tts(self, chinese_text, output_dir, output_wav=None):
+        """
+        第三步：调用TTS程序合成中文语音
+        """
+        output_path = os.path.join(output_dir, output_wav)
+        #chinese_text = chinese_text.split("output: ")[-1].split("\nAX_ENGINE_Deinit")[0]
+        chinese_text = cn2an.transform(chinese_text, "an2cn")
+        # 构建命令参数
+        cmd = [
+            self.tts_exec,
+            "--g", os.path.join(self.tts_model_dir, self.tts_model_files["g"]),
+            "-e", os.path.join(self.tts_model_dir, self.tts_model_files["encoder"]),
+            "-l", os.path.join(self.tts_model_dir, self.tts_model_files["lexicon"]),
+            "-t", os.path.join(self.tts_model_dir, self.tts_model_files["tokens"]),
+            "-d", os.path.join(self.tts_model_dir, self.tts_model_files["decoder"]),
+            "-w", output_path,
+            "-s", f'"{chinese_text}"'
+            ]
+        try:
+            # 执行命令
+            result = subprocess.run(
+                cmd,
+                capture_output=False,
+                text=True,
+                timeout=60  # TTS可能需要更长时间
+            )
+            # 检查执行结果
+            if result.returncode != 0:
+                error_msg = f"TTS程序执行失败: {result.stderr}"
+                raise RuntimeError(error_msg)
+            # 验证输出文件是否存在
+            if not os.path.exists(output_path):
+                raise FileNotFoundError(f"输出文件未生成: {output_path}")
+            return output_path
+        except subprocess.TimeoutExpired:
+            raise RuntimeError("TTS程序执行超时")
+        except Exception as e:
+            # 清理临时文件
+            if output_path and os.path.exists(os.path.dirname(output_path)):
+                shutil.rmtree(os.path.dirname(output_path))
+            raise e
+    def full_pipeline(self, speech, fs, output_dir=None,output_tts = None):
+        """
+        完整Pipeline：语音识别 -> 翻译 -> TTS合成
+        """
+        # 第一步：语音识别
+        print("\n----------------------VAD+ASR----------------------------\n")
+        start_time = time.time()  # 记录开始时间
+        english_text = self.speech_recognition(speech, fs)
+        asr_time = time.time() - start_time  # 计算耗时
+        print(f"语音识别耗时: {asr_time:.2f} 秒")
+        # 第二步：翻译
+        print("\n---------------------translate---------------------------\n")
+        start_time = time.time()  # 记录开始时间
+        chinese_text = self.run_translation(english_text)
+        translate_time = time.time() - start_time  # 计算耗时
+        print(f"翻译耗时: {translate_time:.2f} 秒")
+        # 第三步：TTS合成
+        print("-------------------------TTS-------------------------------\n")
+        start_time = time.time()  # 记录开始时间
+        output_path = self.run_tts(chinese_text, output_dir, output_tts)
+        tts_time = time.time() - start_time  # 计算耗时
+        print(f"TTS合成耗时: {tts_time:.2f} 秒")
+        return {
+            "original_text": english_text,
+            "translated_text": chinese_text,
+            "audio_path": output_path
+        }
+def main():
+    parser = argparse.ArgumentParser(description="Speech Recognition, Translation and TTS Pipeline")
+    parser.add_argument("--audio_file", type=str, required=True, help="Input audio file path")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Output directory")
+    parser.add_argument("--output_tts", type=str, default="output.wav", help="Output directory")
+    args = parser.parse_args()
+    print("-------------------START------------------------\n")
+    os.makedirs(args.output_dir ,exist_ok=True)
+    print(f"Processing audio file: {args.audio_file}")
+    # 加载音频
+    speech, fs = librosa.load(args.audio_file, sr=None)
+    if fs != 16000:
+        print(f"Resampling audio from {fs}Hz to 16000Hz")
+        speech = librosa.resample(y=speech, orig_sr=fs, target_sr=16000)
+        fs = 16000
+    audio_duration = librosa.get_duration(y=speech, sr=fs)
+    # 初始化
+    pipeline = SpeechTranslationPipeline(
+        translate_exec=TRANSLATE_EXECUTABLE,
+        translate_model=TRANSLATE_MODEL,
+        translate_tokenizer=TRANSLATE_TOKENIZER_DIR,
+        tts_exec=TTS_EXECUTABLE,
+        tts_model_dir=TTS_MODEL_DIR,
+        tts_model_files=TTS_MODEL_FILES,
+        asr_model_dir="ax_model",
+        seq_len=132
+    )
+    start_time = time.time()
+    try:
+        # 运行
+        result = pipeline.full_pipeline(speech, fs, args.output_dir, args.output_tts)
+        print("\n" + "="*50)
+        print("speech translate 完成!")
+        print("="*50 + "\n")
+        print(f"原始音频: {args.audio_file}")
+        print(f"原始文本: {result['original_text']}")
+        print(f"翻译文本: {result['translated_text']}")
+        print(f"生成音频: {result['audio_path']}")
+        # 保存结果到文件
+        result_file = os.path.join(args.output_dir, "pipeline_result.txt")
+        with open(result_file, 'w', encoding='utf-8') as f:
+            f.write(f"原始音频: {args.audio_file}\n")
+            f.write(f"识别文本: {result['original_text']}\n")
+            f.write(f"翻译结果: {result['translated_text']}\n")
+            f.write(f"合成音频: {result['audio_path']}\n")
+        # print(f"\n详细结果已保存到: {result_file}")
+        time_cost = time.time() - start_time
+        rtf = time_cost / audio_duration
+        print(f"Inference time for {args.audio_file}: {time_cost:.2f} seconds")
+        print(f"Audio duration: {audio_duration:.2f} seconds")
+        print(f"RTF: {rtf:.2f}\n")
+    except Exception as e:
+        print(f"Pipeline执行失败: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

libmelotts/install/libonnxruntime.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0b062d058b15bbf87bc3232b9c905a2bd49ea7e88e203a308df5c08c4c15129
+size 16014680

libmelotts/install/libonnxruntime.so.1.14.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0b062d058b15bbf87bc3232b9c905a2bd49ea7e88e203a308df5c08c4c15129
+size 16014680

libmelotts/install/libonnxruntime_providers_shared.so ADDED Viewed

Binary file (9.92 kB). View file

libmelotts/install/melotts ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88d12e4b441d2398ea7646afb677f2dd5819f347caa680781ce2bcfa26e81d85
+size 187256

libmelotts/models/decoder-en.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c93c0fa978cc1c68fbac6a78707dd75b8b9069cb01a1ade6846e2435aa1eb1
+size 44093802

libmelotts/models/decoder-zh.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37ea2d8401f18dd371eec50b90bd39dcadf9684aaf3543dace8ce1a9499ef253
+size 44092592

libmelotts/models/encoder-en.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cc51185fb81934c7490c5f9ac993fff7efa98ab41c08cd3753c96abcb297582
+size 31488385

libmelotts/models/encoder-zh.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b0a5bc2789faef16b4bfc56ab4905364f8163a59f2db3d071b4a14792bfee5
+size 31397760

libmelotts/models/g-en.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:094bf0dbe1cd6c9408707209b2b7261b9df2cd5917d310bfac5945a15a31821a
+size 1024

libmelotts/models/g-jp.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c01dd0961bbe1effca4ed378d2969d6fbd9b579133b722f6968db5cf4d22281e
+size 1024

libmelotts/models/g-zh_mix_en.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c70d897674847882bd35e780aee696ddaff8d04d5c57e4f9cf37611b6821879f
+size 1024

libmelotts/models/lexicon.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

libmelotts/models/tokens.txt ADDED Viewed

	@@ -0,0 +1,112 @@

+_ 0
+AA 1
+E 2
+EE 3
+En 4
+N 5
+OO 6
+V 7
+a 8
+a: 9
+aa 10
+ae 11
+ah 12
+ai 13
+an 14
+ang 15
+ao 16
+aw 17
+ay 18
+b 19
+by 20
+c 21
+ch 22
+d 23
+dh 24
+dy 25
+e 26
+e: 27
+eh 28
+ei 29
+en 30
+eng 31
+er 32
+ey 33
+f 34
+g 35
+gy 36
+h 37
+hh 38
+hy 39
+i 40
+i0 41
+i: 42
+ia 43
+ian 44
+iang 45
+iao 46
+ie 47
+ih 48
+in 49
+ing 50
+iong 51
+ir 52
+iu 53
+iy 54
+j 55
+jh 56
+k 57
+ky 58
+l 59
+m 60
+my 61
+n 62
+ng 63
+ny 64
+o 65
+o: 66
+ong 67
+ou 68
+ow 69
+oy 70
+p 71
+py 72
+q 73
+r 74
+ry 75
+s 76
+sh 77
+t 78
+th 79
+ts 80
+ty 81
+u 82
+u: 83
+ua 84
+uai 85
+uan 86
+uang 87
+uh 88
+ui 89
+un 90
+uo 91
+uw 92
+v 93
+van 94
+ve 95
+vn 96
+w 97
+x 98
+y 99
+z 100
+zh 101
+zy 102
+! 103
+? 104
+… 105
+, 106
+. 107
+' 108
+- 109
+SP 110
+UNK 111

libtranslate/libax_translate.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d6c5901387ed8d0a3c0611a5ad8d60281487e34d20afc83a58923ea2462b456
+size 1222544

libtranslate/libsentencepiece.so.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e93872d229a074c73e66eeeff8988197deb35b81b09ee893e022e115f886ed4
+size 1320848

libtranslate/opus-mt-en-zh.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e262aab07f3a5478a0a0df24a92e1f88c138c4999be5b4cb18b618d996bcacff
+size 217562368

libtranslate/opus-mt-en-zh/.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

libtranslate/opus-mt-en-zh/README.md ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+language:
+- en
+- zh
+tags:
+- translation
+license: apache-2.0
+---
+### eng-zho
+* source group: English
+* target group: Chinese
+*  OPUS readme: [eng-zho](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-zho/README.md)
+*  model: transformer
+* source language(s): eng
+* target language(s): cjy_Hans cjy_Hant cmn cmn_Hans cmn_Hant gan lzh lzh_Hans nan wuu yue yue_Hans yue_Hant
+* model: transformer
+* pre-processing: normalization + SentencePiece (spm32k,spm32k)
+* a sentence initial language token is required in the form of `>>id<<` (id = valid target language ID)
+* download original weights: [opus-2020-07-17.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-zho/opus-2020-07-17.zip)
+* test set translations: [opus-2020-07-17.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-zho/opus-2020-07-17.test.txt)
+* test set scores: [opus-2020-07-17.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/eng-zho/opus-2020-07-17.eval.txt)
+## Benchmarks
+| testset               | BLEU  | chr-F |
+|-----------------------|-------|-------|
+| Tatoeba-test.eng.zho 	| 31.4 	| 0.268 |
+### System Info:
+- hf_name: eng-zho
+- source_languages: eng
+- target_languages: zho
+- opus_readme_url: https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/eng-zho/README.md
+- original_repo: Tatoeba-Challenge
+- tags: ['translation']
+- languages: ['en', 'zh']
+- src_constituents: {'eng'}
+- tgt_constituents: {'cmn_Hans', 'nan', 'nan_Hani', 'gan', 'yue', 'cmn_Kana', 'yue_Hani', 'wuu_Bopo', 'cmn_Latn', 'yue_Hira', 'cmn_Hani', 'cjy_Hans', 'cmn', 'lzh_Hang', 'lzh_Hira', 'cmn_Hant', 'lzh_Bopo', 'zho', 'zho_Hans', 'zho_Hant', 'lzh_Hani', 'yue_Hang', 'wuu', 'yue_Kana', 'wuu_Latn', 'yue_Bopo', 'cjy_Hant', 'yue_Hans', 'lzh', 'cmn_Hira', 'lzh_Yiii', 'lzh_Hans', 'cmn_Bopo', 'cmn_Hang', 'hak_Hani', 'cmn_Yiii', 'yue_Hant', 'lzh_Kana', 'wuu_Hani'}
+- src_multilingual: False
+- tgt_multilingual: False
+- prepro:  normalization + SentencePiece (spm32k,spm32k)
+- url_model: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-zho/opus-2020-07-17.zip
+- url_test_set: https://object.pouta.csc.fi/Tatoeba-MT-models/eng-zho/opus-2020-07-17.test.txt
+- src_alpha3: eng
+- tgt_alpha3: zho
+- short_pair: en-zh
+- chrF2_score: 0.268
+- bleu: 31.4
+- brevity_penalty: 0.8959999999999999
+- ref_len: 110468.0
+- src_name: English
+- tgt_name: Chinese
+- train_date: 2020-07-17
+- src_alpha2: en
+- tgt_alpha2: zh
+- prefer_old: False
+- long_pair: eng-zho
+- helsinki_git_sha: 480fcbe0ee1bf4774bcbe6226ad9f58e63f6c535
+- transformers_git_sha: 2207e5d8cb224e954a7cba69fa4ac2309e9ff30b
+- port_machine: brutasse
+- port_time: 2020-08-21-14:41

libtranslate/opus-mt-en-zh/config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "_name_or_path": "./",
+  "activation_dropout": 0.0,
+  "activation_function": "swish",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "MarianMTModel"
+  ],
+  "attention_dropout": 0.0,
+  "bad_words_ids": [
+    [
+      65000
+    ]
+  ],
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 512,
+  "decoder_attention_heads": 8,
+  "decoder_ffn_dim": 2048,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 65000,
+  "do_blenderbot_90_layernorm": false,
+  "dropout": 0.1,
+  "encoder_attention_heads": 8,
+  "encoder_ffn_dim": 2048,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 6,
+  "eos_token_id": 0,
+  "extra_pos_embeddings": 0,
+  "force_bos_token_to_be_generated": false,
+  "forced_eos_token_id": 0,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "max_length": 512,
+  "max_position_embeddings": 512,
+  "model_type": "marian",
+  "normalize_before": false,
+  "normalize_embedding": false,
+  "num_beams": 4,
+  "num_hidden_layers": 6,
+  "pad_token_id": 65000,
+  "scale_embedding": true,
+  "static_position_embeddings": true,
+  "transformers_version": "4.9.0.dev0",
+  "use_cache": true,
+  "vocab_size": 65001
+}

libtranslate/opus-mt-en-zh/generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bad_words_ids": [
+    [
+      65000
+    ]
+  ],
+  "bos_token_id": 0,
+  "decoder_start_token_id": 65000,
+  "eos_token_id": 0,
+  "forced_eos_token_id": 0,
+  "max_length": 512,
+  "num_beams": 4,
+  "pad_token_id": 65000,
+  "renormalize_logits": true,
+  "transformers_version": "4.32.0.dev0"
+}

libtranslate/opus-mt-en-zh/metadata.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"hf_name":"eng-zho","source_languages":"eng","target_languages":"zho","opus_readme_url":"https:\/\/github.com\/Helsinki-NLP\/Tatoeba-Challenge\/tree\/master\/models\/eng-zho\/README.md","original_repo":"Tatoeba-Challenge","tags":["translation"],"languages":["en","zh"],"src_constituents":["eng"],"tgt_constituents":["cmn_Hans","nan","nan_Hani","gan","yue","cmn_Kana","yue_Hani","wuu_Bopo","cmn_Latn","yue_Hira","cmn_Hani","cjy_Hans","cmn","lzh_Hang","lzh_Hira","cmn_Hant","lzh_Bopo","zho","zho_Hans","zho_Hant","lzh_Hani","yue_Hang","wuu","yue_Kana","wuu_Latn","yue_Bopo","cjy_Hant","yue_Hans","lzh","cmn_Hira","lzh_Yiii","lzh_Hans","cmn_Bopo","cmn_Hang","hak_Hani","cmn_Yiii","yue_Hant","lzh_Kana","wuu_Hani"],"src_multilingual":false,"tgt_multilingual":false,"prepro":" normalization + SentencePiece (spm32k,spm32k)","url_model":"https:\/\/object.pouta.csc.fi\/Tatoeba-MT-models\/eng-zho\/opus-2020-07-17.zip","url_test_set":"https:\/\/object.pouta.csc.fi\/Tatoeba-MT-models\/eng-zho\/opus-2020-07-17.test.txt","src_alpha3":"eng","tgt_alpha3":"zho","short_pair":"en-zh","chrF2_score":0.268,"bleu":31.4,"brevity_penalty":0.896,"ref_len":110468.0,"src_name":"English","tgt_name":"Chinese","train_date":"2020-07-17","src_alpha2":"en","tgt_alpha2":"zh","prefer_old":false,"long_pair":"eng-zho","helsinki_git_sha":"480fcbe0ee1bf4774bcbe6226ad9f58e63f6c535","transformers_git_sha":"2207e5d8cb224e954a7cba69fa4ac2309e9ff30b","port_machine":"brutasse","port_time":"2020-08-21-14:41"}

libtranslate/opus-mt-en-zh/source.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5775ddc9e3ff2fae91554da56468ad35ff56edaba870fea74447bc7234bfdaa8
+size 806435

libtranslate/opus-mt-en-zh/target.spm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81dc94efa84e4025ef38d25d5d07429fe41e3eb29d44003f1db6fe98487b0052
+size 804600

libtranslate/opus-mt-en-zh/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"target_lang": "zho", "source_lang": "eng"}

libtranslate/opus-mt-en-zh/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

libtranslate/test_translate ADDED Viewed

Binary file (82.1 kB). View file

model.py ADDED Viewed

	@@ -0,0 +1,942 @@

+import time
+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Iterable, Optional
+from funasr.register import tables
+from funasr.models.ctc.ctc import CTC
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.models.paraformer.search import Hypothesis
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.metrics.compute_acc import compute_accuracy, th_accuracy
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+from utils.ctc_alignment import ctc_forced_align
+class SinusoidalPositionEncoder(torch.nn.Module):
+    """ """
+    def __int__(self, d_model=80, dropout_rate=0.1):
+        pass
+    def encode(
+        self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
+    ):
+        batch_size = positions.size(0)
+        positions = positions.type(dtype)
+        device = positions.device
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
+            depth / 2 - 1
+        )
+        inv_timescales = torch.exp(
+            torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
+        )
+        inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
+            inv_timescales, [1, 1, -1]
+        )
+        encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+        return encoding.type(dtype)
+    def forward(self, x):
+        batch_size, timesteps, input_dim = x.size()
+        positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+        return x + position_encoding
+    def get_position_encoding(self, x):
+        batch_size, timesteps, input_dim = x.size()
+        positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+        return position_encoding
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
+class MultiHeadedAttentionSANM(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(
+        self,
+        n_head,
+        in_feat,
+        n_feat,
+        dropout_rate,
+        kernel_size,
+        sanm_shfit=0,
+        lora_list=None,
+        lora_rank=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        # self.linear_q = nn.Linear(n_feat, n_feat)
+        # self.linear_k = nn.Linear(n_feat, n_feat)
+        # self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.fsmn_block = nn.Conv1d(
+            n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+        )
+        # padding
+        left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            left_padding = left_padding + sanm_shfit
+        right_padding = kernel_size - 1 - left_padding
+        self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+    def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+        b, t, d = inputs.size()
+        if mask is not None:
+            mask = torch.reshape(mask, (b, -1, 1))
+            if mask_shfit_chunk is not None:
+                mask = mask * mask_shfit_chunk
+            inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        x = self.dropout(x)
+        if mask is not None:
+            x = x * mask
+        return x
+    def forward_qkv(self, x):
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        b, t, d = x.size()
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time1, d_k)
+        k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        return q_h, k_h, v_h, v
+    def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            if mask_att_chunk_encoder is not None:
+                mask = mask * mask_att_chunk_encoder
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = -float(
+                "inf"
+            )  # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
+        return att_outs + fsmn_memory
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        if chunk_size is not None and look_back > 0 or look_back == -1:
+            if cache is not None:
+                k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
+                v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
+                k_h = torch.cat((cache["k"], k_h), dim=2)
+                v_h = torch.cat((cache["v"], v_h), dim=2)
+                cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
+                cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
+                if look_back != -1:
+                    cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
+                    cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
+            else:
+                cache_tmp = {
+                    "k": k_h[:, :, : -(chunk_size[2]), :],
+                    "v": v_h[:, :, : -(chunk_size[2]), :],
+                }
+                cache = cache_tmp
+        fsmn_memory = self.forward_fsmn(v, None)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, None)
+        return att_outs + fsmn_memory, cache
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
+    if maxlen is None:
+        maxlen = lengths.max()
+    row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
+    matrix = torch.unsqueeze(lengths, dim=-1)
+    mask = row_vector < matrix
+    mask = mask.detach()
+    return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+class EncoderLayerSANM(nn.Module):
+    def __init__(
+        self,
+        in_size,
+        size,
+        self_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        stochastic_depth_rate=0.0,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayerSANM, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(in_size)
+        self.norm2 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.in_size = in_size
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.dropout_rate = dropout_rate
+    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute encoded features.
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_concat = torch.cat(
+                (
+                    x,
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    ),
+                ),
+                dim=-1,
+            )
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+            else:
+                x = stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+            else:
+                x = stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute encoded features.
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.in_size == self.size:
+            attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+            x = residual + attn
+        else:
+            x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.feed_forward(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x, cache
+@tables.register("encoder_classes", "SenseVoiceEncoderSmall")
+class SenseVoiceEncoderSmall(nn.Module):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
+    https://arxiv.org/abs/2006.01713
+    """
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        tp_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        stochastic_depth_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        pos_enc_class=SinusoidalPositionEncoder,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 1,
+        padding_idx: int = -1,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        selfattention_layer_type: str = "sanm",
+        **kwargs,
+    ):
+        super().__init__()
+        self._output_size = output_size
+        self.embed = SinusoidalPositionEncoder()
+        self.normalize_before = normalize_before
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+        )
+        encoder_selfattn_layer = MultiHeadedAttentionSANM
+        encoder_selfattn_layer_args0 = (
+            attention_heads,
+            input_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        self.encoders0 = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    input_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(1)
+            ]
+        )
+        self.encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(num_blocks - 1)
+            ]
+        )
+        self.tp_encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(tp_blocks)
+            ]
+        )
+        self.after_norm = LayerNorm(output_size)
+        self.tp_norm = LayerNorm(output_size)
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        # ilens: torch.Tensor,
+        masks: torch.Tensor,
+        position_encoding: torch.Tensor
+    ):
+        """Embed positions in tensor."""
+        # masks = sequence_mask(ilens, device=ilens.device)[:, None, :]
+        xs_pad *= self.output_size() ** 0.5
+        # xs_pad = self.embed(xs_pad)
+        xs_pad += position_encoding
+        # forward encoder1
+        for layer_idx, encoder_layer in enumerate(self.encoders0):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        for layer_idx, encoder_layer in enumerate(self.encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        xs_pad = self.after_norm(xs_pad)
+        # forward encoder2
+        olens = masks.squeeze(1).sum(1).int()
+        for layer_idx, encoder_layer in enumerate(self.tp_encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        xs_pad = self.tp_norm(xs_pad)
+        return xs_pad, olens
+@tables.register("model_classes", "SenseVoiceSmall")
+class SenseVoiceSmall(nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+    def __init__(
+        self,
+        specaug: str = None,
+        specaug_conf: dict = None,
+        normalize: str = None,
+        normalize_conf: dict = None,
+        encoder: str = None,
+        encoder_conf: dict = None,
+        ctc_conf: dict = None,
+        input_size: int = 80,
+        vocab_size: int = -1,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        length_normalized_loss: bool = False,
+        seq_len = 68,
+        **kwargs,
+    ):
+        super().__init__()
+        if specaug is not None:
+            specaug_class = tables.specaug_classes.get(specaug)
+            specaug = specaug_class(**specaug_conf)
+        if normalize is not None:
+            normalize_class = tables.normalize_classes.get(normalize)
+            normalize = normalize_class(**normalize_conf)
+        encoder_class = tables.encoder_classes.get(encoder)
+        encoder = encoder_class(input_size=input_size, **encoder_conf)
+        encoder_output_size = encoder.output_size()
+        if ctc_conf is None:
+            ctc_conf = {}
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
+        self.blank_id = blank_id
+        self.sos = sos if sos is not None else vocab_size - 1
+        self.eos = eos if eos is not None else vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.specaug = specaug
+        self.normalize = normalize
+        self.encoder = encoder
+        self.error_calculator = None
+        self.ctc = ctc
+        self.length_normalized_loss = length_normalized_loss
+        self.encoder_output_size = encoder_output_size
+        self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+        self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size)
+        self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004}
+        self.criterion_att = LabelSmoothingLoss(
+            size=self.vocab_size,
+            padding_idx=self.ignore_id,
+            smoothing=kwargs.get("lsm_weight", 0.0),
+            normalize_length=self.length_normalized_loss,
+        )
+        self.seq_len = seq_len
+    @staticmethod
+    def from_pretrained(model:str=None, **kwargs):
+        from funasr import AutoModel
+        model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
+        return model, kwargs
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ):
+        """Encoder + Decoder + Calc loss
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        # import pdb;
+        # pdb.set_trace()
+        if len(text_lengths.size()) > 1:
+            text_lengths = text_lengths[:, 0]
+        if len(speech_lengths.size()) > 1:
+            speech_lengths = speech_lengths[:, 0]
+        batch_size = speech.shape[0]
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text)
+        loss_ctc, cer_ctc = None, None
+        loss_rich, acc_rich = None, None
+        stats = dict()
+        loss_ctc, cer_ctc = self._calc_ctc_loss(
+            encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4
+        )
+        loss_rich, acc_rich = self._calc_rich_ce_loss(
+            encoder_out[:, :4, :], text[:, :4]
+        )
+        loss = loss_ctc + loss_rich
+        # Collect total loss stats
+        stats["loss_ctc"] = torch.clone(loss_ctc.detach()) if loss_ctc is not None else None
+        stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None
+        stats["loss"] = torch.clone(loss.detach()) if loss is not None else None
+        stats["acc_rich"] = acc_rich
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + 1).sum())
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+    def encode(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        **kwargs,
+    ):
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                ind: int
+        """
+        # Data augmentation
+        if self.specaug is not None and self.training:
+            speech, speech_lengths = self.specaug(speech, speech_lengths)
+        # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+        if self.normalize is not None:
+            speech, speech_lengths = self.normalize(speech, speech_lengths)
+        lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device)
+        language_query = self.embed(lids)
+        styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device)
+        style_query = self.embed(styles)
+        speech = torch.cat((style_query, speech), dim=1)
+        speech_lengths += 1
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1)
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        return encoder_out, encoder_out_lens
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
+    def _calc_rich_ce_loss(
+        self,
+        encoder_out: torch.Tensor,
+        ys_pad: torch.Tensor,
+    ):
+        decoder_out = self.ctc.ctc_lo(encoder_out)
+        # 2. Compute attention loss
+        loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous())
+        acc_rich = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_pad.contiguous(),
+            ignore_label=self.ignore_id,
+        )
+        return loss_rich, acc_rich
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = ["wav_file_tmp_name"],
+        tokenizer=None,
+        frontend=None,
+        **kwargs,
+    ):
+        meta_data = {}
+        if (
+            isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
+        ):  # fbank
+            speech, speech_lengths = data_in, data_lengths
+            if len(speech.shape) < 3:
+                speech = speech[None, :, :]
+            if speech_lengths is None:
+                speech_lengths = speech.shape[1]
+        else:
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(
+                data_in,
+                fs=frontend.fs,
+                audio_fs=kwargs.get("fs", 16000),
+                data_type=kwargs.get("data_type", "sound"),
+                tokenizer=tokenizer,
+            )
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(
+                audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
+            )
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = (
+                speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+            )
+        speech = speech.to(device=kwargs["device"])
+        speech_lengths = speech_lengths.to(device=kwargs["device"])
+        language = kwargs.get("language", "auto")
+        language_query = self.embed(
+            torch.LongTensor(
+                [[self.lid_dict[language] if language in self.lid_dict else 0]]
+            ).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        use_itn = kwargs.get("use_itn", False)
+        output_timestamp = kwargs.get("output_timestamp", False)
+        textnorm = kwargs.get("text_norm", None)
+        if textnorm is None:
+            textnorm = "withitn" if use_itn else "woitn"
+        textnorm_query = self.embed(
+            torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        speech = torch.cat((textnorm_query, speech), dim=1)
+        speech_lengths += 1
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(
+            speech.size(0), 1, 1
+        )
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+        # Encoder
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+        # c. Passed the encoder result and the beam search
+        ctc_logits = self.ctc.log_softmax(encoder_out)
+        if kwargs.get("ban_emo_unk", False):
+            ctc_logits[:, :, self.emo_dict["unk"]] = -float("inf")
+        results = []
+        b, n, d = encoder_out.size()
+        if isinstance(key[0], (list, tuple)):
+            key = key[0]
+        if len(key) < b:
+            key = key * b
+        for i in range(b):
+            x = ctc_logits[i, : encoder_out_lens[i].item(), :]
+            yseq = x.argmax(dim=-1)
+            yseq = torch.unique_consecutive(yseq, dim=-1)
+            ibest_writer = None
+            if kwargs.get("output_dir") is not None:
+                if not hasattr(self, "writer"):
+                    self.writer = DatadirWriter(kwargs.get("output_dir"))
+                ibest_writer = self.writer[f"1best_recog"]
+            mask = yseq != self.blank_id
+            token_int = yseq[mask].tolist()
+            # Change integer-ids to tokens
+            text = tokenizer.decode(token_int)
+            if ibest_writer is not None:
+                ibest_writer["text"][key[i]] = text
+            if output_timestamp:
+                from itertools import groupby
+                timestamp = []
+                tokens = tokenizer.text2tokens(text)[4:]
+                logits_speech = self.ctc.softmax(encoder_out)[i, 4:encoder_out_lens[i].item(), :]
+                pred = logits_speech.argmax(-1).cpu()
+                logits_speech[pred==self.blank_id, self.blank_id] = 0
+                align = ctc_forced_align(
+                    logits_speech.unsqueeze(0).float(),
+                    torch.Tensor(token_int[4:]).unsqueeze(0).long().to(logits_speech.device),
+                    (encoder_out_lens-4).long(),
+                    torch.tensor(len(token_int)-4).unsqueeze(0).long().to(logits_speech.device),
+                    ignore_id=self.ignore_id,
+                )
+                pred = groupby(align[0, :encoder_out_lens[0]])
+                _start = 0
+                token_id = 0
+                ts_max = encoder_out_lens[i] - 4
+                for pred_token, pred_frame in pred:
+                    _end = _start + len(list(pred_frame))
+                    if pred_token != 0:
+                        ts_left = max((_start*60-30)/1000, 0)
+                        ts_right = min((_end*60-30)/1000, (ts_max*60-30)/1000)
+                        timestamp.append([tokens[token_id], ts_left, ts_right])
+                        token_id += 1
+                    _start = _end
+                result_i = {"key": key[i], "text": text, "timestamp": timestamp}
+                results.append(result_i)
+            else:
+                result_i = {"key": key[i], "text": text}
+                results.append(result_i)
+        return results, meta_data
+    def export(self, **kwargs):
+        from export_meta import export_rebuild_model
+        if "max_seq_len" not in kwargs:
+            kwargs["max_seq_len"] = 512
+        models = export_rebuild_model(model=self, **kwargs)
+        return models

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+tqdm
+funasr==1.2.7
+torchaudio
+cn2an

utils/__init__.py ADDED Viewed

File without changes

utils/ax_model_bin.py ADDED Viewed

	@@ -0,0 +1,241 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os.path
+from pathlib import Path
+from typing import List, Union, Tuple
+import torch
+import numpy as np
+import axengine as axe
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+try:
+    import librosa
+except ImportError:
+    print("Warning: librosa not found. Please install it using 'pip install librosa'.")
+    # Provide a fallback implementation if needed
+    def load_wav_fallback(path, sr=None):
+        import wave
+        import numpy as np
+        with wave.open(path, 'rb') as wf:
+            num_frames = wf.getnframes()
+            frames = wf.readframes(num_frames)
+            return np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0, wf.getframerate()
+from utils.infer_utils import (
+    CharTokenizer,
+    get_logger,
+    read_yaml,
+)
+from utils.frontend import WavFrontend
+from utils.ctc_alignment import ctc_forced_align
+logging = get_logger()
+def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
+    if maxlen is None:
+        maxlen = lengths.max()
+    row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
+    matrix = torch.unsqueeze(lengths, dim=-1)
+    mask = row_vector < matrix
+    mask = mask.detach()
+    return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+class AX_SenseVoiceSmall:
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
+    https://arxiv.org/abs/2206.08317
+    """
+    def __init__(
+        self,
+        model_dir: Union[str, Path] = None,
+        batch_size: int = 1,
+        seq_len: int = 68
+    ):
+        model_file = os.path.join(model_dir, "sensevoice.axmodel")
+        config_file = os.path.join(model_dir, "sensevoice/config.yaml")
+        cmvn_file = os.path.join(model_dir, "sensevoice/am.mvn")
+        config = read_yaml(config_file)
+        self.model_dir = model_dir
+        # token_list = os.path.join(model_dir, "tokens.json")
+        # with open(token_list, "r", encoding="utf-8") as f:
+        #     token_list = json.load(f)
+        # self.converter = TokenIDConverter(token_list)
+        self.tokenizer = CharTokenizer()
+        config["frontend_conf"]['cmvn_file'] = cmvn_file
+        self.frontend = WavFrontend(**config["frontend_conf"])
+        # self.ort_infer = OrtInferSession(
+        #     model_file, device_id, intra_op_num_threads=intra_op_num_threads
+        # )
+        self.session = axe.InferenceSession(model_file, providers='AxEngineExecutionProvider')
+        self.batch_size = batch_size
+        self.blank_id = 0
+        self.seq_len = seq_len
+        self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+        self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004}
+    def __call__(self,
+                 wav_content: Union[str, np.ndarray, List[str]],
+                 language: str,
+                 withitn: bool,
+                 position_encoding: np.ndarray,
+                 tokenizer=None,
+                 **kwargs) -> List:
+        """Enhanced model inference with additional features from model.py
+        Args:
+            wav_content: Audio data or path
+            language: Language code for processing
+            withitn: Whether to use ITN (inverse text normalization)
+            position_encoding: Position encoding tensor
+            tokenizer: Tokenizer for text conversion
+            **kwargs: Additional arguments
+        """
+        # Start time tracking for metadata
+        import time
+        meta_data = {}
+        time_start = time.perf_counter()
+        # Load waveform data
+        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
+        waveform_nums = len(waveform_list)
+        time_load = time.perf_counter()
+        meta_data["load_data"] = f"{time_load - time_start:0.3f}"
+        # Load queries from saved numpy files
+        language_query = np.load(os.path.join(self.model_dir, f"{language}.npy"))
+        textnorm_query = np.load(os.path.join(self.model_dir, "withitn.npy") if withitn
+                                 else os.path.join(self.model_dir, "woitn.npy"))
+        event_emo_query = np.load(os.path.join(self.model_dir, "event_emo.npy"))
+        # Concatenate queries to form input_query
+        input_query = np.concatenate((language_query, event_emo_query, textnorm_query), axis=1)
+        # Process features
+        results = ""
+        # Handle output_dir without using DatadirWriter (which is not available)
+        slice_len = self.seq_len - 4
+        time_pre = time.perf_counter()
+        meta_data["preprocess"] = f"{time_pre - time_load:0.3f}"
+        for beg_idx in range(0, waveform_nums, self.batch_size):
+            end_idx = min(waveform_nums, beg_idx + self.batch_size)
+            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
+            time_feat = time.perf_counter()
+            meta_data["extract_feat"] = f"{time_feat - time_pre:0.3f}"
+            for i in range(int(np.ceil(feats.shape[1] / slice_len))):
+                sub_feats = np.concatenate([input_query, feats[:, i*slice_len : (i+1)*slice_len, :]], axis=1)
+                feats_len[0] = sub_feats.shape[1]
+                if feats_len[0] < self.seq_len:
+                    sub_feats = np.concatenate([sub_feats, np.zeros((1, self.seq_len - feats_len[0], 560), dtype=np.float32)], axis=1)
+                masks = sequence_mask(torch.IntTensor([self.seq_len]), maxlen=self.seq_len, dtype=torch.float32)[:, None, :]
+                masks = masks.numpy()
+                # Run inference
+                ctc_logits, encoder_out_lens = self.infer(sub_feats, masks, position_encoding)
+                # Convert to torch tensor for processing
+                ctc_logits = torch.from_numpy(ctc_logits).float()
+                # Process results for each batch
+                b, _, _ = ctc_logits.size()
+                for j in range(b):
+                    x = ctc_logits[j, : encoder_out_lens[j].item(), :]
+                    yseq = x.argmax(dim=-1)
+                    yseq = torch.unique_consecutive(yseq, dim=-1)
+                    mask = yseq != self.blank_id
+                    token_int = yseq[mask].tolist()[4:] #前4个略去: <|zh|><|ANGRY|><|Speech|><|withitn|>
+                    # Convert tokens to text
+                    text = tokenizer.decode(token_int) if tokenizer is not None else str(token_int)
+                    if tokenizer is not None:
+                        results+= text
+                    else:
+                        results+= token_int
+        return results
+    def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+        def load_wav(path: str) -> np.ndarray:
+            try:
+                # Use librosa if available
+                if 'librosa' in globals():
+                    waveform, _ = librosa.load(path, sr=fs)
+                else:
+                    # Use fallback implementation
+                    waveform, native_sr = load_wav_fallback(path)
+                    if fs is not None and native_sr != fs:
+                        # Implement resampling if needed
+                        print(f"Warning: Resampling from {native_sr} to {fs} is not implemented in fallback mode")
+                return waveform
+            except Exception as e:
+                print(f"Error loading audio file {path}: {e}")
+                # Return empty audio in case of error
+                return np.zeros(1600, dtype=np.float32)
+        if isinstance(wav_content, np.ndarray):
+            return [wav_content]
+        if isinstance(wav_content, str):
+            return [load_wav(wav_content)]
+        if isinstance(wav_content, list):
+            return [load_wav(path) for path in wav_content]
+        raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")
+    def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        feats, feats_len = [], []
+        for waveform in waveform_list:
+            speech, _ = self.frontend.fbank(waveform)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+        feats = self.pad_feats(feats, np.max(feats_len))
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+    @staticmethod
+    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+            pad_width = ((0, max_feat_len - cur_len), (0, 0))
+            return np.pad(feat, pad_width, "constant", constant_values=0)
+        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+        feats = np.array(feat_res).astype(np.float32)
+        return feats
+    def infer(self,
+              feats: np.ndarray,
+              masks: np.ndarray,
+              position_encoding: np.ndarray,
+              ) -> Tuple[np.ndarray, np.ndarray]:
+        #outputs = self.ort_infer([feats, masks, position_encoding])
+        outputs =self.session.run(None, {
+            'speech': feats,
+            'masks': masks,
+            'position_encoding': position_encoding
+        })
+        return outputs

utils/ax_vad_bin.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os.path
+from typing import List,  Tuple
+import numpy as np
+from utils.utils.utils import  read_yaml
+from utils.utils.frontend import WavFrontend
+from utils.utils.e2e_vad import E2EVadModel
+import axengine as axe
+class AX_Fsmn_vad:
+    def __init__(self, model_dir, batch_size=1, max_end_sil=None):
+            """Initialize VAD model for inference"""
+            # Export model if needed
+            model_file = os.path.join(model_dir, "vad.axmodel")
+            # Load config and frontend
+            config_file = os.path.join(model_dir, "vad/config.yaml")
+            cmvn_file = os.path.join(model_dir, "vad/am.mvn")
+            self.config = read_yaml(config_file)
+            self.frontend = WavFrontend(cmvn_file=cmvn_file, **self.config["frontend_conf"])
+            self.session = axe.InferenceSession(model_file, providers='AxEngineExecutionProvider')
+            self.batch_size = batch_size
+            self.vad_scorer = E2EVadModel(self.config["model_conf"])
+            self.max_end_sil = max_end_sil if max_end_sil is not None else self.config["model_conf"]["max_end_silence_time"]
+    def extract_feat(self, waveform_list):
+        """Extract features from waveform"""
+        feats, feats_len = [], []
+        for waveform in waveform_list:
+            speech, _ = self.frontend.fbank(waveform)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+        max_len = max(feats_len)
+        padded_feats = [np.pad(f, ((0, max_len - f.shape[0]), (0, 0)), 'constant') for f in feats]
+        feats = np.array(padded_feats).astype(np.float32)
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+    def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
+        """Run inference with ONNX Runtime"""
+        # Get all input names from the model
+        input_names = [input.name for input in self.session.get_inputs()]
+        output_names = [x.name for x in self.session.get_outputs()]
+        # Create input dictionary for all inputs
+        input_dict = {}
+        for i, (name, tensor) in enumerate(zip(input_names, feats)):
+            input_dict[name] = tensor
+        # Run inference with all inputs
+        outputs = self.session.run(output_names, input_dict)
+        scores, out_caches = outputs[0], outputs[1:]
+        return scores, out_caches
+    def __call__(self, wav_file, **kwargs):
+        """Process audio file with sliding window approach"""
+        # Load audio and prepare data
+        # waveform = self.load_wav(wav_file)
+        # waveform, _ = librosa.load(wav_file, sr=16000)
+        waveform_list = [wav_file]
+        waveform_nums = len(waveform_list)
+        is_final = kwargs.get("kwargs", False)
+        segments = [[]] * self.batch_size
+        for beg_idx in range(0, waveform_nums, self.batch_size):
+            vad_scorer = E2EVadModel(self.config["model_conf"])
+            end_idx = min(waveform_nums, beg_idx + self.batch_size)
+            waveform = waveform_list[beg_idx:end_idx]
+            feats, feats_len = self.extract_feat(waveform)
+            waveform = np.array(waveform)
+            param_dict = kwargs.get("param_dict", dict())
+            in_cache = param_dict.get("in_cache", list())
+            in_cache = self.prepare_cache(in_cache)
+            t_offset = 0
+            step = int(min(feats_len.max(), 6000))
+            for t_offset in range(0, int(feats_len), min(step, feats_len - t_offset)):
+                if t_offset + step >= feats_len - 1:
+                    step = feats_len - t_offset
+                    is_final = True
+                else:
+                    is_final = False
+                # Extract feature segment
+                feats_package = feats[:, t_offset:int(t_offset + step), :]
+                # Pad if it's the final segment
+                if is_final:
+                    pad_length = 6000 - int(step)
+                    feats_package = np.pad(
+                        feats_package,
+                        ((0, 0), (0, pad_length), (0, 0)),
+                        mode='constant',
+                        constant_values=0
+                    )
+                # Extract corresponding waveform segment
+                waveform_package = waveform[
+                    :,
+                    t_offset * 160:min(waveform.shape[-1], (int(t_offset + step) - 1) * 160 + 400),
+                ]
+                # Pad waveform if it's the final segment
+                if is_final:
+                    expected_wave_length = 6000 * 160 + 240
+                    current_wave_length = waveform_package.shape[-1]
+                    pad_wave_length = expected_wave_length - current_wave_length
+                    if pad_wave_length > 0:
+                        waveform_package = np.pad(
+                            waveform_package,
+                            ((0, 0), (0, pad_wave_length)),
+                            mode='constant',
+                            constant_values=0
+                        )
+                # Run inference
+                inputs = [feats_package]
+                inputs.extend(in_cache)
+                scores, out_caches = self.infer(inputs)
+                in_cache = out_caches
+                # Get VAD segments for this chunk
+                segments_part = vad_scorer(
+                    scores,
+                    waveform_package,
+                    is_final=is_final,
+                    max_end_sil=self.max_end_sil,
+                    online=False,
+                )
+                # Accumulate segments
+                if segments_part:
+                    for batch_num in range(0, self.batch_size):
+                        segments[batch_num] += segments_part[batch_num]
+        return segments
+    def prepare_cache(self, in_cache: list = []):
+        if len(in_cache) > 0:
+            return in_cache
+        fsmn_layers = 4
+        proj_dim = 128
+        lorder = 20
+        for i in range(fsmn_layers):
+            cache = np.zeros((1, proj_dim, lorder - 1, 1)).astype(np.float32)
+            in_cache.append(cache)
+        return in_cache

utils/ctc_alignment.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+def ctc_forced_align(
+    log_probs: torch.Tensor,
+    targets: torch.Tensor,
+    input_lengths: torch.Tensor,
+    target_lengths: torch.Tensor,
+    blank: int = 0,
+    ignore_id: int = -1,
+) -> torch.Tensor:
+    """Align a CTC label sequence to an emission.
+    Args:
+        log_probs (Tensor): log probability of CTC emission output.
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
+            `C` is the number of characters in alphabet including blank.
+        targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
+            where `L` is the target length.
+        input_lengths (Tensor):
+            Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (Tensor):
+            Lengths of the targets. 1-D Tensor of shape `(B,)`.
+        blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
+        ignore_id (int, optional): The index of ignore symbol in CTC emission. (Default: -1)
+    """
+    targets[targets == ignore_id] = blank
+    batch_size, input_time_size, _ = log_probs.size()
+    bsz_indices = torch.arange(batch_size, device=input_lengths.device)
+    _t_a_r_g_e_t_s_ = torch.cat(
+        (
+            torch.stack((torch.full_like(targets, blank), targets), dim=-1).flatten(start_dim=1),
+            torch.full_like(targets[:, :1], blank),
+        ),
+        dim=-1,
+    )
+    diff_labels = torch.cat(
+        (
+            torch.as_tensor([[False, False]], device=targets.device).expand(batch_size, -1),
+            _t_a_r_g_e_t_s_[:, 2:] != _t_a_r_g_e_t_s_[:, :-2],
+        ),
+        dim=1,
+    )
+    neg_inf = torch.tensor(float("-inf"), device=log_probs.device, dtype=log_probs.dtype)
+    padding_num = 2
+    padded_t = padding_num + _t_a_r_g_e_t_s_.size(-1)
+    best_score = torch.full((batch_size, padded_t), neg_inf, device=log_probs.device, dtype=log_probs.dtype)
+    best_score[:, padding_num + 0] = log_probs[:, 0, blank]
+    best_score[:, padding_num + 1] = log_probs[bsz_indices, 0, _t_a_r_g_e_t_s_[:, 1]]
+    backpointers = torch.zeros((batch_size, input_time_size, padded_t), device=log_probs.device, dtype=targets.dtype)
+    for t in range(1, input_time_size):
+        prev = torch.stack(
+            (best_score[:, 2:], best_score[:, 1:-1], torch.where(diff_labels, best_score[:, :-2], neg_inf))
+        )
+        prev_max_value, prev_max_idx = prev.max(dim=0)
+        best_score[:, padding_num:] = log_probs[:, t].gather(-1, _t_a_r_g_e_t_s_) + prev_max_value
+        backpointers[:, t, padding_num:] = prev_max_idx
+    l1l2 = best_score.gather(
+        -1, torch.stack((padding_num + target_lengths * 2 - 1, padding_num + target_lengths * 2), dim=-1)
+    )
+    path = torch.zeros((batch_size, input_time_size), device=best_score.device, dtype=torch.long)
+    path[bsz_indices, input_lengths - 1] = padding_num + target_lengths * 2 - 1 + l1l2.argmax(dim=-1)
+    for t in range(input_time_size - 1, 0, -1):
+        target_indices = path[:, t]
+        prev_max_idx = backpointers[bsz_indices, t, target_indices]
+        path[:, t - 1] += target_indices - prev_max_idx
+    alignments = _t_a_r_g_e_t_s_.gather(dim=-1, index=(path - padding_num).clamp(min=0))
+    return alignments

utils/frontend.py ADDED Viewed

	@@ -0,0 +1,433 @@

+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import copy
+import numpy as np
+import kaldi_native_fbank as knf
+root_dir = Path(__file__).resolve().parent
+logger_initialized = {}
+class WavFrontend:
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        **kwargs,
+    ) -> None:
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+        self.fbank_fn = None
+        self.fbank_beg_idx = 0
+        self.reset_status()
+    def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def fbank_online(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(self.fbank_beg_idx, frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def reset_status(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_beg_idx = 0
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n :].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+    def load_cmvn(
+        self,
+    ) -> np.ndarray:
+        with open(self.cmvn_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == "<AddShift>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    add_shift_line = line_item[3 : (len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == "<Rescale>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    rescale_line = line_item[3 : (len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
+class WavFrontendOnline(WavFrontend):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        # add variables
+        self.frame_sample_length = int(
+            self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.frame_shift_sample_length = int(
+            self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.waveform = None
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+    @staticmethod
+    # inputs has catted the cache
+    def apply_lfr(
+        inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        """
+        Apply lfr with data
+        """
+        LFR_inputs = []
+        T = inputs.shape[0]  # include the right context
+        T_lfr = int(
+            np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
+        )  # minus the right context: (lfr_m - 1) // 2
+        splice_idx = T_lfr
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:  # process last LFR frame
+                if is_final:
+                    num_padding = lfr_m - (T - i * lfr_n)
+                    frame = (inputs[i * lfr_n :]).reshape(-1)
+                    for _ in range(num_padding):
+                        frame = np.hstack((frame, inputs[-1]))
+                    LFR_inputs.append(frame)
+                else:
+                    # update splice_idx and break the circle
+                    splice_idx = i
+                    break
+        splice_idx = min(T - 1, splice_idx * lfr_n)
+        lfr_splice_cache = inputs[splice_idx:, :]
+        LFR_outputs = np.vstack(LFR_inputs)
+        return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
+    @staticmethod
+    def compute_frame_num(
+        sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
+    ) -> int:
+        frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
+        return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+    def fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        batch_size = input.shape[0]
+        if self.input_cache is None:
+            self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
+        input = np.concatenate((self.input_cache, input), axis=1)
+        frame_num = self.compute_frame_num(
+            input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
+        )
+        # update self.in_cache
+        self.input_cache = input[
+            :, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
+        ]
+        waveforms = np.empty(0, dtype=np.float32)
+        feats_pad = np.empty(0, dtype=np.float32)
+        feats_lens = np.empty(0, dtype=np.int32)
+        if frame_num:
+            waveforms = []
+            feats = []
+            feats_lens = []
+            for i in range(batch_size):
+                waveform = input[i]
+                waveforms.append(
+                    waveform[
+                        : (
+                            (frame_num - 1) * self.frame_shift_sample_length
+                            + self.frame_sample_length
+                        )
+                    ]
+                )
+                waveform = waveform * (1 << 15)
+                self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+                frames = self.fbank_fn.num_frames_ready
+                mat = np.empty([frames, self.opts.mel_opts.num_bins])
+                for i in range(frames):
+                    mat[i, :] = self.fbank_fn.get_frame(i)
+                feat = mat.astype(np.float32)
+                feat_len = np.array(mat.shape[0]).astype(np.int32)
+                feats.append(feat)
+                feats_lens.append(feat_len)
+            waveforms = np.stack(waveforms)
+            feats_lens = np.array(feats_lens)
+            feats_pad = np.array(feats)
+        self.fbanks = feats_pad
+        self.fbanks_lens = copy.deepcopy(feats_lens)
+        return waveforms, feats_pad, feats_lens
+    def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
+        return self.fbanks, self.fbanks_lens
+    def lfr_cmvn(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, List[int]]:
+        batch_size = input.shape[0]
+        feats = []
+        feats_lens = []
+        lfr_splice_frame_idxs = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            lfr_splice_frame_idx = -1
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                # update self.lfr_splice_cache in self.apply_lfr
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(
+                    mat, self.lfr_m, self.lfr_n, is_final
+                )
+            if self.cmvn_file is not None:
+                mat = self.apply_cmvn(mat)
+            feat_length = mat.shape[0]
+            feats.append(mat)
+            feats_lens.append(feat_length)
+            lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+        feats_lens = np.array(feats_lens)
+        feats_pad = np.array(feats)
+        return feats_pad, feats_lens, lfr_splice_frame_idxs
+    def extract_fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        batch_size = input.shape[0]
+        assert (
+            batch_size == 1
+        ), "we support to extract feature online only when the batch size is equal to 1 now"
+        waveforms, feats, feats_lengths = self.fbank(input, input_lengths)  # input shape: B T D
+        if feats.shape[0]:
+            self.waveforms = (
+                waveforms
+                if self.reserve_waveforms is None
+                else np.concatenate((self.reserve_waveforms, waveforms), axis=1)
+            )
+            if not self.lfr_splice_cache:
+                for i in range(batch_size):
+                    self.lfr_splice_cache.append(
+                        np.expand_dims(feats[i][0, :], axis=0).repeat((self.lfr_m - 1) // 2, axis=0)
+                    )
+            if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
+                lfr_splice_cache_np = np.stack(self.lfr_splice_cache)  # B T D
+                feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
+                feats_lengths += lfr_splice_cache_np[0].shape[0]
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length)
+                    / self.frame_shift_sample_length
+                    + 1
+                )
+                minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
+                feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(
+                    feats, feats_lengths, is_final
+                )
+                if self.lfr_m == 1:
+                    self.reserve_waveforms = None
+                else:
+                    reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+                    # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
+                    # print('frame_frame:  ' + str(frame_from_waveforms))
+                    self.reserve_waveforms = self.waveforms[
+                        :,
+                        reserve_frame_idx
+                        * self.frame_shift_sample_length : frame_from_waveforms
+                        * self.frame_shift_sample_length,
+                    ]
+                    sample_length = (
+                        frame_from_waveforms - 1
+                    ) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.waveforms = self.waveforms[:, :sample_length]
+            else:
+                # update self.reserve_waveforms and self.lfr_splice_cache
+                self.reserve_waveforms = self.waveforms[
+                    :, : -(self.frame_sample_length - self.frame_shift_sample_length)
+                ]
+                for i in range(batch_size):
+                    self.lfr_splice_cache[i] = np.concatenate(
+                        (self.lfr_splice_cache[i], feats[i]), axis=0
+                    )
+                return np.empty(0, dtype=np.float32), feats_lengths
+        else:
+            if is_final:
+                self.waveforms = (
+                    waveforms if self.reserve_waveforms is None else self.reserve_waveforms
+                )
+                feats = np.stack(self.lfr_splice_cache)
+                feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
+                feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
+        if is_final:
+            self.cache_reset()
+        return feats, feats_lengths
+    def get_waveforms(self):
+        return self.waveforms
+    def cache_reset(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+def load_bytes(input):
+    middle_data = np.frombuffer(input, dtype=np.int16)
+    middle_data = np.asarray(middle_data)
+    if middle_data.dtype.kind not in "iu":
+        raise TypeError("'middle_data' must be an array of integers")
+    dtype = np.dtype("float32")
+    if dtype.kind != "f":
+        raise TypeError("'dtype' must be a floating point type")
+    i = np.iinfo(middle_data.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+    return array
+class SinusoidalPositionEncoderOnline:
+    """Streaming Positional encoding."""
+    def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
+        batch_size = positions.shape[0]
+        positions = positions.astype(dtype)
+        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
+        inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
+        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
+        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
+        return encoding.astype(dtype)
+    def forward(self, x, start_idx=0):
+        batch_size, timesteps, input_dim = x.shape
+        positions = np.arange(1, timesteps + 1 + start_idx)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype)
+        return x + position_encoding[:, start_idx : start_idx + timesteps]
+def test():
+    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
+    import librosa
+    cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
+    config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
+    from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
+    config = read_yaml(config_file)
+    waveform, _ = librosa.load(path, sr=None)
+    frontend = WavFrontend(
+        cmvn_file=cmvn_file,
+        **config["frontend_conf"],
+    )
+    speech, _ = frontend.fbank_online(waveform)  # 1d, (sample,), numpy
+    feat, feat_len = frontend.lfr_cmvn(
+        speech
+    )  # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
+    frontend.reset_status()  # clear cache
+    return feat, feat_len
+if __name__ == "__main__":
+    test()

utils/infer_utils.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# -*- encoding: utf-8 -*-
+import functools
+import logging
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import re
+import numpy as np
+import yaml
+import jieba
+import warnings
+root_dir = Path(__file__).resolve().parent
+logger_initialized = {}
+def pad_list(xs, pad_value, max_len=None):
+    n_batch = len(xs)
+    if max_len is None:
+        max_len = max(x.size(0) for x in xs)
+    # pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    # numpy format
+    pad = (np.zeros((n_batch, max_len)) + pad_value).astype(np.int32)
+    for i in range(n_batch):
+        pad[i, : xs[i].shape[0]] = xs[i]
+    return pad
+"""
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+    if not isinstance(lengths, list):
+        lengths = lengths.tolist()
+    bs = int(len(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
+    else:
+        assert xs is None
+        assert maxlen >= int(max(lengths))
+    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    if xs is not None:
+        assert xs.size(0) == bs, (xs.size(0), bs)
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(
+            slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
+        )
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+"""
+class TokenIDConverter:
+    def __init__(
+        self,
+        token_list: Union[List, str],
+    ):
+        self.token_list = token_list
+        self.unk_symbol = token_list[-1]
+        self.token2id = {v: i for i, v in enumerate(self.token_list)}
+        self.unk_id = self.token2id[self.unk_symbol]
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+    def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise TokenIDConverterError(f"Must be 1 dim ndarray, but got {integers.ndim}")
+        return [self.token_list[i] for i in integers]
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        return [self.token2id.get(i, self.unk_id) for i in tokens]
+class CharTokenizer:
+    def __init__(
+        self,
+        symbol_value: Union[Path, str, Iterable[str]] = None,
+        space_symbol: str = "<space>",
+        remove_non_linguistic_symbols: bool = False,
+    ):
+        self.space_symbol = space_symbol
+        self.non_linguistic_symbols = self.load_symbols(symbol_value)
+        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
+    @staticmethod
+    def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
+        if value is None:
+            return set()
+        if isinstance(value, Iterable[str]):
+            return set(value)
+        file_path = Path(value)
+        if not file_path.exists():
+            logging.warning("%s doesn't exist.", file_path)
+            return set()
+        with file_path.open("r", encoding="utf-8") as f:
+            return set(line.rstrip() for line in f)
+    def text2tokens(self, line: Union[str, list]) -> List[str]:
+        tokens = []
+        while len(line) != 0:
+            for w in self.non_linguistic_symbols:
+                if line.startswith(w):
+                    if not self.remove_non_linguistic_symbols:
+                        tokens.append(line[: len(w)])
+                    line = line[len(w) :]
+                    break
+            else:
+                t = line[0]
+                if t == " ":
+                    t = "<space>"
+                tokens.append(t)
+                line = line[1:]
+        return tokens
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        tokens = [t if t != self.space_symbol else " " for t in tokens]
+        return "".join(tokens)
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f'space_symbol="{self.space_symbol}"'
+            f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
+            f")"
+        )
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+    yseq: np.ndarray
+    score: Union[float, np.ndarray] = 0
+    scores: Dict[str, Union[float, np.ndarray]] = dict()
+    states: Dict[str, Any] = dict()
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+class TokenIDConverterError(Exception):
+    pass
+class ONNXRuntimeError(Exception):
+    pass
+def split_to_mini_sentence(words: list, word_limit: int = 20):
+    assert word_limit > 1
+    if len(words) <= word_limit:
+        return [words]
+    sentences = []
+    length = len(words)
+    sentence_len = length // word_limit
+    for i in range(sentence_len):
+        sentences.append(words[i * word_limit : (i + 1) * word_limit])
+    if length % word_limit > 0:
+        sentences.append(words[sentence_len * word_limit :])
+    return sentences
+def code_mix_split_words(text: str):
+    words = []
+    segs = text.split()
+    for seg in segs:
+        # There is no space in seg.
+        current_word = ""
+        for c in seg:
+            if len(c.encode()) == 1:
+                # This is an ASCII char.
+                current_word += c
+            else:
+                # This is a Chinese char.
+                if len(current_word) > 0:
+                    words.append(current_word)
+                    current_word = ""
+                words.append(c)
+        if len(current_word) > 0:
+            words.append(current_word)
+    return words
+def isEnglish(text: str):
+    if re.search("^[a-zA-Z']+$", text):
+        return True
+    else:
+        return False
+def join_chinese_and_english(input_list):
+    line = ""
+    for token in input_list:
+        if isEnglish(token):
+            line = line + " " + token
+        else:
+            line = line + token
+    line = line.strip()
+    return line
+def code_mix_split_words_jieba(seg_dict_file: str):
+    jieba.load_userdict(seg_dict_file)
+    def _fn(text: str):
+        input_list = text.split()
+        token_list_all = []
+        langauge_list = []
+        token_list_tmp = []
+        language_flag = None
+        for token in input_list:
+            if isEnglish(token) and language_flag == "Chinese":
+                token_list_all.append(token_list_tmp)
+                langauge_list.append("Chinese")
+                token_list_tmp = []
+            elif not isEnglish(token) and language_flag == "English":
+                token_list_all.append(token_list_tmp)
+                langauge_list.append("English")
+                token_list_tmp = []
+            token_list_tmp.append(token)
+            if isEnglish(token):
+                language_flag = "English"
+            else:
+                language_flag = "Chinese"
+        if token_list_tmp:
+            token_list_all.append(token_list_tmp)
+            langauge_list.append(language_flag)
+        result_list = []
+        for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+            if language_flag == "English":
+                result_list.extend(token_list_tmp)
+            else:
+                seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+                result_list.extend(seg_list)
+        return result_list
+    return _fn
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+    if not Path(yaml_path).exists():
+        raise FileExistsError(f"The {yaml_path} does not exist.")
+    with open(str(yaml_path), "rb") as f:
+        data = yaml.load(f, Loader=yaml.Loader)
+    return data
+@functools.lru_cache()
+def get_logger(name="funasr_onnx"):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added.
+    Args:
+        name (str): Logger name.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%Y/%m/%d %H:%M:%S"
+    )
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    logger.addHandler(sh)
+    logger_initialized[name] = True
+    logger.propagate = False
+    logging.basicConfig(level=logging.ERROR)
+    return logger

utils/utils/__init__.py ADDED Viewed

File without changes

utils/utils/e2e_vad.py ADDED Viewed

	@@ -0,0 +1,711 @@

+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+from enum import Enum
+from typing import List, Tuple, Dict, Any
+import math
+import numpy as np
+class VadStateMachine(Enum):
+    kVadInStateStartPointNotDetected = 1
+    kVadInStateInSpeechSegment = 2
+    kVadInStateEndPointDetected = 3
+class FrameState(Enum):
+    kFrameStateInvalid = -1
+    kFrameStateSpeech = 1
+    kFrameStateSil = 0
+# final voice/unvoice state per frame
+class AudioChangeState(Enum):
+    kChangeStateSpeech2Speech = 0
+    kChangeStateSpeech2Sil = 1
+    kChangeStateSil2Sil = 2
+    kChangeStateSil2Speech = 3
+    kChangeStateNoBegin = 4
+    kChangeStateInvalid = 5
+class VadDetectMode(Enum):
+    kVadSingleUtteranceDetectMode = 0
+    kVadMutipleUtteranceDetectMode = 1
+class VADXOptions:
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
+        snr_mode: int = 0,
+        max_end_silence_time: int = 800,
+        max_start_silence_time: int = 3000,
+        do_start_point_detection: bool = True,
+        do_end_point_detection: bool = True,
+        window_size_ms: int = 200,
+        sil_to_speech_time_thres: int = 150,
+        speech_to_sil_time_thres: int = 150,
+        speech_2_noise_ratio: float = 1.0,
+        do_extend: int = 1,
+        lookback_time_start_point: int = 200,
+        lookahead_time_end_point: int = 100,
+        max_single_segment_time: int = 60000,
+        nn_eval_block_size: int = 8,
+        dcd_block_size: int = 4,
+        snr_thres: int = -100.0,
+        noise_frame_num_used_for_snr: int = 100,
+        decibel_thres: int = -100.0,
+        speech_noise_thres: float = 0.6,
+        fe_prior_thres: float = 1e-4,
+        silence_pdf_num: int = 1,
+        sil_pdf_ids: List[int] = [0],
+        speech_noise_thresh_low: float = -0.1,
+        speech_noise_thresh_high: float = 0.3,
+        output_frame_probs: bool = False,
+        frame_in_ms: int = 10,
+        frame_length_ms: int = 25,
+    ):
+        self.sample_rate = sample_rate
+        self.detect_mode = detect_mode
+        self.snr_mode = snr_mode
+        self.max_end_silence_time = max_end_silence_time
+        self.max_start_silence_time = max_start_silence_time
+        self.do_start_point_detection = do_start_point_detection
+        self.do_end_point_detection = do_end_point_detection
+        self.window_size_ms = window_size_ms
+        self.sil_to_speech_time_thres = sil_to_speech_time_thres
+        self.speech_to_sil_time_thres = speech_to_sil_time_thres
+        self.speech_2_noise_ratio = speech_2_noise_ratio
+        self.do_extend = do_extend
+        self.lookback_time_start_point = lookback_time_start_point
+        self.lookahead_time_end_point = lookahead_time_end_point
+        self.max_single_segment_time = max_single_segment_time
+        self.nn_eval_block_size = nn_eval_block_size
+        self.dcd_block_size = dcd_block_size
+        self.snr_thres = snr_thres
+        self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
+        self.decibel_thres = decibel_thres
+        self.speech_noise_thres = speech_noise_thres
+        self.fe_prior_thres = fe_prior_thres
+        self.silence_pdf_num = silence_pdf_num
+        self.sil_pdf_ids = sil_pdf_ids
+        self.speech_noise_thresh_low = speech_noise_thresh_low
+        self.speech_noise_thresh_high = speech_noise_thresh_high
+        self.output_frame_probs = output_frame_probs
+        self.frame_in_ms = frame_in_ms
+        self.frame_length_ms = frame_length_ms
+class E2EVadSpeechBufWithDoa(object):
+    def __init__(self):
+        self.start_ms = 0
+        self.end_ms = 0
+        self.buffer = []
+        self.contain_seg_start_point = False
+        self.contain_seg_end_point = False
+        self.doa = 0
+    def Reset(self):
+        self.start_ms = 0
+        self.end_ms = 0
+        self.buffer = []
+        self.contain_seg_start_point = False
+        self.contain_seg_end_point = False
+        self.doa = 0
+class E2EVadFrameProb(object):
+    def __init__(self):
+        self.noise_prob = 0.0
+        self.speech_prob = 0.0
+        self.score = 0.0
+        self.frame_id = 0
+        self.frm_state = 0
+class WindowDetector(object):
+    def __init__(
+        self,
+        window_size_ms: int,
+        sil_to_speech_time: int,
+        speech_to_sil_time: int,
+        frame_size_ms: int,
+    ):
+        self.window_size_ms = window_size_ms
+        self.sil_to_speech_time = sil_to_speech_time
+        self.speech_to_sil_time = speech_to_sil_time
+        self.frame_size_ms = frame_size_ms
+        self.win_size_frame = int(window_size_ms / frame_size_ms)
+        self.win_sum = 0
+        self.win_state = [0] * self.win_size_frame  # 初始化窗
+        self.cur_win_pos = 0
+        self.pre_frame_state = FrameState.kFrameStateSil
+        self.cur_frame_state = FrameState.kFrameStateSil
+        self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
+        self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
+        self.voice_last_frame_count = 0
+        self.noise_last_frame_count = 0
+        self.hydre_frame_count = 0
+    def Reset(self) -> None:
+        self.cur_win_pos = 0
+        self.win_sum = 0
+        self.win_state = [0] * self.win_size_frame
+        self.pre_frame_state = FrameState.kFrameStateSil
+        self.cur_frame_state = FrameState.kFrameStateSil
+        self.voice_last_frame_count = 0
+        self.noise_last_frame_count = 0
+        self.hydre_frame_count = 0
+    def GetWinSize(self) -> int:
+        return int(self.win_size_frame)
+    def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
+        cur_frame_state = FrameState.kFrameStateSil
+        if frameState == FrameState.kFrameStateSpeech:
+            cur_frame_state = 1
+        elif frameState == FrameState.kFrameStateSil:
+            cur_frame_state = 0
+        else:
+            return AudioChangeState.kChangeStateInvalid
+        self.win_sum -= self.win_state[self.cur_win_pos]
+        self.win_sum += cur_frame_state
+        self.win_state[self.cur_win_pos] = cur_frame_state
+        self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
+        if (
+            self.pre_frame_state == FrameState.kFrameStateSil
+            and self.win_sum >= self.sil_to_speech_frmcnt_thres
+        ):
+            self.pre_frame_state = FrameState.kFrameStateSpeech
+            return AudioChangeState.kChangeStateSil2Speech
+        if (
+            self.pre_frame_state == FrameState.kFrameStateSpeech
+            and self.win_sum <= self.speech_to_sil_frmcnt_thres
+        ):
+            self.pre_frame_state = FrameState.kFrameStateSil
+            return AudioChangeState.kChangeStateSpeech2Sil
+        if self.pre_frame_state == FrameState.kFrameStateSil:
+            return AudioChangeState.kChangeStateSil2Sil
+        if self.pre_frame_state == FrameState.kFrameStateSpeech:
+            return AudioChangeState.kChangeStateSpeech2Speech
+        return AudioChangeState.kChangeStateInvalid
+    def FrameSizeMs(self) -> int:
+        return int(self.frame_size_ms)
+class E2EVadModel:
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
+    def __init__(self, vad_post_args: Dict[str, Any]):
+        super(E2EVadModel, self).__init__()
+        self.vad_opts = VADXOptions(**vad_post_args)
+        self.windows_detector = WindowDetector(
+            self.vad_opts.window_size_ms,
+            self.vad_opts.sil_to_speech_time_thres,
+            self.vad_opts.speech_to_sil_time_thres,
+            self.vad_opts.frame_in_ms,
+        )
+        # self.encoder = encoder
+        # init variables
+        self.is_final = False
+        self.data_buf_start_frame = 0
+        self.frm_cnt = 0
+        self.latest_confirmed_speech_frame = 0
+        self.lastest_confirmed_silence_frame = -1
+        self.continous_silence_frame_count = 0
+        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+        self.confirmed_start_frame = -1
+        self.confirmed_end_frame = -1
+        self.number_end_time_detected = 0
+        self.sil_frame = 0
+        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+        self.noise_average_decibel = -100.0
+        self.pre_end_silence_detected = False
+        self.next_seg = True
+        self.output_data_buf = []
+        self.output_data_buf_offset = 0
+        self.frame_probs = []
+        self.max_end_sil_frame_cnt_thresh = (
+            self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+        )
+        self.speech_noise_thres = self.vad_opts.speech_noise_thres
+        self.scores = None
+        self.idx_pre_chunk = 0
+        self.max_time_out = False
+        self.decibel = []
+        self.data_buf_size = 0
+        self.data_buf_all_size = 0
+        self.waveform = None
+        self.ResetDetection()
+    def AllResetDetection(self):
+        self.is_final = False
+        self.data_buf_start_frame = 0
+        self.frm_cnt = 0
+        self.latest_confirmed_speech_frame = 0
+        self.lastest_confirmed_silence_frame = -1
+        self.continous_silence_frame_count = 0
+        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+        self.confirmed_start_frame = -1
+        self.confirmed_end_frame = -1
+        self.number_end_time_detected = 0
+        self.sil_frame = 0
+        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+        self.noise_average_decibel = -100.0
+        self.pre_end_silence_detected = False
+        self.next_seg = True
+        self.output_data_buf = []
+        self.output_data_buf_offset = 0
+        self.frame_probs = []
+        self.max_end_sil_frame_cnt_thresh = (
+            self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+        )
+        self.speech_noise_thres = self.vad_opts.speech_noise_thres
+        self.scores = None
+        self.idx_pre_chunk = 0
+        self.max_time_out = False
+        self.decibel = []
+        self.data_buf_size = 0
+        self.data_buf_all_size = 0
+        self.waveform = None
+        self.ResetDetection()
+    def ResetDetection(self):
+        self.continous_silence_frame_count = 0
+        self.latest_confirmed_speech_frame = 0
+        self.lastest_confirmed_silence_frame = -1
+        self.confirmed_start_frame = -1
+        self.confirmed_end_frame = -1
+        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+        self.windows_detector.Reset()
+        self.sil_frame = 0
+        self.frame_probs = []
+    def ComputeDecibel(self) -> None:
+        frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
+        frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
+        if self.data_buf_all_size == 0:
+            self.data_buf_all_size = len(self.waveform[0])
+            self.data_buf_size = self.data_buf_all_size
+        else:
+            self.data_buf_all_size += len(self.waveform[0])
+        for offset in range(
+            0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length
+        ):
+            self.decibel.append(
+                10
+                * math.log10(
+                    np.square((self.waveform[0][offset : offset + frame_sample_length])).sum()
+                    + 0.000001
+                )
+            )
+    def ComputeScores(self, scores: np.ndarray) -> None:
+        # scores = self.encoder(feats, in_cache)  # return B * T * D
+        self.vad_opts.nn_eval_block_size = scores.shape[1]
+        self.frm_cnt += scores.shape[1]  # count total frames
+        self.scores = scores
+    def PopDataBufTillFrame(self, frame_idx: int) -> None:  # need check again
+        while self.data_buf_start_frame < frame_idx:
+            if self.data_buf_size >= int(
+                self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
+            ):
+                self.data_buf_start_frame += 1
+                self.data_buf_size = self.data_buf_all_size - self.data_buf_start_frame * int(
+                    self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
+                )
+    def PopDataToOutputBuf(
+        self,
+        start_frm: int,
+        frm_cnt: int,
+        first_frm_is_start_point: bool,
+        last_frm_is_end_point: bool,
+        end_point_is_sent_end: bool,
+    ) -> None:
+        self.PopDataBufTillFrame(start_frm)
+        expected_sample_number = int(
+            frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
+        )
+        if last_frm_is_end_point:
+            extra_sample = max(
+                0,
+                int(
+                    self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
+                    - self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
+                ),
+            )
+            expected_sample_number += int(extra_sample)
+        if end_point_is_sent_end:
+            expected_sample_number = max(expected_sample_number, self.data_buf_size)
+        if self.data_buf_size < expected_sample_number:
+            print("error in calling pop data_buf\n")
+        if len(self.output_data_buf) == 0 or first_frm_is_start_point:
+            self.output_data_buf.append(E2EVadSpeechBufWithDoa())
+            self.output_data_buf[-1].Reset()
+            self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
+            self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
+            self.output_data_buf[-1].doa = 0
+        cur_seg = self.output_data_buf[-1]
+        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+            print("warning\n")
+        out_pos = len(cur_seg.buffer)  # cur_seg.buff现在没做任何操作
+        data_to_pop = 0
+        if end_point_is_sent_end:
+            data_to_pop = expected_sample_number
+        else:
+            data_to_pop = int(
+                frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
+            )
+        if data_to_pop > self.data_buf_size:
+            print("VAD data_to_pop is bigger than self.data_buf_size!!!\n")
+            data_to_pop = self.data_buf_size
+            expected_sample_number = self.data_buf_size
+        cur_seg.doa = 0
+        for sample_cpy_out in range(0, data_to_pop):
+            # cur_seg.buffer[out_pos ++] = data_buf_.back();
+            out_pos += 1
+        for sample_cpy_out in range(data_to_pop, expected_sample_number):
+            # cur_seg.buffer[out_pos++] = data_buf_.back()
+            out_pos += 1
+        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+            print("Something wrong with the VAD algorithm\n")
+        self.data_buf_start_frame += frm_cnt
+        cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
+        if first_frm_is_start_point:
+            cur_seg.contain_seg_start_point = True
+        if last_frm_is_end_point:
+            cur_seg.contain_seg_end_point = True
+    def OnSilenceDetected(self, valid_frame: int):
+        self.lastest_confirmed_silence_frame = valid_frame
+        if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+            self.PopDataBufTillFrame(valid_frame)
+        # silence_detected_callback_
+        # pass
+    def OnVoiceDetected(self, valid_frame: int) -> None:
+        self.latest_confirmed_speech_frame = valid_frame
+        self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
+    def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
+        if self.vad_opts.do_start_point_detection:
+            pass
+        if self.confirmed_start_frame != -1:
+            print("not reset vad properly\n")
+        else:
+            self.confirmed_start_frame = start_frame
+        if (
+            not fake_result
+            and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected
+        ):
+            self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
+    def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
+        for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
+            self.OnVoiceDetected(t)
+        if self.vad_opts.do_end_point_detection:
+            pass
+        if self.confirmed_end_frame != -1:
+            print("not reset vad properly\n")
+        else:
+            self.confirmed_end_frame = end_frame
+        if not fake_result:
+            self.sil_frame = 0
+            self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
+        self.number_end_time_detected += 1
+    def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
+        if is_final_frame:
+            self.OnVoiceEnd(cur_frm_idx, False, True)
+            self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+    def GetLatency(self) -> int:
+        return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
+    def LatencyFrmNumAtStartPoint(self) -> int:
+        vad_latency = self.windows_detector.GetWinSize()
+        if self.vad_opts.do_extend:
+            vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
+        return vad_latency
+    def GetFrameState(self, t: int) -> FrameState:
+        frame_state = FrameState.kFrameStateInvalid
+        cur_decibel = self.decibel[t]
+        cur_snr = cur_decibel - self.noise_average_decibel
+        # for each frame, calc log posterior probability of each state
+        if cur_decibel < self.vad_opts.decibel_thres:
+            frame_state = FrameState.kFrameStateSil
+            self.DetectOneFrame(frame_state, t, False)
+            return frame_state
+        sum_score = 0.0
+        noise_prob = 0.0
+        assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
+        if len(self.sil_pdf_ids) > 0:
+            assert len(self.scores) == 1  # 只支持batch_size = 1的测试
+            sil_pdf_scores = [
+                self.scores[0][t - self.idx_pre_chunk][sil_pdf_id]
+                for sil_pdf_id in self.sil_pdf_ids
+            ]
+            sum_score = sum(sil_pdf_scores)
+            # add by huyuan, avoid sum_score <= 0
+            if sum_score <= 0.0:
+                sum_score = 1e-10
+            # import pdb
+            # pdb.set_trace()
+            noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
+            total_score = 1.0
+            sum_score = total_score - sum_score
+        speech_prob = math.log(sum_score)
+        if self.vad_opts.output_frame_probs:
+            frame_prob = E2EVadFrameProb()
+            frame_prob.noise_prob = noise_prob
+            frame_prob.speech_prob = speech_prob
+            frame_prob.score = sum_score
+            frame_prob.frame_id = t
+            self.frame_probs.append(frame_prob)
+        if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
+            if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
+                frame_state = FrameState.kFrameStateSpeech
+            else:
+                frame_state = FrameState.kFrameStateSil
+        else:
+            frame_state = FrameState.kFrameStateSil
+            if self.noise_average_decibel < -99.9:
+                self.noise_average_decibel = cur_decibel
+            else:
+                self.noise_average_decibel = (
+                    cur_decibel
+                    + self.noise_average_decibel * (self.vad_opts.noise_frame_num_used_for_snr - 1)
+                ) / self.vad_opts.noise_frame_num_used_for_snr
+        return frame_state
+    def __call__(
+        self,
+        score: np.ndarray,
+        waveform: np.ndarray,
+        is_final: bool = False,
+        max_end_sil: int = 800,
+        online: bool = False,
+    ):
+        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
+        self.waveform = waveform  # compute decibel for each frame
+        self.ComputeDecibel()
+        self.ComputeScores(score)
+        #import pdb
+        #pdb.set_trace()
+        if not is_final:
+            self.DetectCommonFrames()
+        else:
+            self.DetectLastFrames()
+        segments = []
+        for batch_num in range(0, score.shape[0]):  # only support batch_size = 1 now
+            segment_batch = []
+            if len(self.output_data_buf) > 0:
+                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+                    if online:
+                        if not self.output_data_buf[i].contain_seg_start_point:
+                            continue
+                        if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
+                            continue
+                        start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
+                        if self.output_data_buf[i].contain_seg_end_point:
+                            end_ms = self.output_data_buf[i].end_ms
+                            self.next_seg = True
+                            self.output_data_buf_offset += 1
+                        else:
+                            end_ms = -1
+                            self.next_seg = False
+                    else:
+                        if not is_final and (
+                            not self.output_data_buf[i].contain_seg_start_point
+                            or not self.output_data_buf[i].contain_seg_end_point
+                        ):
+                            continue
+                        start_ms = self.output_data_buf[i].start_ms
+                        end_ms = self.output_data_buf[i].end_ms
+                        self.output_data_buf_offset += 1
+                    segment = [start_ms, end_ms]
+                    segment_batch.append(segment)
+            if segment_batch:
+                segments.append(segment_batch)
+        if is_final:
+            # reset class variables and clear the dict for the next query
+            self.AllResetDetection()
+        return segments
+    def DetectCommonFrames(self) -> int:
+        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+            return 0
+        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+            frame_state = FrameState.kFrameStateInvalid
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+        self.idx_pre_chunk += self.scores.shape[1]
+        return 0
+    def DetectLastFrames(self) -> int:
+        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+            return 0
+        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+            frame_state = FrameState.kFrameStateInvalid
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            if i != 0:
+                self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+            else:
+                self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
+        return 0
+    def DetectOneFrame(
+        self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool
+    ) -> None:
+        tmp_cur_frm_state = FrameState.kFrameStateInvalid
+        if cur_frm_state == FrameState.kFrameStateSpeech:
+            if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
+                tmp_cur_frm_state = FrameState.kFrameStateSpeech
+            else:
+                tmp_cur_frm_state = FrameState.kFrameStateSil
+        elif cur_frm_state == FrameState.kFrameStateSil:
+            tmp_cur_frm_state = FrameState.kFrameStateSil
+        state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
+        frm_shift_in_ms = self.vad_opts.frame_in_ms
+        if AudioChangeState.kChangeStateSil2Speech == state_change:
+            silence_frame_count = self.continous_silence_frame_count
+            self.continous_silence_frame_count = 0
+            self.pre_end_silence_detected = False
+            start_frame = 0
+            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+                start_frame = max(
+                    self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint()
+                )
+                self.OnVoiceStart(start_frame)
+                self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
+                for t in range(start_frame + 1, cur_frm_idx + 1):
+                    self.OnVoiceDetected(t)
+            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+                for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
+                    self.OnVoiceDetected(t)
+                if (
+                    cur_frm_idx - self.confirmed_start_frame + 1
+                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
+                ):
+                    self.OnVoiceEnd(cur_frm_idx, False, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                elif not is_final_frame:
+                    self.OnVoiceDetected(cur_frm_idx)
+                else:
+                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+            else:
+                pass
+        elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
+            self.continous_silence_frame_count = 0
+            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+                pass
+            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+                if (
+                    cur_frm_idx - self.confirmed_start_frame + 1
+                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
+                ):
+                    self.OnVoiceEnd(cur_frm_idx, False, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                elif not is_final_frame:
+                    self.OnVoiceDetected(cur_frm_idx)
+                else:
+                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+            else:
+                pass
+        elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
+            self.continous_silence_frame_count = 0
+            if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+                if (
+                    cur_frm_idx - self.confirmed_start_frame + 1
+                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
+                ):
+                    self.max_time_out = True
+                    self.OnVoiceEnd(cur_frm_idx, False, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                elif not is_final_frame:
+                    self.OnVoiceDetected(cur_frm_idx)
+                else:
+                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+            else:
+                pass
+        elif AudioChangeState.kChangeStateSil2Sil == state_change:
+            self.continous_silence_frame_count += 1
+            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+                # silence timeout, return zero length decision
+                if (
+                    (self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value)
+                    and (
+                        self.continous_silence_frame_count * frm_shift_in_ms
+                        > self.vad_opts.max_start_silence_time
+                    )
+                ) or (is_final_frame and self.number_end_time_detected == 0):
+                    for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
+                        self.OnSilenceDetected(t)
+                    self.OnVoiceStart(0, True)
+                    self.OnVoiceEnd(0, True, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                else:
+                    if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
+                        self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
+            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+                if (
+                    self.continous_silence_frame_count * frm_shift_in_ms
+                    >= self.max_end_sil_frame_cnt_thresh
+                ):
+                    lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
+                    if self.vad_opts.do_extend:
+                        lookback_frame -= int(
+                            self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
+                        )
+                        lookback_frame -= 1
+                        lookback_frame = max(0, lookback_frame)
+                    self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                elif (
+                    cur_frm_idx - self.confirmed_start_frame + 1
+                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
+                ):
+                    self.OnVoiceEnd(cur_frm_idx, False, False)
+                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+                elif self.vad_opts.do_extend and not is_final_frame:
+                    if self.continous_silence_frame_count <= int(
+                        self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
+                    ):
+                        self.OnVoiceDetected(cur_frm_idx)
+                else:
+                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+            else:
+                pass
+        if (
+            self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected
+            and self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value
+        ):
+            self.ResetDetection()

utils/utils/frontend.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import copy
+from functools import lru_cache
+import numpy as np
+import kaldi_native_fbank as knf
+root_dir = Path(__file__).resolve().parent
+logger_initialized = {}
+class WavFrontend:
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        **kwargs,
+    ) -> None:
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        if self.cmvn_file:
+            self.cmvn = load_cmvn(self.cmvn_file)
+        self.fbank_fn = None
+        self.fbank_beg_idx = 0
+        self.reset_status()
+    def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        fbank_fn = knf.OnlineFbank(self.opts)
+        fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = fbank_fn.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def fbank_online(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(self.fbank_beg_idx, frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def reset_status(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_beg_idx = 0
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n :].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+@lru_cache()
+def load_cmvn(cmvn_file: Union[str, Path]) -> np.ndarray:
+    """load cmvn file to numpy array.
+    Args:
+        cmvn_file (Union[str, Path]): cmvn file path.
+    Raises:
+        FileNotFoundError: cmvn file not exits.
+    Returns:
+        np.ndarray: cmvn array. shape is (2, dim).The first row is means, the second row is vars.
+    """
+    cmvn_file = Path(cmvn_file)
+    if not cmvn_file.exists():
+        raise FileNotFoundError("cmvn file not exits")
+    with open(cmvn_file, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    means_list = []
+    vars_list = []
+    for i in range(len(lines)):
+        line_item = lines[i].split()
+        if line_item[0] == "<AddShift>":
+            line_item = lines[i + 1].split()
+            if line_item[0] == "<LearnRateCoef>":
+                add_shift_line = line_item[3 : (len(line_item) - 1)]
+                means_list = list(add_shift_line)
+                continue
+        elif line_item[0] == "<Rescale>":
+            line_item = lines[i + 1].split()
+            if line_item[0] == "<LearnRateCoef>":
+                rescale_line = line_item[3 : (len(line_item) - 1)]
+                vars_list = list(rescale_line)
+                continue
+    means = np.array(means_list).astype(np.float64)
+    vars = np.array(vars_list).astype(np.float64)
+    cmvn = np.array([means, vars])
+    return cmvn
+class WavFrontendOnline(WavFrontend):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        # add variables
+        self.frame_sample_length = int(
+            self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.frame_shift_sample_length = int(
+            self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.waveform = None
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+    @staticmethod
+    # inputs has catted the cache
+    def apply_lfr(
+        inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        """
+        Apply lfr with data
+        """
+        LFR_inputs = []
+        T = inputs.shape[0]  # include the right context
+        T_lfr = int(
+            np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
+        )  # minus the right context: (lfr_m - 1) // 2
+        splice_idx = T_lfr
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:  # process last LFR frame
+                if is_final:
+                    num_padding = lfr_m - (T - i * lfr_n)
+                    frame = (inputs[i * lfr_n :]).reshape(-1)
+                    for _ in range(num_padding):
+                        frame = np.hstack((frame, inputs[-1]))
+                    LFR_inputs.append(frame)
+                else:
+                    # update splice_idx and break the circle
+                    splice_idx = i
+                    break
+        splice_idx = min(T - 1, splice_idx * lfr_n)
+        lfr_splice_cache = inputs[splice_idx:, :]
+        LFR_outputs = np.vstack(LFR_inputs)
+        return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
+    @staticmethod
+    def compute_frame_num(
+        sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
+    ) -> int:
+        frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
+        return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+    def fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        batch_size = input.shape[0]
+        if self.input_cache is None:
+            self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
+        input = np.concatenate((self.input_cache, input), axis=1)
+        frame_num = self.compute_frame_num(
+            input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
+        )
+        # update self.in_cache
+        self.input_cache = input[
+            :, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
+        ]
+        waveforms = np.empty(0, dtype=np.float32)
+        feats_pad = np.empty(0, dtype=np.float32)
+        feats_lens = np.empty(0, dtype=np.int32)
+        if frame_num:
+            waveforms = []
+            feats = []
+            feats_lens = []
+            for i in range(batch_size):
+                waveform = input[i]
+                waveforms.append(
+                    waveform[
+                        : (
+                            (frame_num - 1) * self.frame_shift_sample_length
+                            + self.frame_sample_length
+                        )
+                    ]
+                )
+                waveform = waveform * (1 << 15)
+                self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+                frames = self.fbank_fn.num_frames_ready
+                mat = np.empty([frames, self.opts.mel_opts.num_bins])
+                for i in range(frames):
+                    mat[i, :] = self.fbank_fn.get_frame(i)
+                feat = mat.astype(np.float32)
+                feat_len = np.array(mat.shape[0]).astype(np.int32)
+                feats.append(feat)
+                feats_lens.append(feat_len)
+            waveforms = np.stack(waveforms)
+            feats_lens = np.array(feats_lens)
+            feats_pad = np.array(feats)
+        self.fbanks = feats_pad
+        self.fbanks_lens = copy.deepcopy(feats_lens)
+        return waveforms, feats_pad, feats_lens
+    def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
+        return self.fbanks, self.fbanks_lens
+    def lfr_cmvn(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, List[int]]:
+        batch_size = input.shape[0]
+        feats = []
+        feats_lens = []
+        lfr_splice_frame_idxs = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            lfr_splice_frame_idx = -1
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                # update self.lfr_splice_cache in self.apply_lfr
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(
+                    mat, self.lfr_m, self.lfr_n, is_final
+                )
+            if self.cmvn_file is not None:
+                mat = self.apply_cmvn(mat)
+            feat_length = mat.shape[0]
+            feats.append(mat)
+            feats_lens.append(feat_length)
+            lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+        feats_lens = np.array(feats_lens)
+        feats_pad = np.array(feats)
+        return feats_pad, feats_lens, lfr_splice_frame_idxs
+    def extract_fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        batch_size = input.shape[0]
+        assert (
+            batch_size == 1
+        ), "we support to extract feature online only when the batch size is equal to 1 now"
+        waveforms, feats, feats_lengths = self.fbank(input, input_lengths)  # input shape: B T D
+        if feats.shape[0]:
+            self.waveforms = (
+                waveforms
+                if self.reserve_waveforms is None
+                else np.concatenate((self.reserve_waveforms, waveforms), axis=1)
+            )
+            if not self.lfr_splice_cache:
+                for i in range(batch_size):
+                    self.lfr_splice_cache.append(
+                        np.expand_dims(feats[i][0, :], axis=0).repeat((self.lfr_m - 1) // 2, axis=0)
+                    )
+            if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
+                lfr_splice_cache_np = np.stack(self.lfr_splice_cache)  # B T D
+                feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
+                feats_lengths += lfr_splice_cache_np[0].shape[0]
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length)
+                    / self.frame_shift_sample_length
+                    + 1
+                )
+                minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
+                feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(
+                    feats, feats_lengths, is_final
+                )
+                if self.lfr_m == 1:
+                    self.reserve_waveforms = None
+                else:
+                    reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+                    # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
+                    # print('frame_frame:  ' + str(frame_from_waveforms))
+                    self.reserve_waveforms = self.waveforms[
+                        :,
+                        reserve_frame_idx
+                        * self.frame_shift_sample_length : frame_from_waveforms
+                        * self.frame_shift_sample_length,
+                    ]
+                    sample_length = (
+                        frame_from_waveforms - 1
+                    ) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.waveforms = self.waveforms[:, :sample_length]
+            else:
+                # update self.reserve_waveforms and self.lfr_splice_cache
+                self.reserve_waveforms = self.waveforms[
+                    :, : -(self.frame_sample_length - self.frame_shift_sample_length)
+                ]
+                for i in range(batch_size):
+                    self.lfr_splice_cache[i] = np.concatenate(
+                        (self.lfr_splice_cache[i], feats[i]), axis=0
+                    )
+                return np.empty(0, dtype=np.float32), feats_lengths
+        else:
+            if is_final:
+                self.waveforms = (
+                    waveforms if self.reserve_waveforms is None else self.reserve_waveforms
+                )
+                feats = np.stack(self.lfr_splice_cache)
+                feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
+                feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
+        if is_final:
+            self.cache_reset()
+        return feats, feats_lengths
+    def get_waveforms(self):
+        return self.waveforms
+    def cache_reset(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+def load_bytes(input):
+    middle_data = np.frombuffer(input, dtype=np.int16)
+    middle_data = np.asarray(middle_data)
+    if middle_data.dtype.kind not in "iu":
+        raise TypeError("'middle_data' must be an array of integers")
+    dtype = np.dtype("float32")
+    if dtype.kind != "f":
+        raise TypeError("'dtype' must be a floating point type")
+    i = np.iinfo(middle_data.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+    return array
+class SinusoidalPositionEncoderOnline:
+    """Streaming Positional encoding."""
+    def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
+        batch_size = positions.shape[0]
+        positions = positions.astype(dtype)
+        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
+        inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
+        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
+        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
+        return encoding.astype(dtype)
+    def forward(self, x, start_idx=0):
+        batch_size, timesteps, input_dim = x.shape
+        positions = np.arange(1, timesteps + 1 + start_idx)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype)
+        return x + position_encoding[:, start_idx : start_idx + timesteps]
+def test():
+    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
+    import librosa
+    cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
+    config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
+    from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
+    config = read_yaml(config_file)
+    waveform, _ = librosa.load(path, sr=None)
+    frontend = WavFrontend(
+        cmvn_file=cmvn_file,
+        **config["frontend_conf"],
+    )
+    speech, _ = frontend.fbank_online(waveform)  # 1d, (sample,), numpy
+    feat, feat_len = frontend.lfr_cmvn(
+        speech
+    )  # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
+    frontend.reset_status()  # clear cache
+    return feat, feat_len
+if __name__ == "__main__":
+    test()