Upload folder using huggingface_hub
Browse files- README.md +3 -6
- app.js +1095 -0
- index.html +134 -18
- pcs_vocab.json +0 -0
- punct_cap_seg_en.onnx +3 -0
- punctuator.js +294 -0
- silero_vad.onnx +3 -0
- style.css +627 -17
- vad.js +157 -0
README.md
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title: Granite Speech
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
| 8 |
-
license: apache-2.0
|
| 9 |
---
|
| 10 |
-
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Granite Speech WebGPU
|
| 3 |
+
emoji: 🗣️
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
|
|
|
|
|
app.js
ADDED
|
@@ -0,0 +1,1095 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Granite Speech WebGPU Demo
|
| 3 |
+
* Uses ONNX Runtime Web for in-browser speech recognition
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
import { PreTrainedTokenizer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2';
|
| 7 |
+
import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';
|
| 8 |
+
|
| 9 |
+
// Check if ONNX Runtime is loaded
|
| 10 |
+
if (typeof ort === 'undefined') {
|
| 11 |
+
console.error('ONNX Runtime Web not loaded! Check if the script tag is correct.');
|
| 12 |
+
alert('Failed to load ONNX Runtime. Please refresh the page.');
|
| 13 |
+
} else {
|
| 14 |
+
// Configure WASM paths to CDN
|
| 15 |
+
ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/';
|
| 16 |
+
|
| 17 |
+
// WASM settings - enable multi-threading for encoder performance
|
| 18 |
+
ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
|
| 19 |
+
ort.env.wasm.simd = true;
|
| 20 |
+
|
| 21 |
+
// WebGPU settings
|
| 22 |
+
ort.env.webgpu = ort.env.webgpu || {};
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// Model paths
|
| 26 |
+
// Granite Speech ONNX models hosted on HF Hub
|
| 27 |
+
const HF_MODEL_BASE = 'https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/onnx';
|
| 28 |
+
const ENCODER_PATH = `${HF_MODEL_BASE}/audio_encoder_q4f32.onnx`;
|
| 29 |
+
const EMBED_PATH = `${HF_MODEL_BASE}/embed_tokens_q4f16.onnx`;
|
| 30 |
+
const DECODER_PATH = `${HF_MODEL_BASE}/decoder_model_merged_q4f16.onnx`;
|
| 31 |
+
|
| 32 |
+
// Audio config from preprocessor_config.json
|
| 33 |
+
const SAMPLE_RATE = 16000;
|
| 34 |
+
const N_MELS = 80;
|
| 35 |
+
const N_FFT = 512;
|
| 36 |
+
const HOP_LENGTH = 160;
|
| 37 |
+
const WIN_LENGTH = 400;
|
| 38 |
+
|
| 39 |
+
// Model config
|
| 40 |
+
const HIDDEN_SIZE = 2048;
|
| 41 |
+
const VOCAB_SIZE = 100353;
|
| 42 |
+
const BOS_TOKEN = 100257;
|
| 43 |
+
const EOS_TOKEN = 100257;
|
| 44 |
+
const PAD_TOKEN = 100256;
|
| 45 |
+
const MAX_NEW_TOKENS = 256;
|
| 46 |
+
// Note: embedding_multiplier (12) is likely already applied in the model weights
|
| 47 |
+
|
| 48 |
+
// Prompt templates
|
| 49 |
+
const PROMPT_PREFIX = 'USER: ';
|
| 50 |
+
const PROMPTS = {
|
| 51 |
+
'transcribe': 'Transcribe the speech to text\n ASSISTANT:',
|
| 52 |
+
'translate_en': 'Translate the speech to English\n ASSISTANT:',
|
| 53 |
+
'translate_fr': 'Translate the speech to French\n ASSISTANT:',
|
| 54 |
+
'translate_de': 'Translate the speech to German\n ASSISTANT:',
|
| 55 |
+
'translate_es': 'Translate the speech to Spanish\n ASSISTANT:',
|
| 56 |
+
'translate_pt': 'Translate the speech to Portuguese\n ASSISTANT:',
|
| 57 |
+
'translate_ja': 'Translate the speech to Japanese\n ASSISTANT:',
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
// State
|
| 61 |
+
let encoderSession = null;
|
| 62 |
+
let embedSession = null;
|
| 63 |
+
let decoderSession = null;
|
| 64 |
+
let tokenizer = null;
|
| 65 |
+
let isModelLoading = false;
|
| 66 |
+
let currentAudioData = null;
|
| 67 |
+
|
| 68 |
+
// Pre-computed prompt embeddings (populated at init)
|
| 69 |
+
const promptEmbeddings = {
|
| 70 |
+
prefix: null, // "USER: "
|
| 71 |
+
// suffix embeddings keyed by prompt name
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
// DOM Elements
|
| 75 |
+
const statusDot = document.getElementById('statusDot');
|
| 76 |
+
const statusText = document.getElementById('statusText');
|
| 77 |
+
const recordBtn = document.getElementById('recordBtn');
|
| 78 |
+
const audioFile = document.getElementById('audioFile');
|
| 79 |
+
const fileTile = document.querySelector('.file-label');
|
| 80 |
+
const inputCard = document.querySelector('.input-card');
|
| 81 |
+
const audioPreview = document.getElementById('audioPreview');
|
| 82 |
+
const audioPlayer = document.getElementById('audioPlayer');
|
| 83 |
+
const playBtn = document.getElementById('playBtn');
|
| 84 |
+
const waveformCanvas = document.getElementById('waveformCanvas');
|
| 85 |
+
const waveformProgress = document.getElementById('waveformProgress');
|
| 86 |
+
const audioTime = document.getElementById('audioTime');
|
| 87 |
+
const transcribeSection = document.getElementById('transcribeSection');
|
| 88 |
+
const transcribeBtn = document.getElementById('transcribeBtn');
|
| 89 |
+
const promptSelect = document.getElementById('promptSelect');
|
| 90 |
+
const punctuationCheckbox = document.getElementById('punctuationCheckbox');
|
| 91 |
+
const transcriptCard = document.getElementById('transcriptCard');
|
| 92 |
+
const outputText = document.getElementById('outputText');
|
| 93 |
+
const copyBtn = document.getElementById('copyBtn');
|
| 94 |
+
const downloadBtn = document.getElementById('downloadBtn');
|
| 95 |
+
const clearBtn = document.getElementById('clearBtn');
|
| 96 |
+
const progressSection = document.getElementById('progressSection');
|
| 97 |
+
const progressFill = document.getElementById('progressFill');
|
| 98 |
+
const progressText = document.getElementById('progressText');
|
| 99 |
+
const gpuInfo = document.getElementById('gpuInfo');
|
| 100 |
+
|
| 101 |
+
// Recording state
|
| 102 |
+
let mediaRecorder = null;
|
| 103 |
+
let audioChunks = [];
|
| 104 |
+
let transcriptionAborted = false;
|
| 105 |
+
|
| 106 |
+
// Utility functions
|
| 107 |
+
function setStatus(status, message) {
|
| 108 |
+
statusDot.className = `status-dot ${status}`;
|
| 109 |
+
statusText.textContent = message;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Punctuation is handled by punctuator.js (applyPunctuation function)
|
| 113 |
+
|
| 114 |
+
function showProgress(show) {
|
| 115 |
+
progressSection.style.display = show ? 'block' : 'none';
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
function updateProgress(progress, text) {
|
| 119 |
+
progressFill.style.width = `${progress}%`;
|
| 120 |
+
progressText.textContent = text;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// Check WebGPU support
|
| 124 |
+
async function checkWebGPU() {
|
| 125 |
+
if (!navigator.gpu) {
|
| 126 |
+
gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
|
| 127 |
+
gpuInfo.style.color = '#e74c3c';
|
| 128 |
+
return false;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
try {
|
| 132 |
+
const adapter = await navigator.gpu.requestAdapter();
|
| 133 |
+
if (!adapter) {
|
| 134 |
+
gpuInfo.textContent = 'No WebGPU adapter available';
|
| 135 |
+
gpuInfo.style.color = '#f39c12';
|
| 136 |
+
return false;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
return true;
|
| 140 |
+
} catch (e) {
|
| 141 |
+
console.error('WebGPU error:', e);
|
| 142 |
+
gpuInfo.textContent = `WebGPU error: ${e.message || e}`;
|
| 143 |
+
gpuInfo.style.color = '#e74c3c';
|
| 144 |
+
return false;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
// Load tokenizer using transformers.js
|
| 149 |
+
async function loadTokenizer() {
|
| 150 |
+
const [tokenizerJson, tokenizerConfig] = await Promise.all([
|
| 151 |
+
fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer.json').then(r => r.json()),
|
| 152 |
+
fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer_config.json').then(r => r.json())
|
| 153 |
+
]);
|
| 154 |
+
return new PreTrainedTokenizer(tokenizerJson, tokenizerConfig);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// Get embeddings for token IDs (returns Float32Array)
|
| 158 |
+
async function getEmbeddings(tokenIds) {
|
| 159 |
+
const idsTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
|
| 160 |
+
const output = await embedSession.run({ input_ids: idsTensor });
|
| 161 |
+
return {
|
| 162 |
+
data: new Float32Array(output.inputs_embeds.data),
|
| 163 |
+
seqLen: output.inputs_embeds.dims[1]
|
| 164 |
+
};
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
// Pre-compute embeddings for all prompts
|
| 168 |
+
async function precomputePromptEmbeddings() {
|
| 169 |
+
// Prefix embedding
|
| 170 |
+
const prefixTokens = tokenizer.encode(PROMPT_PREFIX, { add_special_tokens: false });
|
| 171 |
+
promptEmbeddings.prefix = await getEmbeddings(prefixTokens);
|
| 172 |
+
|
| 173 |
+
// Suffix embeddings for each prompt
|
| 174 |
+
for (const [key, text] of Object.entries(PROMPTS)) {
|
| 175 |
+
const tokens = tokenizer.encode(text, { add_special_tokens: false });
|
| 176 |
+
promptEmbeddings[key] = await getEmbeddings(tokens);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
console.log('Pre-computed embeddings for', Object.keys(promptEmbeddings).length, 'prompts');
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
// Session options - WebGPU only (no WASM fallback)
|
| 183 |
+
const sessionOptions = {
|
| 184 |
+
executionProviders: ['webgpu'],
|
| 185 |
+
enableMemPattern: false,
|
| 186 |
+
enableCpuMemArena: false,
|
| 187 |
+
graphOptimizationLevel: 'basic',
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
// Force garbage collection pause
|
| 191 |
+
async function gcPause() {
|
| 192 |
+
// Give browser time to garbage collect
|
| 193 |
+
await new Promise(resolve => setTimeout(resolve, 100));
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
// Load ONNX model with external data support
|
| 197 |
+
async function loadModelWithExternalData(modelPath, options) {
|
| 198 |
+
// Check if external data file exists
|
| 199 |
+
const dataPath = modelPath.replace('.onnx', '.onnx_data');
|
| 200 |
+
|
| 201 |
+
const modelResponse = await fetch(modelPath);
|
| 202 |
+
const modelBuffer = await modelResponse.arrayBuffer();
|
| 203 |
+
|
| 204 |
+
const dataResponse = await fetch(dataPath);
|
| 205 |
+
if (!dataResponse.ok) {
|
| 206 |
+
// No external data, load model directly
|
| 207 |
+
return await ort.InferenceSession.create(modelBuffer, options);
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
const dataBuffer = await dataResponse.arrayBuffer();
|
| 211 |
+
|
| 212 |
+
// Extract filename from path for external data reference
|
| 213 |
+
const dataFileName = dataPath.split('/').pop();
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
// Create session with external data
|
| 217 |
+
const sessionOptionsWithData = {
|
| 218 |
+
...options,
|
| 219 |
+
externalData: [
|
| 220 |
+
{
|
| 221 |
+
path: dataFileName,
|
| 222 |
+
data: dataBuffer,
|
| 223 |
+
}
|
| 224 |
+
]
|
| 225 |
+
};
|
| 226 |
+
|
| 227 |
+
return await ort.InferenceSession.create(modelBuffer, sessionOptionsWithData);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
// Initialize ONNX Runtime and load models
|
| 231 |
+
async function initModels() {
|
| 232 |
+
if (isModelLoading) return;
|
| 233 |
+
isModelLoading = true;
|
| 234 |
+
|
| 235 |
+
setStatus('loading', 'Loading models...');
|
| 236 |
+
showProgress(true);
|
| 237 |
+
|
| 238 |
+
try {
|
| 239 |
+
const hasWebGPU = await checkWebGPU();
|
| 240 |
+
|
| 241 |
+
updateProgress(10, 'Initializing ONNX Runtime...');
|
| 242 |
+
|
| 243 |
+
// Load tokenizer
|
| 244 |
+
updateProgress(15, 'Loading tokenizer...');
|
| 245 |
+
tokenizer = await loadTokenizer();
|
| 246 |
+
|
| 247 |
+
// Load models one at a time with GC pauses between
|
| 248 |
+
// Use loadModelWithExternalData to handle .onnx_data files
|
| 249 |
+
|
| 250 |
+
// Load encoder model (q4f32 with WebGPU)
|
| 251 |
+
updateProgress(20, 'Loading encoder model...');
|
| 252 |
+
encoderSession = await loadModelWithExternalData(ENCODER_PATH, sessionOptions);
|
| 253 |
+
|
| 254 |
+
await gcPause();
|
| 255 |
+
|
| 256 |
+
// Load embed tokens model
|
| 257 |
+
updateProgress(40, 'Loading embed tokens model...');
|
| 258 |
+
embedSession = await loadModelWithExternalData(EMBED_PATH, sessionOptions);
|
| 259 |
+
|
| 260 |
+
// Pre-compute prompt embeddings
|
| 261 |
+
updateProgress(50, 'Pre-computing prompt embeddings...');
|
| 262 |
+
await precomputePromptEmbeddings();
|
| 263 |
+
|
| 264 |
+
await gcPause();
|
| 265 |
+
|
| 266 |
+
// Load decoder model
|
| 267 |
+
updateProgress(60, 'Loading decoder model...');
|
| 268 |
+
decoderSession = await loadModelWithExternalData(DECODER_PATH, sessionOptions);
|
| 269 |
+
|
| 270 |
+
updateProgress(100, 'Models loaded!');
|
| 271 |
+
showProgress(false);
|
| 272 |
+
setStatus('ready', 'Ready - Record or upload audio');
|
| 273 |
+
enableControls(true);
|
| 274 |
+
|
| 275 |
+
} catch (error) {
|
| 276 |
+
console.error('Model loading failed:', error);
|
| 277 |
+
console.error('Error stack:', error?.stack);
|
| 278 |
+
const errorMsg = error?.message || error?.toString() || 'Unknown error';
|
| 279 |
+
setStatus('error', `Error: ${errorMsg}`);
|
| 280 |
+
showProgress(false);
|
| 281 |
+
isModelLoading = false;
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
function enableControls(enabled) {
|
| 286 |
+
recordBtn.disabled = !enabled;
|
| 287 |
+
audioFile.disabled = !enabled;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
// Mel spectrogram computation
|
| 291 |
+
// Uses custom implementation matching torchaudio
|
| 292 |
+
function computeMelSpectrogram(audioData) {
|
| 293 |
+
// Pad signal with reflection (center=True, pad_mode='reflect')
|
| 294 |
+
const padLength = Math.floor(N_FFT / 2);
|
| 295 |
+
const paddedLength = audioData.length + 2 * padLength;
|
| 296 |
+
const paddedAudio = new Float32Array(paddedLength);
|
| 297 |
+
|
| 298 |
+
// Reflect padding at start: for position -i, use position i (not i-1)
|
| 299 |
+
// numpy reflect: for index -1, reflects to index 1
|
| 300 |
+
for (let i = 0; i < padLength; i++) {
|
| 301 |
+
// Position -(i+1) reflects to position (i+1)
|
| 302 |
+
const srcIdx = Math.min(i + 1, audioData.length - 1);
|
| 303 |
+
paddedAudio[padLength - 1 - i] = audioData[srcIdx];
|
| 304 |
+
}
|
| 305 |
+
// Copy original audio
|
| 306 |
+
for (let i = 0; i < audioData.length; i++) {
|
| 307 |
+
paddedAudio[padLength + i] = audioData[i];
|
| 308 |
+
}
|
| 309 |
+
// Reflect padding at end
|
| 310 |
+
for (let i = 0; i < padLength; i++) {
|
| 311 |
+
const srcIdx = Math.max(0, audioData.length - 2 - i);
|
| 312 |
+
paddedAudio[padLength + audioData.length + i] = audioData[srcIdx];
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
// Calculate number of frames
|
| 316 |
+
const numFrames = Math.floor((paddedLength - N_FFT) / HOP_LENGTH) + 1;
|
| 317 |
+
|
| 318 |
+
// Create mel filterbank (torchaudio HTK style)
|
| 319 |
+
const melFilterbank = createMelFilterbank(N_FFT, N_MELS, SAMPLE_RATE);
|
| 320 |
+
|
| 321 |
+
// Hann window (periodic=True like torchaudio)
|
| 322 |
+
const window = new Float32Array(WIN_LENGTH);
|
| 323 |
+
for (let i = 0; i < WIN_LENGTH; i++) {
|
| 324 |
+
window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / WIN_LENGTH));
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
const melSpec = new Float32Array(numFrames * N_MELS);
|
| 328 |
+
|
| 329 |
+
// torch.stft center-pads the window when win_length < n_fft
|
| 330 |
+
// Window is placed at indices padLeft to padLeft+win_length
|
| 331 |
+
const padLeft = Math.floor((N_FFT - WIN_LENGTH) / 2); // = 56
|
| 332 |
+
|
| 333 |
+
for (let frame = 0; frame < numFrames; frame++) {
|
| 334 |
+
const start = frame * HOP_LENGTH;
|
| 335 |
+
|
| 336 |
+
// Apply center-padded window (matching torch.stft behavior)
|
| 337 |
+
// Read n_fft samples, apply window centered in the middle
|
| 338 |
+
const windowed = new Float32Array(N_FFT); // initialized to zeros
|
| 339 |
+
for (let i = 0; i < WIN_LENGTH; i++) {
|
| 340 |
+
windowed[padLeft + i] = paddedAudio[start + padLeft + i] * window[i];
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
// Compute power spectrum
|
| 344 |
+
const powerSpec = computePowerSpectrum(windowed);
|
| 345 |
+
|
| 346 |
+
// Apply mel filterbank and log10
|
| 347 |
+
for (let m = 0; m < N_MELS; m++) {
|
| 348 |
+
let sum = 0;
|
| 349 |
+
for (let k = 0; k < N_FFT / 2 + 1; k++) {
|
| 350 |
+
sum += powerSpec[k] * melFilterbank[m * (N_FFT / 2 + 1) + k];
|
| 351 |
+
}
|
| 352 |
+
melSpec[frame * N_MELS + m] = Math.log10(Math.max(sum, 1e-10));
|
| 353 |
+
}
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
return { data: melSpec, numFrames, numMels: N_MELS };
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
// Create mel filterbank (torchaudio HTK style)
|
| 360 |
+
function createMelFilterbank(nfft, nMels, sampleRate) {
|
| 361 |
+
const numBins = nfft / 2 + 1;
|
| 362 |
+
const filterbank = new Float32Array(nMels * numBins);
|
| 363 |
+
|
| 364 |
+
// HTK mel scale
|
| 365 |
+
const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700);
|
| 366 |
+
const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1);
|
| 367 |
+
|
| 368 |
+
const fMin = 0;
|
| 369 |
+
const fMax = sampleRate / 2;
|
| 370 |
+
const melMin = hzToMel(fMin);
|
| 371 |
+
const melMax = hzToMel(fMax);
|
| 372 |
+
|
| 373 |
+
// Create mel-spaced frequency points (n_mels + 2 points)
|
| 374 |
+
const fPts = new Float32Array(nMels + 2);
|
| 375 |
+
for (let i = 0; i < nMels + 2; i++) {
|
| 376 |
+
fPts[i] = melToHz(melMin + (melMax - melMin) * i / (nMels + 1));
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
// Create frequency array for each FFT bin
|
| 380 |
+
const allFreqs = new Float32Array(numBins);
|
| 381 |
+
for (let i = 0; i < numBins; i++) {
|
| 382 |
+
allFreqs[i] = i * sampleRate / nfft;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
// Compute frequency differences
|
| 386 |
+
const fDiff = new Float32Array(nMels + 1);
|
| 387 |
+
for (let i = 0; i < nMels + 1; i++) {
|
| 388 |
+
fDiff[i] = fPts[i + 1] - fPts[i];
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
// Create triangular filters using slopes (torchaudio style)
|
| 392 |
+
for (let m = 0; m < nMels; m++) {
|
| 393 |
+
for (let k = 0; k < numBins; k++) {
|
| 394 |
+
const freq = allFreqs[k];
|
| 395 |
+
const lowSlope = (freq - fPts[m]) / fDiff[m];
|
| 396 |
+
const upSlope = (fPts[m + 2] - freq) / fDiff[m + 1];
|
| 397 |
+
filterbank[m * numBins + k] = Math.max(0, Math.min(lowSlope, upSlope));
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
return filterbank;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
// Compute power spectrum using radix-2 FFT
|
| 405 |
+
function computePowerSpectrum(signal) {
|
| 406 |
+
const n = signal.length;
|
| 407 |
+
|
| 408 |
+
// Use radix-2 FFT for power of 2 lengths
|
| 409 |
+
if ((n & (n - 1)) === 0) {
|
| 410 |
+
return computePowerSpectrumFFT(signal);
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
// Fallback to DFT for non-power-of-2
|
| 414 |
+
const spectrum = new Float32Array(n / 2 + 1);
|
| 415 |
+
for (let k = 0; k <= n / 2; k++) {
|
| 416 |
+
let real = 0, imag = 0;
|
| 417 |
+
for (let t = 0; t < n; t++) {
|
| 418 |
+
const angle = -2 * Math.PI * k * t / n;
|
| 419 |
+
real += signal[t] * Math.cos(angle);
|
| 420 |
+
imag += signal[t] * Math.sin(angle);
|
| 421 |
+
}
|
| 422 |
+
spectrum[k] = real * real + imag * imag;
|
| 423 |
+
}
|
| 424 |
+
return spectrum;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
// Radix-2 FFT for power spectrum
|
| 428 |
+
function computePowerSpectrumFFT(signal) {
|
| 429 |
+
const n = signal.length;
|
| 430 |
+
|
| 431 |
+
// Bit-reversal permutation
|
| 432 |
+
const real = new Float32Array(n);
|
| 433 |
+
const imag = new Float32Array(n);
|
| 434 |
+
|
| 435 |
+
for (let i = 0; i < n; i++) {
|
| 436 |
+
let j = 0;
|
| 437 |
+
let x = i;
|
| 438 |
+
for (let k = 0; k < Math.log2(n); k++) {
|
| 439 |
+
j = (j << 1) | (x & 1);
|
| 440 |
+
x >>= 1;
|
| 441 |
+
}
|
| 442 |
+
real[j] = signal[i];
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
// Cooley-Tukey FFT
|
| 446 |
+
for (let size = 2; size <= n; size *= 2) {
|
| 447 |
+
const halfSize = size / 2;
|
| 448 |
+
const step = Math.PI / halfSize;
|
| 449 |
+
|
| 450 |
+
for (let i = 0; i < n; i += size) {
|
| 451 |
+
for (let j = 0; j < halfSize; j++) {
|
| 452 |
+
const angle = -j * step;
|
| 453 |
+
const cos = Math.cos(angle);
|
| 454 |
+
const sin = Math.sin(angle);
|
| 455 |
+
|
| 456 |
+
const idx1 = i + j;
|
| 457 |
+
const idx2 = i + j + halfSize;
|
| 458 |
+
|
| 459 |
+
const tReal = cos * real[idx2] - sin * imag[idx2];
|
| 460 |
+
const tImag = sin * real[idx2] + cos * imag[idx2];
|
| 461 |
+
|
| 462 |
+
real[idx2] = real[idx1] - tReal;
|
| 463 |
+
imag[idx2] = imag[idx1] - tImag;
|
| 464 |
+
real[idx1] = real[idx1] + tReal;
|
| 465 |
+
imag[idx1] = imag[idx1] + tImag;
|
| 466 |
+
}
|
| 467 |
+
}
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
// Compute power spectrum (first half + DC and Nyquist)
|
| 471 |
+
const spectrum = new Float32Array(n / 2 + 1);
|
| 472 |
+
for (let k = 0; k <= n / 2; k++) {
|
| 473 |
+
spectrum[k] = real[k] * real[k] + imag[k] * imag[k];
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
return spectrum;
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
// Prepare audio features for encoder
|
| 480 |
+
function prepareAudioFeatures(audioData) {
|
| 481 |
+
const melSpec = computeMelSpectrogram(audioData);
|
| 482 |
+
|
| 483 |
+
// Apply Granite Speech normalization:
|
| 484 |
+
// 1. Already have log10 mel from computeMelSpectrogram
|
| 485 |
+
// 2. Normalize: max(logmel, max - 8) / 4 + 1
|
| 486 |
+
const logmel = melSpec.data;
|
| 487 |
+
let maxVal = -Infinity;
|
| 488 |
+
for (let i = 0; i < logmel.length; i++) {
|
| 489 |
+
if (logmel[i] > maxVal) maxVal = logmel[i];
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
const normalized = new Float32Array(logmel.length);
|
| 493 |
+
for (let i = 0; i < logmel.length; i++) {
|
| 494 |
+
normalized[i] = Math.max(logmel[i], maxVal - 8) / 4 + 1;
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
// Remove last frame if odd
|
| 498 |
+
let numFrames = melSpec.numFrames;
|
| 499 |
+
if (numFrames % 2 === 1) {
|
| 500 |
+
numFrames -= 1;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
// Stack 2 consecutive frames -> 160 features (80 mels * 2)
|
| 504 |
+
const stackedFrames = numFrames / 2;
|
| 505 |
+
const features = new Float32Array(stackedFrames * 160);
|
| 506 |
+
|
| 507 |
+
for (let t = 0; t < stackedFrames; t++) {
|
| 508 |
+
// First frame of pair
|
| 509 |
+
for (let m = 0; m < N_MELS; m++) {
|
| 510 |
+
features[t * 160 + m] = normalized[(t * 2) * N_MELS + m];
|
| 511 |
+
}
|
| 512 |
+
// Second frame of pair
|
| 513 |
+
for (let m = 0; m < N_MELS; m++) {
|
| 514 |
+
features[t * 160 + N_MELS + m] = normalized[(t * 2 + 1) * N_MELS + m];
|
| 515 |
+
}
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
return { data: features, shape: [1, stackedFrames, 160] };
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
// Transcribe a single audio segment and return the text
|
| 522 |
+
async function transcribeSegment(audioSegment, onPartialResult) {
|
| 523 |
+
// Prepare audio features
|
| 524 |
+
const audioFeatures = prepareAudioFeatures(audioSegment);
|
| 525 |
+
|
| 526 |
+
// Run encoder
|
| 527 |
+
const encoderInput = new ort.Tensor('float32', audioFeatures.data, audioFeatures.shape);
|
| 528 |
+
const encoderOutput = await encoderSession.run({ input_features: encoderInput });
|
| 529 |
+
const audioEmbeddings = encoderOutput.audio_features;
|
| 530 |
+
|
| 531 |
+
// Get pre-computed prompt embeddings
|
| 532 |
+
const prefixEmbed = promptEmbeddings.prefix;
|
| 533 |
+
const suffixEmbed = promptEmbeddings[promptSelect.value] || promptEmbeddings['transcribe'];
|
| 534 |
+
|
| 535 |
+
// Concatenate embeddings using TypedArray.set()
|
| 536 |
+
const prefixSeqLen = prefixEmbed.seqLen;
|
| 537 |
+
const audioSeqLen = audioEmbeddings.dims[1];
|
| 538 |
+
const suffixSeqLen = suffixEmbed.seqLen;
|
| 539 |
+
const totalSeqLen = prefixSeqLen + audioSeqLen + suffixSeqLen;
|
| 540 |
+
|
| 541 |
+
const combinedEmbeds = new Float32Array(totalSeqLen * HIDDEN_SIZE);
|
| 542 |
+
combinedEmbeds.set(prefixEmbed.data, 0);
|
| 543 |
+
combinedEmbeds.set(new Float32Array(audioEmbeddings.data), prefixSeqLen * HIDDEN_SIZE);
|
| 544 |
+
combinedEmbeds.set(suffixEmbed.data, (prefixSeqLen + audioSeqLen) * HIDDEN_SIZE);
|
| 545 |
+
|
| 546 |
+
// Autoregressive generation
|
| 547 |
+
let generatedTokens = [];
|
| 548 |
+
let currentEmbeds = combinedEmbeds;
|
| 549 |
+
let currentSeqLen = totalSeqLen;
|
| 550 |
+
let pastKeyValues = null;
|
| 551 |
+
const numLayers = 40;
|
| 552 |
+
let totalSeqLenSoFar = totalSeqLen;
|
| 553 |
+
|
| 554 |
+
for (let step = 0; step < MAX_NEW_TOKENS; step++) {
|
| 555 |
+
const attentionMask = new BigInt64Array(totalSeqLenSoFar).fill(1n);
|
| 556 |
+
|
| 557 |
+
const embedsTensor = new ort.Tensor('float32', currentEmbeds, [1, currentSeqLen, HIDDEN_SIZE]);
|
| 558 |
+
const maskTensor = new ort.Tensor('int64', attentionMask, [1, totalSeqLenSoFar]);
|
| 559 |
+
|
| 560 |
+
const decoderInputs = {
|
| 561 |
+
inputs_embeds: embedsTensor,
|
| 562 |
+
attention_mask: maskTensor,
|
| 563 |
+
};
|
| 564 |
+
|
| 565 |
+
if (pastKeyValues) {
|
| 566 |
+
for (let i = 0; i < numLayers; i++) {
|
| 567 |
+
decoderInputs[`past_key_values.${i}.key`] = pastKeyValues[`present.${i}.key`];
|
| 568 |
+
decoderInputs[`past_key_values.${i}.value`] = pastKeyValues[`present.${i}.value`];
|
| 569 |
+
}
|
| 570 |
+
} else {
|
| 571 |
+
const emptyPast = new Uint16Array(0);
|
| 572 |
+
for (let i = 0; i < numLayers; i++) {
|
| 573 |
+
decoderInputs[`past_key_values.${i}.key`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
|
| 574 |
+
decoderInputs[`past_key_values.${i}.value`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
|
| 575 |
+
}
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
const decoderOutput = await decoderSession.run(decoderInputs);
|
| 579 |
+
pastKeyValues = decoderOutput;
|
| 580 |
+
|
| 581 |
+
const logitsFloat32 = Float32Array.from(decoderOutput.logits.data);
|
| 582 |
+
|
| 583 |
+
// Extract logits for last position and find argmax
|
| 584 |
+
const logitOffset = (currentSeqLen - 1) * VOCAB_SIZE;
|
| 585 |
+
const lastLogits = logitsFloat32.subarray(logitOffset, logitOffset + VOCAB_SIZE);
|
| 586 |
+
|
| 587 |
+
let nextToken = 0, maxVal = lastLogits[0];
|
| 588 |
+
for (let i = 1; i < VOCAB_SIZE; i++) {
|
| 589 |
+
if (lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
// Avoid EOS on first token - take second best
|
| 593 |
+
if (step === 0 && nextToken === EOS_TOKEN) {
|
| 594 |
+
nextToken = 0; maxVal = -Infinity;
|
| 595 |
+
for (let i = 0; i < VOCAB_SIZE; i++) {
|
| 596 |
+
if (i !== EOS_TOKEN && lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
|
| 597 |
+
}
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
if (nextToken === EOS_TOKEN) {
|
| 601 |
+
break;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
generatedTokens.push(nextToken);
|
| 605 |
+
|
| 606 |
+
// Callback for streaming updates
|
| 607 |
+
if (onPartialResult) {
|
| 608 |
+
onPartialResult(tokenizer.decode(generatedTokens));
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
const nextTokenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(nextToken)]), [1, 1]);
|
| 612 |
+
const nextEmbedOutput = await embedSession.run({ input_ids: nextTokenTensor });
|
| 613 |
+
currentEmbeds = new Float32Array(nextEmbedOutput.inputs_embeds.data);
|
| 614 |
+
currentSeqLen = 1;
|
| 615 |
+
totalSeqLenSoFar += 1;
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
return tokenizer.decode(generatedTokens);
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
// Wait until audio playback reaches a specific time
|
| 622 |
+
function waitForPlaybackTime(targetTime) {
|
| 623 |
+
return new Promise((resolve) => {
|
| 624 |
+
const check = () => {
|
| 625 |
+
if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) {
|
| 626 |
+
resolve();
|
| 627 |
+
} else {
|
| 628 |
+
requestAnimationFrame(check);
|
| 629 |
+
}
|
| 630 |
+
};
|
| 631 |
+
check();
|
| 632 |
+
});
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
// Run inference with segmentation and audio sync
|
| 636 |
+
async function transcribe() {
|
| 637 |
+
if (!encoderSession || !embedSession || !decoderSession || !currentAudioData) {
|
| 638 |
+
setStatus('error', 'Model or audio not ready');
|
| 639 |
+
return;
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
setStatus('processing', 'Processing audio...');
|
| 643 |
+
transcribeBtn.disabled = true;
|
| 644 |
+
transcriptionAborted = false;
|
| 645 |
+
outputText.textContent = '';
|
| 646 |
+
transcriptCard.style.display = 'block';
|
| 647 |
+
showProgress(true);
|
| 648 |
+
|
| 649 |
+
try {
|
| 650 |
+
// Get speech segments using VAD
|
| 651 |
+
updateProgress(5, 'Detecting speech segments...');
|
| 652 |
+
const segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
|
| 653 |
+
console.log(`VAD found ${segments.length} segment(s)`);
|
| 654 |
+
|
| 655 |
+
// Start audio playback immediately
|
| 656 |
+
audioPlayer.currentTime = 0;
|
| 657 |
+
audioPlayer.play();
|
| 658 |
+
playBtn.querySelector('.play-icon').style.display = 'none';
|
| 659 |
+
playBtn.querySelector('.pause-icon').style.display = 'block';
|
| 660 |
+
const playbackStartTime = performance.now() / 1000;
|
| 661 |
+
|
| 662 |
+
// Process and display segments in sync with audio
|
| 663 |
+
const displayedResults = [];
|
| 664 |
+
const totalSegments = segments.length;
|
| 665 |
+
|
| 666 |
+
for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
|
| 667 |
+
if (transcriptionAborted) break;
|
| 668 |
+
|
| 669 |
+
const seg = segments[segIdx];
|
| 670 |
+
|
| 671 |
+
// Update progress bar
|
| 672 |
+
const segProgress = ((segIdx + 1) / totalSegments) * 100;
|
| 673 |
+
updateProgress(segProgress, '');
|
| 674 |
+
|
| 675 |
+
// Wait for audio to reach this segment's start time
|
| 676 |
+
const elapsed = (performance.now() / 1000) - playbackStartTime;
|
| 677 |
+
const waitTime = seg.start - elapsed;
|
| 678 |
+
if (waitTime > 0) {
|
| 679 |
+
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);
|
| 683 |
+
|
| 684 |
+
// Extract and transcribe this segment
|
| 685 |
+
const startSample = Math.floor(seg.start * SAMPLE_RATE);
|
| 686 |
+
const endSample = Math.floor(seg.end * SAMPLE_RATE);
|
| 687 |
+
const audioSegment = currentAudioData.slice(startSample, endSample);
|
| 688 |
+
|
| 689 |
+
const timestamp = formatTimestamp(seg.start);
|
| 690 |
+
const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;
|
| 691 |
+
|
| 692 |
+
// Transcribe with streaming display
|
| 693 |
+
const segmentText = await transcribeSegment(audioSegment, (partial) => {
|
| 694 |
+
const escaped = partial.replace(/</g, '<').replace(/>/g, '>');
|
| 695 |
+
const rows = [...displayedResults, makeRow(timestamp, escaped)];
|
| 696 |
+
outputText.innerHTML = rows.join('');
|
| 697 |
+
outputText.scrollTop = outputText.scrollHeight;
|
| 698 |
+
});
|
| 699 |
+
|
| 700 |
+
if (segmentText.trim()) {
|
| 701 |
+
let finalSegmentText = segmentText.trim();
|
| 702 |
+
// Auto-detect language and apply punctuation if supported
|
| 703 |
+
if (punctuationCheckbox.checked) {
|
| 704 |
+
const detectedLang = detect(finalSegmentText);
|
| 705 |
+
const supportedLangs = ['en', ...window.MULTILINGUAL_PUNCT_LANGS];
|
| 706 |
+
if (supportedLangs.includes(detectedLang)) {
|
| 707 |
+
// Strip existing punctuation before applying punctuation model
|
| 708 |
+
const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
|
| 709 |
+
finalSegmentText = await applyPunctuation(stripped, detectedLang);
|
| 710 |
+
// Replace unknown tokens from punctuator with spaces
|
| 711 |
+
finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
|
| 712 |
+
}
|
| 713 |
+
}
|
| 714 |
+
const escaped = finalSegmentText.replace(/</g, '<').replace(/>/g, '>');
|
| 715 |
+
displayedResults.push(makeRow(timestamp, escaped));
|
| 716 |
+
outputText.innerHTML = displayedResults.join('');
|
| 717 |
+
outputText.scrollTop = outputText.scrollHeight;
|
| 718 |
+
}
|
| 719 |
+
}
|
| 720 |
+
|
| 721 |
+
// Final output
|
| 722 |
+
if (displayedResults.length === 0) {
|
| 723 |
+
outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
|
| 724 |
+
}
|
| 725 |
+
copyBtn.disabled = false;
|
| 726 |
+
|
| 727 |
+
showProgress(false);
|
| 728 |
+
setStatus('ready', 'Transcription complete');
|
| 729 |
+
|
| 730 |
+
} catch (error) {
|
| 731 |
+
console.error('Transcription failed:', error);
|
| 732 |
+
setStatus('error', `Error: ${error.message}`);
|
| 733 |
+
showProgress(false);
|
| 734 |
+
}
|
| 735 |
+
|
| 736 |
+
transcribeBtn.disabled = false;
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
// Audio recording
|
| 740 |
+
let isRecording = false;
|
| 741 |
+
|
| 742 |
+
function toggleRecording() {
|
| 743 |
+
if (isRecording) {
|
| 744 |
+
stopRecording();
|
| 745 |
+
} else {
|
| 746 |
+
startRecording();
|
| 747 |
+
}
|
| 748 |
+
}
|
| 749 |
+
|
| 750 |
+
async function startRecording() {
|
| 751 |
+
try {
|
| 752 |
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
| 753 |
+
|
| 754 |
+
mediaRecorder = new MediaRecorder(stream);
|
| 755 |
+
audioChunks = [];
|
| 756 |
+
|
| 757 |
+
mediaRecorder.ondataavailable = (event) => {
|
| 758 |
+
audioChunks.push(event.data);
|
| 759 |
+
};
|
| 760 |
+
|
| 761 |
+
mediaRecorder.onstop = async () => {
|
| 762 |
+
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
|
| 763 |
+
const audioUrl = URL.createObjectURL(audioBlob);
|
| 764 |
+
audioPlayer.src = audioUrl;
|
| 765 |
+
audioPreview.style.display = 'flex';
|
| 766 |
+
transcribeSection.style.display = 'flex';
|
| 767 |
+
|
| 768 |
+
await processAudioBlob(audioBlob);
|
| 769 |
+
drawWaveform();
|
| 770 |
+
updateAudioTime();
|
| 771 |
+
stream.getTracks().forEach(track => track.stop());
|
| 772 |
+
};
|
| 773 |
+
|
| 774 |
+
mediaRecorder.start();
|
| 775 |
+
isRecording = true;
|
| 776 |
+
setStatus('recording', 'Recording...');
|
| 777 |
+
|
| 778 |
+
// Update button UI
|
| 779 |
+
recordBtn.querySelector('.mic-icon').style.display = 'none';
|
| 780 |
+
recordBtn.querySelector('.stop-icon').style.display = 'block';
|
| 781 |
+
recordBtn.querySelector('span').textContent = 'Stop';
|
| 782 |
+
recordBtn.classList.add('recording');
|
| 783 |
+
|
| 784 |
+
} catch (error) {
|
| 785 |
+
console.error('Recording failed:', error);
|
| 786 |
+
setStatus('error', 'Microphone access denied');
|
| 787 |
+
}
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
function stopRecording() {
|
| 791 |
+
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
|
| 792 |
+
mediaRecorder.stop();
|
| 793 |
+
isRecording = false;
|
| 794 |
+
setStatus('ready', 'Recording stopped - Click Transcribe');
|
| 795 |
+
|
| 796 |
+
// Update button UI
|
| 797 |
+
recordBtn.querySelector('.mic-icon').style.display = 'block';
|
| 798 |
+
recordBtn.querySelector('.stop-icon').style.display = 'none';
|
| 799 |
+
recordBtn.querySelector('span').textContent = 'Record';
|
| 800 |
+
recordBtn.classList.remove('recording');
|
| 801 |
+
}
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
// Process audio file/blob
|
| 805 |
+
async function processAudioBlob(blob) {
|
| 806 |
+
try {
|
| 807 |
+
const arrayBuffer = await blob.arrayBuffer();
|
| 808 |
+
const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
|
| 809 |
+
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
|
| 810 |
+
|
| 811 |
+
// Convert to mono Float32Array
|
| 812 |
+
let audioData;
|
| 813 |
+
if (audioBuffer.numberOfChannels > 1) {
|
| 814 |
+
const left = audioBuffer.getChannelData(0);
|
| 815 |
+
const right = audioBuffer.getChannelData(1);
|
| 816 |
+
audioData = new Float32Array(left.length);
|
| 817 |
+
for (let i = 0; i < left.length; i++) {
|
| 818 |
+
audioData[i] = (left[i] + right[i]) / 2;
|
| 819 |
+
}
|
| 820 |
+
} else {
|
| 821 |
+
audioData = audioBuffer.getChannelData(0);
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
// Resample if needed
|
| 825 |
+
if (audioBuffer.sampleRate !== SAMPLE_RATE) {
|
| 826 |
+
audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
|
| 827 |
+
}
|
| 828 |
+
|
| 829 |
+
currentAudioData = audioData;
|
| 830 |
+
transcribeBtn.disabled = false;
|
| 831 |
+
|
| 832 |
+
} catch (error) {
|
| 833 |
+
console.error('Audio processing failed:', error);
|
| 834 |
+
setStatus('error', 'Failed to process audio');
|
| 835 |
+
}
|
| 836 |
+
}
|
| 837 |
+
|
| 838 |
+
// Simple linear resampling
|
| 839 |
+
function resample(audioData, fromRate, toRate) {
|
| 840 |
+
const ratio = fromRate / toRate;
|
| 841 |
+
const newLength = Math.round(audioData.length / ratio);
|
| 842 |
+
const result = new Float32Array(newLength);
|
| 843 |
+
|
| 844 |
+
for (let i = 0; i < newLength; i++) {
|
| 845 |
+
const srcIndex = i * ratio;
|
| 846 |
+
const srcIndexFloor = Math.floor(srcIndex);
|
| 847 |
+
const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
|
| 848 |
+
const t = srcIndex - srcIndexFloor;
|
| 849 |
+
result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
|
| 850 |
+
}
|
| 851 |
+
|
| 852 |
+
return result;
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
// Handle file upload
|
| 856 |
+
async function handleFileUpload(event) {
|
| 857 |
+
const file = event.target.files[0];
|
| 858 |
+
if (!file) return;
|
| 859 |
+
await loadAudioFile(file);
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
+
// Handle dropped files
|
| 863 |
+
async function handleFileDrop(event) {
|
| 864 |
+
event.preventDefault();
|
| 865 |
+
inputCard.classList.remove('drag-over');
|
| 866 |
+
|
| 867 |
+
const file = event.dataTransfer.files[0];
|
| 868 |
+
if (!file || !file.type.startsWith('audio/')) {
|
| 869 |
+
setStatus('error', 'Please drop an audio file');
|
| 870 |
+
return;
|
| 871 |
+
}
|
| 872 |
+
await loadAudioFile(file);
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
// Common file loading logic
|
| 876 |
+
async function loadAudioFile(file) {
|
| 877 |
+
setStatus('processing', 'Processing audio file...');
|
| 878 |
+
|
| 879 |
+
const audioUrl = URL.createObjectURL(file);
|
| 880 |
+
audioPlayer.src = audioUrl;
|
| 881 |
+
audioPreview.style.display = 'flex';
|
| 882 |
+
transcribeSection.style.display = 'flex';
|
| 883 |
+
|
| 884 |
+
await processAudioBlob(file);
|
| 885 |
+
drawWaveform();
|
| 886 |
+
updateAudioTime();
|
| 887 |
+
setStatus('ready', 'Audio loaded - Click Transcribe');
|
| 888 |
+
}
|
| 889 |
+
|
| 890 |
+
// Draw waveform visualization
|
| 891 |
+
function drawWaveform() {
|
| 892 |
+
if (!currentAudioData) return;
|
| 893 |
+
|
| 894 |
+
const canvas = waveformCanvas;
|
| 895 |
+
const ctx = canvas.getContext('2d');
|
| 896 |
+
const dpr = window.devicePixelRatio || 1;
|
| 897 |
+
|
| 898 |
+
// Set canvas size
|
| 899 |
+
const rect = canvas.getBoundingClientRect();
|
| 900 |
+
canvas.width = rect.width * dpr;
|
| 901 |
+
canvas.height = rect.height * dpr;
|
| 902 |
+
ctx.scale(dpr, dpr);
|
| 903 |
+
|
| 904 |
+
const width = rect.width;
|
| 905 |
+
const height = rect.height;
|
| 906 |
+
const centerY = height / 2;
|
| 907 |
+
|
| 908 |
+
// Downsample audio data for visualization
|
| 909 |
+
const samples = currentAudioData;
|
| 910 |
+
const barCount = Math.floor(width / 3);
|
| 911 |
+
const samplesPerBar = Math.floor(samples.length / barCount);
|
| 912 |
+
|
| 913 |
+
// Calculate bar amplitudes
|
| 914 |
+
const barAmplitudes = [];
|
| 915 |
+
for (let i = 0; i < barCount; i++) {
|
| 916 |
+
let sum = 0;
|
| 917 |
+
const start = i * samplesPerBar;
|
| 918 |
+
for (let j = 0; j < samplesPerBar; j++) {
|
| 919 |
+
sum += Math.abs(samples[start + j] || 0);
|
| 920 |
+
}
|
| 921 |
+
barAmplitudes.push(sum / samplesPerBar);
|
| 922 |
+
}
|
| 923 |
+
|
| 924 |
+
// Find max amplitude for normalization
|
| 925 |
+
const maxAmp = Math.max(...barAmplitudes, 0.01);
|
| 926 |
+
|
| 927 |
+
// Get color based on color scheme
|
| 928 |
+
const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
|
| 929 |
+
ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';
|
| 930 |
+
|
| 931 |
+
// Draw bars normalized to fill height
|
| 932 |
+
for (let i = 0; i < barCount; i++) {
|
| 933 |
+
const normalized = barAmplitudes[i] / maxAmp;
|
| 934 |
+
const barHeight = Math.max(2, normalized * height * 0.9);
|
| 935 |
+
|
| 936 |
+
ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
|
| 937 |
+
}
|
| 938 |
+
}
|
| 939 |
+
|
| 940 |
+
// Format time as M:SS
|
| 941 |
+
function formatTime(seconds) {
|
| 942 |
+
const mins = Math.floor(seconds / 60);
|
| 943 |
+
const secs = Math.floor(seconds % 60);
|
| 944 |
+
return `${mins}:${secs.toString().padStart(2, '0')}`;
|
| 945 |
+
}
|
| 946 |
+
|
| 947 |
+
// Update audio time display
|
| 948 |
+
function updateAudioTime() {
|
| 949 |
+
const current = audioPlayer.currentTime || 0;
|
| 950 |
+
const duration = audioPlayer.duration || 0;
|
| 951 |
+
if (duration > 0) {
|
| 952 |
+
audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
|
| 953 |
+
waveformProgress.style.width = `${(current / duration) * 100}%`;
|
| 954 |
+
} else {
|
| 955 |
+
audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
|
| 956 |
+
}
|
| 957 |
+
}
|
| 958 |
+
|
| 959 |
+
// Toggle play/pause
|
| 960 |
+
function togglePlayback() {
|
| 961 |
+
if (audioPlayer.paused) {
|
| 962 |
+
audioPlayer.play();
|
| 963 |
+
playBtn.querySelector('.play-icon').style.display = 'none';
|
| 964 |
+
playBtn.querySelector('.pause-icon').style.display = 'block';
|
| 965 |
+
} else {
|
| 966 |
+
audioPlayer.pause();
|
| 967 |
+
playBtn.querySelector('.play-icon').style.display = 'block';
|
| 968 |
+
playBtn.querySelector('.pause-icon').style.display = 'none';
|
| 969 |
+
// Stop transcription if running
|
| 970 |
+
if (!transcriptionAborted && transcribeBtn.disabled) {
|
| 971 |
+
transcriptionAborted = true;
|
| 972 |
+
showProgress(false);
|
| 973 |
+
setStatus('ready', 'Transcription stopped');
|
| 974 |
+
transcribeBtn.disabled = false;
|
| 975 |
+
}
|
| 976 |
+
}
|
| 977 |
+
}
|
| 978 |
+
|
| 979 |
+
// Seek in audio
|
| 980 |
+
function seekAudio(event) {
|
| 981 |
+
const rect = waveformCanvas.getBoundingClientRect();
|
| 982 |
+
const x = event.clientX - rect.left;
|
| 983 |
+
const percent = x / rect.width;
|
| 984 |
+
audioPlayer.currentTime = percent * audioPlayer.duration;
|
| 985 |
+
updateAudioTime();
|
| 986 |
+
}
|
| 987 |
+
|
| 988 |
+
// Copy to clipboard
|
| 989 |
+
async function copyToClipboard() {
|
| 990 |
+
try {
|
| 991 |
+
await navigator.clipboard.writeText(outputText.textContent);
|
| 992 |
+
// Brief visual feedback via title attribute
|
| 993 |
+
const originalTitle = copyBtn.title;
|
| 994 |
+
copyBtn.title = 'Copied!';
|
| 995 |
+
setTimeout(() => {
|
| 996 |
+
copyBtn.title = originalTitle;
|
| 997 |
+
}, 2000);
|
| 998 |
+
} catch (error) {
|
| 999 |
+
console.error('Copy failed:', error);
|
| 1000 |
+
}
|
| 1001 |
+
}
|
| 1002 |
+
|
| 1003 |
+
function downloadTranscript() {
|
| 1004 |
+
// Convert <br> to newlines and strip other HTML
|
| 1005 |
+
const text = outputText.innerHTML
|
| 1006 |
+
.replace(/<br\s*\/?>/gi, '\n')
|
| 1007 |
+
.replace(/<[^>]+>/g, '');
|
| 1008 |
+
if (!text) return;
|
| 1009 |
+
|
| 1010 |
+
const blob = new Blob([text], { type: 'text/plain' });
|
| 1011 |
+
const url = URL.createObjectURL(blob);
|
| 1012 |
+
const a = document.createElement('a');
|
| 1013 |
+
a.href = url;
|
| 1014 |
+
a.download = 'transcript.txt';
|
| 1015 |
+
document.body.appendChild(a);
|
| 1016 |
+
a.click();
|
| 1017 |
+
document.body.removeChild(a);
|
| 1018 |
+
URL.revokeObjectURL(url);
|
| 1019 |
+
|
| 1020 |
+
// Brief visual feedback
|
| 1021 |
+
const originalTitle = downloadBtn.title;
|
| 1022 |
+
downloadBtn.title = 'Downloaded!';
|
| 1023 |
+
setTimeout(() => {
|
| 1024 |
+
downloadBtn.title = originalTitle;
|
| 1025 |
+
}, 2000);
|
| 1026 |
+
}
|
| 1027 |
+
|
| 1028 |
+
function clearAudio() {
|
| 1029 |
+
// Stop any playback
|
| 1030 |
+
audioPlayer.pause();
|
| 1031 |
+
audioPlayer.src = '';
|
| 1032 |
+
|
| 1033 |
+
// Reset audio state
|
| 1034 |
+
currentAudioData = null;
|
| 1035 |
+
|
| 1036 |
+
// Hide audio player and transcribe section
|
| 1037 |
+
audioPreview.style.display = 'none';
|
| 1038 |
+
transcribeSection.style.display = 'none';
|
| 1039 |
+
|
| 1040 |
+
// Clear transcript
|
| 1041 |
+
transcriptCard.style.display = 'none';
|
| 1042 |
+
outputText.textContent = '';
|
| 1043 |
+
|
| 1044 |
+
// Reset waveform
|
| 1045 |
+
waveformProgress.style.width = '0%';
|
| 1046 |
+
const ctx = waveformCanvas.getContext('2d');
|
| 1047 |
+
ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);
|
| 1048 |
+
|
| 1049 |
+
// Reset time display
|
| 1050 |
+
audioTime.textContent = '0:00';
|
| 1051 |
+
|
| 1052 |
+
// Reset buttons
|
| 1053 |
+
transcribeBtn.disabled = true;
|
| 1054 |
+
|
| 1055 |
+
// Reset file input
|
| 1056 |
+
audioFile.value = '';
|
| 1057 |
+
|
| 1058 |
+
// Update status
|
| 1059 |
+
setStatus('ready', 'Ready');
|
| 1060 |
+
}
|
| 1061 |
+
|
| 1062 |
+
// Event listeners
|
| 1063 |
+
recordBtn.addEventListener('click', toggleRecording);
|
| 1064 |
+
audioFile.addEventListener('change', handleFileUpload);
|
| 1065 |
+
|
| 1066 |
+
// Audio player controls
|
| 1067 |
+
playBtn.addEventListener('click', togglePlayback);
|
| 1068 |
+
waveformCanvas.addEventListener('click', seekAudio);
|
| 1069 |
+
audioPlayer.addEventListener('timeupdate', updateAudioTime);
|
| 1070 |
+
audioPlayer.addEventListener('ended', () => {
|
| 1071 |
+
playBtn.querySelector('.play-icon').style.display = 'block';
|
| 1072 |
+
playBtn.querySelector('.pause-icon').style.display = 'none';
|
| 1073 |
+
waveformProgress.style.width = '0%';
|
| 1074 |
+
});
|
| 1075 |
+
|
| 1076 |
+
// Redraw waveform on resize
|
| 1077 |
+
window.addEventListener('resize', drawWaveform);
|
| 1078 |
+
transcribeBtn.addEventListener('click', transcribe);
|
| 1079 |
+
copyBtn.addEventListener('click', copyToClipboard);
|
| 1080 |
+
downloadBtn.addEventListener('click', downloadTranscript);
|
| 1081 |
+
clearBtn.addEventListener('click', clearAudio);
|
| 1082 |
+
|
| 1083 |
+
// Drag and drop on input card
|
| 1084 |
+
inputCard.addEventListener('dragover', (e) => {
|
| 1085 |
+
e.preventDefault();
|
| 1086 |
+
inputCard.classList.add('drag-over');
|
| 1087 |
+
});
|
| 1088 |
+
inputCard.addEventListener('dragleave', (e) => {
|
| 1089 |
+
e.preventDefault();
|
| 1090 |
+
inputCard.classList.remove('drag-over');
|
| 1091 |
+
});
|
| 1092 |
+
inputCard.addEventListener('drop', handleFileDrop);
|
| 1093 |
+
|
| 1094 |
+
// Initialize on load
|
| 1095 |
+
window.addEventListener('load', initModels);
|
index.html
CHANGED
|
@@ -1,19 +1,135 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Granite Speech WebGPU</title>
|
| 7 |
+
<link rel="stylesheet" href="style.css?v=3">
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div class="container">
|
| 11 |
+
<!-- Header -->
|
| 12 |
+
<h1>Granite Speech WebGPU</h1>
|
| 13 |
+
<h2>Speech recognition and translation directly in your browser</h2>
|
| 14 |
+
|
| 15 |
+
<!-- Audio Input Card -->
|
| 16 |
+
<div class="input-card">
|
| 17 |
+
<div class="input-options">
|
| 18 |
+
<button id="recordBtn" class="input-tile" disabled>
|
| 19 |
+
<svg class="mic-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
|
| 20 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M12 18.75a6 6 0 006-6v-1.5m-6 7.5a6 6 0 01-6-6v-1.5m6 7.5v3.75m-3.75 0h7.5M12 15.75a3 3 0 01-3-3V4.5a3 3 0 116 0v8.25a3 3 0 01-3 3z" />
|
| 21 |
+
</svg>
|
| 22 |
+
<svg class="stop-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" style="display: none;">
|
| 23 |
+
<rect x="6" y="6" width="12" height="12" rx="1" />
|
| 24 |
+
</svg>
|
| 25 |
+
<span>Record</span>
|
| 26 |
+
</button>
|
| 27 |
+
<div class="divider"></div>
|
| 28 |
+
<label class="input-tile file-label">
|
| 29 |
+
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
|
| 30 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M3.75 9.776c.112-.017.227-.026.344-.026h15.812c.117 0 .232.009.344.026m-16.5 0a2.25 2.25 0 00-1.883 2.542l.857 6a2.25 2.25 0 002.227 1.932H19.05a2.25 2.25 0 002.227-1.932l.857-6a2.25 2.25 0 00-1.883-2.542m-16.5 0V6A2.25 2.25 0 016 3.75h3.879a1.5 1.5 0 011.06.44l2.122 2.12a1.5 1.5 0 001.06.44H18A2.25 2.25 0 0120.25 9v.776" />
|
| 31 |
+
</svg>
|
| 32 |
+
<span>Upload</span>
|
| 33 |
+
<input type="file" id="audioFile" accept="audio/*" hidden>
|
| 34 |
+
</label>
|
| 35 |
+
</div>
|
| 36 |
+
<div class="progress-bar">
|
| 37 |
+
<div class="progress-fill" id="progressFill"></div>
|
| 38 |
+
</div>
|
| 39 |
+
</div>
|
| 40 |
+
|
| 41 |
+
<!-- Status -->
|
| 42 |
+
<div class="status-section" id="statusSection">
|
| 43 |
+
<span class="status-dot" id="statusDot"></span>
|
| 44 |
+
<span id="statusText">Loading...</span>
|
| 45 |
+
</div>
|
| 46 |
+
|
| 47 |
+
<!-- Audio Player (hidden initially) -->
|
| 48 |
+
<div class="audio-player" id="audioPreview" style="display: none;">
|
| 49 |
+
<button class="play-btn" id="playBtn">
|
| 50 |
+
<svg class="play-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor">
|
| 51 |
+
<path d="M8 5v14l11-7z"/>
|
| 52 |
+
</svg>
|
| 53 |
+
<svg class="pause-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor" style="display: none;">
|
| 54 |
+
<path d="M6 19h4V5H6v14zm8-14v14h4V5h-4z"/>
|
| 55 |
+
</svg>
|
| 56 |
+
</button>
|
| 57 |
+
<div class="waveform-container">
|
| 58 |
+
<canvas id="waveformCanvas"></canvas>
|
| 59 |
+
<div class="waveform-progress" id="waveformProgress"></div>
|
| 60 |
+
</div>
|
| 61 |
+
<span class="audio-time" id="audioTime">0:00</span>
|
| 62 |
+
<button class="clear-btn" id="clearBtn" title="Clear">
|
| 63 |
+
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
|
| 64 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
|
| 65 |
+
</svg>
|
| 66 |
+
</button>
|
| 67 |
+
<audio id="audioPlayer" style="display: none;"></audio>
|
| 68 |
+
</div>
|
| 69 |
+
|
| 70 |
+
<!-- Transcribe Section -->
|
| 71 |
+
<div class="transcribe-section" id="transcribeSection" style="display: none;">
|
| 72 |
+
<div class="task-row">
|
| 73 |
+
<label for="promptSelect">Task:</label>
|
| 74 |
+
<select id="promptSelect">
|
| 75 |
+
<option value="transcribe">Transcribe</option>
|
| 76 |
+
<option value="translate_en">Translate to English</option>
|
| 77 |
+
<option value="translate_fr">Translate to French</option>
|
| 78 |
+
<option value="translate_de">Translate to German</option>
|
| 79 |
+
<option value="translate_es">Translate to Spanish</option>
|
| 80 |
+
<option value="translate_pt">Translate to Portuguese</option>
|
| 81 |
+
<option value="translate_ja">Translate to Japanese</option>
|
| 82 |
+
</select>
|
| 83 |
+
</div>
|
| 84 |
+
<label class="checkbox-row">
|
| 85 |
+
<input type="checkbox" id="punctuationCheckbox" checked>
|
| 86 |
+
<span>Add punctuation (English only)</span>
|
| 87 |
+
</label>
|
| 88 |
+
<button id="transcribeBtn" class="transcribe-btn" disabled>
|
| 89 |
+
Transcribe
|
| 90 |
+
</button>
|
| 91 |
+
</div>
|
| 92 |
+
|
| 93 |
+
<!-- Model Loading Progress -->
|
| 94 |
+
<div class="model-progress" id="progressSection" style="display: none;">
|
| 95 |
+
<span id="progressText">Loading model...</span>
|
| 96 |
+
</div>
|
| 97 |
+
|
| 98 |
+
<!-- Transcript Output -->
|
| 99 |
+
<div class="transcript-card" id="transcriptCard" style="display: none;">
|
| 100 |
+
<div class="transcript-header">
|
| 101 |
+
<span>Transcript</span>
|
| 102 |
+
<div class="transcript-actions">
|
| 103 |
+
<button id="copyBtn" class="icon-btn" title="Copy">
|
| 104 |
+
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
|
| 105 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M15.666 3.888A2.25 2.25 0 0013.5 2.25h-3c-1.03 0-1.9.693-2.166 1.638m7.332 0c.055.194.084.4.084.612v0a.75.75 0 01-.75.75H9.75a.75.75 0 01-.75-.75v0c0-.212.03-.418.084-.612m7.332 0c.646.049 1.288.11 1.927.184 1.1.128 1.907 1.077 1.907 2.185V19.5a2.25 2.25 0 01-2.25 2.25H6.75A2.25 2.25 0 014.5 19.5V6.257c0-1.108.806-2.057 1.907-2.185a48.208 48.208 0 011.927-.184" />
|
| 106 |
+
</svg>
|
| 107 |
+
</button>
|
| 108 |
+
<button id="downloadBtn" class="icon-btn" title="Download">
|
| 109 |
+
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
|
| 110 |
+
<path stroke-linecap="round" stroke-linejoin="round" d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5M16.5 12L12 16.5m0 0L7.5 12m4.5 4.5V3" />
|
| 111 |
+
</svg>
|
| 112 |
+
</button>
|
| 113 |
+
</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div class="transcript-output" id="outputText"></div>
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
<!-- Footer -->
|
| 119 |
+
<div class="footer">
|
| 120 |
+
Made with
|
| 121 |
+
<a href="https://huggingface.co/ibm-granite/granite-4.0-1b-speech" target="_blank">Granite Speech 4.0 1B</a>
|
| 122 |
+
and
|
| 123 |
+
<a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank">ONNX Runtime Web</a>
|
| 124 |
+
<br>
|
| 125 |
+
<span class="privacy-note">Your audio and transcription never leave your device</span>
|
| 126 |
+
</div>
|
| 127 |
+
<div class="gpu-info" id="gpuInfo"></div>
|
| 128 |
+
</div>
|
| 129 |
+
|
| 130 |
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.all.min.js"></script>
|
| 131 |
+
<script src="vad.js?v=1"></script>
|
| 132 |
+
<script src="punctuator.js?v=3"></script>
|
| 133 |
+
<script type="module" src="app.js?v=53"></script>
|
| 134 |
+
</body>
|
| 135 |
</html>
|
pcs_vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
punct_cap_seg_en.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd922d459da618cd324280889740608b76fb3e9e61d3f402291be1251f91421b
|
| 3 |
+
size 209532928
|
punctuator.js
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Punctuation and Capitalization using ONNX
|
| 3 |
+
* - English: Full punctuation + capitalization (1-800-BAD-CODE model)
|
| 4 |
+
* - Other languages (DE, FR, IT, NL, ES, PT): Punctuation only (oliverguhr multilingual model)
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
// English model (punctuation + capitalization)
|
| 8 |
+
let pcsSession = null;
|
| 9 |
+
let pcsVocab = null;
|
| 10 |
+
let pcsVocabReverse = null;
|
| 11 |
+
|
| 12 |
+
// Multilingual model (punctuation only)
|
| 13 |
+
let multilingualSession = null;
|
| 14 |
+
let multilingualTokenizer = null;
|
| 15 |
+
|
| 16 |
+
const PCS_CONFIG = {
|
| 17 |
+
preLabels: ["<NULL>", "¿"],
|
| 18 |
+
postLabels: ["<NULL>", "<ACRONYM>", ".", ",", "?"],
|
| 19 |
+
unkId: 0,
|
| 20 |
+
bosId: 1,
|
| 21 |
+
eosId: 2,
|
| 22 |
+
padId: 3,
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
// Multilingual model label mapping
|
| 26 |
+
const MULTILINGUAL_LABELS = {
|
| 27 |
+
0: "", // No punctuation
|
| 28 |
+
1: ".", // Period
|
| 29 |
+
2: ",", // Comma
|
| 30 |
+
3: "?", // Question mark
|
| 31 |
+
4: "-", // Hyphen
|
| 32 |
+
5: ":", // Colon
|
| 33 |
+
};
|
| 34 |
+
|
| 35 |
+
// Languages supported by multilingual model
|
| 36 |
+
const MULTILINGUAL_LANGS = ['de', 'fr', 'it', 'nl', 'es', 'pt'];
|
| 37 |
+
|
| 38 |
+
// Load the English punctuator model and vocab
|
| 39 |
+
async function loadEnglishPunctuator() {
|
| 40 |
+
if (pcsSession) return;
|
| 41 |
+
|
| 42 |
+
console.log('Loading English punctuator model...');
|
| 43 |
+
|
| 44 |
+
// Load vocab
|
| 45 |
+
const vocabResponse = await fetch('./pcs_vocab.json');
|
| 46 |
+
const vocabData = await vocabResponse.json();
|
| 47 |
+
pcsVocab = vocabData.vocab;
|
| 48 |
+
|
| 49 |
+
// Create reverse vocab (id -> piece)
|
| 50 |
+
pcsVocabReverse = {};
|
| 51 |
+
for (const [piece, id] of Object.entries(pcsVocab)) {
|
| 52 |
+
pcsVocabReverse[id] = piece;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
// Load ONNX model
|
| 56 |
+
pcsSession = await ort.InferenceSession.create('./punct_cap_seg_en.onnx', {
|
| 57 |
+
executionProviders: ['wasm'],
|
| 58 |
+
});
|
| 59 |
+
|
| 60 |
+
console.log('English punctuator model loaded');
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
// Load the multilingual punctuator model
|
| 64 |
+
async function loadMultilingualPunctuator() {
|
| 65 |
+
if (multilingualSession) return;
|
| 66 |
+
|
| 67 |
+
console.log('Loading multilingual punctuator model...');
|
| 68 |
+
|
| 69 |
+
// Load tokenizer from transformers.js
|
| 70 |
+
const { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2');
|
| 71 |
+
multilingualTokenizer = await AutoTokenizer.from_pretrained('oliverguhr/fullstop-punctuation-multilingual-base');
|
| 72 |
+
|
| 73 |
+
// Load ONNX model
|
| 74 |
+
multilingualSession = await ort.InferenceSession.create('./punct_multilingual_q8.onnx', {
|
| 75 |
+
executionProviders: ['wasm'],
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
console.log('Multilingual punctuator model loaded');
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
// Simple Unigram tokenizer for English model (greedy longest match)
|
| 82 |
+
function tokenizeEnglish(text) {
|
| 83 |
+
const normalized = text.toLowerCase().replace(/ /g, '▁');
|
| 84 |
+
const tokens = [];
|
| 85 |
+
let i = 0;
|
| 86 |
+
|
| 87 |
+
// Add BOS
|
| 88 |
+
tokens.push(PCS_CONFIG.bosId);
|
| 89 |
+
|
| 90 |
+
// Prepend ▁ for first word
|
| 91 |
+
let remaining = '▁' + normalized;
|
| 92 |
+
|
| 93 |
+
while (remaining.length > 0) {
|
| 94 |
+
let found = false;
|
| 95 |
+
// Try longest match first
|
| 96 |
+
for (let len = Math.min(remaining.length, 20); len > 0; len--) {
|
| 97 |
+
const piece = remaining.substring(0, len);
|
| 98 |
+
if (pcsVocab[piece] !== undefined) {
|
| 99 |
+
tokens.push(pcsVocab[piece]);
|
| 100 |
+
remaining = remaining.substring(len);
|
| 101 |
+
found = true;
|
| 102 |
+
break;
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
if (!found) {
|
| 106 |
+
// Unknown character, use UNK and skip
|
| 107 |
+
tokens.push(PCS_CONFIG.unkId);
|
| 108 |
+
remaining = remaining.substring(1);
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Add EOS
|
| 113 |
+
tokens.push(PCS_CONFIG.eosId);
|
| 114 |
+
|
| 115 |
+
return tokens;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
// Apply punctuation and capitalization for English
|
| 119 |
+
async function applyEnglishPunctuation(text) {
|
| 120 |
+
await loadEnglishPunctuator();
|
| 121 |
+
|
| 122 |
+
// Tokenize
|
| 123 |
+
const tokenIds = tokenizeEnglish(text);
|
| 124 |
+
|
| 125 |
+
// Run inference
|
| 126 |
+
const inputTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
|
| 127 |
+
const outputs = await pcsSession.run({ input_ids: inputTensor });
|
| 128 |
+
|
| 129 |
+
const prePreds = outputs.pre_preds.data;
|
| 130 |
+
const postPreds = outputs.post_preds.data;
|
| 131 |
+
const capPreds = outputs.cap_preds.data;
|
| 132 |
+
const segPreds = outputs.seg_preds.data;
|
| 133 |
+
|
| 134 |
+
// Decode: skip BOS (index 0) and EOS (last index)
|
| 135 |
+
const numTokens = tokenIds.length - 2;
|
| 136 |
+
const result = [];
|
| 137 |
+
let currentSentence = [];
|
| 138 |
+
|
| 139 |
+
for (let i = 0; i < numTokens; i++) {
|
| 140 |
+
const tokenId = tokenIds[i + 1];
|
| 141 |
+
const token = pcsVocabReverse[tokenId] || '';
|
| 142 |
+
const outputIdx = i + 1;
|
| 143 |
+
|
| 144 |
+
// Handle word boundary
|
| 145 |
+
if (token.startsWith('▁') && currentSentence.length > 0) {
|
| 146 |
+
currentSentence.push(' ');
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// Process each character in token
|
| 150 |
+
const charStart = token.startsWith('▁') ? 1 : 0;
|
| 151 |
+
for (let j = charStart; j < token.length; j++) {
|
| 152 |
+
let char = token[j];
|
| 153 |
+
|
| 154 |
+
// Pre-punctuation (e.g., inverted question mark)
|
| 155 |
+
if (j === charStart && prePreds[outputIdx] === 1) {
|
| 156 |
+
currentSentence.push(PCS_CONFIG.preLabels[1]);
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
// Capitalization - capPreds is [batch, seq, 16]
|
| 160 |
+
const capOffset = outputIdx * 16 + j;
|
| 161 |
+
if (capPreds[capOffset]) {
|
| 162 |
+
char = char.toUpperCase();
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
currentSentence.push(char);
|
| 166 |
+
|
| 167 |
+
// Post-punctuation
|
| 168 |
+
const postLabel = postPreds[outputIdx];
|
| 169 |
+
if (postLabel === 1) { // ACRONYM
|
| 170 |
+
currentSentence.push('.');
|
| 171 |
+
} else if (j === token.length - 1 && postLabel > 1) {
|
| 172 |
+
currentSentence.push(PCS_CONFIG.postLabels[postLabel]);
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
// Sentence boundary
|
| 177 |
+
if (segPreds[outputIdx]) {
|
| 178 |
+
result.push(currentSentence.join(''));
|
| 179 |
+
currentSentence = [];
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
if (currentSentence.length > 0) {
|
| 184 |
+
result.push(currentSentence.join(''));
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
return result.join(' ');
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
// Apply punctuation only for other languages (multilingual model)
|
| 191 |
+
async function applyMultilingualPunctuation(text) {
|
| 192 |
+
await loadMultilingualPunctuator();
|
| 193 |
+
|
| 194 |
+
// Tokenize using transformers.js tokenizer
|
| 195 |
+
const encoded = await multilingualTokenizer(text, {
|
| 196 |
+
return_tensors: false,
|
| 197 |
+
padding: false,
|
| 198 |
+
truncation: true,
|
| 199 |
+
max_length: 512,
|
| 200 |
+
});
|
| 201 |
+
|
| 202 |
+
const inputIds = encoded.input_ids;
|
| 203 |
+
const attentionMask = encoded.attention_mask;
|
| 204 |
+
|
| 205 |
+
// Run inference
|
| 206 |
+
const inputIdsTensor = new ort.Tensor('int64', BigInt64Array.from(inputIds.map(BigInt)), [1, inputIds.length]);
|
| 207 |
+
const attentionMaskTensor = new ort.Tensor('int64', BigInt64Array.from(attentionMask.map(BigInt)), [1, attentionMask.length]);
|
| 208 |
+
|
| 209 |
+
const outputs = await multilingualSession.run({
|
| 210 |
+
input_ids: inputIdsTensor,
|
| 211 |
+
attention_mask: attentionMaskTensor,
|
| 212 |
+
});
|
| 213 |
+
|
| 214 |
+
const logits = outputs.logits.data;
|
| 215 |
+
const numLabels = 6;
|
| 216 |
+
|
| 217 |
+
// Get predictions (argmax over logits)
|
| 218 |
+
const predictions = [];
|
| 219 |
+
for (let i = 0; i < inputIds.length; i++) {
|
| 220 |
+
let maxIdx = 0;
|
| 221 |
+
let maxVal = logits[i * numLabels];
|
| 222 |
+
for (let j = 1; j < numLabels; j++) {
|
| 223 |
+
if (logits[i * numLabels + j] > maxVal) {
|
| 224 |
+
maxVal = logits[i * numLabels + j];
|
| 225 |
+
maxIdx = j;
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
predictions.push(maxIdx);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
// Decode tokens back to text with punctuation
|
| 232 |
+
const tokens = multilingualTokenizer.model.convert_ids_to_tokens(inputIds);
|
| 233 |
+
const result = [];
|
| 234 |
+
|
| 235 |
+
for (let i = 0; i < tokens.length; i++) {
|
| 236 |
+
const token = tokens[i];
|
| 237 |
+
|
| 238 |
+
// Skip special tokens
|
| 239 |
+
if (token === '<s>' || token === '</s>' || token === '<pad>') {
|
| 240 |
+
continue;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
// Handle subword tokens (▁ prefix indicates start of new word)
|
| 244 |
+
if (token.startsWith('▁')) {
|
| 245 |
+
if (result.length > 0) {
|
| 246 |
+
result.push(' ');
|
| 247 |
+
}
|
| 248 |
+
result.push(token.substring(1));
|
| 249 |
+
} else {
|
| 250 |
+
result.push(token);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
// Add punctuation after token
|
| 254 |
+
const punct = MULTILINGUAL_LABELS[predictions[i]];
|
| 255 |
+
if (punct) {
|
| 256 |
+
result.push(punct);
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
return result.join('');
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
// Main entry point - routes to appropriate model based on language
|
| 264 |
+
async function applyPunctuation(text, lang = null) {
|
| 265 |
+
if (!text || text.trim().length === 0) return text;
|
| 266 |
+
|
| 267 |
+
// If language specified and supported by multilingual model, use it
|
| 268 |
+
if (lang && MULTILINGUAL_LANGS.includes(lang)) {
|
| 269 |
+
try {
|
| 270 |
+
return await applyMultilingualPunctuation(text);
|
| 271 |
+
} catch (error) {
|
| 272 |
+
console.warn('Multilingual punctuation failed, returning original:', error);
|
| 273 |
+
return text;
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
// Default to English model
|
| 278 |
+
try {
|
| 279 |
+
return await applyEnglishPunctuation(text);
|
| 280 |
+
} catch (error) {
|
| 281 |
+
console.warn('English punctuation failed, returning original:', error);
|
| 282 |
+
return text;
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// Preload English model (called during init)
|
| 287 |
+
async function loadPunctuator() {
|
| 288 |
+
await loadEnglishPunctuator();
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
// Export for use in app.js
|
| 292 |
+
window.applyPunctuation = applyPunctuation;
|
| 293 |
+
window.loadPunctuator = loadPunctuator;
|
| 294 |
+
window.MULTILINGUAL_PUNCT_LANGS = MULTILINGUAL_LANGS;
|
silero_vad.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
| 3 |
+
size 2243022
|
style.css
CHANGED
|
@@ -1,28 +1,638 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
body {
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
}
|
| 5 |
|
|
|
|
| 6 |
h1 {
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
}
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
}
|
| 17 |
|
| 18 |
-
.
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
-
.
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
}
|
|
|
|
| 1 |
+
/* Granite Speech WebGPU - Whisper Web Style */
|
| 2 |
+
|
| 3 |
+
* {
|
| 4 |
+
box-sizing: border-box;
|
| 5 |
+
margin: 0;
|
| 6 |
+
padding: 0;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
body {
|
| 10 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
| 11 |
+
background: #f8fafc;
|
| 12 |
+
color: #0f172a;
|
| 13 |
+
min-height: 100vh;
|
| 14 |
+
display: flex;
|
| 15 |
+
justify-content: center;
|
| 16 |
+
align-items: center;
|
| 17 |
+
padding: 2rem;
|
| 18 |
+
line-height: 1.5;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.container {
|
| 22 |
+
width: 100%;
|
| 23 |
+
max-width: 540px;
|
| 24 |
+
display: flex;
|
| 25 |
+
flex-direction: column;
|
| 26 |
+
align-items: center;
|
| 27 |
+
gap: 1.5rem;
|
| 28 |
}
|
| 29 |
|
| 30 |
+
/* Header */
|
| 31 |
h1 {
|
| 32 |
+
font-size: 3rem;
|
| 33 |
+
font-weight: 800;
|
| 34 |
+
letter-spacing: -0.025em;
|
| 35 |
+
color: #0f172a;
|
| 36 |
+
text-align: center;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
h2 {
|
| 40 |
+
font-size: 1.25rem;
|
| 41 |
+
font-weight: 600;
|
| 42 |
+
letter-spacing: -0.015em;
|
| 43 |
+
color: #0f172a;
|
| 44 |
+
text-align: center;
|
| 45 |
+
padding: 0 1rem;
|
| 46 |
+
white-space: nowrap;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.privacy-note {
|
| 50 |
+
font-size: 0.8125rem;
|
| 51 |
+
color: #64748b;
|
| 52 |
+
text-align: center;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/* Input Card */
|
| 56 |
+
.input-card {
|
| 57 |
+
width: 100%;
|
| 58 |
+
background: white;
|
| 59 |
+
border-radius: 0.5rem;
|
| 60 |
+
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
|
| 61 |
+
border: 1px solid rgba(51, 65, 85, 0.1);
|
| 62 |
+
overflow: hidden;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.input-options {
|
| 66 |
+
display: flex;
|
| 67 |
+
align-items: stretch;
|
| 68 |
+
padding: 0.5rem;
|
| 69 |
+
gap: 0.5rem;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.input-tile {
|
| 73 |
+
display: flex;
|
| 74 |
+
align-items: center;
|
| 75 |
+
justify-content: center;
|
| 76 |
+
gap: 0.5rem;
|
| 77 |
+
padding: 0.5rem 0.75rem;
|
| 78 |
+
background: transparent;
|
| 79 |
+
border: none;
|
| 80 |
+
border-radius: 0.5rem;
|
| 81 |
+
color: #64748b;
|
| 82 |
+
font-size: 0.9375rem;
|
| 83 |
+
cursor: pointer;
|
| 84 |
+
transition: all 0.2s;
|
| 85 |
+
flex: 1;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.input-tile:hover:not(:disabled) {
|
| 89 |
+
color: #4f46e5;
|
| 90 |
+
background: #eef2ff;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
.input-tile.recording {
|
| 94 |
+
color: #ef4444;
|
| 95 |
+
background: #fef2f2;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
.input-tile.recording:hover {
|
| 99 |
+
color: #dc2626;
|
| 100 |
+
background: #fee2e2;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
.input-tile:disabled {
|
| 104 |
+
opacity: 0.5;
|
| 105 |
+
cursor: not-allowed;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.input-tile svg {
|
| 109 |
+
width: 1.75rem;
|
| 110 |
+
height: 1.75rem;
|
| 111 |
+
flex-shrink: 0;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
.file-label {
|
| 115 |
+
cursor: pointer;
|
| 116 |
+
transition: all 0.2s;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.file-label.drag-over {
|
| 120 |
+
color: #2563eb;
|
| 121 |
+
background: #eff6ff;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
.input-card.drag-over {
|
| 125 |
+
border: 2px dashed #2563eb;
|
| 126 |
+
background: #eff6ff;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.divider {
|
| 130 |
+
width: 1px;
|
| 131 |
+
background: #e2e8f0;
|
| 132 |
+
margin: 0.25rem 0;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
/* Progress Bar */
|
| 136 |
+
.progress-bar {
|
| 137 |
+
width: 100%;
|
| 138 |
+
height: 4px;
|
| 139 |
+
background: #e5e7eb;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
.progress-fill {
|
| 143 |
+
height: 100%;
|
| 144 |
+
background: #2563eb;
|
| 145 |
+
width: 0%;
|
| 146 |
+
transition: width 0.1s;
|
| 147 |
+
border-radius: 0 2px 2px 0;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
/* Status Section */
|
| 151 |
+
.status-section {
|
| 152 |
+
display: flex;
|
| 153 |
+
align-items: center;
|
| 154 |
+
gap: 0.5rem;
|
| 155 |
+
font-size: 0.875rem;
|
| 156 |
+
color: #64748b;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
.status-dot {
|
| 160 |
+
width: 8px;
|
| 161 |
+
height: 8px;
|
| 162 |
+
border-radius: 50%;
|
| 163 |
+
background: #94a3b8;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.status-dot.ready {
|
| 167 |
+
background: #22c55e;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.status-dot.loading,
|
| 171 |
+
.status-dot.processing {
|
| 172 |
+
background: #f59e0b;
|
| 173 |
+
animation: pulse 1.5s infinite;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.status-dot.recording {
|
| 177 |
+
background: #ef4444;
|
| 178 |
+
animation: pulse 0.8s infinite;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.status-dot.error {
|
| 182 |
+
background: #ef4444;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
@keyframes pulse {
|
| 186 |
+
0%, 100% { opacity: 1; }
|
| 187 |
+
50% { opacity: 0.5; }
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
/* Audio Player */
|
| 191 |
+
.audio-player {
|
| 192 |
+
width: 100%;
|
| 193 |
+
display: flex;
|
| 194 |
+
align-items: center;
|
| 195 |
+
gap: 0.5rem;
|
| 196 |
+
padding: 0.25rem 0.5rem;
|
| 197 |
+
background: white;
|
| 198 |
+
border-radius: 0.5rem;
|
| 199 |
+
border: 1px solid rgba(51, 65, 85, 0.1);
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
.play-btn {
|
| 203 |
+
width: 24px;
|
| 204 |
+
height: 24px;
|
| 205 |
+
border-radius: 50%;
|
| 206 |
+
border: none;
|
| 207 |
+
background: #2563eb;
|
| 208 |
+
color: white;
|
| 209 |
+
cursor: pointer;
|
| 210 |
+
display: flex;
|
| 211 |
+
align-items: center;
|
| 212 |
+
justify-content: center;
|
| 213 |
+
flex-shrink: 0;
|
| 214 |
+
transition: background 0.2s;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.play-btn:hover {
|
| 218 |
+
background: #1d4ed8;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.play-btn svg {
|
| 222 |
+
width: 14px;
|
| 223 |
+
height: 14px;
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
.waveform-container {
|
| 227 |
+
flex: 1;
|
| 228 |
+
height: 24px;
|
| 229 |
+
position: relative;
|
| 230 |
+
cursor: pointer;
|
| 231 |
+
border-radius: 4px;
|
| 232 |
+
overflow: hidden;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
#waveformCanvas {
|
| 236 |
+
width: 100%;
|
| 237 |
+
height: 100%;
|
| 238 |
+
display: block;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
.waveform-progress {
|
| 242 |
+
position: absolute;
|
| 243 |
+
top: 0;
|
| 244 |
+
left: 0;
|
| 245 |
+
height: 100%;
|
| 246 |
+
width: 0%;
|
| 247 |
+
background: rgba(37, 99, 235, 0.3);
|
| 248 |
+
pointer-events: none;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
.audio-time {
|
| 252 |
+
font-size: 0.8125rem;
|
| 253 |
+
color: #64748b;
|
| 254 |
+
font-variant-numeric: tabular-nums;
|
| 255 |
+
min-width: 3rem;
|
| 256 |
+
text-align: right;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
.clear-btn {
|
| 260 |
+
width: 20px;
|
| 261 |
+
height: 20px;
|
| 262 |
+
border-radius: 50%;
|
| 263 |
+
border: none;
|
| 264 |
+
background: transparent;
|
| 265 |
+
color: #94a3b8;
|
| 266 |
+
cursor: pointer;
|
| 267 |
+
display: flex;
|
| 268 |
+
align-items: center;
|
| 269 |
+
justify-content: center;
|
| 270 |
+
flex-shrink: 0;
|
| 271 |
+
padding: 0;
|
| 272 |
+
transition: all 0.2s;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.clear-btn:hover {
|
| 276 |
+
color: #ef4444;
|
| 277 |
+
background: #fef2f2;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.clear-btn svg {
|
| 281 |
+
width: 14px;
|
| 282 |
+
height: 14px;
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
/* Transcribe Section */
|
| 286 |
+
.transcribe-section {
|
| 287 |
+
width: 100%;
|
| 288 |
+
display: flex;
|
| 289 |
+
flex-direction: column;
|
| 290 |
+
align-items: center;
|
| 291 |
+
gap: 1rem;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
.task-row {
|
| 295 |
+
display: flex;
|
| 296 |
+
align-items: center;
|
| 297 |
+
gap: 0.75rem;
|
| 298 |
+
font-size: 0.875rem;
|
| 299 |
+
color: #64748b;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
.task-row select {
|
| 303 |
+
padding: 0.375rem 0.75rem;
|
| 304 |
+
font-size: 0.875rem;
|
| 305 |
+
background: white;
|
| 306 |
+
color: #0f172a;
|
| 307 |
+
border: 1px solid #d1d5db;
|
| 308 |
+
border-radius: 0.5rem;
|
| 309 |
+
cursor: pointer;
|
| 310 |
+
outline: none;
|
| 311 |
}
|
| 312 |
|
| 313 |
+
.task-row select:focus {
|
| 314 |
+
border-color: #2563eb;
|
| 315 |
+
box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2);
|
|
|
|
|
|
|
| 316 |
}
|
| 317 |
|
| 318 |
+
.transcribe-btn {
|
| 319 |
+
padding: 0.625rem 2.5rem;
|
| 320 |
+
font-size: 0.9375rem;
|
| 321 |
+
font-weight: 500;
|
| 322 |
+
background: #2563eb;
|
| 323 |
+
color: white;
|
| 324 |
+
border: none;
|
| 325 |
+
border-radius: 0.5rem;
|
| 326 |
+
cursor: pointer;
|
| 327 |
+
transition: background 0.2s;
|
| 328 |
}
|
| 329 |
|
| 330 |
+
.transcribe-btn:hover:not(:disabled) {
|
| 331 |
+
background: #1d4ed8;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.transcribe-btn:disabled {
|
| 335 |
+
opacity: 0.5;
|
| 336 |
+
cursor: not-allowed;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
.checkbox-row {
|
| 340 |
+
display: flex;
|
| 341 |
+
align-items: center;
|
| 342 |
+
gap: 0.5rem;
|
| 343 |
+
font-size: 0.875rem;
|
| 344 |
+
color: #64748b;
|
| 345 |
+
cursor: pointer;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
.checkbox-row input[type="checkbox"] {
|
| 349 |
+
width: 1rem;
|
| 350 |
+
height: 1rem;
|
| 351 |
+
cursor: pointer;
|
| 352 |
+
accent-color: #2563eb;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
/* Model Progress */
|
| 356 |
+
.model-progress {
|
| 357 |
+
font-size: 0.875rem;
|
| 358 |
+
color: #64748b;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
/* Transcript Card */
|
| 362 |
+
.transcript-card {
|
| 363 |
+
width: 100%;
|
| 364 |
+
background: white;
|
| 365 |
+
border-radius: 0.5rem;
|
| 366 |
+
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
|
| 367 |
+
border: 1px solid rgba(51, 65, 85, 0.1);
|
| 368 |
+
overflow: hidden;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
.transcript-header {
|
| 372 |
+
display: flex;
|
| 373 |
+
justify-content: space-between;
|
| 374 |
+
align-items: center;
|
| 375 |
+
padding: 0.75rem 1rem;
|
| 376 |
+
border-bottom: 1px solid #e2e8f0;
|
| 377 |
+
font-size: 0.875rem;
|
| 378 |
+
font-weight: 500;
|
| 379 |
+
color: #64748b;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
.transcript-actions {
|
| 383 |
+
display: flex;
|
| 384 |
+
gap: 0.25rem;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.icon-btn {
|
| 388 |
+
display: flex;
|
| 389 |
+
align-items: center;
|
| 390 |
+
justify-content: center;
|
| 391 |
+
width: 32px;
|
| 392 |
+
height: 32px;
|
| 393 |
+
background: transparent;
|
| 394 |
+
border: none;
|
| 395 |
+
border-radius: 0.375rem;
|
| 396 |
+
color: #64748b;
|
| 397 |
+
cursor: pointer;
|
| 398 |
+
transition: all 0.2s;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
.icon-btn:hover {
|
| 402 |
+
background: #f1f5f9;
|
| 403 |
+
color: #0f172a;
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
.icon-btn svg {
|
| 407 |
+
width: 18px;
|
| 408 |
+
height: 18px;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
.transcript-output {
|
| 412 |
+
padding: 1rem;
|
| 413 |
+
min-height: 100px;
|
| 414 |
+
max-height: 300px;
|
| 415 |
+
overflow-y: auto;
|
| 416 |
+
font-size: 0.9375rem;
|
| 417 |
+
line-height: 1.7;
|
| 418 |
+
color: #0f172a;
|
| 419 |
+
word-wrap: break-word;
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
.transcript-row {
|
| 423 |
+
display: flex;
|
| 424 |
+
gap: 0.75rem;
|
| 425 |
+
padding: 0.25rem 0;
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
.transcript-row .timestamp {
|
| 429 |
+
font-size: 0.9375rem;
|
| 430 |
+
color: #94a3b8;
|
| 431 |
+
flex-shrink: 0;
|
| 432 |
+
min-width: 3rem;
|
| 433 |
+
text-align: right;
|
| 434 |
+
font-variant-numeric: tabular-nums;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
.transcript-row .transcript-text {
|
| 438 |
+
flex: 1;
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
/* Footer */
|
| 442 |
+
.footer {
|
| 443 |
+
text-align: center;
|
| 444 |
+
font-size: 0.875rem;
|
| 445 |
+
color: #64748b;
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
.footer a {
|
| 449 |
+
color: #2563eb;
|
| 450 |
+
text-decoration: none;
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
.footer a:hover {
|
| 454 |
+
text-decoration: underline;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.gpu-info {
|
| 458 |
+
font-size: 0.75rem;
|
| 459 |
+
font-family: 'SF Mono', Monaco, 'Courier New', monospace;
|
| 460 |
+
color: #94a3b8;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
/* Responsive */
|
| 464 |
+
@media (max-width: 640px) {
|
| 465 |
+
body {
|
| 466 |
+
padding: 1rem;
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
h1 {
|
| 470 |
+
font-size: 2.25rem;
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
h2 {
|
| 474 |
+
font-size: 1rem;
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
.input-tile {
|
| 478 |
+
padding: 0.5rem;
|
| 479 |
+
font-size: 0.8125rem;
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
.input-tile svg {
|
| 483 |
+
width: 1.5rem;
|
| 484 |
+
height: 1.5rem;
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
/* Scrollbar */
|
| 489 |
+
.transcript-output::-webkit-scrollbar {
|
| 490 |
+
width: 6px;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
.transcript-output::-webkit-scrollbar-track {
|
| 494 |
+
background: #f1f5f9;
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
.transcript-output::-webkit-scrollbar-thumb {
|
| 498 |
+
background: #cbd5e1;
|
| 499 |
+
border-radius: 3px;
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
.transcript-output::-webkit-scrollbar-thumb:hover {
|
| 503 |
+
background: #94a3b8;
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
/* Dark Mode */
|
| 507 |
+
@media (prefers-color-scheme: dark) {
|
| 508 |
+
body {
|
| 509 |
+
background: #0f172a;
|
| 510 |
+
color: #e2e8f0;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
h1, h2 {
|
| 514 |
+
color: #f1f5f9;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
.input-card,
|
| 518 |
+
.transcript-card,
|
| 519 |
+
.audio-player {
|
| 520 |
+
background: #1e293b;
|
| 521 |
+
border-color: rgba(148, 163, 184, 0.1);
|
| 522 |
+
box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.3);
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
.audio-time {
|
| 526 |
+
color: #94a3b8;
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
.input-card.drag-over {
|
| 530 |
+
background: #1e3a5f;
|
| 531 |
+
border-color: #3b82f6;
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
.input-tile {
|
| 535 |
+
color: #94a3b8;
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
.input-tile:hover:not(:disabled) {
|
| 539 |
+
color: #818cf8;
|
| 540 |
+
background: #312e81;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
.input-tile.recording {
|
| 544 |
+
color: #f87171;
|
| 545 |
+
background: #450a0a;
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
.input-tile.recording:hover {
|
| 549 |
+
color: #fca5a5;
|
| 550 |
+
background: #7f1d1d;
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
.divider {
|
| 554 |
+
background: #334155;
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
.progress-bar {
|
| 558 |
+
background: #334155;
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
.status-section,
|
| 562 |
+
.task-row,
|
| 563 |
+
.checkbox-row,
|
| 564 |
+
.model-progress {
|
| 565 |
+
color: #94a3b8;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
.task-row select {
|
| 569 |
+
background: #1e293b;
|
| 570 |
+
color: #e2e8f0;
|
| 571 |
+
border-color: #475569;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
.task-row select:focus {
|
| 575 |
+
border-color: #3b82f6;
|
| 576 |
+
box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.3);
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
.transcript-header {
|
| 580 |
+
border-color: #334155;
|
| 581 |
+
color: #94a3b8;
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
.transcript-output {
|
| 585 |
+
color: #e2e8f0;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
.transcript-row .timestamp {
|
| 589 |
+
color: #64748b;
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
.icon-btn {
|
| 593 |
+
color: #94a3b8;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
.icon-btn:hover {
|
| 597 |
+
background: #334155;
|
| 598 |
+
color: #f1f5f9;
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
.clear-btn {
|
| 602 |
+
color: #64748b;
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
.clear-btn:hover {
|
| 606 |
+
color: #f87171;
|
| 607 |
+
background: #450a0a;
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
.footer {
|
| 611 |
+
color: #64748b;
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
.footer a {
|
| 615 |
+
color: #60a5fa;
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
.privacy-note {
|
| 619 |
+
color: #64748b;
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
.gpu-info {
|
| 623 |
+
color: #64748b;
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
/* Scrollbar dark */
|
| 627 |
+
.transcript-output::-webkit-scrollbar-track {
|
| 628 |
+
background: #1e293b;
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
.transcript-output::-webkit-scrollbar-thumb {
|
| 632 |
+
background: #475569;
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
.transcript-output::-webkit-scrollbar-thumb:hover {
|
| 636 |
+
background: #64748b;
|
| 637 |
+
}
|
| 638 |
}
|
vad.js
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Silero VAD for speech detection and silence trimming
|
| 3 |
+
* Based on the approach in ibm-granite/granite-speech HF demo
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
let vadSession = null;
|
| 7 |
+
const VAD_SAMPLE_RATE = 16000;
|
| 8 |
+
const VAD_CHUNK_SIZE = 512; // 32ms chunks at 16kHz
|
| 9 |
+
|
| 10 |
+
// Load VAD model
|
| 11 |
+
async function loadVAD() {
|
| 12 |
+
if (vadSession) return;
|
| 13 |
+
|
| 14 |
+
console.log('Loading VAD model...');
|
| 15 |
+
vadSession = await ort.InferenceSession.create('./silero_vad.onnx', {
|
| 16 |
+
executionProviders: ['wasm'],
|
| 17 |
+
});
|
| 18 |
+
console.log('VAD model loaded');
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
// Get speech timestamps using Silero VAD
|
| 22 |
+
// Returns list of {start, end} in samples
|
| 23 |
+
async function getSpeechTimestamps(audioData, threshold = 0.5) {
|
| 24 |
+
await loadVAD();
|
| 25 |
+
|
| 26 |
+
// Initialize state [2, 1, 128]
|
| 27 |
+
let state = new Float32Array(2 * 1 * 128);
|
| 28 |
+
const sr = BigInt(VAD_SAMPLE_RATE);
|
| 29 |
+
|
| 30 |
+
const speechProbs = [];
|
| 31 |
+
|
| 32 |
+
// Process in chunks
|
| 33 |
+
for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) {
|
| 34 |
+
const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length);
|
| 35 |
+
let chunk = new Float32Array(VAD_CHUNK_SIZE);
|
| 36 |
+
|
| 37 |
+
// Copy chunk data
|
| 38 |
+
for (let j = 0; j < chunkEnd - i; j++) {
|
| 39 |
+
chunk[j] = audioData[i + j];
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
// Run VAD
|
| 43 |
+
const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]);
|
| 44 |
+
const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]);
|
| 45 |
+
const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []);
|
| 46 |
+
|
| 47 |
+
const outputs = await vadSession.run({
|
| 48 |
+
input: inputTensor,
|
| 49 |
+
state: stateTensor,
|
| 50 |
+
sr: srTensor
|
| 51 |
+
});
|
| 52 |
+
|
| 53 |
+
speechProbs.push(outputs.output.data[0]);
|
| 54 |
+
state = new Float32Array(outputs.stateN.data);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
// Find speech segments
|
| 58 |
+
const segments = [];
|
| 59 |
+
let inSpeech = false;
|
| 60 |
+
let speechStart = 0;
|
| 61 |
+
|
| 62 |
+
for (let i = 0; i < speechProbs.length; i++) {
|
| 63 |
+
const isSpeech = speechProbs[i] >= threshold;
|
| 64 |
+
|
| 65 |
+
if (isSpeech && !inSpeech) {
|
| 66 |
+
speechStart = i * VAD_CHUNK_SIZE;
|
| 67 |
+
inSpeech = true;
|
| 68 |
+
} else if (!isSpeech && inSpeech) {
|
| 69 |
+
segments.push({
|
| 70 |
+
start: speechStart,
|
| 71 |
+
end: i * VAD_CHUNK_SIZE
|
| 72 |
+
});
|
| 73 |
+
inSpeech = false;
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
if (inSpeech) {
|
| 78 |
+
segments.push({
|
| 79 |
+
start: speechStart,
|
| 80 |
+
end: audioData.length
|
| 81 |
+
});
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
return segments;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
// Get speech segments with merging (like granite-speech demo)
|
| 88 |
+
// Returns segments with start/end in seconds
|
| 89 |
+
async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) {
|
| 90 |
+
const vadSegments = await getSpeechTimestamps(audioData);
|
| 91 |
+
|
| 92 |
+
if (vadSegments.length === 0) {
|
| 93 |
+
return [{ start: 0, end: audioData.length / sampleRate }];
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
// Convert to seconds and apply buffering/merging
|
| 97 |
+
const startBuffer = 0.3; // seconds - pad segment start
|
| 98 |
+
const minGap = 0.5; // seconds - merge segments with small gaps
|
| 99 |
+
|
| 100 |
+
const segments = [];
|
| 101 |
+
|
| 102 |
+
for (const seg of vadSegments) {
|
| 103 |
+
const startSec = seg.start / sampleRate;
|
| 104 |
+
const endSec = seg.end / sampleRate;
|
| 105 |
+
const bufferedStart = Math.max(0, startSec - startBuffer);
|
| 106 |
+
|
| 107 |
+
if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) {
|
| 108 |
+
// Merge with previous segment
|
| 109 |
+
segments[segments.length - 1].end = endSec;
|
| 110 |
+
} else {
|
| 111 |
+
// Start new segment
|
| 112 |
+
if (segments.length > 0) {
|
| 113 |
+
// Extend to previous end to avoid gaps
|
| 114 |
+
segments.push({ start: segments[segments.length - 1].end, end: endSec });
|
| 115 |
+
} else {
|
| 116 |
+
segments.push({ start: bufferedStart, end: endSec });
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
return segments;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// Trim silence from audio (simple version - just trim start/end)
|
| 125 |
+
async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) {
|
| 126 |
+
const segments = await getSpeechTimestamps(audioData);
|
| 127 |
+
|
| 128 |
+
if (segments.length === 0) {
|
| 129 |
+
console.log('VAD: No speech detected, returning original audio');
|
| 130 |
+
return audioData;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
// Add padding (300ms)
|
| 134 |
+
const paddingSamples = Math.floor(0.3 * sampleRate);
|
| 135 |
+
|
| 136 |
+
const start = Math.max(0, segments[0].start - paddingSamples);
|
| 137 |
+
const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples);
|
| 138 |
+
|
| 139 |
+
const trimmedStart = (start / sampleRate).toFixed(2);
|
| 140 |
+
const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2);
|
| 141 |
+
console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`);
|
| 142 |
+
|
| 143 |
+
return audioData.slice(start, end);
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// Format timestamp as MM:SS
|
| 147 |
+
function formatTimestamp(seconds) {
|
| 148 |
+
const mins = Math.floor(seconds / 60);
|
| 149 |
+
const secs = Math.floor(seconds % 60);
|
| 150 |
+
return `${mins}:${secs.toString().padStart(2, '0')}`;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// Export
|
| 154 |
+
window.loadVAD = loadVAD;
|
| 155 |
+
window.trimSilence = trimSilence;
|
| 156 |
+
window.getSpeechSegments = getSpeechSegments;
|
| 157 |
+
window.formatTimestamp = formatTimestamp;
|