Automatic Speech Recognition
Transformers
Safetensors
Danish
qwen3_asr
text-generation
audio
speech
danish
qwen3-asr
trust-remote-code
custom-code
custom_code
Instructions to use capacit-ai/saga with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use capacit-ai/saga with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="capacit-ai/saga", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("capacit-ai/saga", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2026 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| import numpy as np | |
| from transformers.audio_utils import AudioInput | |
| from transformers.feature_extraction_utils import BatchFeature | |
| from transformers.processing_utils import ProcessingKwargs, ProcessorMixin | |
| from transformers.tokenization_utils_base import TextInput | |
| class Qwen3ASRProcessorKwargs(ProcessingKwargs, total=False): | |
| _defaults = { | |
| "text_kwargs": { | |
| "padding": False, | |
| "padding_side": "left", | |
| }, | |
| "audio_kwargs": { | |
| "sampling_rate": 16000, | |
| "padding": True, | |
| "return_attention_mask": True, | |
| }, | |
| } | |
| def _get_feat_extract_output_lengths(input_lengths): | |
| """ | |
| Computes the output length of the convolutional layers and the output length of the audio encoder | |
| """ | |
| input_lengths_leave = input_lengths % 100 | |
| feat_lengths = (input_lengths_leave - 1) // 2 + 1 | |
| output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 | |
| return output_lengths | |
| class Qwen3ASRProcessor(ProcessorMixin): | |
| r""" | |
| Constructs a Qwen3ASR processor. | |
| [`Qwen3ASRProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the | |
| [`~Qwen3ASRProcessor.__call__`] and [`~Qwen3ASRProcessor.decode`] for more information. | |
| Args: | |
| feature_extractor ([`WhisperFeatureExtractor`], *optional*): | |
| The audio feature extractor. | |
| tokenizer ([`Qwen2TokenizerFast`], *optional*): | |
| The text tokenizer. | |
| chat_template (`Optional[str]`, *optional*): | |
| The Jinja template to use for formatting the conversation. If not provided, the default chat template is used. | |
| """ | |
| attributes = ["feature_extractor", "tokenizer"] | |
| feature_extractor_class = "WhisperFeatureExtractor" | |
| tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") | |
| def __init__( | |
| self, feature_extractor=None, tokenizer=None, chat_template=None | |
| ): | |
| super().__init__(feature_extractor, tokenizer, chat_template=chat_template) | |
| self.audio_token = self.tokenizer.audio_token | |
| self.audio_bos_token = self.tokenizer.audio_bos_token | |
| self.audio_eos_token = self.tokenizer.audio_eos_token | |
| def __call__( | |
| self, | |
| text: TextInput = None, | |
| audio: AudioInput = None, | |
| **kwargs, | |
| ) -> BatchFeature: | |
| """ | |
| Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text` | |
| and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode | |
| the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to | |
| WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. Please refer to the doctsring | |
| of the above two methods for more information. | |
| Args: | |
| text (`str`, `List[str]`, `List[List[str]]`): | |
| The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings | |
| (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set | |
| `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). | |
| audio (`np.ndarray`, `List[np.ndarray]`): | |
| The audio or batch of audio to be prepared. Each audio can be a NumPy array. | |
| """ | |
| if text is None: | |
| raise ValueError("You need to specify either a `text` input to process.") | |
| output_kwargs = self._merge_kwargs( | |
| Qwen3ASRProcessorKwargs, | |
| tokenizer_init_kwargs=self.tokenizer.init_kwargs, | |
| **kwargs, | |
| ) | |
| if audio is not None: | |
| output_kwargs["audio_kwargs"]["padding"] = True | |
| output_kwargs["audio_kwargs"]["truncation"] = False | |
| audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) | |
| audio_inputs["feature_attention_mask"] = audio_inputs.pop( | |
| "attention_mask" | |
| ) # rename feature_attention_mask to prevent conflicts later on | |
| audio_inputs["input_features"] = audio_inputs.pop( | |
| "input_features" | |
| ) # rename input_features to prevent conflicts later on | |
| audio_lengths = iter(_get_feat_extract_output_lengths(audio_inputs["feature_attention_mask"].sum(-1))) | |
| else: | |
| audio_inputs = {} | |
| audio_lengths = iter([]) | |
| if not isinstance(text, list): | |
| text = [text] | |
| text = self.replace_multimodal_special_tokens( | |
| text, | |
| audio_lengths, | |
| ) | |
| texts_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) | |
| return BatchFeature( | |
| data={**texts_inputs, **audio_inputs}, | |
| tensor_type=kwargs.get("return_tensors"), | |
| ) | |
| def replace_multimodal_special_tokens( | |
| self, | |
| text, | |
| audio_lengths, | |
| ): | |
| processed_text = [] | |
| for sample in text: | |
| positions = [] | |
| special_tokens = [re.escape(tok) for tok in [self.audio_token]] | |
| pattern = "|".join(special_tokens) | |
| positions = sorted([(match.start(), match.group()) for match in re.finditer(pattern, sample)]) | |
| positions.sort(key=lambda x: x[0]) | |
| for _, special_token in positions: | |
| if special_token == self.audio_token: | |
| sample = sample.replace(self.audio_token, "<|audio_placeholder|>" * next(audio_lengths), 1) | |
| sample = sample.replace("<|audio_placeholder|>", self.audio_token) | |
| processed_text.append(sample) | |
| return processed_text | |
| def get_chunked_index(self, token_indices: np.ndarray, tokens_per_chunk: int) -> list[tuple[int, int]]: | |
| """ | |
| Splits token index list into chunks based on token value ranges. | |
| Given a list of token indices, returns a list of (start, end) index tuples representing | |
| slices of the list where the token values fall within successive ranges of `t_ntoken_per_chunk`. | |
| For example, if `t_ntoken_per_chunk` is 1000, the function will create chunks such that: | |
| - the first chunk contains token values < 1000, | |
| - the second chunk contains values >= 1000 and < 2000, and so on. | |
| Parameters: | |
| token_indices (`np.ndarray`): A monotonically increasing list of token index values. | |
| t_ntoken_per_chunk (`int`): Number of tokens per chunk (used as the chunk size threshold). | |
| Returns: | |
| `list[tuple[int, int]]`: A list of tuples, each representing the start (inclusive) | |
| and end (exclusive) indices of a chunk in `token_indices`. | |
| """ | |
| def _iter(): | |
| i, start_idx = 0, 0 # skip bos token | |
| current_chunk = 1 | |
| while i < len(token_indices): # skip eos token | |
| if token_indices[i] >= current_chunk * tokens_per_chunk: | |
| yield (start_idx, i) | |
| start_idx = i | |
| current_chunk += 1 | |
| i += 1 | |
| yield (start_idx, len(token_indices)) | |
| return list(_iter()) | |
| def load_audio(path: str, target_sr: int = 16_000) -> "np.ndarray": | |
| """Load an audio file as a mono float32 waveform resampled to *target_sr*. | |
| Args: | |
| path: Path to the audio file (wav, mp3, m4a, …). | |
| target_sr: Target sample rate in Hz. Defaults to 16 000. | |
| Returns: | |
| 1-D float32 NumPy array. | |
| """ | |
| import soundfile as sf | |
| audio, sr = sf.read(path, dtype="float32") | |
| audio = np.asarray(audio, dtype=np.float32) | |
| if audio.ndim > 1: | |
| audio = audio.mean(axis=1) | |
| if int(sr) != target_sr: | |
| src_len = audio.shape[0] | |
| dst_len = int(round(src_len * (float(target_sr) / float(sr)))) | |
| if dst_len <= 1: | |
| return np.zeros((0,), dtype=np.float32) | |
| src_x = np.linspace(0.0, 1.0, num=src_len, endpoint=False) | |
| dst_x = np.linspace(0.0, 1.0, num=dst_len, endpoint=False) | |
| audio = np.interp(dst_x, src_x, audio).astype(np.float32) | |
| return audio | |
| def build_prompt(self, language: str = "Danish") -> str: | |
| """Build the base ASR text prompt with a language tag. | |
| Args: | |
| language: Language name injected into the prompt. Defaults to | |
| ``"Danish"``. | |
| Returns: | |
| The full prompt string ready for continuation decoding. | |
| """ | |
| messages = [ | |
| {"role": "system", "content": ""}, | |
| {"role": "user", "content": [{"type": "audio", "audio": ""}]}, | |
| ] | |
| prompt = self.apply_chat_template( | |
| messages, add_generation_prompt=True, tokenize=False, | |
| ) | |
| return prompt + f"language {language}<asr_text>" | |
| def apply_chat_template(self, conversations, chat_template=None, **kwargs): | |
| return super().apply_chat_template(conversations, chat_template, **kwargs) | |
| def model_input_names(self): | |
| tokenizer_input_names = self.tokenizer.model_input_names | |
| feature_extractor_input_names = self.feature_extractor.model_input_names | |
| return list( | |
| dict.fromkeys( | |
| tokenizer_input_names | |
| + feature_extractor_input_names | |
| + ["feature_attention_mask"] | |
| ) | |
| ) | |
| __all__ = ["Qwen3ASRProcessor"] | |