Spaces:

claudiocaletti
/

medwhisper-large-v3-ita-demo

Sleeping

App Files Files Community

calettippo commited on Sep 18

Commit

edf497c

1 Parent(s): 65b0afc

Improve Whisper pipeline caching

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +127 -33

.gitignore CHANGED Viewed

@@ -4,3 +4,4 @@ __pycache__/
 *.py[cod]
 .DS_Store
 *.log

 *.py[cod]
 .DS_Store
 *.log
+hf_models/

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import time
 import logging
 import gc
 import io
 from dataclasses import dataclass
 from typing import Optional, Tuple, List, Any, Dict
 from contextlib import contextmanager
@@ -18,6 +19,7 @@ from pydub import AudioSegment
 from pydub.silence import split_on_silence
 import soundfile as sf
 import noisereduce
 load_dotenv()
@@ -25,6 +27,13 @@ load_dotenv()
 PREPROCESSING_AVAILABLE = True
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
     """Get environment variable or default."""
     return os.environ.get(key, default)
@@ -51,6 +60,51 @@ class PreprocessingConfig:
     remove_silence: bool = True
 def normalize_audio(audio_bytes: bytes) -> bytes:
     """
     Converte un chunk audio in bytes nel formato standard per Whisper.
@@ -248,59 +302,95 @@ def load_asr_pipeline(
     logger.info(f"  Chunk length: {chunk_length_s}s")
     logger.info(f"  Return timestamps: {return_timestamps}")
     # Use ultra-simplified approach to avoid all compatibility issues
     try:
         logger.info(
             "Setting up ultra-simplified pipeline to avoid forced_decoder_ids conflicts..."
         )
-        # Create pipeline with absolute minimal configuration
-        asr = pipeline(
-            task="automatic-speech-recognition",
-            model=model_id,
-            torch_dtype=dtype,
-            device=0
-            if device_str == "cuda"
-            else ("mps" if device_str == "mps" else "cpu"),
-            token=hf_token,
-        )
         # Post-loading cleanup to remove any forced_decoder_ids
-        if hasattr(asr.model, "generation_config"):
-            if hasattr(asr.model.generation_config, "forced_decoder_ids"):
-                logger.info("Removing forced_decoder_ids from model generation config")
-                asr.model.generation_config.forced_decoder_ids = None
-        # Set basic parameters after loading
         if chunk_length_s:
             logger.info(f"Setting chunk_length_s to {chunk_length_s}")
         logger.info(f"Successfully created ultra-simplified pipeline for: {model_id}")
     except Exception as e:
         logger.error(f"Ultra-simplified pipeline creation failed: {e}")
         logger.info("Falling back to absolute minimal settings...")
-        try:
-            # Fallback with absolute minimal settings
-            fallback_dtype = torch.float32
-            asr = pipeline(
-                task="automatic-speech-recognition",
-                model=model_id,
-                torch_dtype=fallback_dtype,
-                device="cpu",  # Force CPU for maximum compatibility
-                token=hf_token,
             )
-            # Post-loading cleanup
-            if hasattr(asr.model, "generation_config"):
-                if hasattr(asr.model.generation_config, "forced_decoder_ids"):
-                    logger.info("Removing forced_decoder_ids from fallback model")
-                    asr.model.generation_config.forced_decoder_ids = None
-            device_str = "cpu"
-            dtype = fallback_dtype
             logger.info(
                 f"Minimal fallback pipeline created with dtype: {fallback_dtype}"
             )
@@ -309,7 +399,11 @@ def load_asr_pipeline(
             logger.error(f"Minimal fallback failed: {fallback_error}")
             raise
-    return asr, device_str, str(dtype).replace("torch.", "")
 @contextmanager

 import logging
 import gc
 import io
+import threading
 from dataclasses import dataclass
 from typing import Optional, Tuple, List, Any, Dict
 from contextlib import contextmanager
 from pydub.silence import split_on_silence
 import soundfile as sf
 import noisereduce
+from huggingface_hub import snapshot_download
 load_dotenv()
 PREPROCESSING_AVAILABLE = True
+# Shared caches to keep models/pipelines in memory across requests
+PIPELINE_CACHE: Dict[Tuple[str, str, str], Tuple[Any, str, str]] = {}
+PIPELINE_CACHE_LOCK = threading.Lock()
+MODEL_PATH_CACHE: Dict[str, str] = {}
+MODEL_PATH_CACHE_LOCK = threading.Lock()
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
     """Get environment variable or default."""
     return os.environ.get(key, default)
     remove_silence: bool = True
+def ensure_local_model(model_id: str, hf_token: Optional[str] = None) -> str:
+    """Ensure a model snapshot is available locally and return its path."""
+    if os.path.isdir(model_id):
+        return model_id
+    with MODEL_PATH_CACHE_LOCK:
+        cached_path = MODEL_PATH_CACHE.get(model_id)
+    if cached_path and os.path.isdir(cached_path):
+        return cached_path
+    logger = logging.getLogger(__name__)
+    cache_root = get_env_or_secret("HF_MODEL_CACHE_DIR")
+    if not cache_root:
+        cache_root = os.path.join(os.path.dirname(__file__), "hf_models")
+    os.makedirs(cache_root, exist_ok=True)
+    local_dir = os.path.join(cache_root, model_id.replace("/", "__"))
+    try:
+        snapshot_download(
+            repo_id=model_id,
+            token=hf_token,
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            resume_download=True,
+        )
+    except Exception as download_error:
+        # If download fails but we already have weights, continue with local copy
+        if os.path.isdir(local_dir) and os.listdir(local_dir):
+            logger.warning(
+                "Unable to refresh model %s from hub (%s), using existing files",
+                model_id,
+                download_error,
+            )
+        else:
+            raise
+    with MODEL_PATH_CACHE_LOCK:
+        MODEL_PATH_CACHE[model_id] = local_dir
+    return local_dir
 def normalize_audio(audio_bytes: bytes) -> bytes:
     """
     Converte un chunk audio in bytes nel formato standard per Whisper.
     logger.info(f"  Chunk length: {chunk_length_s}s")
     logger.info(f"  Return timestamps: {return_timestamps}")
+    dtype_name = str(dtype).replace("torch.", "") if dtype is not None else "auto"
+    cache_key = (model_id, device_str, dtype_name)
+    with PIPELINE_CACHE_LOCK:
+        cached_pipeline = PIPELINE_CACHE.get(cache_key)
+    if cached_pipeline:
+        logger.info(
+            "Reusing cached pipeline for %s on %s (%s)",
+            model_id,
+            device_str,
+            dtype_name,
+        )
+        return cached_pipeline
+    model_source = ensure_local_model(model_id, hf_token=hf_token)
+    logger.info(f"Using local model files from: {model_source}")
+    device_argument: Any = 0 if device_str == "cuda" else device_str
+    pipeline_kwargs = {
+        "task": "automatic-speech-recognition",
+        "model": model_source,
+        "device": device_argument,
+    }
+    if dtype is not None:
+        pipeline_kwargs["torch_dtype"] = dtype
     # Use ultra-simplified approach to avoid all compatibility issues
     try:
         logger.info(
             "Setting up ultra-simplified pipeline to avoid forced_decoder_ids conflicts..."
         )
+        asr = pipeline(**pipeline_kwargs)
         # Post-loading cleanup to remove any forced_decoder_ids
+        if hasattr(asr.model, "generation_config") and hasattr(
+            asr.model.generation_config, "forced_decoder_ids"
+        ):
+            logger.info("Removing forced_decoder_ids from model generation config")
+            asr.model.generation_config.forced_decoder_ids = None
         if chunk_length_s:
             logger.info(f"Setting chunk_length_s to {chunk_length_s}")
+        final_device = device_str
+        final_dtype = dtype
+        final_dtype_name = dtype_name
         logger.info(f"Successfully created ultra-simplified pipeline for: {model_id}")
     except Exception as e:
         logger.error(f"Ultra-simplified pipeline creation failed: {e}")
         logger.info("Falling back to absolute minimal settings...")
+        fallback_device = "cpu"
+        fallback_dtype = torch.float32
+        fallback_dtype_name = str(fallback_dtype).replace("torch.", "")
+        fallback_key = (model_id, fallback_device, fallback_dtype_name)
+        with PIPELINE_CACHE_LOCK:
+            cached_pipeline = PIPELINE_CACHE.get(fallback_key)
+        if cached_pipeline:
+            logger.info(
+                "Reusing cached fallback pipeline for %s (%s)",
+                model_id,
+                fallback_dtype_name,
             )
+            return cached_pipeline
+        fallback_kwargs = {
+            "task": "automatic-speech-recognition",
+            "model": model_source,
+            "device": fallback_device,
+            "torch_dtype": fallback_dtype,
+        }
+        try:
+            asr = pipeline(**fallback_kwargs)
+            if hasattr(asr.model, "generation_config") and hasattr(
+                asr.model.generation_config, "forced_decoder_ids"
+            ):
+                logger.info("Removing forced_decoder_ids from fallback model")
+                asr.model.generation_config.forced_decoder_ids = None
+            final_device = fallback_device
+            final_dtype = fallback_dtype
+            final_dtype_name = fallback_dtype_name
             logger.info(
                 f"Minimal fallback pipeline created with dtype: {fallback_dtype}"
             )
             logger.error(f"Minimal fallback failed: {fallback_error}")
             raise
+    cache_key = (model_id, final_device, final_dtype_name)
+    with PIPELINE_CACHE_LOCK:
+        PIPELINE_CACHE[cache_key] = (asr, final_device, final_dtype_name)
+    return asr, final_device, final_dtype_name
 @contextmanager