Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

App Files Files Community

Tonic commited on Jul 29

Commit

1a6008e

1 Parent(s): 55d7c97

add french description

Browse files

Files changed (1) hide show

app.py +49 -123

app.py CHANGED Viewed

@@ -12,25 +12,15 @@ import requests
 # Set torch to use float32 for better compatibility with quantized models
 torch.set_default_dtype(torch.float32)
-# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Model configuration
 MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Global variables for model and tokenizer
 model = None
 tokenizer = None
-# Default system prompt
 DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."
-# Title and description content
 title = "# 🤖 Petite Elle L'Aime 3 - Chat Interface"
-description = "A fine-tuned version of SmolLM3-3B optimized for French and multilingual conversations. This is the int4 quantized version for efficient CPU deployment."
 presentation1 = """
 ### 🎯 Features
 - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
@@ -39,23 +29,19 @@ presentation1 = """
 - **Customizable System Prompt**: Define the assistant's personality and behavior
 - **Thinking Mode**: Enable reasoning mode with thinking tags
 """
-presentation2 = """
-### 📋 Model Information
-- **Base Model**: SmolLM3-3B
-- **Parameters**: ~3B
-- **Context Length**: 128k
-- **Languages**: English, French, Italian, Portuguese, Chinese, Arabic
-- **Device**: CPU optimized
-- **Quantization**: int4
 """
 joinus = """
-### 🚀 Quick Start
-1. Add context in the system prompt
-2. Type your message
-3. Click generate to start chatting
-4. Use advanced settings for fine-tuning
 """
 def download_chat_template():
     """Download the chat template from the main repository"""
     try:
@@ -82,25 +68,18 @@ def load_model():
     global model, tokenizer
     try:
-        # Load tokenizer from main repository to get the base configuration
         logger.info(f"Loading tokenizer from {MAIN_MODEL_ID}")
         tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID, subfolder="int4")
-        # Download and set the chat template
         chat_template = download_chat_template()
         tokenizer.chat_template = chat_template
         logger.info("Chat template downloaded and set successfully")
-        # Load the int4 model from local path
         logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
-        # Configure model loading parameters for int4 quantization
         model_kwargs = {
             "device_map": "auto" if DEVICE == "cuda" else "cpu",
-            "torch_dtype": torch.float32,  # Use float32 for int4 quantized models
             "trust_remote_code": True,
-            "low_cpu_mem_usage": True,  # Help with memory management
         }
         logger.info(f"Model loading parameters: {model_kwargs}")
@@ -121,25 +100,16 @@ def load_model():
 def create_prompt(system_message, user_message, enable_thinking=True):
     """Create prompt using the model's chat template"""
     try:
-        # Prepare messages for the template
         formatted_messages = []
-        # Add system message if provided
         if system_message and system_message.strip():
             formatted_messages.append({"role": "system", "content": system_message})
-        # Add user message
-        formatted_messages.append({"role": "user", "content": user_message})
-        # Apply the chat template
         prompt = tokenizer.apply_chat_template(
             formatted_messages,
             tokenize=False,
             add_generation_prompt=True,
             enable_thinking=enable_thinking
-        )
-        # Add  /no_think to the end of prompt when thinking is disabled
         if not enable_thinking:
             prompt += " /no_think"
@@ -156,76 +126,36 @@ def generate_response(message, history, system_message, max_tokens, temperature,
     if model is None or tokenizer is None:
         return "Error: Model not loaded. Please wait for the model to load."
-    try:
-        # Create prompt using chat template
-        full_prompt = create_prompt(system_message, message, enable_thinking)
-        if not full_prompt:
-            return "Error: Failed to create prompt."
-        # Tokenize the input
-        inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
-        # Debug input tensor information
-        logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
-        # Move to device
-        if DEVICE == "cuda":
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        # Generate response
-        with torch.no_grad():
-            try:
-                output_ids = model.generate(
-                    inputs['input_ids'],
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=do_sample,
-                    attention_mask=inputs['attention_mask'],
-                    pad_token_id=tokenizer.eos_token_id,
-                    eos_token_id=tokenizer.eos_token_id
-                )
-            except RuntimeError as e:
-                if "expected scalar type" in str(e):
-                    logger.error(f"Data type mismatch error: {e}")
-                    # Try with explicit dtype conversion
-                    inputs['input_ids'] = inputs['input_ids'].to(torch.int64)
-                    inputs['attention_mask'] = inputs['attention_mask'].to(torch.int64)
-                    output_ids = model.generate(
-                        inputs['input_ids'],
-                        max_new_tokens=max_tokens,
-                        temperature=temperature,
-                        top_p=top_p,
-                        do_sample=do_sample,
-                        attention_mask=inputs['attention_mask'],
-                        pad_token_id=tokenizer.eos_token_id,
-                        eos_token_id=tokenizer.eos_token_id
-                    )
-                else:
-                    raise e
-        # Decode the response
-        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Extract only the new response (remove the input prompt)
         assistant_response = response[len(full_prompt):].strip()
-        # Clean up the response - only remove special tokens, preserve thinking tags when enabled
         assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
-        # Only remove thinking tags if thinking mode is disabled
         if not enable_thinking:
             assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
         assistant_response = assistant_response.strip()
         return assistant_response
-    except Exception as e:
-        logger.error(f"Error generating response: {e}")
-        return f"Error generating response: {str(e)}"
 def user(user_message, history):
     """Add user message to history"""
@@ -235,16 +165,12 @@ def user(user_message, history):
 def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking):
     """Generate bot response"""
-    # Get the last user message
     if not history:
-        return history
     user_message = history[-1]["content"] if history else ""
     do_sample = advanced_checkbox
     bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking)
-    # Add assistant response to history
     history.append({"role": "assistant", "content": bot_message})
     return history
@@ -275,40 +201,40 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=2):
             system_prompt = gr.TextArea(
-                label="📑 Context",
                 placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.",
                 lines=5,
                 value=DEFAULT_SYSTEM_PROMPT
             )
             user_input = gr.TextArea(
-                label="🤷🏻‍♂️ User Input",
-                placeholder="Hi there my name is Tonic!",
                 lines=2
             )
             advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
             with gr.Column(visible=False) as advanced_settings:
                 max_length = gr.Slider(
-                    label="📏 Max Length",
-                    minimum=64,
-                    maximum=2048,
-                    value=512,
-                    step=64
                 )
                 temperature = gr.Slider(
-                    label="🌡️ Temperature",
                     minimum=0.01,
                     maximum=1.0,
-                    value=0.7,
                     step=0.01
                 )
                 top_p = gr.Slider(
-                    label="⚛️ Top-p (Nucleus Sampling)",
                     minimum=0.1,
                     maximum=1.0,
-                    value=0.9,
                     step=0.01
                 )
-                enable_thinking = gr.Checkbox(label="Enable Thinking Mode", value=True)
             generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")

 # Set torch to use float32 for better compatibility with quantized models
 torch.set_default_dtype(torch.float32)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model = None
 tokenizer = None
 DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."
 title = "# 🤖 Petite Elle L'Aime 3 - Chat Interface"
+description = "A fine-tuned version of SmolLM3-3B optimized for French conversations. This is the int4 quantized version for efficient CPU deployment."
 presentation1 = """
 ### 🎯 Features
 - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
 - **Customizable System Prompt**: Define the assistant's personality and behavior
 - **Thinking Mode**: Enable reasoning mode with thinking tags
 """
+presentation2 = """### 🎯 Fonctionnalités
+* **Support multilingue** : Anglais, Français, Italien, Portugais, Chinois, Arabe
+* **Quantification Int4** : Optimisé pour un déploiement sur CPU avec une réduction de mémoire d’environ 50 %
+* **Interface de chat interactive** : Conversation en temps réel avec le modèle
+* **Invite système personnalisable** : Définissez la personnalité et le comportement de l’assistant
+* **Mode Réflexion** : Activez le mode raisonnement avec des balises de réflexion
 """
 joinus = """
+## Join us :
+🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
 def download_chat_template():
     """Download the chat template from the main repository"""
     try:
     global model, tokenizer
     try:
         logger.info(f"Loading tokenizer from {MAIN_MODEL_ID}")
         tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID, subfolder="int4")
         chat_template = download_chat_template()
         tokenizer.chat_template = chat_template
         logger.info("Chat template downloaded and set successfully")
         logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
         model_kwargs = {
             "device_map": "auto" if DEVICE == "cuda" else "cpu",
+            "torch_dtype": torch.float32,
             "trust_remote_code": True,
+            "low_cpu_mem_usage": True,
         }
         logger.info(f"Model loading parameters: {model_kwargs}")
 def create_prompt(system_message, user_message, enable_thinking=True):
     """Create prompt using the model's chat template"""
     try:
         formatted_messages = []
         if system_message and system_message.strip():
             formatted_messages.append({"role": "system", "content": system_message})
+        formatted_messages.append({"role": "user", "content": user_message})
         prompt = tokenizer.apply_chat_template(
             formatted_messages,
             tokenize=False,
             add_generation_prompt=True,
             enable_thinking=enable_thinking
+        )
         if not enable_thinking:
             prompt += " /no_think"
     if model is None or tokenizer is None:
         return "Error: Model not loaded. Please wait for the model to load."
+    full_prompt = create_prompt(system_message, message, enable_thinking)
+    if not full_prompt:
+        return "Error: Failed to create prompt."
+    inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
+    logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
+    if DEVICE == "cuda":
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    with torch.no_grad():
+        output_ids = model.generate(
+            inputs['input_ids'],
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=do_sample,
+            attention_mask=inputs['attention_mask'],
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+            )
+        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         assistant_response = response[len(full_prompt):].strip()
         assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
         if not enable_thinking:
             assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
         assistant_response = assistant_response.strip()
         return assistant_response
 def user(user_message, history):
     """Add user message to history"""
 def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking):
     """Generate bot response"""
     if not history:
+        return history
     user_message = history[-1]["content"] if history else ""
     do_sample = advanced_checkbox
     bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking)
     history.append({"role": "assistant", "content": bot_message})
     return history
     with gr.Row():
         with gr.Column(scale=2):
             system_prompt = gr.TextArea(
+                label="📑 Contexte",
                 placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.",
                 lines=5,
                 value=DEFAULT_SYSTEM_PROMPT
             )
             user_input = gr.TextArea(
+                label="🤷🏻‍♂️ Message",
+                placeholder="Bonjour je m'appel Tonic!",
                 lines=2
             )
             advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
             with gr.Column(visible=False) as advanced_settings:
                 max_length = gr.Slider(
+                    label="📏 Longueur de la réponse",
+                    minimum=10,
+                    maximum=556,
+                    value=120,
+                    step=1
                 )
                 temperature = gr.Slider(
+                    label="🌡️ Température",
                     minimum=0.01,
                     maximum=1.0,
+                    value=0.5,
                     step=0.01
                 )
                 top_p = gr.Slider(
+                    label="⚛️ Top-p (Echantillonnage)",
                     minimum=0.1,
                     maximum=1.0,
+                    value=0.95,
                     step=0.01
                 )
+                enable_thinking = gr.Checkbox(label="Mode Réflexion", value=True)
             generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")