Tonic commited on
Commit
1a6008e
·
1 Parent(s): 55d7c97

add french description

Browse files
Files changed (1) hide show
  1. app.py +49 -123
app.py CHANGED
@@ -12,25 +12,15 @@ import requests
12
 
13
  # Set torch to use float32 for better compatibility with quantized models
14
  torch.set_default_dtype(torch.float32)
15
-
16
- # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
-
20
- # Model configuration
21
  MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
22
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
-
24
- # Global variables for model and tokenizer
25
  model = None
26
  tokenizer = None
27
-
28
- # Default system prompt
29
  DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."
30
-
31
- # Title and description content
32
  title = "# 🤖 Petite Elle L'Aime 3 - Chat Interface"
33
- description = "A fine-tuned version of SmolLM3-3B optimized for French and multilingual conversations. This is the int4 quantized version for efficient CPU deployment."
34
  presentation1 = """
35
  ### 🎯 Features
36
  - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
@@ -39,23 +29,19 @@ presentation1 = """
39
  - **Customizable System Prompt**: Define the assistant's personality and behavior
40
  - **Thinking Mode**: Enable reasoning mode with thinking tags
41
  """
42
- presentation2 = """
43
- ### 📋 Model Information
44
- - **Base Model**: SmolLM3-3B
45
- - **Parameters**: ~3B
46
- - **Context Length**: 128k
47
- - **Languages**: English, French, Italian, Portuguese, Chinese, Arabic
48
- - **Device**: CPU optimized
49
- - **Quantization**: int4
50
  """
51
  joinus = """
52
- ### 🚀 Quick Start
53
- 1. Add context in the system prompt
54
- 2. Type your message
55
- 3. Click generate to start chatting
56
- 4. Use advanced settings for fine-tuning
57
  """
58
 
 
59
  def download_chat_template():
60
  """Download the chat template from the main repository"""
61
  try:
@@ -82,25 +68,18 @@ def load_model():
82
  global model, tokenizer
83
 
84
  try:
85
- # Load tokenizer from main repository to get the base configuration
86
  logger.info(f"Loading tokenizer from {MAIN_MODEL_ID}")
87
  tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID, subfolder="int4")
88
-
89
- # Download and set the chat template
90
  chat_template = download_chat_template()
91
  tokenizer.chat_template = chat_template
92
  logger.info("Chat template downloaded and set successfully")
93
 
94
-
95
- # Load the int4 model from local path
96
  logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
97
-
98
- # Configure model loading parameters for int4 quantization
99
  model_kwargs = {
100
  "device_map": "auto" if DEVICE == "cuda" else "cpu",
101
- "torch_dtype": torch.float32, # Use float32 for int4 quantized models
102
  "trust_remote_code": True,
103
- "low_cpu_mem_usage": True, # Help with memory management
104
  }
105
 
106
  logger.info(f"Model loading parameters: {model_kwargs}")
@@ -121,25 +100,16 @@ def load_model():
121
  def create_prompt(system_message, user_message, enable_thinking=True):
122
  """Create prompt using the model's chat template"""
123
  try:
124
- # Prepare messages for the template
125
  formatted_messages = []
126
-
127
- # Add system message if provided
128
  if system_message and system_message.strip():
129
  formatted_messages.append({"role": "system", "content": system_message})
130
-
131
- # Add user message
132
- formatted_messages.append({"role": "user", "content": user_message})
133
-
134
- # Apply the chat template
135
  prompt = tokenizer.apply_chat_template(
136
  formatted_messages,
137
  tokenize=False,
138
  add_generation_prompt=True,
139
  enable_thinking=enable_thinking
140
- )
141
-
142
- # Add /no_think to the end of prompt when thinking is disabled
143
  if not enable_thinking:
144
  prompt += " /no_think"
145
 
@@ -156,76 +126,36 @@ def generate_response(message, history, system_message, max_tokens, temperature,
156
 
157
  if model is None or tokenizer is None:
158
  return "Error: Model not loaded. Please wait for the model to load."
159
-
160
- try:
161
- # Create prompt using chat template
162
- full_prompt = create_prompt(system_message, message, enable_thinking)
163
 
164
- if not full_prompt:
165
- return "Error: Failed to create prompt."
166
-
167
- # Tokenize the input
168
- inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
169
-
170
- # Debug input tensor information
171
- logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
172
-
173
- # Move to device
174
- if DEVICE == "cuda":
175
- inputs = {k: v.cuda() for k, v in inputs.items()}
176
-
177
- # Generate response
178
- with torch.no_grad():
179
- try:
180
- output_ids = model.generate(
181
- inputs['input_ids'],
182
- max_new_tokens=max_tokens,
183
- temperature=temperature,
184
- top_p=top_p,
185
- do_sample=do_sample,
186
- attention_mask=inputs['attention_mask'],
187
- pad_token_id=tokenizer.eos_token_id,
188
- eos_token_id=tokenizer.eos_token_id
189
- )
190
- except RuntimeError as e:
191
- if "expected scalar type" in str(e):
192
- logger.error(f"Data type mismatch error: {e}")
193
- # Try with explicit dtype conversion
194
- inputs['input_ids'] = inputs['input_ids'].to(torch.int64)
195
- inputs['attention_mask'] = inputs['attention_mask'].to(torch.int64)
196
- output_ids = model.generate(
197
- inputs['input_ids'],
198
- max_new_tokens=max_tokens,
199
- temperature=temperature,
200
- top_p=top_p,
201
- do_sample=do_sample,
202
- attention_mask=inputs['attention_mask'],
203
- pad_token_id=tokenizer.eos_token_id,
204
- eos_token_id=tokenizer.eos_token_id
205
- )
206
- else:
207
- raise e
208
-
209
- # Decode the response
210
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
211
-
212
- # Extract only the new response (remove the input prompt)
213
  assistant_response = response[len(full_prompt):].strip()
214
-
215
- # Clean up the response - only remove special tokens, preserve thinking tags when enabled
216
  assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
217
-
218
- # Only remove thinking tags if thinking mode is disabled
219
  if not enable_thinking:
220
  assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
221
 
222
  assistant_response = assistant_response.strip()
223
 
224
  return assistant_response
225
-
226
- except Exception as e:
227
- logger.error(f"Error generating response: {e}")
228
- return f"Error generating response: {str(e)}"
229
 
230
  def user(user_message, history):
231
  """Add user message to history"""
@@ -235,16 +165,12 @@ def user(user_message, history):
235
 
236
  def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking):
237
  """Generate bot response"""
238
- # Get the last user message
239
  if not history:
240
- return history
241
-
242
  user_message = history[-1]["content"] if history else ""
243
 
244
  do_sample = advanced_checkbox
245
  bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking)
246
-
247
- # Add assistant response to history
248
  history.append({"role": "assistant", "content": bot_message})
249
  return history
250
 
@@ -275,40 +201,40 @@ with gr.Blocks() as demo:
275
  with gr.Row():
276
  with gr.Column(scale=2):
277
  system_prompt = gr.TextArea(
278
- label="📑 Context",
279
  placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.",
280
  lines=5,
281
  value=DEFAULT_SYSTEM_PROMPT
282
  )
283
  user_input = gr.TextArea(
284
- label="🤷🏻‍♂️ User Input",
285
- placeholder="Hi there my name is Tonic!",
286
  lines=2
287
  )
288
  advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
289
  with gr.Column(visible=False) as advanced_settings:
290
  max_length = gr.Slider(
291
- label="📏 Max Length",
292
- minimum=64,
293
- maximum=2048,
294
- value=512,
295
- step=64
296
  )
297
  temperature = gr.Slider(
298
- label="🌡️ Temperature",
299
  minimum=0.01,
300
  maximum=1.0,
301
- value=0.7,
302
  step=0.01
303
  )
304
  top_p = gr.Slider(
305
- label="⚛️ Top-p (Nucleus Sampling)",
306
  minimum=0.1,
307
  maximum=1.0,
308
- value=0.9,
309
  step=0.01
310
  )
311
- enable_thinking = gr.Checkbox(label="Enable Thinking Mode", value=True)
312
 
313
  generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")
314
 
 
12
 
13
  # Set torch to use float32 for better compatibility with quantized models
14
  torch.set_default_dtype(torch.float32)
 
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
 
 
17
  MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
18
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
19
  model = None
20
  tokenizer = None
 
 
21
  DEFAULT_SYSTEM_PROMPT = "Tu es TonicIA, un assistant francophone rigoureux et bienveillant."
 
 
22
  title = "# 🤖 Petite Elle L'Aime 3 - Chat Interface"
23
+ description = "A fine-tuned version of SmolLM3-3B optimized for French conversations. This is the int4 quantized version for efficient CPU deployment."
24
  presentation1 = """
25
  ### 🎯 Features
26
  - **Multilingual Support**: English, French, Italian, Portuguese, Chinese, Arabic
 
29
  - **Customizable System Prompt**: Define the assistant's personality and behavior
30
  - **Thinking Mode**: Enable reasoning mode with thinking tags
31
  """
32
+ presentation2 = """### 🎯 Fonctionnalités
33
+ * **Support multilingue** : Anglais, Français, Italien, Portugais, Chinois, Arabe
34
+ * **Quantification Int4** : Optimisé pour un déploiement sur CPU avec une réduction de mémoire d’environ 50 %
35
+ * **Interface de chat interactive** : Conversation en temps réel avec le modèle
36
+ * **Invite système personnalisable** : Définissez la personnalité et le comportement de l’assistant
37
+ * **Mode Réflexion** : Activez le mode raisonnement avec des balises de réflexion
 
 
38
  """
39
  joinus = """
40
+ ## Join us :
41
+ 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 
 
 
42
  """
43
 
44
+
45
  def download_chat_template():
46
  """Download the chat template from the main repository"""
47
  try:
 
68
  global model, tokenizer
69
 
70
  try:
 
71
  logger.info(f"Loading tokenizer from {MAIN_MODEL_ID}")
72
  tokenizer = AutoTokenizer.from_pretrained(MAIN_MODEL_ID, subfolder="int4")
 
 
73
  chat_template = download_chat_template()
74
  tokenizer.chat_template = chat_template
75
  logger.info("Chat template downloaded and set successfully")
76
 
 
 
77
  logger.info(f"Loading int4 model from {MAIN_MODEL_ID}")
 
 
78
  model_kwargs = {
79
  "device_map": "auto" if DEVICE == "cuda" else "cpu",
80
+ "torch_dtype": torch.float32,
81
  "trust_remote_code": True,
82
+ "low_cpu_mem_usage": True,
83
  }
84
 
85
  logger.info(f"Model loading parameters: {model_kwargs}")
 
100
  def create_prompt(system_message, user_message, enable_thinking=True):
101
  """Create prompt using the model's chat template"""
102
  try:
 
103
  formatted_messages = []
 
 
104
  if system_message and system_message.strip():
105
  formatted_messages.append({"role": "system", "content": system_message})
106
+ formatted_messages.append({"role": "user", "content": user_message})
 
 
 
 
107
  prompt = tokenizer.apply_chat_template(
108
  formatted_messages,
109
  tokenize=False,
110
  add_generation_prompt=True,
111
  enable_thinking=enable_thinking
112
+ )
 
 
113
  if not enable_thinking:
114
  prompt += " /no_think"
115
 
 
126
 
127
  if model is None or tokenizer is None:
128
  return "Error: Model not loaded. Please wait for the model to load."
129
+ full_prompt = create_prompt(system_message, message, enable_thinking)
130
+
131
+ if not full_prompt:
132
+ return "Error: Failed to create prompt."
133
 
134
+ inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
135
+ logger.info(f"Input tensor shapes: {[(k, v.shape, v.dtype) for k, v in inputs.items()]}")
136
+
137
+ if DEVICE == "cuda":
138
+ inputs = {k: v.cuda() for k, v in inputs.items()}
139
+ with torch.no_grad():
140
+ output_ids = model.generate(
141
+ inputs['input_ids'],
142
+ max_new_tokens=max_tokens,
143
+ temperature=temperature,
144
+ top_p=top_p,
145
+ do_sample=do_sample,
146
+ attention_mask=inputs['attention_mask'],
147
+ pad_token_id=tokenizer.eos_token_id,
148
+ eos_token_id=tokenizer.eos_token_id
149
+ )
150
+ response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  assistant_response = response[len(full_prompt):].strip()
 
 
152
  assistant_response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', assistant_response, flags=re.DOTALL)
 
 
153
  if not enable_thinking:
154
  assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL)
155
 
156
  assistant_response = assistant_response.strip()
157
 
158
  return assistant_response
 
 
 
 
159
 
160
  def user(user_message, history):
161
  """Add user message to history"""
 
165
 
166
  def bot(history, system_prompt, max_length, temperature, top_p, advanced_checkbox, enable_thinking):
167
  """Generate bot response"""
 
168
  if not history:
169
+ return history
 
170
  user_message = history[-1]["content"] if history else ""
171
 
172
  do_sample = advanced_checkbox
173
  bot_message = generate_response(user_message, history, system_prompt, max_length, temperature, top_p, do_sample, enable_thinking)
 
 
174
  history.append({"role": "assistant", "content": bot_message})
175
  return history
176
 
 
201
  with gr.Row():
202
  with gr.Column(scale=2):
203
  system_prompt = gr.TextArea(
204
+ label="📑 Contexte",
205
  placeholder="Tu es TonicIA, un assistant francophone rigoureux et bienveillant.",
206
  lines=5,
207
  value=DEFAULT_SYSTEM_PROMPT
208
  )
209
  user_input = gr.TextArea(
210
+ label="🤷🏻‍♂️ Message",
211
+ placeholder="Bonjour je m'appel Tonic!",
212
  lines=2
213
  )
214
  advanced_checkbox = gr.Checkbox(label="🧪 Advanced Settings", value=False)
215
  with gr.Column(visible=False) as advanced_settings:
216
  max_length = gr.Slider(
217
+ label="📏 Longueur de la réponse",
218
+ minimum=10,
219
+ maximum=556,
220
+ value=120,
221
+ step=1
222
  )
223
  temperature = gr.Slider(
224
+ label="🌡️ Température",
225
  minimum=0.01,
226
  maximum=1.0,
227
+ value=0.5,
228
  step=0.01
229
  )
230
  top_p = gr.Slider(
231
+ label="⚛️ Top-p (Echantillonnage)",
232
  minimum=0.1,
233
  maximum=1.0,
234
+ value=0.95,
235
  step=0.01
236
  )
237
+ enable_thinking = gr.Checkbox(label="Mode Réflexion", value=True)
238
 
239
  generate_button = gr.Button(value="🤖 Petite Elle L'Aime 3")
240