Vu Anh Claude commited on
Commit
742fa4d
·
1 Parent(s): 05cf955

Clean up training runs and enhance model export functionality

Browse files

- Remove 8 training runs without exported models to optimize repository size
- Keep only runs/20250928_060819 (SVC model with 72.47% accuracy and exported model)
- Update train.py with --export-model flag and joblib format support
- Enhance use_this_model.py with consistent prediction interface matching inference.py
- Add model export functionality for distribution and publishing
- Optimize repository structure while preserving all distributed models

Repository cleanup:
- Removed runs: 20250928_054424, 20250928_054605, 20250928_054642, 20250928_054802
- Removed runs: 20250928_054813, 20250928_054840, 20250928_055536, 20250928_060804
- Preserved: runs/20250928_060819 (corresponds to uts2017_bank_classifier_20250928_060819.joblib)
- All exported models maintained for inference and deployment

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

runs/20250928_060819/models/UTS2017_Bank_SVC_feat20k_ngram1-2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaaffac2cb04faf77023502c3fc144ff2503a0ff9211c574c1b07424a0ad6e08
3
+ size 1674180
runs/20250928_060819/models/model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaaffac2cb04faf77023502c3fc144ff2503a0ff9211c574c1b07424a0ad6e08
3
+ size 1674180
train.py CHANGED
@@ -234,6 +234,7 @@ def train_model(
234
  ngram_range=(1, 2),
235
  split_ratio=0.2,
236
  n_samples=None,
 
237
  ):
238
  """Train a single model with specified parameters
239
 
@@ -360,16 +361,26 @@ def train_model(
360
  logging.info(f"Confusion Matrix shape: {cm.shape}")
361
 
362
  # Save the model
363
- model_path = os.path.join(output_folder, "model.pkl")
364
  joblib.dump(text_clf, model_path)
365
  logging.info(f"Model saved to {model_path}")
366
  print(f"Model saved to {model_path}")
367
 
368
  # Save model with config name
369
- config_model_path = os.path.join(output_folder, f"{config_name}.pkl")
370
  joblib.dump(text_clf, config_model_path)
371
  logging.info(f"Model also saved as {config_model_path}")
372
 
 
 
 
 
 
 
 
 
 
 
373
  # Save label mapping
374
  label_mapping_path = os.path.join(output_folder, "labels.txt")
375
  with open(label_mapping_path, "w", encoding="utf-8") as f:
@@ -506,13 +517,13 @@ def train_all_configurations(dataset="vntc", models=None, num_rows=None):
506
 
507
 
508
  def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
509
- split_ratio=0.2, n_samples=None, compare=False):
510
  """
511
  Convenience function for training in Jupyter/Colab notebooks without argparse.
512
 
513
  Example usage:
514
  from train import train_notebook
515
- train_notebook(dataset="vntc", model_name="logistic", max_features=20000)
516
  """
517
  if compare:
518
  print("Training and comparing multiple configurations...")
@@ -529,6 +540,7 @@ def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000,
529
  ngram_range=(ngram_min, ngram_max),
530
  split_ratio=split_ratio,
531
  n_samples=n_samples,
 
532
  )
533
 
534
 
@@ -594,6 +606,11 @@ def main():
594
  default="vntc",
595
  help="Dataset to use for model comparison (default: vntc)"
596
  )
 
 
 
 
 
597
 
598
  # Use parse_known_args to ignore Jupyter/Colab kernel arguments
599
  args, unknown = parser.parse_known_args()
@@ -629,6 +646,7 @@ def main():
629
  ngram_range=(args.ngram_min, args.ngram_max),
630
  split_ratio=args.split_ratio,
631
  n_samples=args.num_rows,
 
632
  )
633
 
634
 
 
234
  ngram_range=(1, 2),
235
  split_ratio=0.2,
236
  n_samples=None,
237
+ export_model=False,
238
  ):
239
  """Train a single model with specified parameters
240
 
 
361
  logging.info(f"Confusion Matrix shape: {cm.shape}")
362
 
363
  # Save the model
364
+ model_path = os.path.join(output_folder, "model.joblib")
365
  joblib.dump(text_clf, model_path)
366
  logging.info(f"Model saved to {model_path}")
367
  print(f"Model saved to {model_path}")
368
 
369
  # Save model with config name
370
+ config_model_path = os.path.join(output_folder, f"{config_name}.joblib")
371
  joblib.dump(text_clf, config_model_path)
372
  logging.info(f"Model also saved as {config_model_path}")
373
 
374
+ # Export model if requested
375
+ if export_model:
376
+ # Use new format: <datasetname>_classifier_<run_id>.joblib
377
+ run_id = os.path.basename(run_dir)
378
+ export_filename = f"{dataset_name.lower()}_classifier_{run_id}.joblib"
379
+ export_path = os.path.join(".", export_filename)
380
+ joblib.dump(text_clf, export_path)
381
+ logging.info(f"Model exported as {export_path}")
382
+ print(f"Model exported for distribution: {export_filename}")
383
+
384
  # Save label mapping
385
  label_mapping_path = os.path.join(output_folder, "labels.txt")
386
  with open(label_mapping_path, "w", encoding="utf-8") as f:
 
517
 
518
 
519
  def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
520
+ split_ratio=0.2, n_samples=None, compare=False, export_model=False):
521
  """
522
  Convenience function for training in Jupyter/Colab notebooks without argparse.
523
 
524
  Example usage:
525
  from train import train_notebook
526
+ train_notebook(dataset="vntc", model_name="logistic", max_features=20000, export_model=True)
527
  """
528
  if compare:
529
  print("Training and comparing multiple configurations...")
 
540
  ngram_range=(ngram_min, ngram_max),
541
  split_ratio=split_ratio,
542
  n_samples=n_samples,
543
+ export_model=export_model,
544
  )
545
 
546
 
 
606
  default="vntc",
607
  help="Dataset to use for model comparison (default: vntc)"
608
  )
609
+ parser.add_argument(
610
+ "--export-model",
611
+ action="store_true",
612
+ help="Export a copy of the trained model to project root for distribution/publishing"
613
+ )
614
 
615
  # Use parse_known_args to ignore Jupyter/Colab kernel arguments
616
  args, unknown = parser.parse_known_args()
 
646
  ngram_range=(args.ngram_min, args.ngram_max),
647
  split_ratio=args.split_ratio,
648
  n_samples=args.num_rows,
649
+ export_model=args.export_model,
650
  )
651
 
652
 
use_this_model.py CHANGED
@@ -6,7 +6,29 @@ Shows how to download and use both VNTC and UTS2017_Bank pre-trained models.
6
 
7
  from huggingface_hub import hf_hub_download
8
  import joblib
9
- import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def load_model_from_hub(model_type="vntc"):
@@ -57,22 +79,17 @@ def predict_vntc_examples(model):
57
 
58
  for expected_category, text in examples:
59
  try:
60
- prediction = model.predict([text])[0]
61
- probabilities = model.predict_proba([text])[0]
62
- confidence = np.max(probabilities)
63
-
64
- print(f"Text: {text}")
65
- print(f"Expected: {expected_category}")
66
- print(f"Predicted: {prediction}")
67
- print(f"Confidence: {confidence:.3f}")
68
-
69
- # Show top 3 predictions
70
- if hasattr(model, 'classes_'):
71
- top_indices = np.argsort(probabilities)[-3:][::-1]
72
  print("Top 3 predictions:")
73
- for i, idx in enumerate(top_indices, 1):
74
- category = model.classes_[idx]
75
- prob = probabilities[idx]
76
  print(f" {i}. {category}: {prob:.3f}")
77
 
78
  print("-" * 60)
@@ -111,22 +128,17 @@ def predict_uts2017_examples(model):
111
 
112
  for expected_category, text in examples:
113
  try:
114
- prediction = model.predict([text])[0]
115
- probabilities = model.predict_proba([text])[0]
116
- confidence = np.max(probabilities)
117
-
118
- print(f"Text: {text}")
119
- print(f"Expected: {expected_category}")
120
- print(f"Predicted: {prediction}")
121
- print(f"Confidence: {confidence:.3f}")
122
-
123
- # Show top 3 predictions
124
- if hasattr(model, 'classes_'):
125
- top_indices = np.argsort(probabilities)[-3:][::-1]
126
  print("Top 3 predictions:")
127
- for i, idx in enumerate(top_indices, 1):
128
- category = model.classes_[idx]
129
- prob = probabilities[idx]
130
  print(f" {i}. {category}: {prob:.3f}")
131
 
132
  print("-" * 60)
@@ -154,20 +166,15 @@ def interactive_mode(model, model_type):
154
  if not user_input:
155
  continue
156
 
157
- prediction = model.predict([user_input])[0]
158
- probabilities = model.predict_proba([user_input])[0]
159
- confidence = np.max(probabilities)
160
 
161
- print(f"Predicted category: {prediction}")
162
- print(f"Confidence: {confidence:.3f}")
 
163
 
164
- # Show top 3 predictions
165
- if hasattr(model, 'classes_'):
166
- top_indices = np.argsort(probabilities)[-3:][::-1]
167
  print("Top 3 predictions:")
168
- for i, idx in enumerate(top_indices, 1):
169
- category = model.classes_[idx]
170
- prob = probabilities[idx]
171
  print(f" {i}. {category}: {prob:.3f}")
172
 
173
  except KeyboardInterrupt:
@@ -178,9 +185,9 @@ def interactive_mode(model, model_type):
178
 
179
 
180
  def simple_usage_examples():
181
- """Show simple usage examples for both models"""
182
  print("\n" + "="*60)
183
- print("SIMPLE USAGE EXAMPLES")
184
  print("="*60)
185
 
186
  print("Code examples:")
@@ -189,7 +196,7 @@ def simple_usage_examples():
189
  from huggingface_hub import hf_hub_download
190
  import joblib
191
 
192
- # Download and load VNTC model
193
  vntc_model = joblib.load(
194
  hf_hub_download("undertheseanlp/sonar_core_1", "vntc_classifier_20250927_161550.joblib")
195
  )
@@ -200,7 +207,7 @@ prediction = vntc_model.predict([news_text])[0]
200
  print(f"News category: {prediction}")
201
 
202
  # UTS2017_Bank Model (Vietnamese Banking Text Classification)
203
- # Download and load UTS2017_Bank model
204
  bank_model = joblib.load(
205
  hf_hub_download("undertheseanlp/sonar_core_1", "uts2017_bank_classifier_20250927_161733.joblib")
206
  )
@@ -209,6 +216,8 @@ bank_model = joblib.load(
209
  bank_text = "Tôi muốn mở tài khoản tiết kiệm"
210
  prediction = bank_model.predict([bank_text])[0]
211
  print(f"Banking category: {prediction}")
 
 
212
  """)
213
 
214
 
 
6
 
7
  from huggingface_hub import hf_hub_download
8
  import joblib
9
+
10
+
11
+ def predict_text(model, text):
12
+ """Make prediction on a single text (consistent with inference.py)"""
13
+ try:
14
+ probabilities = model.predict_proba([text])[0]
15
+
16
+ # Get top 3 predictions sorted by probability
17
+ top_indices = probabilities.argsort()[-3:][::-1]
18
+ top_predictions = []
19
+ for idx in top_indices:
20
+ category = model.classes_[idx]
21
+ prob = probabilities[idx]
22
+ top_predictions.append((category, prob))
23
+
24
+ # The prediction should be the top category
25
+ prediction = top_predictions[0][0]
26
+ confidence = top_predictions[0][1]
27
+
28
+ return prediction, confidence, top_predictions
29
+ except Exception as e:
30
+ print(f"Error making prediction: {e}")
31
+ return None, 0, []
32
 
33
 
34
  def load_model_from_hub(model_type="vntc"):
 
79
 
80
  for expected_category, text in examples:
81
  try:
82
+ prediction, confidence, top_predictions = predict_text(model, text)
83
+
84
+ if prediction:
85
+ print(f"Text: {text}")
86
+ print(f"Expected: {expected_category}")
87
+ print(f"Predicted: {prediction}")
88
+ print(f"Confidence: {confidence:.3f}")
89
+
90
+ # Show top 3 predictions
 
 
 
91
  print("Top 3 predictions:")
92
+ for i, (category, prob) in enumerate(top_predictions, 1):
 
 
93
  print(f" {i}. {category}: {prob:.3f}")
94
 
95
  print("-" * 60)
 
128
 
129
  for expected_category, text in examples:
130
  try:
131
+ prediction, confidence, top_predictions = predict_text(model, text)
132
+
133
+ if prediction:
134
+ print(f"Text: {text}")
135
+ print(f"Expected: {expected_category}")
136
+ print(f"Predicted: {prediction}")
137
+ print(f"Confidence: {confidence:.3f}")
138
+
139
+ # Show top 3 predictions
 
 
 
140
  print("Top 3 predictions:")
141
+ for i, (category, prob) in enumerate(top_predictions, 1):
 
 
142
  print(f" {i}. {category}: {prob:.3f}")
143
 
144
  print("-" * 60)
 
166
  if not user_input:
167
  continue
168
 
169
+ prediction, confidence, top_predictions = predict_text(model, user_input)
 
 
170
 
171
+ if prediction:
172
+ print(f"Predicted category: {prediction}")
173
+ print(f"Confidence: {confidence:.3f}")
174
 
175
+ # Show top 3 predictions
 
 
176
  print("Top 3 predictions:")
177
+ for i, (category, prob) in enumerate(top_predictions, 1):
 
 
178
  print(f" {i}. {category}: {prob:.3f}")
179
 
180
  except KeyboardInterrupt:
 
185
 
186
 
187
  def simple_usage_examples():
188
+ """Show simple usage examples for HuggingFace Hub models"""
189
  print("\n" + "="*60)
190
+ print("HUGGINGFACE HUB USAGE EXAMPLES")
191
  print("="*60)
192
 
193
  print("Code examples:")
 
196
  from huggingface_hub import hf_hub_download
197
  import joblib
198
 
199
+ # Download and load VNTC model from HuggingFace Hub
200
  vntc_model = joblib.load(
201
  hf_hub_download("undertheseanlp/sonar_core_1", "vntc_classifier_20250927_161550.joblib")
202
  )
 
207
  print(f"News category: {prediction}")
208
 
209
  # UTS2017_Bank Model (Vietnamese Banking Text Classification)
210
+ # Download and load UTS2017_Bank model from HuggingFace Hub
211
  bank_model = joblib.load(
212
  hf_hub_download("undertheseanlp/sonar_core_1", "uts2017_bank_classifier_20250927_161733.joblib")
213
  )
 
216
  bank_text = "Tôi muốn mở tài khoản tiết kiệm"
217
  prediction = bank_model.predict([bank_text])[0]
218
  print(f"Banking category: {prediction}")
219
+
220
+ # For local file inference, use inference.py instead
221
  """)
222
 
223