Clean up training runs and enhance model export functionality

- Remove 8 training runs without exported models to optimize repository size
- Keep only runs/20250928_060819 (SVC model with 72.47% accuracy and exported model)
- Update train.py with --export-model flag and joblib format support
- Enhance use_this_model.py with consistent prediction interface matching inference.py
- Add model export functionality for distribution and publishing
- Optimize repository structure while preserving all distributed models

Repository cleanup:
- Removed runs: 20250928_054424, 20250928_054605, 20250928_054642, 20250928_054802
- Removed runs: 20250928_054813, 20250928_054840, 20250928_055536, 20250928_060804
- Preserved: runs/20250928_060819 (corresponds to uts2017_bank_classifier_20250928_060819.joblib)
- All exported models maintained for inference and deployment

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (4) hide show

runs/20250928_060819/models/UTS2017_Bank_SVC_feat20k_ngram1-2.joblib +3 -0
runs/20250928_060819/models/model.joblib +3 -0
train.py +22 -4
use_this_model.py +55 -46

runs/20250928_060819/models/UTS2017_Bank_SVC_feat20k_ngram1-2.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaaffac2cb04faf77023502c3fc144ff2503a0ff9211c574c1b07424a0ad6e08
+size 1674180

runs/20250928_060819/models/model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaaffac2cb04faf77023502c3fc144ff2503a0ff9211c574c1b07424a0ad6e08
+size 1674180

train.py CHANGED Viewed

@@ -234,6 +234,7 @@ def train_model(
     ngram_range=(1, 2),
     split_ratio=0.2,
     n_samples=None,
 ):
     """Train a single model with specified parameters
@@ -360,16 +361,26 @@ def train_model(
     logging.info(f"Confusion Matrix shape: {cm.shape}")
     # Save the model
-    model_path = os.path.join(output_folder, "model.pkl")
     joblib.dump(text_clf, model_path)
     logging.info(f"Model saved to {model_path}")
     print(f"Model saved to {model_path}")
     # Save model with config name
-    config_model_path = os.path.join(output_folder, f"{config_name}.pkl")
     joblib.dump(text_clf, config_model_path)
     logging.info(f"Model also saved as {config_model_path}")
     # Save label mapping
     label_mapping_path = os.path.join(output_folder, "labels.txt")
     with open(label_mapping_path, "w", encoding="utf-8") as f:
@@ -506,13 +517,13 @@ def train_all_configurations(dataset="vntc", models=None, num_rows=None):
 def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
-                   split_ratio=0.2, n_samples=None, compare=False):
     """
     Convenience function for training in Jupyter/Colab notebooks without argparse.
     Example usage:
         from train import train_notebook
-        train_notebook(dataset="vntc", model_name="logistic", max_features=20000)
     """
     if compare:
         print("Training and comparing multiple configurations...")
@@ -529,6 +540,7 @@ def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000,
             ngram_range=(ngram_min, ngram_max),
             split_ratio=split_ratio,
             n_samples=n_samples,
         )
@@ -594,6 +606,11 @@ def main():
         default="vntc",
         help="Dataset to use for model comparison (default: vntc)"
     )
     # Use parse_known_args to ignore Jupyter/Colab kernel arguments
     args, unknown = parser.parse_known_args()
@@ -629,6 +646,7 @@ def main():
             ngram_range=(args.ngram_min, args.ngram_max),
             split_ratio=args.split_ratio,
             n_samples=args.num_rows,
         )

     ngram_range=(1, 2),
     split_ratio=0.2,
     n_samples=None,
+    export_model=False,
 ):
     """Train a single model with specified parameters
     logging.info(f"Confusion Matrix shape: {cm.shape}")
     # Save the model
+    model_path = os.path.join(output_folder, "model.joblib")
     joblib.dump(text_clf, model_path)
     logging.info(f"Model saved to {model_path}")
     print(f"Model saved to {model_path}")
     # Save model with config name
+    config_model_path = os.path.join(output_folder, f"{config_name}.joblib")
     joblib.dump(text_clf, config_model_path)
     logging.info(f"Model also saved as {config_model_path}")
+    # Export model if requested
+    if export_model:
+        # Use new format: <datasetname>_classifier_<run_id>.joblib
+        run_id = os.path.basename(run_dir)
+        export_filename = f"{dataset_name.lower()}_classifier_{run_id}.joblib"
+        export_path = os.path.join(".", export_filename)
+        joblib.dump(text_clf, export_path)
+        logging.info(f"Model exported as {export_path}")
+        print(f"Model exported for distribution: {export_filename}")
     # Save label mapping
     label_mapping_path = os.path.join(output_folder, "labels.txt")
     with open(label_mapping_path, "w", encoding="utf-8") as f:
 def train_notebook(dataset="uts2017", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
+                   split_ratio=0.2, n_samples=None, compare=False, export_model=False):
     """
     Convenience function for training in Jupyter/Colab notebooks without argparse.
     Example usage:
         from train import train_notebook
+        train_notebook(dataset="vntc", model_name="logistic", max_features=20000, export_model=True)
     """
     if compare:
         print("Training and comparing multiple configurations...")
             ngram_range=(ngram_min, ngram_max),
             split_ratio=split_ratio,
             n_samples=n_samples,
+            export_model=export_model,
         )
         default="vntc",
         help="Dataset to use for model comparison (default: vntc)"
     )
+    parser.add_argument(
+        "--export-model",
+        action="store_true",
+        help="Export a copy of the trained model to project root for distribution/publishing"
+    )
     # Use parse_known_args to ignore Jupyter/Colab kernel arguments
     args, unknown = parser.parse_known_args()
             ngram_range=(args.ngram_min, args.ngram_max),
             split_ratio=args.split_ratio,
             n_samples=args.num_rows,
+            export_model=args.export_model,
         )

use_this_model.py CHANGED Viewed

@@ -6,7 +6,29 @@ Shows how to download and use both VNTC and UTS2017_Bank pre-trained models.
 from huggingface_hub import hf_hub_download
 import joblib
-import numpy as np
 def load_model_from_hub(model_type="vntc"):
@@ -57,22 +79,17 @@ def predict_vntc_examples(model):
     for expected_category, text in examples:
         try:
-            prediction = model.predict([text])[0]
-            probabilities = model.predict_proba([text])[0]
-            confidence = np.max(probabilities)
-            print(f"Text: {text}")
-            print(f"Expected: {expected_category}")
-            print(f"Predicted: {prediction}")
-            print(f"Confidence: {confidence:.3f}")
-            # Show top 3 predictions
-            if hasattr(model, 'classes_'):
-                top_indices = np.argsort(probabilities)[-3:][::-1]
                 print("Top 3 predictions:")
-                for i, idx in enumerate(top_indices, 1):
-                    category = model.classes_[idx]
-                    prob = probabilities[idx]
                     print(f"  {i}. {category}: {prob:.3f}")
             print("-" * 60)
@@ -111,22 +128,17 @@ def predict_uts2017_examples(model):
     for expected_category, text in examples:
         try:
-            prediction = model.predict([text])[0]
-            probabilities = model.predict_proba([text])[0]
-            confidence = np.max(probabilities)
-            print(f"Text: {text}")
-            print(f"Expected: {expected_category}")
-            print(f"Predicted: {prediction}")
-            print(f"Confidence: {confidence:.3f}")
-            # Show top 3 predictions
-            if hasattr(model, 'classes_'):
-                top_indices = np.argsort(probabilities)[-3:][::-1]
                 print("Top 3 predictions:")
-                for i, idx in enumerate(top_indices, 1):
-                    category = model.classes_[idx]
-                    prob = probabilities[idx]
                     print(f"  {i}. {category}: {prob:.3f}")
             print("-" * 60)
@@ -154,20 +166,15 @@ def interactive_mode(model, model_type):
             if not user_input:
                 continue
-            prediction = model.predict([user_input])[0]
-            probabilities = model.predict_proba([user_input])[0]
-            confidence = np.max(probabilities)
-            print(f"Predicted category: {prediction}")
-            print(f"Confidence: {confidence:.3f}")
-            # Show top 3 predictions
-            if hasattr(model, 'classes_'):
-                top_indices = np.argsort(probabilities)[-3:][::-1]
                 print("Top 3 predictions:")
-                for i, idx in enumerate(top_indices, 1):
-                    category = model.classes_[idx]
-                    prob = probabilities[idx]
                     print(f"  {i}. {category}: {prob:.3f}")
         except KeyboardInterrupt:
@@ -178,9 +185,9 @@ def interactive_mode(model, model_type):
 def simple_usage_examples():
-    """Show simple usage examples for both models"""
     print("\n" + "="*60)
-    print("SIMPLE USAGE EXAMPLES")
     print("="*60)
     print("Code examples:")
@@ -189,7 +196,7 @@ def simple_usage_examples():
 from huggingface_hub import hf_hub_download
 import joblib
-# Download and load VNTC model
 vntc_model = joblib.load(
     hf_hub_download("undertheseanlp/sonar_core_1", "vntc_classifier_20250927_161550.joblib")
 )
@@ -200,7 +207,7 @@ prediction = vntc_model.predict([news_text])[0]
 print(f"News category: {prediction}")
 # UTS2017_Bank Model (Vietnamese Banking Text Classification)
-# Download and load UTS2017_Bank model
 bank_model = joblib.load(
     hf_hub_download("undertheseanlp/sonar_core_1", "uts2017_bank_classifier_20250927_161733.joblib")
 )
@@ -209,6 +216,8 @@ bank_model = joblib.load(
 bank_text = "Tôi muốn mở tài khoản tiết kiệm"
 prediction = bank_model.predict([bank_text])[0]
 print(f"Banking category: {prediction}")
 """)

 from huggingface_hub import hf_hub_download
 import joblib
+def predict_text(model, text):
+    """Make prediction on a single text (consistent with inference.py)"""
+    try:
+        probabilities = model.predict_proba([text])[0]
+        # Get top 3 predictions sorted by probability
+        top_indices = probabilities.argsort()[-3:][::-1]
+        top_predictions = []
+        for idx in top_indices:
+            category = model.classes_[idx]
+            prob = probabilities[idx]
+            top_predictions.append((category, prob))
+        # The prediction should be the top category
+        prediction = top_predictions[0][0]
+        confidence = top_predictions[0][1]
+        return prediction, confidence, top_predictions
+    except Exception as e:
+        print(f"Error making prediction: {e}")
+        return None, 0, []
 def load_model_from_hub(model_type="vntc"):
     for expected_category, text in examples:
         try:
+            prediction, confidence, top_predictions = predict_text(model, text)
+            if prediction:
+                print(f"Text: {text}")
+                print(f"Expected: {expected_category}")
+                print(f"Predicted: {prediction}")
+                print(f"Confidence: {confidence:.3f}")
+                # Show top 3 predictions
                 print("Top 3 predictions:")
+                for i, (category, prob) in enumerate(top_predictions, 1):
                     print(f"  {i}. {category}: {prob:.3f}")
             print("-" * 60)
     for expected_category, text in examples:
         try:
+            prediction, confidence, top_predictions = predict_text(model, text)
+            if prediction:
+                print(f"Text: {text}")
+                print(f"Expected: {expected_category}")
+                print(f"Predicted: {prediction}")
+                print(f"Confidence: {confidence:.3f}")
+                # Show top 3 predictions
                 print("Top 3 predictions:")
+                for i, (category, prob) in enumerate(top_predictions, 1):
                     print(f"  {i}. {category}: {prob:.3f}")
             print("-" * 60)
             if not user_input:
                 continue
+            prediction, confidence, top_predictions = predict_text(model, user_input)
+            if prediction:
+                print(f"Predicted category: {prediction}")
+                print(f"Confidence: {confidence:.3f}")
+                # Show top 3 predictions
                 print("Top 3 predictions:")
+                for i, (category, prob) in enumerate(top_predictions, 1):
                     print(f"  {i}. {category}: {prob:.3f}")
         except KeyboardInterrupt:
 def simple_usage_examples():
+    """Show simple usage examples for HuggingFace Hub models"""
     print("\n" + "="*60)
+    print("HUGGINGFACE HUB USAGE EXAMPLES")
     print("="*60)
     print("Code examples:")
 from huggingface_hub import hf_hub_download
 import joblib
+# Download and load VNTC model from HuggingFace Hub
 vntc_model = joblib.load(
     hf_hub_download("undertheseanlp/sonar_core_1", "vntc_classifier_20250927_161550.joblib")
 )
 print(f"News category: {prediction}")
 # UTS2017_Bank Model (Vietnamese Banking Text Classification)
+# Download and load UTS2017_Bank model from HuggingFace Hub
 bank_model = joblib.load(
     hf_hub_download("undertheseanlp/sonar_core_1", "uts2017_bank_classifier_20250927_161733.joblib")
 )
 bank_text = "Tôi muốn mở tài khoản tiết kiệm"
 prediction = bank_model.predict([bank_text])[0]
 print(f"Banking category: {prediction}")
+# For local file inference, use inference.py instead
 """)