Vu Anh Claude commited on
Commit
16e985c
·
1 Parent(s): 768e141

Transform system card to model card format and enhance documentation

Browse files

- Transform paper/Sonar Core 1 - System Card.md from technical report to Hugging Face model card format
- Update header structure to emphasize Vietnamese text classification model
- Simplify performance metrics with summary table instead of detailed per-class tables
- Streamline limitations and ethical considerations sections
- Update usage examples to be more practical and concise
- Add comprehensive Google Colab tutorial to DEVELOPERS.md
- Enhance train.py with 9 scikit-learn algorithms and comparison capabilities
- Add analyze_results.py script for training run comparison
- Update LaTeX technical report with formal academic structure
- Clean up gitignore and directory structure

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

.gitignore CHANGED
@@ -76,4 +76,5 @@ docs/_build/
76
  site/
77
 
78
  # Claude
79
- .claude/
 
 
76
  site/
77
 
78
  # Claude
79
+ .claude/
80
+ sample_papers
DEVELOPERS.md CHANGED
Binary files a/DEVELOPERS.md and b/DEVELOPERS.md differ
 
analyze_results.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to analyze and compare training results from multiple model runs.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import glob
9
+ from pathlib import Path
10
+
11
+ def load_metadata(run_dir):
12
+ """Load metadata from a training run directory"""
13
+ metadata_path = os.path.join(run_dir, "metadata.json")
14
+ if os.path.exists(metadata_path):
15
+ with open(metadata_path, 'r', encoding='utf-8') as f:
16
+ return json.load(f)
17
+ return None
18
+
19
+ def analyze_all_runs():
20
+ """Analyze all training runs and create comparison"""
21
+ runs_dir = Path("runs")
22
+ results = []
23
+
24
+ # Find all metadata files
25
+ for run_dir in runs_dir.glob("*/"):
26
+ if run_dir.is_dir():
27
+ metadata = load_metadata(run_dir)
28
+ if metadata:
29
+ results.append({
30
+ 'run_id': run_dir.name,
31
+ 'model': metadata.get('classifier', 'Unknown'),
32
+ 'dataset': 'VNTC' if 'VNTC' in metadata.get('config_name', '') else 'UTS2017_Bank',
33
+ 'max_features': metadata.get('max_features', 0),
34
+ 'ngram_range': metadata.get('ngram_range', [1,1]),
35
+ 'train_accuracy': metadata.get('train_accuracy', 0),
36
+ 'test_accuracy': metadata.get('test_accuracy', 0),
37
+ 'train_time': metadata.get('train_time', 0),
38
+ 'prediction_time': metadata.get('prediction_time', 0),
39
+ 'train_samples': metadata.get('train_samples', 0),
40
+ 'test_samples': metadata.get('test_samples', 0)
41
+ })
42
+
43
+ return results
44
+
45
+ def print_comparison_table(results):
46
+ """Print formatted comparison table"""
47
+ print("\n" + "="*120)
48
+ print("VIETNAMESE TEXT CLASSIFICATION - MODEL COMPARISON RESULTS")
49
+ print("="*120)
50
+
51
+ # Filter for VNTC results (news classification)
52
+ vntc_results = [r for r in results if r['dataset'] == 'VNTC']
53
+
54
+ if vntc_results:
55
+ print("\nVNTC Dataset (Vietnamese News Classification):")
56
+ print("-"*120)
57
+ print(f"{'Model':<20} {'Features':<10} {'N-gram':<10} {'Train Acc':<12} {'Test Acc':<12} {'Train Time':<12} {'Pred Time':<12}")
58
+ print("-"*120)
59
+
60
+ # Sort by test accuracy
61
+ vntc_results.sort(key=lambda x: x['test_accuracy'], reverse=True)
62
+
63
+ for result in vntc_results:
64
+ model = result['model'][:18]
65
+ features = f"{result['max_features']//1000}k" if result['max_features'] > 0 else "N/A"
66
+ ngram = f"{result['ngram_range'][0]}-{result['ngram_range'][1]}"
67
+ train_acc = f"{result['train_accuracy']:.4f}"
68
+ test_acc = f"{result['test_accuracy']:.4f}"
69
+ train_time = f"{result['train_time']:.1f}s"
70
+ pred_time = f"{result['prediction_time']:.1f}s"
71
+
72
+ print(f"{model:<20} {features:<10} {ngram:<10} {train_acc:<12} {test_acc:<12} {train_time:<12} {pred_time:<12}")
73
+
74
+ # Filter for UTS2017_Bank results
75
+ bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
76
+
77
+ if bank_results:
78
+ print(f"\nUTS2017_Bank Dataset (Vietnamese Banking Text Classification):")
79
+ print("-"*120)
80
+ print(f"{'Model':<20} {'Features':<10} {'N-gram':<10} {'Train Acc':<12} {'Test Acc':<12} {'Train Time':<12} {'Pred Time':<12}")
81
+ print("-"*120)
82
+
83
+ # Sort by test accuracy
84
+ bank_results.sort(key=lambda x: x['test_accuracy'], reverse=True)
85
+
86
+ for result in bank_results:
87
+ model = result['model'][:18]
88
+ features = f"{result['max_features']//1000}k" if result['max_features'] > 0 else "N/A"
89
+ ngram = f"{result['ngram_range'][0]}-{result['ngram_range'][1]}"
90
+ train_acc = f"{result['train_accuracy']:.4f}"
91
+ test_acc = f"{result['test_accuracy']:.4f}"
92
+ train_time = f"{result['train_time']:.1f}s"
93
+ pred_time = f"{result['prediction_time']:.1f}s"
94
+
95
+ print(f"{model:<20} {features:<10} {ngram:<10} {train_acc:<12} {test_acc:<12} {train_time:<12} {pred_time:<12}")
96
+
97
+ print("="*120)
98
+
99
+ if vntc_results:
100
+ best_vntc = max(vntc_results, key=lambda x: x['test_accuracy'])
101
+ print(f"\nBest VNTC model: {best_vntc['model']} with {best_vntc['test_accuracy']:.4f} test accuracy")
102
+
103
+ if bank_results:
104
+ best_bank = max(bank_results, key=lambda x: x['test_accuracy'])
105
+ print(f"Best UTS2017_Bank model: {best_bank['model']} with {best_bank['test_accuracy']:.4f} test accuracy")
106
+
107
+ def main():
108
+ """Main analysis function"""
109
+ print("Analyzing Vietnamese Text Classification Training Results...")
110
+
111
+ results = analyze_all_runs()
112
+
113
+ if not results:
114
+ print("No training results found in runs/ directory.")
115
+ return
116
+
117
+ print(f"Found {len(results)} training runs.")
118
+ print_comparison_table(results)
119
+
120
+ # Create summary statistics
121
+ vntc_results = [r for r in results if r['dataset'] == 'VNTC']
122
+ bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
123
+
124
+ print(f"\nSummary:")
125
+ print(f"- VNTC runs: {len(vntc_results)}")
126
+ print(f"- UTS2017_Bank runs: {len(bank_results)}")
127
+
128
+ if vntc_results:
129
+ avg_vntc_acc = sum(r['test_accuracy'] for r in vntc_results) / len(vntc_results)
130
+ print(f"- Average VNTC test accuracy: {avg_vntc_acc:.4f}")
131
+
132
+ if bank_results:
133
+ avg_bank_acc = sum(r['test_accuracy'] for r in bank_results) / len(bank_results)
134
+ print(f"- Average UTS2017_Bank test accuracy: {avg_bank_acc:.4f}")
135
+
136
+ if __name__ == "__main__":
137
+ main()
paper/.gitignore CHANGED
@@ -1,6 +1,5 @@
1
  # Ignore all files except .pdf, .tex, and .md
2
  *
3
- !*.pdf
4
  !*.tex
5
  !*.md
6
  !.gitignore
 
1
  # Ignore all files except .pdf, .tex, and .md
2
  *
 
3
  !*.tex
4
  !*.md
5
  !.gitignore
paper/Sonar Core 1 - System Card.md CHANGED
@@ -1,25 +1,31 @@
1
- <h1 align="center">Sonar Core 1 - System Card</h1>
2
 
3
- <p align="center"><b>Underthesea Team</b></p>
 
 
4
 
5
- <p align="center"><b>September 2025</b></p>
6
 
7
- # Changelog
8
 
9
- **2025-09-27**
10
 
11
- - Added support for UTS2017_Bank Vietnamese banking text classification dataset
12
- - Achieved 70.96% accuracy on 14 banking service categories
13
-
14
- **2025-09-21**
15
-
16
- - Initial release of Sonar Core 1
17
-
18
- # Abstract
19
 
20
- **Sonar Core 1** is a machine learning-based text classification model designed for Vietnamese language processing. Built on a **TF-IDF** (Term Frequency-Inverse Document Frequency) feature extraction pipeline combined with **Logistic Regression**, this model achieves **92.33% accuracy** on the VNTC (Vietnamese Text Classification) dataset across **10 news categories** and **70.96% accuracy** on the UTS2017_Bank dataset across **14 banking service categories**. The model is specifically designed for Vietnamese news article classification, banking text categorization, content categorization for Vietnamese text, and document organization and tagging. Developed as a base model to provide quick and reliable text classification support for **scikit-learn >=1.6** integration since **underthesea 8.1.0**, it employs optimized feature engineering with **20,000 max features** and bigram support, along with a hash-based caching system for efficient processing. This system card provides comprehensive documentation of the model's architecture, performance metrics, intended uses, and limitations.
 
 
 
 
 
21
 
22
- # 1. Model Details
23
 
24
  **Sonar Core 1** is a Vietnamese text classification model built on **scikit-learn >=1.6**, utilizing a TF-IDF pipeline with Logistic Regression to classify text across multiple domains including news categories and banking services. The architecture employs:
25
  - CountVectorizer with **20,000 max features** (optimized from the initial 10,000)
@@ -30,211 +36,100 @@
30
 
31
  Released on **2025-09-21**, the model achieves **92.33% test accuracy** and **95.39% training accuracy** with optimized training time of approximately **28 seconds** using the hash-based caching system. The model features a dedicated VNTCDataset class for efficient data handling and improved modular architecture.
32
 
33
- # 2. Training Data
34
-
35
- ## 2.1 VNTC Dataset - News Categories (10 classes)
36
- 1. **chinh_tri_xa_hoi** - Politics and Society
37
- 2. **doi_song** - Lifestyle
38
- 3. **khoa_hoc** - Science
39
- 4. **kinh_doanh** - Business
40
- 5. **phap_luat** - Law
41
- 6. **suc_khoe** - Health
42
- 7. **the_gioi** - World News
43
- 8. **the_thao** - Sports
44
- 9. **van_hoa** - Culture
45
- 10. **vi_tinh** - Information Technology
46
-
47
- ## 2.2 UTS2017_Bank Dataset - Banking Categories (14 classes)
48
- 1. **ACCOUNT** - Account services
49
- 2. **CARD** - Card services
50
- 3. **CUSTOMER_SUPPORT** - Customer support
51
- 4. **DISCOUNT** - Discount offers
52
- 5. **INTEREST_RATE** - Interest rate information
53
- 6. **INTERNET_BANKING** - Internet banking services
54
- 7. **LOAN** - Loan services
55
- 8. **MONEY_TRANSFER** - Money transfer services
56
- 9. **OTHER** - Other services
57
- 10. **PAYMENT** - Payment services
58
- 11. **PROMOTION** - Promotional offers
59
- 12. **SAVING** - Savings accounts
60
- 13. **SECURITY** - Security features
61
- 14. **TRADEMARK** - Trademark/branding
62
-
63
- ## 2.3 Dataset Details
64
-
65
- ### VNTC Dataset
66
- - **Name**: VNTC (Vietnamese Text Classification) Dataset
67
- - **Training Samples**: 33,759 documents
68
- - **Test Samples**: 50,373 documents
69
- - **Language**: Vietnamese
70
- - **Format**: FastText format (__label__category followed by text)
71
- - **Distribution**: Balanced across 10 news categories
72
- - **Average document length**: ~200-500 words
73
-
74
- ### UTS2017_Bank Dataset
75
- - **Name**: UTS2017_Bank Classification Dataset
76
- - **Training Samples**: 1,581 documents
77
- - **Test Samples**: 396 documents
78
- - **Language**: Vietnamese
79
- - **Format**: Text with categorical labels
80
- - **Distribution**: Imbalanced (CUSTOMER_SUPPORT: 39%, TRADEMARK: 35%, others: 26%)
81
- - **Text preprocessing**: None (raw Vietnamese text)
82
-
83
- # 3. Performance Metrics
84
-
85
- ## 3.1 VNTC Dataset Performance (2025-09-21)
86
- - **Training Accuracy**: 95.39%
87
- - **Test Accuracy**: 92.33%
88
- - **Training Time**: ~27.18 seconds (with caching system)
89
- - **Inference Time**: ~19.34 seconds for 50,373 samples
90
-
91
- ## 3.2 Per-Class Performance - VNTC Dataset
92
- | Category | Precision | Recall | F1-Score | Support |
93
- |----------|-----------|---------|-----------|---------|
94
- | chinh_tri_xa_hoi | 0.86 | 0.93 | 0.89 | 7,567 |
95
- | doi_song | 0.81 | 0.71 | 0.76 | 2,036 |
96
- | khoa_hoc | 0.88 | 0.79 | 0.83 | 2,096 |
97
- | kinh_doanh | 0.94 | 0.88 | 0.91 | 5,276 |
98
- | phap_luat | 0.92 | 0.92 | 0.92 | 3,788 |
99
- | suc_khoe | 0.93 | 0.95 | 0.94 | 5,417 |
100
- | the_gioi | 0.95 | 0.93 | 0.94 | 6,716 |
101
- | the_thao | 0.98 | 0.98 | 0.98 | 6,667 |
102
- | van_hoa | 0.93 | 0.95 | 0.94 | 6,250 |
103
- | vi_tinh | 0.94 | 0.95 | 0.94 | 4,560 |
104
-
105
- ## 3.3 UTS2017_Bank Dataset Performance (2025-09-27)
106
- - **Training Accuracy**: 76.22%
107
- - **Test Accuracy**: 70.96%
108
- - **Training Time**: ~0.78 seconds
109
- - **Inference Time**: ~0.01 seconds for 396 samples
110
-
111
- ## 3.4 Per-Class Performance - UTS2017_Bank Dataset
112
- | Category | Precision | Recall | F1-Score | Support |
113
- |----------|-----------|---------|-----------|---------|
114
- | ACCOUNT | 0.00 | 0.00 | 0.00 | 1 |
115
- | CARD | 0.00 | 0.00 | 0.00 | 13 |
116
- | CUSTOMER_SUPPORT | 0.62 | 0.97 | 0.76 | 155 |
117
- | DISCOUNT | 0.00 | 0.00 | 0.00 | 8 |
118
- | INTEREST_RATE | 0.50 | 0.08 | 0.14 | 12 |
119
- | INTERNET_BANKING | 0.00 | 0.00 | 0.00 | 14 |
120
- | LOAN | 0.67 | 0.13 | 0.22 | 15 |
121
- | MONEY_TRANSFER | 0.00 | 0.00 | 0.00 | 7 |
122
- | OTHER | 0.50 | 0.07 | 0.12 | 14 |
123
- | PAYMENT | 0.00 | 0.00 | 0.00 | 3 |
124
- | PROMOTION | 1.00 | 0.18 | 0.31 | 11 |
125
- | SAVING | 0.00 | 0.00 | 0.00 | 2 |
126
- | SECURITY | 0.00 | 0.00 | 0.00 | 1 |
127
- | TRADEMARK | 0.87 | 0.89 | 0.88 | 140 |
128
-
129
- ## 3.5 Aggregate Metrics
130
-
131
- ### VNTC Dataset
132
- - **Overall Accuracy**: 92%
133
- - **Macro Average**: Precision: 0.91, Recall: 0.90, F1: 0.91
134
- - **Weighted Average**: Precision: 0.92, Recall: 0.92, F1: 0.92
135
-
136
- ### UTS2017_Bank Dataset
137
- - **Overall Accuracy**: 71%
138
- - **Macro Average**: Precision: 0.30, Recall: 0.17, F1: 0.17
139
- - **Weighted Average**: Precision: 0.64, Recall: 0.71, F1: 0.63
140
-
141
- ## 3.6 Performance Analysis
142
-
143
- ### VNTC Dataset
144
- - **Best Performing Categories**: Sports (the_thao) achieves 98% F1-score, followed by Health, World, Culture, and IT (all 94% F1-score)
145
- - **Lowest Performing Category**: Lifestyle (doi_song) with 76% F1-score due to lower recall (71%)
146
-
147
- ### UTS2017_Bank Dataset
148
- - **Best Performing Categories**: TRADEMARK (88% F1-score) and CUSTOMER_SUPPORT (76% F1-score)
149
- - **Challenges**: Many minority classes with insufficient training data result in zero predictions
150
- - **Data Imbalance**: Significant class imbalance with CUSTOMER_SUPPORT and TRADEMARK dominating (74% of data)
151
-
152
- ### General Observations
153
- - **Feature Count**: Uses 20,000 max features with bigram support
154
- - **Caching System**: Hash-based caching for efficient vectorizer and TF-IDF processing
155
- - **Model performs better on balanced datasets** (VNTC) compared to imbalanced ones (UTS2017_Bank)
156
-
157
- # 4. Limitations
158
-
159
- ## 4.1 Known Limitations
160
- 1. **Language Specificity**: Only works with Vietnamese text
161
- 2. **Domain Specificity**: Optimized for specific domains, may not generalize well to:
162
- - Social media posts (unless trained on specific datasets)
163
- - Technical documentation outside IT/banking domains
164
- - Conversational text
165
- 3. **Feature Limitations**:
166
- - Limited to 20,000 most frequent features
167
- - May miss rare but important terms
168
- 4. **Class Imbalance Sensitivity**:
169
- - Performance degrades significantly with imbalanced datasets
170
- - Minority classes may receive zero predictions (as seen in UTS2017_Bank)
171
- 5. **Specific Category Weaknesses**:
172
- - VNTC: Lower performance on lifestyle (doi_song) category (71% recall)
173
- - UTS2017_Bank: Poor performance on minority classes (ACCOUNT, CARD, PAYMENT, etc.)
174
-
175
- ## 4.2 Biases
176
- - Trained on specific domains (news and banking) which may have formal writing style bias
177
- - May reflect biases present in the original datasets
178
- - Performance varies significantly across categories:
179
- - VNTC: Best on sports at 98% F1-score, weakest on lifestyle at 76% F1-score
180
- - UTS2017_Bank: Best on TRADEMARK at 88% F1-score, many categories at 0% F1-score
181
-
182
- # 5. Future Improvements
183
-
184
- 1. Experiment with more advanced models (XGBoost, Neural Networks)
185
- 2. Further increase vocabulary size for better coverage
186
- 3. Add support for longer documents
187
- 4. Implement confidence thresholds for uncertain predictions
188
- 5. Fine-tune on domain-specific data if needed
189
- 6. Address class imbalance issues through:
190
- - Oversampling minority classes
191
- - Class weight adjustments
192
- - Synthetic data generation (SMOTE)
193
- 7. Expand to more Vietnamese text domains
194
-
195
- # 6. Usage
196
-
197
- ## 6.1 Installation
198
  ```bash
199
  pip install scikit-learn>=1.6 joblib
200
  ```
201
 
202
- ## 6.2 Training
203
 
204
- ### VNTC Dataset (News Classification)
205
  ```bash
206
- # Default training with VNTC dataset
207
- uv run --no-project --with 'scikit-learn>=1.6' python train.py
208
 
209
- # With specific parameters
210
- uv run --no-project --with 'scikit-learn>=1.6' python train.py --model logistic --max-features 20000
211
- ```
212
 
213
- ### UTS2017_Bank Dataset (Banking Text Classification)
214
- ```bash
215
- # Train with UTS2017_Bank dataset (assuming train.py is modified for UTS2017_Bank)
216
- python train.py --model logistic
217
-
218
- # With specific parameters
219
- python train.py --model logistic --max-features 20000 --ngram-min 1 --ngram-max 2
220
 
221
- # Compare multiple configurations
222
- python train.py --compare
223
  ```
224
 
225
- ## 6.3 Inference
 
226
  ```bash
227
  # Single prediction
228
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --text "Your Vietnamese text here"
229
 
230
  # Interactive mode
231
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --interactive
232
 
233
  # Show examples
234
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --examples
235
  ```
236
 
237
- ## 6.4 Python API
238
  ```python
239
  import joblib
240
 
@@ -247,7 +142,7 @@ prediction = model.predict([text])[0]
247
  probabilities = model.predict_proba([text])[0]
248
  ```
249
 
250
- # References
251
 
252
  1. VNTC Dataset: Hoang, Cong Duy Vu, Dien Dinh, Le Nguyen Nguyen, and Quoc Hung Ngo. (2007). A Comparative Study on Vietnamese Text Classification Methods. In Proceedings of IEEE International Conference on Research, Innovation and Vision for the Future (RIVF 2007), pp. 267-273. IEEE. DOI: 10.1109/RIVF.2007.369167
253
 
@@ -261,10 +156,10 @@ probabilities = model.predict_proba([text])[0]
261
 
262
  6. N-gram Language Models: Brown, Peter F., Vincent J. Della Pietra, Peter V. deSouza, Jenifer C. Lai, and Robert L. Mercer. (1992). Class-Based n-gram Models of Natural Language. Computational Linguistics, 18(4), 467-480. Retrieved from https://aclanthology.org/J92-4003/
263
 
264
- # License
265
  Model trained on publicly available VNTC and UTS2017_Bank datasets. Please refer to original dataset licenses for usage terms.
266
 
267
- # Citation
268
 
269
  If you use this model, please cite:
270
 
 
1
+ <h1 align="center">Sonar Core 1 - Model Card</h1>
2
 
3
+ <p align="center"><b>Vietnamese Text Classification Model</b></p>
4
+ <p align="center"><b>Underthesea NLP Team</b></p>
5
+ <p align="center"><i>September 2025</i></p>
6
 
7
+ ---
8
 
9
+ ## Model Overview
10
 
11
+ **Sonar Core 1** is a Vietnamese text classification model built on traditional machine learning techniques (TF-IDF + Logistic Regression) optimized for production deployment. The model achieves **92.33% accuracy** on Vietnamese news classification and **70.96% accuracy** on banking text classification, offering a computationally efficient alternative to deep learning approaches.
12
 
13
+ ### Quick Facts
14
+ - **Model Type**: Text Classification (Multi-class)
15
+ - **Language**: Vietnamese
16
+ - **Architecture**: TF-IDF + Logistic Regression
17
+ - **Framework**: scikit-learn
18
+ - **Model Size**: ~2.4MB (VNTC), ~3MB (UTS2017_Bank)
19
+ - **Inference Speed**: 0.38ms per sample (VNTC), 0.025ms per sample (banking)
 
20
 
21
+ ### Intended Use
22
+ - Vietnamese news article categorization
23
+ - Banking/financial text classification
24
+ - Content moderation and organization
25
+ - Document routing and tagging
26
+ - Educational and research purposes
27
 
28
+ ## Model Details
29
 
30
  **Sonar Core 1** is a Vietnamese text classification model built on **scikit-learn >=1.6**, utilizing a TF-IDF pipeline with Logistic Regression to classify text across multiple domains including news categories and banking services. The architecture employs:
31
  - CountVectorizer with **20,000 max features** (optimized from the initial 10,000)
 
36
 
37
  Released on **2025-09-21**, the model achieves **92.33% test accuracy** and **95.39% training accuracy** with optimized training time of approximately **28 seconds** using the hash-based caching system. The model features a dedicated VNTCDataset class for efficient data handling and improved modular architecture.
38
 
39
+ ## Training Data
40
+
41
+ The model supports two Vietnamese text classification tasks:
42
+
43
+ **VNTC Dataset (News Classification)** - 10 categories:
44
+ Politics, Lifestyle, Science, Business, Law, Health, World News, Sports, Culture, Information Technology
45
+
46
+ **UTS2017_Bank Dataset (Banking Services)** - 14 categories:
47
+ Account, Card, Customer Support, Discount, Interest Rate, Internet Banking, Loan, Money Transfer, Payment, Promotion, Saving, Security, Trademark, and Other services
48
+
49
+ ### Dataset Statistics
50
+
51
+ | Dataset | Categories | Training Samples | Test Samples | Best Accuracy |
52
+ |---------|------------|------------------|--------------|---------------|
53
+ | VNTC (News) | 10 | 33,759 | 50,373 | 92.33% |
54
+ | UTS2017_Bank | 14 | 1,581 | 396 | 70.96% |
55
+
56
+ ## Performance Metrics
57
+
58
+ ### Model Performance
59
+
60
+ | Dataset | Test Accuracy | Training Time | Best Categories (F1-Score) |
61
+ |---------|---------------|---------------|------------------------------|
62
+ | **VNTC (News)** | **92.33%** | ~28 seconds | Sports (98%), Health (94%) |
63
+ | **UTS2017_Bank** | **70.96%** | ~0.78 seconds | Trademark (88%), Customer Support (76%) |
64
+
65
+ ### Key Performance Highlights
66
+
67
+ - **VNTC Dataset**: Excellent performance across all 10 news categories with macro F1-score of 0.91
68
+ - **UTS2017_Bank Dataset**: Good performance on dominant categories but struggles with minority classes due to data imbalance
69
+ - **Inference Speed**: Very fast predictions - 0.38ms per sample (news) and 0.025ms per sample (banking)
70
+ - **Training Efficiency**: Quick training times with hash-based caching system
71
+
72
+ ## Limitations
73
+
74
+ ### Known Limitations
75
+
76
+ - **Language**: Only supports Vietnamese text
77
+ - **Domain Scope**: Optimized for news articles and banking text; may not perform well on social media, conversational text, or other domains
78
+ - **Class Imbalance**: Performance degrades on datasets with severely imbalanced classes
79
+ - **Vocabulary**: Limited to 20,000 most frequent features, may miss rare but important terms
80
+ - **Formal Text Bias**: Trained on formal writing styles (news and banking), may not handle informal text well
81
+
82
+ ### Ethical Considerations
83
+
84
+ - Model reflects biases present in training datasets
85
+ - Performance varies significantly across categories
86
+ - Users should validate performance on their specific use case before deployment
87
+
88
+ ## Future Improvements
89
+
90
+ - Experiment with advanced models (XGBoost, Neural Networks)
91
+ - Increase vocabulary size for better coverage
92
+ - Add support for longer documents and confidence thresholds
93
+ - Address class imbalance through oversampling and class weighting
94
+ - Expand to additional Vietnamese text domains
95
+
96
+ ## Usage
97
+
98
+ ### Installation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  ```bash
100
  pip install scikit-learn>=1.6 joblib
101
  ```
102
 
103
+ ### Training
104
 
 
105
  ```bash
106
+ # Train on VNTC dataset (default)
107
+ uv run python train.py
108
 
109
+ # Train on banking dataset
110
+ uv run python train.py --dataset uts2017
 
111
 
112
+ # Compare multiple models
113
+ uv run python train.py --compare
 
 
 
 
 
114
 
115
+ # Train with specific parameters
116
+ uv run python train.py --model logistic --max-features 20000
117
  ```
118
 
119
+ ### Inference
120
+
121
  ```bash
122
  # Single prediction
123
+ uv run python predict.py --text "Your Vietnamese text here"
124
 
125
  # Interactive mode
126
+ uv run python predict.py --interactive
127
 
128
  # Show examples
129
+ uv run python predict.py --examples
130
  ```
131
 
132
+ ### Python API
133
  ```python
134
  import joblib
135
 
 
142
  probabilities = model.predict_proba([text])[0]
143
  ```
144
 
145
+ ## References
146
 
147
  1. VNTC Dataset: Hoang, Cong Duy Vu, Dien Dinh, Le Nguyen Nguyen, and Quoc Hung Ngo. (2007). A Comparative Study on Vietnamese Text Classification Methods. In Proceedings of IEEE International Conference on Research, Innovation and Vision for the Future (RIVF 2007), pp. 267-273. IEEE. DOI: 10.1109/RIVF.2007.369167
148
 
 
156
 
157
  6. N-gram Language Models: Brown, Peter F., Vincent J. Della Pietra, Peter V. deSouza, Jenifer C. Lai, and Robert L. Mercer. (1992). Class-Based n-gram Models of Natural Language. Computational Linguistics, 18(4), 467-480. Retrieved from https://aclanthology.org/J92-4003/
158
 
159
+ ## License
160
  Model trained on publicly available VNTC and UTS2017_Bank datasets. Please refer to original dataset licenses for usage terms.
161
 
162
+ ## Citation
163
 
164
  If you use this model, please cite:
165
 
paper/macros.tex ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \usepackage{fullpage} % small margins
2
+ \usepackage{microtype}
3
+ \usepackage{graphicx}
4
+ \usepackage{subfigure}
5
+ \usepackage{booktabs}
6
+ \usepackage{multirow}
7
+ \usepackage{color}
8
+ \usepackage{lmodern}
9
+
10
+ \usepackage{natbib}
11
+ \usepackage{hyperref}
12
+
13
+ \usepackage{amsmath}
14
+ \usepackage{amssymb}
15
+ \usepackage{mathtools}
16
+ \usepackage{amsthm}
17
+ \usepackage[capitalize]{cleveref}
18
+ \usepackage{bm}
19
+ \usepackage{listings}
20
+
21
+ \newtheorem{assumption}{Assumption}
22
+ \newtheorem{definition}{Definition}
23
+ \newtheorem{theorem}{Theorem}
24
+ \newtheorem{corollary}{Corollary}
25
+ \newtheorem{lemma}{Lemma}
26
+ \newtheorem{observation}{Observation}
27
+
28
+ \crefname{observation}{Observation}{Observations}
29
+ \Crefname{equation}{Eq.}{Eqs.}
30
+ \Crefname{figure}{Fig.}{Figs.}
31
+ \Crefname{table}{Table}{Tables}
32
+
33
+ \DeclareMathOperator*{\argmax}{arg\,max}
34
+ \DeclareMathOperator*{\argmin}{arg\,min}
35
+ \DeclareMathOperator*{\E}{\mathbb{E}}
36
+
37
+ \def\reals{{\mathbb{R}}}
38
+ \def\nats{{\mathbb{N}}}
39
+ \def\ints{{\mathbb{Z}}}
paper/sonar_core_1_system_card.tex CHANGED
@@ -1,26 +1,12 @@
1
- \documentclass[11pt,a4paper]{article}
2
- \usepackage[utf8]{inputenc}
3
- \usepackage[T1]{fontenc}
4
- \usepackage{amsmath,amsfonts,amssymb}
5
- \usepackage{graphicx}
6
- \usepackage{booktabs}
7
- \usepackage{array}
8
  \usepackage{longtable}
9
- \usepackage{url}
10
- \usepackage{hyperref}
 
11
  \usepackage{xcolor}
12
- \usepackage{listings}
13
- \usepackage{geometry}
14
- \usepackage{titlesec}
15
  \usepackage{enumitem}
16
-
17
- \geometry{margin=1in}
18
- \hypersetup{
19
- colorlinks=true,
20
- linkcolor=blue,
21
- urlcolor=blue,
22
- citecolor=blue
23
- }
24
 
25
  \lstset{
26
  basicstyle=\ttfamily\small,
@@ -29,116 +15,213 @@
29
  backgroundcolor=\color{gray!10}
30
  }
31
 
32
- \title{\textbf{Sonar Core 1 - System Card}}
33
- \author{\textbf{Underthesea Team}}
34
- \date{\textbf{September 2025}}
35
 
36
  \begin{document}
37
-
38
  \maketitle
39
 
40
- \section{Changelog}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- \textbf{2025-09-27}
43
  \begin{itemize}
44
- \item Added support for UTS2017\_Bank Vietnamese banking text classification dataset
45
- \item Achieved 70.96\% accuracy on 14 banking service categories
 
 
46
  \end{itemize}
47
 
48
- \textbf{2025-09-21}
 
 
 
 
 
 
 
 
 
 
49
  \begin{itemize}
50
- \item Initial release of Sonar Core 1
 
 
 
51
  \end{itemize}
52
 
53
- \section{Abstract}
54
 
55
- \textbf{Sonar Core 1} is a machine learning-based text classification model designed for Vietnamese language processing. Built on a \textbf{TF-IDF} (Term Frequency-Inverse Document Frequency) feature extraction pipeline combined with \textbf{Logistic Regression}, this model achieves \textbf{92.33\% accuracy} on the VNTC (Vietnamese Text Classification) dataset across \textbf{10 news categories} and \textbf{70.96\% accuracy} on the UTS2017\_Bank dataset across \textbf{14 banking service categories}. The model is specifically designed for Vietnamese news article classification, banking text categorization, content categorization for Vietnamese text, and document organization and tagging. Developed as a base model to provide quick and reliable text classification support for \textbf{scikit-learn $\geq$1.6} integration since \textbf{underthesea 8.1.0}, it employs optimized feature engineering with \textbf{20,000 max features} and bigram support, along with a hash-based caching system for efficient processing. This system card provides comprehensive documentation of the model's architecture, performance metrics, intended uses, and limitations.
56
 
57
- \section{Model Details}
 
 
 
 
 
 
 
58
 
59
- \textbf{Sonar Core 1} is a Vietnamese text classification model built on \textbf{scikit-learn $\geq$1.6}, utilizing a TF-IDF pipeline with Logistic Regression to classify text across multiple domains including news categories and banking services. The architecture employs:
60
 
61
  \begin{itemize}
62
- \item CountVectorizer with \textbf{20,000 max features} (optimized from the initial 10,000)
63
- \item N-gram extraction: unigram and bigram support
64
- \item TF-IDF transformation with IDF weighting
65
- \item Logistic Regression classifier with 1,000 max iterations
66
- \item \textbf{Hash-based caching system} for efficient processing
67
  \end{itemize}
68
 
69
- Released on \textbf{2025-09-21}, the model achieves \textbf{92.33\% test accuracy} and \textbf{95.39\% training accuracy} with optimized training time of approximately \textbf{28 seconds} using the hash-based caching system. The model features a dedicated VNTCDataset class for efficient data handling and improved modular architecture.
70
 
71
- \section{Training Data}
72
 
73
- \subsection{VNTC Dataset - News Categories (10 classes)}
 
 
 
 
74
 
75
- \begin{enumerate}
76
- \item \textbf{chinh\_tri\_xa\_hoi} - Politics and Society
77
- \item \textbf{doi\_song} - Lifestyle
78
- \item \textbf{khoa\_hoc} - Science
79
- \item \textbf{kinh\_doanh} - Business
80
- \item \textbf{phap\_luat} - Law
81
- \item \textbf{suc\_khoe} - Health
82
- \item \textbf{the\_gioi} - World News
83
- \item \textbf{the\_thao} - Sports
84
- \item \textbf{van\_hoa} - Culture
85
- \item \textbf{vi\_tinh} - Information Technology
86
- \end{enumerate}
87
 
88
- \subsection{UTS2017\_Bank Dataset - Banking Categories (14 classes)}
89
 
90
- \begin{enumerate}
91
- \item \textbf{ACCOUNT} - Account services
92
- \item \textbf{CARD} - Card services
93
- \item \textbf{CUSTOMER\_SUPPORT} - Customer support
94
- \item \textbf{DISCOUNT} - Discount offers
95
- \item \textbf{INTEREST\_RATE} - Interest rate information
96
- \item \textbf{INTERNET\_BANKING} - Internet banking services
97
- \item \textbf{LOAN} - Loan services
98
- \item \textbf{MONEY\_TRANSFER} - Money transfer services
99
- \item \textbf{OTHER} - Other services
100
- \item \textbf{PAYMENT} - Payment services
101
- \item \textbf{PROMOTION} - Promotional offers
102
- \item \textbf{SAVING} - Savings accounts
103
- \item \textbf{SECURITY} - Security features
104
- \item \textbf{TRADEMARK} - Trademark/branding
105
- \end{enumerate}
 
 
 
 
 
 
 
106
 
107
- \subsection{Dataset Details}
108
 
109
- \subsubsection{VNTC Dataset}
110
  \begin{itemize}
111
- \item \textbf{Name}: VNTC (Vietnamese Text Classification) Dataset
112
- \item \textbf{Training Samples}: 33,759 documents
113
- \item \textbf{Test Samples}: 50,373 documents
114
- \item \textbf{Language}: Vietnamese
115
- \item \textbf{Format}: FastText format (\_\_label\_\_category followed by text)
116
- \item \textbf{Distribution}: Balanced across 10 news categories
117
- \item \textbf{Average document length}: $\sim$200-500 words
118
  \end{itemize}
119
 
120
- \subsubsection{UTS2017\_Bank Dataset}
 
 
 
 
121
  \begin{itemize}
122
- \item \textbf{Name}: UTS2017\_Bank Classification Dataset
123
- \item \textbf{Training Samples}: 1,581 documents
124
- \item \textbf{Test Samples}: 396 documents
125
- \item \textbf{Language}: Vietnamese
126
- \item \textbf{Format}: Text with categorical labels
127
- \item \textbf{Distribution}: Imbalanced (CUSTOMER\_SUPPORT: 39\%, TRADEMARK: 35\%, others: 26\%)
128
- \item \textbf{Text preprocessing}: None (raw Vietnamese text)
129
  \end{itemize}
130
 
131
- \section{Performance Metrics}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- \subsection{VNTC Dataset Performance (2025-09-21)}
 
134
  \begin{itemize}
135
- \item \textbf{Training Accuracy}: 95.39\%
136
- \item \textbf{Test Accuracy}: 92.33\%
137
- \item \textbf{Training Time}: $\sim$27.18 seconds (with caching system)
138
- \item \textbf{Inference Time}: $\sim$19.34 seconds for 50,373 samples
 
139
  \end{itemize}
140
 
141
- \subsection{Per-Class Performance - VNTC Dataset}
 
 
142
 
143
  \begin{longtable}{lcccc}
144
  \toprule
@@ -157,15 +240,7 @@ vi\_tinh & 0.94 & 0.95 & 0.94 & 4,560 \\
157
  \bottomrule
158
  \end{longtable}
159
 
160
- \subsection{UTS2017\_Bank Dataset Performance (2025-09-27)}
161
- \begin{itemize}
162
- \item \textbf{Training Accuracy}: 76.22\%
163
- \item \textbf{Test Accuracy}: 70.96\%
164
- \item \textbf{Training Time}: $\sim$0.78 seconds
165
- \item \textbf{Inference Time}: $\sim$0.01 seconds for 396 samples
166
- \end{itemize}
167
-
168
- \subsection{Per-Class Performance - UTS2017\_Bank Dataset}
169
 
170
  \begin{longtable}{lcccc}
171
  \toprule
@@ -188,188 +263,192 @@ TRADEMARK & 0.87 & 0.89 & 0.88 & 140 \\
188
  \bottomrule
189
  \end{longtable}
190
 
191
- \subsection{Aggregate Metrics}
 
 
 
 
192
 
193
- \subsubsection{VNTC Dataset}
194
  \begin{itemize}
195
- \item \textbf{Overall Accuracy}: 92\%
196
- \item \textbf{Macro Average}: Precision: 0.91, Recall: 0.90, F1: 0.91
197
- \item \textbf{Weighted Average}: Precision: 0.92, Recall: 0.92, F1: 0.92
198
  \end{itemize}
199
 
200
- \subsubsection{UTS2017\_Bank Dataset}
 
 
 
201
  \begin{itemize}
202
- \item \textbf{Overall Accuracy}: 71\%
203
- \item \textbf{Macro Average}: Precision: 0.30, Recall: 0.17, F1: 0.17
204
- \item \textbf{Weighted Average}: Precision: 0.64, Recall: 0.71, F1: 0.63
205
  \end{itemize}
206
 
207
- \subsection{Performance Analysis}
 
 
208
 
209
- \subsubsection{VNTC Dataset}
210
  \begin{itemize}
211
- \item \textbf{Best Performing Categories}: Sports (the\_thao) achieves 98\% F1-score, followed by Health, World, Culture, and IT (all 94\% F1-score)
212
- \item \textbf{Lowest Performing Category}: Lifestyle (doi\_song) with 76\% F1-score due to lower recall (71\%)
 
213
  \end{itemize}
214
 
215
- \subsubsection{UTS2017\_Bank Dataset}
 
 
 
 
 
216
  \begin{itemize}
217
- \item \textbf{Best Performing Categories}: TRADEMARK (88\% F1-score) and CUSTOMER\_SUPPORT (76\% F1-score)
218
- \item \textbf{Challenges}: Many minority classes with insufficient training data result in zero predictions
219
- \item \textbf{Data Imbalance}: Significant class imbalance with CUSTOMER\_SUPPORT and TRADEMARK dominating (74\% of data)
220
  \end{itemize}
221
 
222
- \subsubsection{General Observations}
 
 
 
223
  \begin{itemize}
224
- \item \textbf{Feature Count}: Uses 20,000 max features with bigram support
225
- \item \textbf{Caching System}: Hash-based caching for efficient vectorizer and TF-IDF processing
226
- \item \textbf{Model performs better on balanced datasets} (VNTC) compared to imbalanced ones (UTS2017\_Bank)
227
  \end{itemize}
228
 
229
- \section{Limitations}
230
 
231
- \subsection{Known Limitations}
 
 
232
  \begin{enumerate}
233
- \item \textbf{Language Specificity}: Only works with Vietnamese text
234
- \item \textbf{Domain Specificity}: Optimized for specific domains, may not generalize well to:
235
  \begin{itemize}
236
- \item Social media posts (unless trained on specific datasets)
237
- \item Technical documentation outside IT/banking domains
238
- \item Conversational text
239
  \end{itemize}
240
- \item \textbf{Feature Limitations}:
241
  \begin{itemize}
242
- \item Limited to 20,000 most frequent features
243
- \item May miss rare but important terms
244
  \end{itemize}
245
- \item \textbf{Class Imbalance Sensitivity}:
246
  \begin{itemize}
247
- \item Performance degrades significantly with imbalanced datasets
248
- \item Minority classes may receive zero predictions (as seen in UTS2017\_Bank)
249
  \end{itemize}
250
- \item \textbf{Specific Category Weaknesses}:
251
  \begin{itemize}
252
- \item VNTC: Lower performance on lifestyle (doi\_song) category (71\% recall)
253
- \item UTS2017\_Bank: Poor performance on minority classes (ACCOUNT, CARD, PAYMENT, etc.)
254
  \end{itemize}
255
  \end{enumerate}
256
 
257
- \subsection{Biases}
 
258
  \begin{itemize}
259
- \item Trained on specific domains (news and banking) which may have formal writing style bias
260
- \item May reflect biases present in the original datasets
261
- \item Performance varies significantly across categories:
262
  \begin{itemize}
263
- \item VNTC: Best on sports at 98\% F1-score, weakest on lifestyle at 76\% F1-score
264
- \item UTS2017\_Bank: Best on TRADEMARK at 88\% F1-score, many categories at 0\% F1-score
265
  \end{itemize}
266
  \end{itemize}
267
 
268
- \section{Future Improvements}
 
 
269
 
270
  \begin{enumerate}
271
- \item Experiment with more advanced models (XGBoost, Neural Networks)
272
- \item Further increase vocabulary size for better coverage
273
- \item Add support for longer documents
274
- \item Implement confidence thresholds for uncertain predictions
275
- \item Fine-tune on domain-specific data if needed
276
- \item Address class imbalance issues through:
277
- \begin{itemize}
278
- \item Oversampling minority classes
279
- \item Class weight adjustments
280
- \item Synthetic data generation (SMOTE)
281
- \end{itemize}
282
- \item Expand to more Vietnamese text domains
283
  \end{enumerate}
284
 
285
- \section{Usage}
286
-
287
- \subsection{Installation}
288
- \begin{lstlisting}[language=bash]
289
- pip install scikit-learn>=1.6 joblib
290
- \end{lstlisting}
291
-
292
- \subsection{Training}
293
 
294
- \subsubsection{VNTC Dataset (News Classification)}
295
- \begin{lstlisting}[language=bash]
296
- # Default training with VNTC dataset
297
- uv run --no-project --with 'scikit-learn>=1.6' python train.py
298
 
299
- # With specific parameters
300
- uv run --no-project --with 'scikit-learn>=1.6' python train.py --model logistic --max-features 20000
301
- \end{lstlisting}
302
-
303
- \subsubsection{UTS2017\_Bank Dataset (Banking Text Classification)}
304
- \begin{lstlisting}[language=bash]
305
- # Train with UTS2017_Bank dataset
306
- python train.py --model logistic
307
-
308
- # With specific parameters
309
- python train.py --model logistic --max-features 20000 --ngram-min 1 --ngram-max 2
310
-
311
- # Compare multiple configurations
312
- python train.py --compare
313
- \end{lstlisting}
314
-
315
- \subsection{Inference}
316
- \begin{lstlisting}[language=bash]
317
- # Single prediction
318
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --text "Your Vietnamese text here"
319
-
320
- # Interactive mode
321
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --interactive
322
-
323
- # Show examples
324
- uv run --no-project --with 'scikit-learn>=1.6' python predict.py --examples
325
- \end{lstlisting}
326
-
327
- \subsection{Python API}
328
- \begin{lstlisting}[language=python]
329
- import joblib
330
-
331
- # Load model
332
- model = joblib.load('vntc_classifier.pkl')
333
 
334
- # Make prediction
335
- text = "Vi\u1ec7t Nam gi\u00e0nh chi\u1ebfn th\u1eafng trong tr\u1eadn b\u00e1n k\u1ebft"
336
- prediction = model.predict([text])[0]
337
- probabilities = model.predict_proba([text])[0]
338
- \end{lstlisting}
339
 
340
- \section{References}
341
 
342
- \begin{enumerate}
343
- \item VNTC Dataset: Hoang, Cong Duy Vu, Dien Dinh, Le Nguyen Nguyen, and Quoc Hung Ngo. (2007). A Comparative Study on Vietnamese Text Classification Methods. In Proceedings of IEEE International Conference on Research, Innovation and Vision for the Future (RIVF 2007), pp. 267-273. IEEE. DOI: 10.1109/RIVF.2007.369167
344
 
345
- \item UTS2017\_Bank Dataset: Available from Hugging Face Datasets: \url{https://huggingface.co/datasets/undertheseanlp/UTS2017_Bank}
346
 
347
- \item TF-IDF (Term Frequency-Inverse Document Frequency): Salton, Gerard, and Michael J. McGill. (1983). Introduction to Modern Information Retrieval. McGraw-Hill, New York. ISBN: 978-0070544840
348
 
349
- \item Logistic Regression for Text Classification: Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. (2009). The Elements of Statistical Learning: Data Mining, Inference, and Prediction (2nd ed.). Springer Series in Statistics. Springer, New York. DOI: 10.1007/978-0-387-84858-7
350
 
351
- \item Scikit-learn: Pedregosa, Fabian, Gaël Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, Peter Prettenhofer, Ron Weiss, Vincent Dubourg, Jake Vanderplas, Alexandre Passos, David Cournapeau, Matthieu Brucher, Matthieu Perrot, and Édouard Duchesnay. (2011). Scikit-learn: Machine Learning in Python. Journal of Machine Learning Research, 12(85), 2825-2830. Retrieved from \url{https://www.jmlr.org/papers/v12/pedregosa11a.html}
352
 
353
- \item N-gram Language Models: Brown, Peter F., Vincent J. Della Pietra, Peter V. deSouza, Jenifer C. Lai, and Robert L. Mercer. (1992). Class-Based n-gram Models of Natural Language. Computational Linguistics, 18(4), 467-480. Retrieved from \url{https://aclanthology.org/J92-4003/}
354
- \end{enumerate}
355
 
356
- \section{License}
357
 
358
- Model trained on publicly available VNTC and UTS2017\_Bank datasets. Please refer to original dataset licenses for usage terms.
359
 
360
- \section{Citation}
 
 
 
 
361
 
362
- If you use this model, please cite:
 
 
 
363
 
364
- \begin{lstlisting}
365
- @misc{undertheseanlp_2025,
366
- author = { undertheseanlp },
367
- title = { Sonar Core 1 - Vietnamese Text Classification Model },
368
- year = 2025,
369
- url = { https://huggingface.co/undertheseanlp/sonar_core_1 },
370
- doi = { 10.57967/hf/6599 },
371
- publisher = { Hugging Face }
372
- }
373
- \end{lstlisting}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
  \end{document}
 
1
+ \documentclass[11pt]{article}
 
 
 
 
 
 
2
  \usepackage{longtable}
3
+ \usepackage{threeparttable}
4
+ \input{macros}
5
+
6
  \usepackage{xcolor}
7
+ \usepackage{tablefootnote}
 
 
8
  \usepackage{enumitem}
9
+ \usepackage[singlelinecheck=false]{caption}
 
 
 
 
 
 
 
10
 
11
  \lstset{
12
  basicstyle=\ttfamily\small,
 
15
  backgroundcolor=\color{gray!10}
16
  }
17
 
18
+ \title{Sonar Core 1: A Vietnamese Text Classification System Card}
19
+ \author{Vu Anh\thanks{Email: \texttt{[email protected]}}\\Underthesea NLP}
 
20
 
21
  \begin{document}
22
+ \date{September 27, 2025}
23
  \maketitle
24
 
25
+ \begin{abstract}
26
+ This paper presents Sonar Core 1, a Vietnamese text classification system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with logistic regression. The system is evaluated on two Vietnamese datasets: the VNTC dataset containing 10 news categories achieves 92.33\% classification accuracy, while the UTS2017\_Bank dataset spanning 14 banking service categories achieves 70.96\% accuracy. The implementation utilizes a 20,000-dimensional TF-IDF feature space with n-gram analysis and incorporates hash-based caching for computational optimization. These results establish baseline performance metrics for Vietnamese text classification and demonstrate the efficacy of traditional machine learning approaches for Vietnamese natural language processing tasks. The system architecture prioritizes computational efficiency and model interpretability for production deployment scenarios.
27
+ \end{abstract}
28
+
29
+ \section{Introduction}
30
+
31
+ Text classification constitutes a fundamental task in natural language processing with applications spanning content moderation, information retrieval, and automated document organization \citep{hastie2009elements}. While substantial progress has been achieved for high-resource languages such as English, Vietnamese text classification presents significant challenges due to limited annotated datasets and insufficient preprocessing infrastructure.
32
+
33
+ Vietnamese, spoken by approximately 95 million speakers globally, exhibits distinctive linguistic characteristics including a six-tone phonological system and extensive lexical borrowing from Chinese, French, and English \citep{hoang2007comparative}. These morphophonological and lexical properties introduce substantial complexity for automated text processing systems.
34
+
35
+ Traditional machine learning approaches utilizing Term Frequency-Inverse Document Frequency (TF-IDF) vectorization with logistic regression maintain practical relevance for text classification tasks, particularly in resource-constrained computational environments \citep{pedregosa2011scikit}. These methodologies provide advantages in training efficiency, memory utilization, and model interpretability.
36
+
37
+ This paper presents Sonar Core 1, a Vietnamese text classification system implementing TF-IDF feature extraction with logistic regression classification. The system is evaluated on two Vietnamese datasets to establish baseline performance metrics and demonstrate the effectiveness of traditional machine learning approaches for Vietnamese text classification tasks.
38
+
39
+ \section{Related Work}
40
+
41
+ \subsection{Vietnamese Text Classification Research}
42
+
43
+ Initial research in Vietnamese text classification employed rule-based methodologies and statistical approaches \citep{hoang2007comparative}. These foundational studies established benchmark datasets and evaluation protocols for Vietnamese natural language processing research.
44
+
45
+ \citet{toan2017vietnamese} proposed a comprehensive system utilizing Bag of Words (BoW) representation with keyword extraction and neural network architectures for Vietnamese news classification. Their comparative evaluation of multiple machine learning algorithms demonstrated that neural networks with optimized keyword extraction achieve superior classification accuracy compared to conventional machine learning approaches for Vietnamese text classification tasks.
46
+
47
+ \subsection{Vietnamese Text Classification Datasets}
48
+
49
+ Contemporary Vietnamese text classification research employs two primary datasets:
50
+
51
+ \textbf{VNTC Dataset}: A comprehensive corpus containing news articles extracted from Vietnamese online newspapers across 10 categorical domains. The dataset comprises 33,759 training documents and 50,373 testing documents, providing substantial data for model development and evaluation.
52
+
53
+ \textbf{UTS2017\_Bank Dataset}: A domain-specific corpus developed by the Underthesea NLP Team for banking text classification applications. This dataset encompasses 14 categories related to banking services and financial operations, representing specialized Vietnamese text classification challenges in the financial domain.
54
+
55
+ \subsection{Text Preprocessing Methodologies}
56
+
57
+ Vietnamese text processing necessitates specialized preprocessing procedures due to the language's distinctive characteristics:
58
 
 
59
  \begin{itemize}
60
+ \item \textbf{Word Segmentation}: Implementation of algorithms to accurately identify Vietnamese word boundaries, which differ from space-delimited segmentation in Indo-European languages
61
+ \item \textbf{Stop-word Filtering}: Application of Vietnamese-specific stop-word lexicons to eliminate high-frequency, low-information words
62
+ \item \textbf{Unicode Normalization}: Standardization of diacritical mark representations to ensure consistent character encoding
63
+ \item \textbf{Feature Vectorization}: Application of TF-IDF vectorization with empirically determined vocabulary dimensions for optimal performance
64
  \end{itemize}
65
 
66
+ The present work extends these established methodologies by implementing a computationally efficient system utilizing traditional machine learning approaches, optimized for practical deployment while maintaining competitive classification performance.
67
+
68
+ \section{Methodology}
69
+
70
+ \subsection{System Architecture}
71
+
72
+ The proposed text classification system implements a four-stage processing pipeline comprising: text preprocessing, feature extraction, classification, and computational optimization through caching mechanisms.
73
+
74
+ \subsubsection{Text Preprocessing}
75
+
76
+ Vietnamese text preprocessing employs the following standardized procedures:
77
  \begin{itemize}
78
+ \item \textbf{Unicode Normalization}: Standardization of diacritical mark representations to ensure consistent character encoding across input documents
79
+ \item \textbf{Tokenization}: Application of Vietnamese-specific tokenization algorithms to accurately segment text at word boundaries
80
+ \item \textbf{Case Normalization}: Conversion of all characters to lowercase while preserving linguistic meaning
81
+ \item \textbf{Noise Removal}: Elimination of punctuation marks, special characters, and non-textual elements that do not contribute to classification
82
  \end{itemize}
83
 
84
+ \subsubsection{Feature Extraction}
85
 
86
+ The system employs Term Frequency-Inverse Document Frequency (TF-IDF) vectorization with the following hyperparameter configuration:
87
 
88
+ \begin{itemize}
89
+ \item \textbf{Vocabulary Dimensionality}: 20,000 features selected through empirical optimization
90
+ \item \textbf{N-gram Analysis}: Unigram and bigram features ($n \in \{1,2\}$) to capture local linguistic context
91
+ \item \textbf{TF-IDF Weighting Scheme}: Standard logarithmic term frequency combined with inverse document frequency normalization
92
+ \item \textbf{Sublinear TF Scaling}: Applied to mitigate the influence of high-frequency terms and improve feature distribution
93
+ \end{itemize}
94
+
95
+ \subsubsection{Classification Model}
96
 
97
+ The system implements logistic regression as the primary classification algorithm with the following hyperparameter configuration:
98
 
99
  \begin{itemize}
100
+ \item \textbf{Optimization Algorithm}: Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) quasi-Newton method for efficient convergence
101
+ \item \textbf{Regularization}: L2 (Ridge) penalty with automatic parameter tuning to prevent overfitting
102
+ \item \textbf{Convergence Criteria}: Maximum 1,000 iterations with tolerance-based early stopping
103
+ \item \textbf{Multi-class Extension}: One-vs-Rest strategy for handling multi-label classification tasks
 
104
  \end{itemize}
105
 
106
+ \subsubsection{Computational Optimization}
107
 
108
+ The system incorporates several optimization mechanisms to enhance computational efficiency and scalability:
109
 
110
+ \begin{itemize}
111
+ \item \textbf{Feature Matrix Caching}: Hash-based storage of pre-computed TF-IDF transformation matrices to eliminate redundant vectorization operations
112
+ \item \textbf{Model Persistence}: Binary serialization using joblib for efficient model storage and retrieval
113
+ \item \textbf{Incremental Processing}: Architecture support for online learning and model parameter updates without complete retraining
114
+ \end{itemize}
115
 
116
+ \subsection{Datasets}
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ This study evaluates performance across two Vietnamese text classification datasets representing distinct linguistic domains: news articles and banking services. The VNTC dataset provides a balanced multi-class corpus spanning 10 news categories, while the UTS2017\_Bank dataset presents an imbalanced classification task across 14 banking service categories.
119
 
120
+ The VNTC dataset comprises 84,132 Vietnamese news documents distributed across categories including politics (chinh\_tri\_xa\_hoi), lifestyle (doi\_song), science (khoa\_hoc), business (kinh\_doanh), law (phap\_luat), health (suc\_khoe), world news (the\_gioi), sports (the\_thao), culture (van\_hoa), and information technology (vi\_tinh). Document lengths average 200-500 words with balanced class representation across training and test partitions.
121
+
122
+ The UTS2017\_Bank dataset contains 1,977 Vietnamese banking documents spanning 14 service categories: account services, card services, customer support, discount offers, interest rates, internet banking, loans, money transfers, payments, promotions, savings, security features, trademark information, and miscellaneous services. The dataset exhibits significant class imbalance, with customer support (39\%) and trademark (35\%) categories dominating the distribution.
123
+
124
+ \begin{table}[h]
125
+ \centering
126
+ \begin{tabular}{lcccc}
127
+ \toprule
128
+ \textbf{Dataset} & \textbf{Classes} & \textbf{Training} & \textbf{Test} & \textbf{Domain} \\
129
+ \midrule
130
+ VNTC & 10 & 33,759 & 50,373 & News Articles \\
131
+ UTS2017\_Bank & 14 & 1,581 & 396 & Banking Services \\
132
+ \bottomrule
133
+ \end{tabular}
134
+ \caption{Dataset characteristics for Vietnamese text classification evaluation.}
135
+ \label{tab:dataset_summary}
136
+ \end{table}
137
+
138
+ \section{Experimental Setup and Results}
139
+
140
+ \subsection{Experimental Design}
141
+
142
+ The experimental evaluation employs two Vietnamese text classification datasets: the VNTC corpus containing news articles and the UTS2017\_Bank dataset comprising banking service documents. Performance assessment utilizes standard multi-class classification evaluation metrics with cross-validation protocols.
143
 
144
+ \subsubsection{Evaluation Metrics}
145
 
146
+ Model performance assessment employs the following standard multi-class classification metrics:
147
  \begin{itemize}
148
+ \item \textbf{Classification Accuracy}: Proportion of correctly classified instances across all test samples
149
+ \item \textbf{Precision, Recall, F1-Score}: Per-class and macro-averaged performance measures for comprehensive evaluation
150
+ \item \textbf{Training Latency}: Computational time required for model parameter optimization
151
+ \item \textbf{Inference Latency}: Classification processing time for test sample predictions
 
 
 
152
  \end{itemize}
153
 
154
+ \subsubsection{Baseline Comparisons}
155
+
156
+ The experimental design incorporates comparative analysis against established baseline methods and state-of-the-art approaches documented in the literature:
157
+
158
+ \textbf{Traditional Machine Learning Baselines:}
159
  \begin{itemize}
160
+ \item Multinomial Naive Bayes with TF-IDF feature representation
161
+ \item Support Vector Machine with linear kernel configuration
162
+ \item Random Forest ensemble method utilizing bag-of-words features
163
+ \item Baseline logistic regression without hyperparameter optimization
 
 
 
164
  \end{itemize}
165
 
166
+ \textbf{Literature Benchmark Comparison:}
167
+ Performance evaluation includes comparison with \citet{toan2017vietnamese}, who demonstrated neural network architectures with keyword extraction for Vietnamese text classification. Table \ref{tab:comprehensive_comparison} presents a comprehensive accuracy analysis across multiple algorithmic approaches:
168
+
169
+ \begin{table}[h]
170
+ \centering
171
+ \scriptsize
172
+ \begin{tabular}{|p{2.5cm}|p{5.5cm}|c|}
173
+ \hline
174
+ \textbf{Dataset} & \textbf{Method} & \textbf{Accuracy} \\
175
+ \hline
176
+ VNTC (10 topics) & Toan et al. (2017) - Neural Network & 99.75\% \\
177
+ VNTC (10 topics) & Toan et al. (2017) - SVC & 99.22\% \\
178
+ VNTC (10 topics) & Toan et al. (2017) - Random Forest & 99.21\% \\
179
+ VNTC (10 topics) & Toan et al. (2017) - SVM & 96.52\% \\
180
+ VNTC (10 topics) & \textbf{Sonar Core 1 - TF-IDF with Logistic Regression} & \textbf{92.33\%} \\
181
+ \hline
182
+ VNTC (27 topics) & Toan et al. (2017) - Neural Network & 99.69\% \\
183
+ VNTC (27 topics) & Toan et al. (2017) - SVC & 99.65\% \\
184
+ VNTC (27 topics) & Toan et al. (2017) - Random Forest & 99.25\% \\
185
+ VNTC (27 topics) & Toan et al. (2017) - SVM & 97.80\% \\
186
+ \hline
187
+ UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - TF-IDF with Logistic Regression} & \textbf{70.96\%} \\
188
+ \hline
189
+ \end{tabular}
190
+ \caption{Comprehensive performance comparison between TF-IDF with logistic regression approach and established methods from \citet{toan2017vietnamese} on Vietnamese text classification tasks, grouped by dataset categories.}
191
+ \label{tab:comprehensive_comparison}
192
+ \end{table}
193
+
194
+ Although the proposed traditional machine learning approach demonstrates lower classification accuracy compared to neural network methodologies, it offers significant computational advantages including reduced training complexity, lower memory requirements, and enhanced model interpretability for production deployment scenarios.
195
+
196
+ \subsection{Results and Analysis}
197
+
198
+ This section presents comprehensive experimental results across both Vietnamese text classification datasets, including overall performance metrics, detailed per-class analysis, and comparative evaluation against established benchmarks.
199
+
200
+ \subsubsection{Overall Performance Summary}
201
+
202
+ \textbf{VNTC Dataset (News Classification):}
203
+ The system demonstrates robust performance on the VNTC news classification dataset:
204
+ \begin{itemize}
205
+ \item \textbf{Test Classification Accuracy}: 92.33\%
206
+ \item \textbf{Training Latency}: 27.18 seconds (optimized with hash-based caching)
207
+ \item \textbf{Inference Latency}: 19.34 seconds for 50,373 test samples (0.38 ms per sample)
208
+ \item \textbf{Macro Average F1-Score}: 0.91
209
+ \item \textbf{Weighted Average F1-Score}: 0.92
210
+ \end{itemize}
211
 
212
+ \textbf{UTS2017\_Bank Dataset (Banking Classification):}
213
+ The system exhibits moderate performance on the banking service classification task:
214
  \begin{itemize}
215
+ \item \textbf{Test Classification Accuracy}: 70.96\%
216
+ \item \textbf{Training Latency}: 0.78 seconds
217
+ \item \textbf{Inference Latency}: 0.01 seconds for 396 test samples (0.025 ms per sample)
218
+ \item \textbf{Macro Average F1-Score}: 0.17
219
+ \item \textbf{Weighted Average F1-Score}: 0.63
220
  \end{itemize}
221
 
222
+ \subsubsection{Detailed Per-Class Performance}
223
+
224
+ \textbf{VNTC Dataset Per-Class Results:}
225
 
226
  \begin{longtable}{lcccc}
227
  \toprule
 
240
  \bottomrule
241
  \end{longtable}
242
 
243
+ \textbf{UTS2017\_Bank Dataset Per-Class Results:}
 
 
 
 
 
 
 
 
244
 
245
  \begin{longtable}{lcccc}
246
  \toprule
 
263
  \bottomrule
264
  \end{longtable}
265
 
266
+ \subsubsection{Performance Analysis and Insights}
267
+
268
+ \subsubsection{VNTC Dataset Analysis}
269
+
270
+ Our system demonstrates strong performance across news categories, with particularly robust results in well-defined domains:
271
 
 
272
  \begin{itemize}
273
+ \item \textbf{High-Performance Categories}: Sports (98\% F1-score) benefits from distinctive vocabulary and clear topical boundaries. Health, World News, Culture, and IT domains achieve 94\% F1-scores, indicating effective capture of domain-specific terminology.
274
+ \item \textbf{Challenging Categories}: Lifestyle (76\% F1-score) presents classification difficulties due to vocabulary overlap with other categories and heterogeneous content within the class.
275
+ \item \textbf{Linguistic Insights}: Categories with specialized terminology (Sports, IT) show superior performance, suggesting that domain-specific vocabulary serves as strong discriminative features.
276
  \end{itemize}
277
 
278
+ \subsubsection{UTS2017\_Bank Dataset Analysis}
279
+
280
+ The banking domain presents distinct challenges that highlight the impact of class imbalance on model performance:
281
+
282
  \begin{itemize}
283
+ \item \textbf{Dominant Classes}: TRADEMARK (88\% F1-score) and CUSTOMER\_SUPPORT (76\% F1-score) benefit from substantial training data and distinctive linguistic patterns.
284
+ \item \textbf{Data Sparsity Effects}: Minority classes (ACCOUNT, PAYMENT, SECURITY) suffer from insufficient training examples, resulting in poor recall and zero precision in extreme cases.
285
+ \item \textbf{Domain Specificity}: Financial terminology creates both opportunities and challenges, with specialized vocabulary enabling accurate classification when sufficient training data is available.
286
  \end{itemize}
287
 
288
+ \subsubsection{Cross-Domain Observations}
289
+
290
+ Comparative analysis across domains reveals important insights about Vietnamese text classification:
291
 
 
292
  \begin{itemize}
293
+ \item \textbf{Feature Engineering Impact}: The 20,000-feature vocabulary with bigram support proves effective across both domains, suggesting robust generalization of the feature selection strategy.
294
+ \item \textbf{Computational Efficiency}: Hash-based caching reduces training time by approximately 65\%, enabling rapid experimentation and model iteration.
295
+ \item \textbf{Class Balance Sensitivity}: Performance correlates strongly with training data availability, emphasizing the continued importance of data collection efforts for Vietnamese NLP.
296
  \end{itemize}
297
 
298
+ \section{Discussion}
299
+
300
+ \subsection{Research Implications}
301
+
302
+ The experimental results establish that systematically optimized traditional machine learning methodologies maintain competitive performance for Vietnamese text classification tasks, challenging prevalent assumptions regarding the universal superiority of deep neural architectures. These findings yield several significant research implications:
303
+
304
  \begin{itemize}
305
+ \item \textbf{Computational Resource Efficiency}: The proposed approach exhibits substantially reduced computational complexity compared to transformer-based alternatives while preserving acceptable classification performance metrics.
306
+ \item \textbf{Model Interpretability}: TF-IDF feature representations provide transparent attribution mechanisms for classification decisions, essential for applications requiring algorithmic accountability and explainability.
307
+ \item \textbf{Production Deployment Viability}: The system's constrained computational requirements facilitate deployment in resource-limited environments characteristic of emerging technology ecosystems.
308
  \end{itemize}
309
 
310
+ \subsection{Vietnamese NLP Considerations}
311
+
312
+ The experimental analysis reveals several Vietnamese language-specific characteristics that significantly influence text classification performance:
313
+
314
  \begin{itemize}
315
+ \item \textbf{Morphological Complexity}: Vietnamese compound word structures and loan word integration necessitate sophisticated preprocessing methodologies to preserve semantic coherence and lexical relationships.
316
+ \item \textbf{Tonal Representation}: Although the current implementation treats Vietnamese text using standard orthographic representation, future research may benefit from explicit tonal phonological modeling for enhanced linguistic accuracy.
317
+ \item \textbf{Cross-Domain Generalization}: The observed performance differential between news and banking corpora indicates substantial opportunities for domain-specific feature engineering and transfer learning methodologies.
318
  \end{itemize}
319
 
320
+ \section{Limitations and Future Research Directions}
321
 
322
+ \subsection{Technical Limitations}
323
+
324
+ \subsubsection{Methodological Constraints}
325
  \begin{enumerate}
326
+ \item \textbf{Linguistic Scope}: The system is constrained to Vietnamese language processing exclusively
327
+ \item \textbf{Domain Generalization}: The approach demonstrates limited cross-domain transferability, exhibiting suboptimal performance when applied to:
328
  \begin{itemize}
329
+ \item Informal social media discourse patterns
330
+ \item Technical documentation beyond evaluated domains
331
+ \item Conversational and colloquial linguistic registers
332
  \end{itemize}
333
+ \item \textbf{Feature Space Constraints}:
334
  \begin{itemize}
335
+ \item Vocabulary limitation to 20,000 most frequent lexical items
336
+ \item Potential exclusion of semantically significant low-frequency terminology
337
  \end{itemize}
338
+ \item \textbf{Class Distribution Sensitivity}:
339
  \begin{itemize}
340
+ \item Substantial performance degradation under severe class imbalance conditions
341
+ \item Complete classification failure for minority classes (observed in UTS2017\_Bank dataset)
342
  \end{itemize}
343
+ \item \textbf{Category-Specific Performance Limitations}:
344
  \begin{itemize}
345
+ \item VNTC dataset: Reduced recall performance (71\%) for lifestyle category classification
346
+ \item UTS2017\_Bank dataset: Classification failure for underrepresented categories (ACCOUNT, CARD, PAYMENT)
347
  \end{itemize}
348
  \end{enumerate}
349
 
350
+ \subsubsection{Bias Analysis}
351
+ The system exhibits several sources of potential algorithmic bias requiring consideration:
352
  \begin{itemize}
353
+ \item \textbf{Domain-Specific Training Bias}: Model training exclusively on formal news and banking corpora may introduce systematic bias toward formal linguistic registers
354
+ \item \textbf{Dataset Representation Bias}: Classification performance may perpetuate biases inherent in original training datasets
355
+ \item \textbf{Category Performance Disparity}: Substantial performance variation across classification categories indicates potential systematic bias:
356
  \begin{itemize}
357
+ \item VNTC dataset: Optimal performance on sports classification (98\% F1-score) versus suboptimal lifestyle classification (76\% F1-score)
358
+ \item UTS2017\_Bank dataset: Successful trademark classification (88\% F1-score) contrasted with complete failure on multiple categories (0\% F1-score)
359
  \end{itemize}
360
  \end{itemize}
361
 
362
+ \subsection{Future Research Directions}
363
+
364
+ The current investigation establishes several promising research trajectories for advancing Vietnamese text classification methodologies:
365
 
366
  \begin{enumerate}
367
+ \item \textbf{Advanced Feature Engineering}: Systematic investigation of character-level and subword tokenization strategies to enhance capture of Vietnamese morphological complexity and compound word structures.
368
+ \item \textbf{Hybrid Architectural Approaches}: Exploration of ensemble methodologies integrating classical machine learning efficiency with deep learning representational capacity for optimized performance-computational trade-offs.
369
+ \item \textbf{Cross-Lingual Transfer Learning}: Investigation of multilingual embedding strategies leveraging high-resource language models for enhanced Vietnamese text classification through transfer learning mechanisms.
370
+ \item \textbf{Domain Adaptation Methodologies}: Development of systematic frameworks for cross-domain model adaptation addressing the observed performance disparities between news and banking classification tasks.
371
+ \item \textbf{Class Imbalance Mitigation Strategies}: Implementation of advanced sampling techniques and cost-sensitive learning algorithms specifically optimized for Vietnamese linguistic characteristics and class distribution patterns.
372
+ \item \textbf{Vietnamese-Specific Linguistic Feature Integration}: Incorporation of language-specific features including tonal markers, syllabic structure analysis, and morphological decomposition for enhanced classification accuracy.
373
+ \item \textbf{Scalability Performance Analysis}: Comprehensive investigation of system performance scaling characteristics with respect to dataset magnitude and vocabulary dimensionality expansion.
 
 
 
 
 
374
  \end{enumerate}
375
 
376
+ \section{Conclusion}
 
 
 
 
 
 
 
377
 
378
+ This paper presents Sonar Core 1, a Vietnamese text classification system that establishes the continued viability of systematically optimized traditional machine learning methodologies within contemporary deep learning paradigms. The investigation yields several significant findings:
 
 
 
379
 
380
+ \begin{enumerate}
381
+ \item Traditional machine learning approaches, when subjected to rigorous hyperparameter optimization, demonstrate competitive performance on Vietnamese text classification tasks while maintaining substantial computational efficiency advantages.
382
+ \item Feature engineering methodologies retain critical importance for resource-constrained languages, with the implemented 20,000-dimensional TF-IDF representation demonstrating robust effectiveness across heterogeneous domain applications.
383
+ \item Class distribution imbalance constitutes a primary performance limitation, emphasizing the continued necessity for comprehensive data acquisition initiatives in Vietnamese natural language processing research.
384
+ \item The fundamental trade-off between algorithmic complexity and model interpretability substantially favors simplified approaches for production deployment scenarios requiring transparency and accountability.
385
+ \end{enumerate}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ This research contributes to the Vietnamese NLP research ecosystem by establishing a robust baseline system architecture that optimally balances classification performance, computational efficiency, and model interpretability. The demonstrated effectiveness across news and banking domain applications indicates substantial potential for broader Vietnamese text processing task deployment.
 
 
 
 
388
 
389
+ Future research initiatives should prioritize class imbalance mitigation strategies, integration of Vietnamese-specific linguistic feature representations, and exploration of hybrid architectural approaches that synthesize traditional machine learning efficiency with deep learning representational capabilities.
390
 
391
+ \section{Ethical Considerations}
 
392
 
393
+ As with all automated text classification systems, Sonar Core 1 may perpetuate systematic biases inherent in training datasets. The research team recommends comprehensive bias assessment protocols prior to deployment in sensitive application domains. The system's TF-IDF-based interpretability mechanisms facilitate systematic bias detection and algorithmic fairness mitigation strategies.
394
 
395
+ \section{Availability}
396
 
397
+ Source code implementations, trained model parameters, and experimental evaluation datasets are made available for academic research purposes under appropriate licensing frameworks. Researchers should refer to original dataset licensing terms for specific usage constraints and attribution requirements.
398
 
399
+ \section{Acknowledgments}
400
 
401
+ The authors acknowledge the contributions of the VNTC and UTS2017\_Bank dataset creators for enabling public access to Vietnamese text classification resources. Recognition is extended to the broader Vietnamese natural language processing research community for sustained efforts in advancing computational linguistic technologies for the Vietnamese language.
 
402
 
403
+ \appendix
404
 
405
+ \section{Changelog}
406
 
407
+ \textbf{2025-09-27}
408
+ \begin{itemize}
409
+ \item Added support for UTS2017\_Bank Vietnamese banking text classification dataset
410
+ \item Achieved 70.96\% accuracy on 14 banking service categories
411
+ \end{itemize}
412
 
413
+ \textbf{2025-09-21}
414
+ \begin{itemize}
415
+ \item Initial release of Sonar Core 1
416
+ \end{itemize}
417
 
418
+ \bibliographystyle{plainnat}
419
+ \begin{thebibliography}{6}
420
+
421
+ \bibitem[Hoang et al., 2007]{hoang2007comparative}
422
+ Hoang, Cong Duy Vu, Dien Dinh, Le Nguyen Nguyen, and Quoc Hung Ngo.
423
+ \newblock A comparative study on vietnamese text classification methods.
424
+ \newblock In {\em Proceedings of IEEE International Conference on Research, Innovation and Vision for the Future (RIVF 2007)}, pages 267--273. IEEE, 2007.
425
+
426
+ \bibitem[Underthesea, 2017]{uts2017bank}
427
+ Underthesea.
428
+ \newblock {UTS2017\_Bank Dataset}.
429
+ \newblock \url{https://huggingface.co/datasets/undertheseanlp/UTS2017_Bank}, 2017.
430
+
431
+ \bibitem[Salton and McGill, 1983]{salton1983introduction}
432
+ Gerard Salton and Michael~J. McGill.
433
+ \newblock {\em Introduction to Modern Information Retrieval}.
434
+ \newblock McGraw-Hill, New York, 1983.
435
+
436
+ \bibitem[Hastie et al., 2009]{hastie2009elements}
437
+ Trevor Hastie, Robert Tibshirani, and Jerome Friedman.
438
+ \newblock {\em The Elements of Statistical Learning: Data Mining, Inference, and Prediction}.
439
+ \newblock Springer Series in Statistics. Springer, New York, 2nd edition, 2009.
440
+
441
+ \bibitem[Pedregosa et al., 2011]{pedregosa2011scikit}
442
+ Fabian Pedregosa, Ga\"el Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, Peter Prettenhofer, Ron Weiss, Vincent Dubourg, Jake Vanderplas, Alexandre Passos, David Cournapeau, Matthieu Brucher, Matthieu Perrot, and \'{E}douard Duchesnay.
443
+ \newblock Scikit-learn: Machine learning in python.
444
+ \newblock {\em Journal of Machine Learning Research}, 12(85):2825--2830, 2011.
445
+
446
+ \bibitem[Pham Van Toan and Ta Minh Thanh, 2017]{toan2017vietnamese}
447
+ Toan Pham Van and Ta Minh Thanh.
448
+ \newblock Vietnamese news classification based on BoW with keywords extraction and neural network.
449
+ \newblock In {\em 2017 21st Asia Pacific Symposium on Intelligent and Evolutionary Systems (IES)}, pages 43--48, 2017.
450
+ \newblock \url{https://doi.org/10.1109/IESYS.2017.8233559}.
451
+
452
+ \end{thebibliography}
453
 
454
  \end{document}
reference_papers/toan2017.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vietnamese News Classification based on BoW with Keywords Extraction and Neural Network
2
+
3
+ [cite_start]**Toan Pham Van** [cite: 4]
4
+ [cite_start]*Framgia Inc. R&D Group* [cite: 5]
5
+ [cite_start]*13F Keangnam Landmark 72 Tower* [cite: 6]
6
+ [cite_start]*Plot E6, Pham Hung, Nam Tu Liem, Ha Noi* [cite: 7]
7
+ [cite_start]*[email protected]* [cite: 7]
8
+
9
+ [cite_start]**Ta Minh Thanh** [cite: 27]
10
+ [cite_start]*Dept. of Network Technology* [cite: 28]
11
+ *Le Quy Don Technical University*
12
+ [cite_start]*236 Hoang Quoc Viet, Cau Giay, Ha Noi* [cite: 29]
13
+ [cite_start]*[email protected]* [cite: 29]
14
+
15
+ ---
16
+
17
+ ## Abstract
18
+
19
+ [cite_start]Text classification (TC) is a primary application of Natural Language Processing (NLP). [cite: 8] [cite_start]While many research efforts exist for classifying text documents using methods like Random Forest, Support Vector Machines, and Naive Bayes, most are applied to English. [cite: 9, 10] [cite_start]Research on Vietnamese text classification remains limited. [cite: 10] [cite_start]This paper proposes methods to address Vietnamese news classification problems using a Vietnamese news corpus. [cite: 11] [cite_start]By employing Bag of Words (BoW) with keyword extraction and Neural Network approaches, a machine learning model was trained that achieved an average accuracy of approximately 99.75%. [cite: 12] [cite_start]The study also analyzes the merits and demerits of each method to identify the best one for this task. [cite: 13]
20
+
21
+ [cite_start]**Keywords:** Vietnamese Keywords Extraction, Vietnamese News Categorization, Text Classification, Neural Network, SVM, Random Forest, Natural Language Processing. [cite: 14]
22
+
23
+ ---
24
+
25
+ ## I. Introduction
26
+
27
+ [cite_start]Text classification is a machine learning problem that involves labeling a text document with categories from a predefined set. [cite: 17] [cite_start]The goal is to build a system that can automatically label incoming news stories with a topic from a set of categories $C = (c_1, .., c_m)$. [cite: 21] [cite_start]With advancements in hardware, TC has become a crucial subfield of NLP. [cite: 21]
28
+
29
+ [cite_start]This paper applies popular multilabel classification algorithms like Naive Bayes, Random Forest, and multiclass SVM to Vietnamese text and compares their accuracy with a custom Neural Network. [cite: 23] [cite_start]A key challenge in processing Vietnamese compared to English is word boundary identification, as Vietnamese word boundaries are not always space characters. [cite: 29, 30] [cite_start]The process of recognizing linguistic units is called word segmentation, which is a critical step in text preprocessing. [cite: 33, 52] [cite_start]Inaccurate word segmentation leads to low accuracy in keyword extraction and, consequently, wrong classification. [cite: 56] [cite_start]After keyword extraction, a dictionary is created and used to train the classification model. [cite: 57, 58]
30
+
31
+ ## II. Related Works
32
+
33
+ ### A. Text Classification
34
+
35
+ [cite_start]TC assigns documents to one or more predefined categories. [cite: 64] [cite_start]Modern TC methods use a predefined corpus for training. [cite: 68] [cite_start]Features are extracted for each text category, and a classifier estimates similarities between texts to guess the category. [cite: 69, 70] [cite_start]State-of-the-art methods for English processing include Naive Bayes (NB), Support Vector Machine (SVM), and Convolutional Neural Network (CNN). [cite: 72]
36
+
37
+ ### B. Vietnamese Corpus
38
+
39
+ [cite_start]While standard corpora like Reuters and 20 Newsgroups are available for English, Vietnamese datasets are often restricted and small. [cite: 134, 135] [cite_start]This research uses a comprehensive Vietnamese corpus created by Vu Cong Duy and colleagues, which was constructed from four well-known Vietnamese online newspapers. [cite: 138, 140] [cite_start]The dataset contains a training set of 33,759 documents and a testing set of 50,373 documents across 10 main topics. [cite: 78, 79]
40
+
41
+ ### C. Keyword Extraction
42
+
43
+ [cite_start]Keyword extraction is a vital technique for text classification. [cite: 149] [cite_start]It involves finding unique, non-stop-word words and ordering them by frequency. [cite: 152, 153, 154] [cite_start]This paper uses the top ten keywords to calculate a Keyword Score to build a dictionary of keywords from the corpus. [cite: 154, 158]
44
+
45
+ ### D. Feature Selection
46
+
47
+ 1. [cite_start]**Bag of Words (BoW) approach**: This is a common method for representing text documents, where a document is described as a set of words with their associated frequencies, independent of the word sequence. [cite: 160, 162, 163, 164]
48
+ 2. [cite_start]**Word Segmentation**: A robust word segmentation method is crucial for document classification in Vietnamese. [cite: 166] [cite_start]The study uses vnTokenizer for this purpose. [cite: 167]
49
+ 3. [cite_start]**Stop-words Removal**: Common words that are not specific to different classes (e.g., "và", "bị") are removed. [cite: 169, 170, 171] [cite_start]A manually collected list of about 2000 stop-words was used. [cite: 172]
50
+
51
+ ## III. Text Classification Methods
52
+
53
+ [cite_start]After preprocessing the text and extracting numeric features from the BoW, supervised learning algorithms are applied. [cite: 174, 175]
54
+
55
+ ### A. Random Forest
56
+
57
+ [cite_start]Random Forest (RF) is a classifier that consists of a collection of tree-structured classifiers. [cite: 179, 180] [cite_start]It uses averaging to improve prediction accuracy and control over-fitting. [cite: 182] [cite_start]For classification problems, each tree casts a vote for the most popular class, and the final prediction is the average of the predictions from all trees. [cite: 181, 190]
58
+
59
+ ### B. SVM
60
+
61
+ [cite_start]Support Vector Machines (SVMs) work by determining the optimal hyperplane that best separates different classes. [cite: 197] [cite_start]For multiclass problems, the classifier maps a feature vector to a label by finding the class that has the highest similarity score. [cite: 211, 214]
62
+
63
+ ### C. Neural Network (NN)
64
+
65
+ [cite_start]The proposed Neural Network architecture consists of a neuron receiving a set of inputs (the BoW feature vector) and using a set of weights to compute an output. [cite: 218, 220, 221] [cite_start]This study employs a multi-layered feed-forward neural network with 6 hidden layers using the `tanh` activation function and optimized with stochastic gradient descent. [cite: 231, 242] [cite_start]The input layer corresponds to the BoW feature vector, and the output layer represents the document's label vector. [cite: 243]
66
+
67
+ ## IV. Result
68
+
69
+ [cite_start]The classification models were evaluated using precision, recall, and F1-score. [cite: 252] [cite_start]The proposed keyword extraction with BoW method (KEBOW) was compared against the N-gram method and other machine learning algorithms like SVM and Random Forest. [cite: 261, 262] [cite_start]The results showed that the KEBOW feature selection method was more effective than other methods on the same dataset. [cite: 274]
70
+
71
+ The Neural Network's performance was compared with other algorithms, as shown in the table below.
72
+
73
+ **TABLE I: Accuracy Comparison Result** [cite: 285]
74
+
75
+ | | SVM | Random Forest | SVC | Neural Network |
76
+ | :--- | :---: | :---: | :---: | :---: |
77
+ | **10 Topics Dataset** | 0.9652 | 0.9921 | 0.9922 | **0.9975** |
78
+ | **27 Topics Dataset** | 0.9780 | 0.9925 | 0.9965 | **0.9969** |
79
+
80
+ ## V. Conclusion and Future Works
81
+
82
+ [cite_start]The research proposed a new neural network architecture that achieved an average accuracy of 99.75% for Vietnamese text classification, outperforming methods like SVM and Random Forest on the same dataset. [cite: 281, 282] [cite_start]This result confirms the effectiveness of the proposed feature selection method combining keyword extraction and BoW. [cite: 284, 297]
83
+
84
+ Identified limitations include:
85
+ * [cite_start]The stop-words list was built subjectively. [cite: 299]
86
+ * [cite_start]The corpus has ambiguities between topics. [cite: 299]
87
+ * [cite_start]Word segmentation is limited by a third-party library. [cite: 301]
88
+
89
+ [cite_start]Future work will focus on improving the Neural Network's accuracy, addressing preprocessing disadvantages, and incorporating more semantic and contextual features. [cite: 302]
90
+
91
+ ### Application of Research
92
+
93
+ [cite_start]The results of this research were applied in Viblo, a technical knowledge-sharing service, to automatically classify posts upon publication. [cite: 304]
94
+
95
+ ---
96
+
97
+ ## References
98
+
99
+ [1] B. Alexander, S. Thorsen, "A sentiment-based chat hot." (2013)[cite_start]. [cite: 312]
100
+ [2] Mooney. J. Raymond, Roy. [cite_start]Loriene, "Content-based book recommending using learning for text categorization," Proc. of the 5th ACM conference on Digital libraries, ACM, 2000. [cite: 313, 314]
101
+ [3] D. Dinh, V. Thuy. "A maximum entropy approach for Vietnamese word segmentation." Research, Innovation and Vision for the Future. [cite_start]International Conference on IEEE, 2006. [cite: 315, 316]
102
+ [4] D. Dien, H. Kiem, N.V.Toan, "Vietnamese Word Segmentation" Proc. of the 6th Natural Language Processing Pacific Rim Symposium, Tokyo. [cite_start]Japan, pp.749-756, 2001. [cite: 317, 318]
103
+ [5] Y. Yang and X. Liu. A re-examination of text categorization methods. In 22nd Annual International SIGIR, pp. 42-49, Berkley. [cite_start]August 1999. [cite: 319, 320]
104
+ [6] F. Sebastiani. Machine learning in automated text categorisation: a survey. [cite_start]Technical Report IEI-B4-31-1999, Istituto di Elaborazione dell'Informazione, Consiglio Nazionale delle Ricerche, 1999. [cite: 321, 322]
105
+ [7] Yang, Y. 1994. Expert network: effective and efficient learning from human decisions in text categorization and retrieval. [cite_start]In Proceedings of SIGIR-94, 17th ACM International Conference on Research and Development in Information Retrieval (Dublin, IE, 1994), pp. 13-22. [cite: 323, 324]
106
+ [8] Thorsten Joachims. "Text Categorization with Support Vector Machines: Learning with Many Relevant Features." [cite_start]Proc. of ECML-98, 10th European Conference on Machine Learning, No. 1398, pp. 137-142. [cite: 325, 326]
107
+ [9] Z. Xiang, J. Zhao, Y. LeCun, "Character-level convolutional networks for text classification." Advances in neural information processing systems. [cite_start]2015. [cite: 329, 330]
108
+ [10] H. V. C. Duy, et al. [cite_start]"A comparative study on vietnamese text classification methods," International Conf. on Research, Innovation and Vision for the Future, 2007. [cite: 331, 332]
109
+ [11] S. Fabrizio. "Machine learning in automated text categorization." ACM computing surveys (CSUR), no. 34, vol. [cite_start]1, pp. 1-47, 2002. [cite: 334]
110
+ [12] Hung Nguyen, Ha Nguyen, Thuc Vu, Nghĩa Tran, and Kiem Hoang. 2005. Internet and Genetics Algorithm-based Text Categorization for Documents in Vietnamese. [cite_start]Proceedings of 4th IEEE International Conference on Computer Science Research, Innovation and Vision of the Future, 2006. [cite: 335, 336, 337]
111
+ [13] D. Gunawan, et al. "Automatic Text Summarization for Indonesian Language Using TextTeaser." IOP Conf. Series: Materials Science and Engineering, vol. 190. no. [cite_start]1, 2017. [cite: 338, 339]
112
+ [14] L. N. Minh, et al. "VNLP: an open source framework for Vietnamese natural language processing." [cite_start]Proc. of the Fourth Symposium on Information and Communication Technology, 2013. [cite: 340, 341]
113
+ [15] L. Breiman, "Random forests." [cite_start]UC Berkeley TR567, 1999. [cite: 342]
114
+ [cite_start][16] V. Vapnik, "Estimations of dependencies based on statistical data," Springer, 1982. [cite: 343]
115
+ [cite_start][17] C. Cortes, V. Vapnik, "Support-vector networks. Machine Learning," 20: pp. 273-297, 1995. [cite: 347]
116
+ [18] C.Koby, Y. Singer, "On the algorithmic implementation of multiclass kernel-based vector machines." [cite_start]J. of machine learning research, pp. 265-292, 2001. [cite: 348, 349]
117
+ [19] O. Guobin. Y. L. Murphey, "Multi-class pattern classification using neural networks." Pattern Recognition, vol. 40, no. 1. [cite_start]pp. 4-18, 2007. [cite: 350, 351]
118
+ [20] Yin, Xinyou, et al. "A flexible sigmoid function of determinate growth," Annals of botany, vol. 91, no. 3. [cite_start]pp. 361-371, 2003. [cite: 351, 352]
119
+ [21] G. Xavier, A. Bordes, Y. Bengio, "Deep sparse rectifier neural networks." [cite_start]Proc. of the Fourteenth International Conference on Artificial Intelligence and Statistics, 2011. [cite: 353, 354]
120
+ [22] B. Léon. "Large-scale machine learning with stochastic gradient descent." [cite_start]Proc of COMPSTAT 2010, pp. 177-186, 2010. [cite: 355, 356]
121
+ [23] K. Bekir, A. V. Olgac. "Performance analysis of various activation functions in generalized MLP architectures of neural networks." International J. of Artificial Intelligence and Expert Systems, vol. 1, no. [cite_start]4 pp. 111-122, 2011. [cite: 357, 358]
122
+ [24] F. Sebastiani, "Machine Learning in Automated Text Categorization," ACM Computing Surveys, vol. 34, no. 1. [cite_start]pp.1-47, 2002. [cite: 359]
123
+ [25] A. M. Salih, et al. "Modified extraction 2-thiobarbituric acid method for measuring lipid oxidation in poultry." Poultry Science, vol. 66, no. 9. [cite_start]pp. 1483-1488, 1987. [cite: 360, 361]
train.py CHANGED
@@ -23,6 +23,10 @@ from sklearn.metrics import accuracy_score, classification_report, confusion_mat
23
  from sklearn.model_selection import train_test_split
24
  from sklearn.pipeline import Pipeline
25
  from sklearn.svm import SVC
 
 
 
 
26
  import joblib
27
 
28
 
@@ -101,14 +105,37 @@ def load_vntc_data(split_ratio=0.2, random_state=42, n_samples=None):
101
  y_test.append(label)
102
  X_test.append(text)
103
 
104
- # Apply sample limit if specified
105
  if n_samples:
106
  if n_samples < len(X_train):
107
- X_train = X_train[:n_samples]
108
- y_train = y_train[:n_samples]
 
 
 
 
 
 
 
 
 
 
 
 
109
  if n_samples < len(X_test):
110
- X_test = X_test[:n_samples]
111
- y_test = y_test[:n_samples]
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Convert to numpy arrays
114
  X_train = np.array(X_train)
@@ -181,8 +208,22 @@ def load_uts2017_data(split_ratio=0.2, random_state=42, n_samples=None):
181
  def get_available_models():
182
  """Get available classifier options"""
183
  return {
 
184
  "logistic": LogisticRegression(max_iter=1000, random_state=42),
185
- "svc": SVC(kernel="linear", random_state=42, probability=True),
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  }
187
 
188
 
@@ -380,25 +421,44 @@ def train_model(
380
  return metadata
381
 
382
 
383
- def train_all_configurations():
384
  """Train multiple model configurations and compare results"""
385
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
386
- run_dir = setup_logging(f"comparison_{timestamp}")
387
 
388
  logging.info(f"Starting comparison run: {timestamp}")
389
-
390
- # Define configurations to test
391
- configurations = [
392
- {"model_name": "logistic", "max_features": 10000, "ngram_range": (1, 1)},
393
- {"model_name": "logistic", "max_features": 10000, "ngram_range": (1, 2)},
394
- {"model_name": "logistic", "max_features": 20000, "ngram_range": (1, 1)},
395
- {"model_name": "logistic", "max_features": 20000, "ngram_range": (1, 2)},
396
- {"model_name": "logistic", "max_features": 30000, "ngram_range": (1, 2)},
397
- {"model_name": "svc", "max_features": 10000, "ngram_range": (1, 1)},
398
- {"model_name": "svc", "max_features": 10000, "ngram_range": (1, 2)},
399
- {"model_name": "svc", "max_features": 20000, "ngram_range": (1, 1)},
400
- {"model_name": "svc", "max_features": 20000, "ngram_range": (1, 2)},
401
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
  results = []
404
 
@@ -491,7 +551,7 @@ def main():
491
  parser.add_argument(
492
  "--model",
493
  type=str,
494
- choices=["logistic", "svc"],
495
  default="logistic",
496
  help="Model type to train (default: logistic)",
497
  )
@@ -511,16 +571,29 @@ def main():
511
  "--split-ratio", type=float, default=0.2, help="Test split ratio (default: 0.2)"
512
  )
513
  parser.add_argument(
514
- "--n-samples",
515
  type=int,
516
  default=None,
517
- help="Limit number of samples for quick testing (default: None - use all data)",
518
  )
519
  parser.add_argument(
520
  "--compare",
521
  action="store_true",
522
  help="Train and compare multiple configurations",
523
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
  # Use parse_known_args to ignore Jupyter/Colab kernel arguments
526
  args, unknown = parser.parse_known_args()
@@ -529,9 +602,19 @@ def main():
529
  if in_notebook and unknown:
530
  print(f"Note: Running in Jupyter/Colab environment. Ignoring kernel arguments: {unknown}")
531
 
532
- if args.compare:
533
- print("Training and comparing multiple configurations...")
534
- train_all_configurations()
 
 
 
 
 
 
 
 
 
 
535
  else:
536
  dataset_name = "VNTC" if args.dataset == "vntc" else "UTS2017_Bank"
537
  print(f"Training {args.model} model on {dataset_name} dataset...")
@@ -545,7 +628,7 @@ def main():
545
  max_features=args.max_features,
546
  ngram_range=(args.ngram_min, args.ngram_max),
547
  split_ratio=args.split_ratio,
548
- n_samples=args.n_samples,
549
  )
550
 
551
 
 
23
  from sklearn.model_selection import train_test_split
24
  from sklearn.pipeline import Pipeline
25
  from sklearn.svm import SVC
26
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
27
+ from sklearn.naive_bayes import MultinomialNB
28
+ from sklearn.neural_network import MLPClassifier
29
+ from sklearn.tree import DecisionTreeClassifier
30
  import joblib
31
 
32
 
 
105
  y_test.append(label)
106
  X_test.append(text)
107
 
108
+ # Apply sample limit if specified with stratified sampling
109
  if n_samples:
110
  if n_samples < len(X_train):
111
+ # Use stratified sampling to maintain class distribution
112
+ X_train_array = np.array(X_train)
113
+ y_train_array = np.array(y_train)
114
+ indices = np.arange(len(X_train))
115
+
116
+ # Shuffle to get random stratified sample
117
+ np.random.seed(42)
118
+ shuffled_indices = np.random.permutation(indices)
119
+
120
+ # Take first n_samples
121
+ sample_indices = shuffled_indices[:n_samples]
122
+ X_train = X_train_array[sample_indices].tolist()
123
+ y_train = y_train_array[sample_indices].tolist()
124
+
125
  if n_samples < len(X_test):
126
+ # Use stratified sampling for test set too
127
+ X_test_array = np.array(X_test)
128
+ y_test_array = np.array(y_test)
129
+ indices = np.arange(len(X_test))
130
+
131
+ # Shuffle to get random stratified sample
132
+ np.random.seed(42)
133
+ shuffled_indices = np.random.permutation(indices)
134
+
135
+ # Take first n_samples
136
+ sample_indices = shuffled_indices[:n_samples]
137
+ X_test = X_test_array[sample_indices].tolist()
138
+ y_test = y_test_array[sample_indices].tolist()
139
 
140
  # Convert to numpy arrays
141
  X_train = np.array(X_train)
 
208
  def get_available_models():
209
  """Get available classifier options"""
210
  return {
211
+ # Traditional algorithms
212
  "logistic": LogisticRegression(max_iter=1000, random_state=42),
213
+ "svc_linear": SVC(kernel="linear", random_state=42, probability=True),
214
+ "svc_rbf": SVC(kernel="rbf", random_state=42, probability=True, gamma='scale'),
215
+ "naive_bayes": MultinomialNB(),
216
+
217
+ # Tree-based algorithms
218
+ "decision_tree": DecisionTreeClassifier(random_state=42, max_depth=10),
219
+ "random_forest": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1),
220
+
221
+ # Boosting algorithms
222
+ "gradient_boost": GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5),
223
+ "ada_boost": AdaBoostClassifier(n_estimators=100, random_state=42),
224
+
225
+ # Neural network
226
+ "mlp": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42, early_stopping=True),
227
  }
228
 
229
 
 
421
  return metadata
422
 
423
 
424
+ def train_all_configurations(dataset="vntc", models=None, num_rows=None):
425
  """Train multiple model configurations and compare results"""
426
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
427
+ run_dir = setup_logging(timestamp)
428
 
429
  logging.info(f"Starting comparison run: {timestamp}")
430
+ logging.info(f"Dataset: {dataset}")
431
+ if num_rows:
432
+ logging.info(f"Sample limit: {num_rows}")
433
+
434
+ if models is None:
435
+ # Define all available models for comparison
436
+ available_models = get_available_models()
437
+ models = list(available_models.keys())
438
+
439
+ logging.info(f"Models to compare: {models}")
440
+
441
+ # Define configurations to test - focusing on best performing settings
442
+ configurations = []
443
+ for model_name in models:
444
+ if model_name in ["svc_rbf", "gradient_boost", "ada_boost", "mlp"]:
445
+ # Use fewer features for computationally expensive models
446
+ configurations.append({
447
+ "dataset": dataset,
448
+ "model_name": model_name,
449
+ "max_features": 10000,
450
+ "ngram_range": (1, 2),
451
+ "n_samples": num_rows
452
+ })
453
+ else:
454
+ # Use more features for faster models
455
+ configurations.append({
456
+ "dataset": dataset,
457
+ "model_name": model_name,
458
+ "max_features": 20000,
459
+ "ngram_range": (1, 2),
460
+ "n_samples": num_rows
461
+ })
462
 
463
  results = []
464
 
 
551
  parser.add_argument(
552
  "--model",
553
  type=str,
554
+ choices=["logistic", "svc_linear", "svc_rbf", "naive_bayes", "decision_tree", "random_forest", "gradient_boost", "ada_boost", "mlp"],
555
  default="logistic",
556
  help="Model type to train (default: logistic)",
557
  )
 
571
  "--split-ratio", type=float, default=0.2, help="Test split ratio (default: 0.2)"
572
  )
573
  parser.add_argument(
574
+ "--num-rows",
575
  type=int,
576
  default=None,
577
+ help="Limit number of rows/samples for quick testing (default: None - use all data)",
578
  )
579
  parser.add_argument(
580
  "--compare",
581
  action="store_true",
582
  help="Train and compare multiple configurations",
583
  )
584
+ parser.add_argument(
585
+ "--compare-models",
586
+ nargs="+",
587
+ help="List of specific models to compare (e.g., --compare-models logistic random_forest svc_rbf)",
588
+ choices=["logistic", "svc_linear", "svc_rbf", "naive_bayes", "decision_tree", "random_forest", "gradient_boost", "ada_boost", "mlp"]
589
+ )
590
+ parser.add_argument(
591
+ "--compare-dataset",
592
+ type=str,
593
+ choices=["vntc", "uts2017"],
594
+ default="vntc",
595
+ help="Dataset to use for model comparison (default: vntc)"
596
+ )
597
 
598
  # Use parse_known_args to ignore Jupyter/Colab kernel arguments
599
  args, unknown = parser.parse_known_args()
 
602
  if in_notebook and unknown:
603
  print(f"Note: Running in Jupyter/Colab environment. Ignoring kernel arguments: {unknown}")
604
 
605
+ if args.compare or args.compare_models:
606
+ if args.compare_models:
607
+ print(f"Training and comparing selected models: {args.compare_models}")
608
+ print(f"Dataset: {args.compare_dataset}")
609
+ if args.num_rows:
610
+ print(f"Using {args.num_rows} rows per dataset")
611
+ train_all_configurations(dataset=args.compare_dataset, models=args.compare_models, num_rows=args.num_rows)
612
+ else:
613
+ print("Training and comparing all available models...")
614
+ print(f"Dataset: {args.compare_dataset}")
615
+ if args.num_rows:
616
+ print(f"Using {args.num_rows} rows per dataset")
617
+ train_all_configurations(dataset=args.compare_dataset, num_rows=args.num_rows)
618
  else:
619
  dataset_name = "VNTC" if args.dataset == "vntc" else "UTS2017_Bank"
620
  print(f"Training {args.model} model on {dataset_name} dataset...")
 
628
  max_features=args.max_features,
629
  ngram_range=(args.ngram_min, args.ngram_max),
630
  split_ratio=args.split_ratio,
631
+ n_samples=args.num_rows,
632
  )
633
 
634