Vu Anh
Claude
commited on
Commit
·
598035a
1
Parent(s):
16e985c
Update analysis results and system card with SVC model performance
Browse files- Added SVC model results achieving 92.80% accuracy on VNTC dataset
- Updated paper with comparative analysis between SVC and Logistic Regression
- Included training time comparisons showing accuracy-efficiency trade-offs
- Enhanced results presentation in analyze_results.py
- Added changelog entry for September 28, 2025
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <[email protected]>
- analyze_results.py +2 -3
- paper/sonar_core_1_system_card.tex +17 -7
analyze_results.py
CHANGED
|
@@ -5,7 +5,6 @@ Script to analyze and compare training results from multiple model runs.
|
|
| 5 |
|
| 6 |
import json
|
| 7 |
import os
|
| 8 |
-
import glob
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
def load_metadata(run_dir):
|
|
@@ -75,7 +74,7 @@ def print_comparison_table(results):
|
|
| 75 |
bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
|
| 76 |
|
| 77 |
if bank_results:
|
| 78 |
-
print(
|
| 79 |
print("-"*120)
|
| 80 |
print(f"{'Model':<20} {'Features':<10} {'N-gram':<10} {'Train Acc':<12} {'Test Acc':<12} {'Train Time':<12} {'Pred Time':<12}")
|
| 81 |
print("-"*120)
|
|
@@ -121,7 +120,7 @@ def main():
|
|
| 121 |
vntc_results = [r for r in results if r['dataset'] == 'VNTC']
|
| 122 |
bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
|
| 123 |
|
| 124 |
-
print(
|
| 125 |
print(f"- VNTC runs: {len(vntc_results)}")
|
| 126 |
print(f"- UTS2017_Bank runs: {len(bank_results)}")
|
| 127 |
|
|
|
|
| 5 |
|
| 6 |
import json
|
| 7 |
import os
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
def load_metadata(run_dir):
|
|
|
|
| 74 |
bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
|
| 75 |
|
| 76 |
if bank_results:
|
| 77 |
+
print("\nUTS2017_Bank Dataset (Vietnamese Banking Text Classification):")
|
| 78 |
print("-"*120)
|
| 79 |
print(f"{'Model':<20} {'Features':<10} {'N-gram':<10} {'Train Acc':<12} {'Test Acc':<12} {'Train Time':<12} {'Pred Time':<12}")
|
| 80 |
print("-"*120)
|
|
|
|
| 120 |
vntc_results = [r for r in results if r['dataset'] == 'VNTC']
|
| 121 |
bank_results = [r for r in results if r['dataset'] == 'UTS2017_Bank']
|
| 122 |
|
| 123 |
+
print("\nSummary:")
|
| 124 |
print(f"- VNTC runs: {len(vntc_results)}")
|
| 125 |
print(f"- UTS2017_Bank runs: {len(bank_results)}")
|
| 126 |
|
paper/sonar_core_1_system_card.tex
CHANGED
|
@@ -23,7 +23,7 @@
|
|
| 23 |
\maketitle
|
| 24 |
|
| 25 |
\begin{abstract}
|
| 26 |
-
This paper presents Sonar Core 1, a Vietnamese text classification system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with
|
| 27 |
\end{abstract}
|
| 28 |
|
| 29 |
\section{Introduction}
|
|
@@ -177,17 +177,18 @@ VNTC (10 topics) & Toan et al. (2017) - Neural Network & 99.75\% \\
|
|
| 177 |
VNTC (10 topics) & Toan et al. (2017) - SVC & 99.22\% \\
|
| 178 |
VNTC (10 topics) & Toan et al. (2017) - Random Forest & 99.21\% \\
|
| 179 |
VNTC (10 topics) & Toan et al. (2017) - SVM & 96.52\% \\
|
| 180 |
-
VNTC (10 topics) & \textbf{Sonar Core 1 - TF-IDF
|
|
|
|
| 181 |
\hline
|
| 182 |
VNTC (27 topics) & Toan et al. (2017) - Neural Network & 99.69\% \\
|
| 183 |
VNTC (27 topics) & Toan et al. (2017) - SVC & 99.65\% \\
|
| 184 |
VNTC (27 topics) & Toan et al. (2017) - Random Forest & 99.25\% \\
|
| 185 |
VNTC (27 topics) & Toan et al. (2017) - SVM & 97.80\% \\
|
| 186 |
\hline
|
| 187 |
-
UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - TF-IDF
|
| 188 |
\hline
|
| 189 |
\end{tabular}
|
| 190 |
-
\caption{Comprehensive performance comparison between TF-IDF
|
| 191 |
\label{tab:comprehensive_comparison}
|
| 192 |
\end{table}
|
| 193 |
|
|
@@ -202,9 +203,11 @@ This section presents comprehensive experimental results across both Vietnamese
|
|
| 202 |
\textbf{VNTC Dataset (News Classification):}
|
| 203 |
The system demonstrates robust performance on the VNTC news classification dataset:
|
| 204 |
\begin{itemize}
|
| 205 |
-
\item \textbf{Test Classification Accuracy}: 92.
|
| 206 |
-
\item \textbf{
|
| 207 |
-
\item \textbf{
|
|
|
|
|
|
|
| 208 |
\item \textbf{Macro Average F1-Score}: 0.91
|
| 209 |
\item \textbf{Weighted Average F1-Score}: 0.92
|
| 210 |
\end{itemize}
|
|
@@ -404,6 +407,13 @@ The authors acknowledge the contributions of the VNTC and UTS2017\_Bank dataset
|
|
| 404 |
|
| 405 |
\section{Changelog}
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
\textbf{2025-09-27}
|
| 408 |
\begin{itemize}
|
| 409 |
\item Added support for UTS2017\_Bank Vietnamese banking text classification dataset
|
|
|
|
| 23 |
\maketitle
|
| 24 |
|
| 25 |
\begin{abstract}
|
| 26 |
+
This paper presents Sonar Core 1, a Vietnamese text classification system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with multiple classification algorithms. The system is evaluated on two Vietnamese datasets: the VNTC dataset containing 10 news categories achieves 92.80\% accuracy with Support Vector Classification (SVC) and 92.33\% with logistic regression, while the UTS2017\_Bank dataset spanning 14 banking service categories achieves 70.96\% accuracy with logistic regression. The implementation utilizes a 20,000-dimensional TF-IDF feature space with n-gram analysis and incorporates hash-based caching for computational optimization. These results establish baseline performance metrics for Vietnamese text classification and demonstrate the efficacy of traditional machine learning approaches for Vietnamese natural language processing tasks. The system architecture prioritizes computational efficiency and model interpretability for production deployment scenarios.
|
| 27 |
\end{abstract}
|
| 28 |
|
| 29 |
\section{Introduction}
|
|
|
|
| 177 |
VNTC (10 topics) & Toan et al. (2017) - SVC & 99.22\% \\
|
| 178 |
VNTC (10 topics) & Toan et al. (2017) - Random Forest & 99.21\% \\
|
| 179 |
VNTC (10 topics) & Toan et al. (2017) - SVM & 96.52\% \\
|
| 180 |
+
VNTC (10 topics) & \textbf{Sonar Core 1 - SVC with TF-IDF} & \textbf{92.80\%} \\
|
| 181 |
+
VNTC (10 topics) & \textbf{Sonar Core 1 - Logistic Regression with TF-IDF} & \textbf{92.33\%} \\
|
| 182 |
\hline
|
| 183 |
VNTC (27 topics) & Toan et al. (2017) - Neural Network & 99.69\% \\
|
| 184 |
VNTC (27 topics) & Toan et al. (2017) - SVC & 99.65\% \\
|
| 185 |
VNTC (27 topics) & Toan et al. (2017) - Random Forest & 99.25\% \\
|
| 186 |
VNTC (27 topics) & Toan et al. (2017) - SVM & 97.80\% \\
|
| 187 |
\hline
|
| 188 |
+
UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - Logistic Regression with TF-IDF} & \textbf{70.96\%} \\
|
| 189 |
\hline
|
| 190 |
\end{tabular}
|
| 191 |
+
\caption{Comprehensive performance comparison between TF-IDF-based approaches and established methods from \citet{toan2017vietnamese} on Vietnamese text classification tasks, grouped by dataset categories.}
|
| 192 |
\label{tab:comprehensive_comparison}
|
| 193 |
\end{table}
|
| 194 |
|
|
|
|
| 203 |
\textbf{VNTC Dataset (News Classification):}
|
| 204 |
The system demonstrates robust performance on the VNTC news classification dataset:
|
| 205 |
\begin{itemize}
|
| 206 |
+
\item \textbf{Best Test Classification Accuracy (SVC)}: 92.80\%
|
| 207 |
+
\item \textbf{Logistic Regression Test Accuracy}: 92.33\%
|
| 208 |
+
\item \textbf{Training Latency (Logistic Regression)}: 31.9 seconds (optimized with hash-based caching)
|
| 209 |
+
\item \textbf{Training Latency (SVC)}: 3,278.4 seconds
|
| 210 |
+
\item \textbf{Inference Latency (Logistic Regression)}: 24.5 seconds for 50,373 test samples (0.49 ms per sample)
|
| 211 |
\item \textbf{Macro Average F1-Score}: 0.91
|
| 212 |
\item \textbf{Weighted Average F1-Score}: 0.92
|
| 213 |
\end{itemize}
|
|
|
|
| 407 |
|
| 408 |
\section{Changelog}
|
| 409 |
|
| 410 |
+
\textbf{2025-09-28}
|
| 411 |
+
\begin{itemize}
|
| 412 |
+
\item Added SVC model evaluation achieving 92.80\% accuracy on VNTC dataset
|
| 413 |
+
\item Completed comparative analysis of multiple classification algorithms
|
| 414 |
+
\item Updated performance benchmarks with SVC outperforming Logistic Regression
|
| 415 |
+
\end{itemize}
|
| 416 |
+
|
| 417 |
\textbf{2025-09-27}
|
| 418 |
\begin{itemize}
|
| 419 |
\item Added support for UTS2017\_Bank Vietnamese banking text classification dataset
|