Spaces:
Building
Building
emuTAALES
Browse files- config/reference_lists.yaml +17 -2
- debug_bigram_trigram.py +78 -0
- debug_plot_columns.py +107 -0
- test/test_advanced_selection.py +174 -0
- test/test_column_naming_fix.py +95 -0
- test_file_upload_handler.py → test/test_file_upload_handler.py +0 -0
- test_fix_403.py → test/test_fix_403.py +0 -0
- test/test_log_transform_consistency.py +121 -0
- test_memory_upload.py → test/test_memory_upload.py +0 -0
- test_tmp_upload.py → test/test_tmp_upload.py +0 -0
- test_column_matching.py +50 -0
- test_csv_comma_handling.py +72 -0
- test_plot_fix.py +106 -0
- test_reference_loading_issue.py +116 -0
- text_analyzer/corpus_visualizer.py +3 -1
- text_analyzer/frequency_analyzer.py +3 -1
- text_analyzer/lexical_sophistication.py +238 -105
- web_app/app.py +1 -1
- web_app/components/ui_components.py +148 -22
- web_app/config_manager.py +8 -3
- web_app/handlers/analysis_handlers.py +221 -103
- web_app/handlers/frequency_handlers.py +3 -1
- web_app/utils/memory_file_handler.py +4 -2
config/reference_lists.yaml
CHANGED
|
@@ -88,7 +88,7 @@ english:
|
|
| 88 |
measure_classifications:
|
| 89 |
concreteness: psycholinguistic
|
| 90 |
header_prefix: '#'
|
| 91 |
-
|
| 92 |
concreteness_ratings_lemma:
|
| 93 |
display_name: Concreteness Ratings (Lemma)
|
| 94 |
description: Concreteness ratings for English words (1-5 scale) - lemma-based
|
|
@@ -241,6 +241,7 @@ english:
|
|
| 241 |
normalized_freq: frequency
|
| 242 |
documents: range
|
| 243 |
range: range
|
|
|
|
| 244 |
COCA_spoken_bigram_frequency_lemma:
|
| 245 |
display_name: COCA Spoken Bigram Frequency (Lemma)
|
| 246 |
description: Bigram frequencies and range data - lemma-based analysis
|
|
@@ -270,6 +271,7 @@ english:
|
|
| 270 |
normalized_freq: frequency
|
| 271 |
documents: range
|
| 272 |
range: range
|
|
|
|
| 273 |
COCA_spoken_bigram_association_token:
|
| 274 |
display_name: COCA Spoken Bigram Associations (Token)
|
| 275 |
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
|
@@ -308,13 +310,21 @@ english:
|
|
| 308 |
t_score: association
|
| 309 |
delta_p: association
|
| 310 |
ap_collex: association
|
|
|
|
| 311 |
COCA_spoken_bigram_association_lemma:
|
| 312 |
display_name: COCA Spoken Bigram Associations (Lemma)
|
| 313 |
description: Bigram association measures (MI, T-score, Delta P) - lemma-based
|
| 314 |
analysis
|
| 315 |
file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
|
| 316 |
format: csv
|
| 317 |
-
columns:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
has_header: true
|
| 319 |
enabled: true
|
| 320 |
analysis_type: lemma
|
|
@@ -339,6 +349,7 @@ english:
|
|
| 339 |
t_score: association
|
| 340 |
delta_p: association
|
| 341 |
ap_collex: association
|
|
|
|
| 342 |
COCA_magazine_bigram_frequency_token:
|
| 343 |
display_name: COCA Magazine Bigram Frequency (Token)
|
| 344 |
description: Bigram frequencies and range data in Magazine - token-based analysis
|
|
@@ -373,6 +384,7 @@ english:
|
|
| 373 |
normalized_freq: frequency
|
| 374 |
documents: range
|
| 375 |
range: range
|
|
|
|
| 376 |
COCA_magazine_bigram_frequency_lemma:
|
| 377 |
display_name: COCA Magazine Bigram Frequency (Lemma)
|
| 378 |
description: Bigram frequencies and range data in Magazine - lemma-based analysis
|
|
@@ -402,6 +414,7 @@ english:
|
|
| 402 |
normalized_freq: frequency
|
| 403 |
documents: range
|
| 404 |
range: range
|
|
|
|
| 405 |
COCA_magazine_bigram_association_token:
|
| 406 |
display_name: COCA Magazine Bigram Associations (Token)
|
| 407 |
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
|
@@ -971,6 +984,7 @@ japanese:
|
|
| 971 |
pos: unknown
|
| 972 |
frequency: frequency
|
| 973 |
japanese_corpus: true
|
|
|
|
| 974 |
jp_frequency_token:
|
| 975 |
display_name: Japanese Frequency List (Token)
|
| 976 |
description: Frequency data for Japanese words - token-based analysis
|
|
@@ -992,6 +1006,7 @@ japanese:
|
|
| 992 |
- frequency
|
| 993 |
measure_classifications:
|
| 994 |
frequency: frequency
|
|
|
|
| 995 |
jp_frequency_lemma:
|
| 996 |
display_name: Japanese Frequency List (Lemma)
|
| 997 |
description: Frequency data for Japanese words - lemma-based analysis
|
|
|
|
| 88 |
measure_classifications:
|
| 89 |
concreteness: psycholinguistic
|
| 90 |
header_prefix: '#'
|
| 91 |
+
|
| 92 |
concreteness_ratings_lemma:
|
| 93 |
display_name: Concreteness Ratings (Lemma)
|
| 94 |
description: Concreteness ratings for English words (1-5 scale) - lemma-based
|
|
|
|
| 241 |
normalized_freq: frequency
|
| 242 |
documents: range
|
| 243 |
range: range
|
| 244 |
+
|
| 245 |
COCA_spoken_bigram_frequency_lemma:
|
| 246 |
display_name: COCA Spoken Bigram Frequency (Lemma)
|
| 247 |
description: Bigram frequencies and range data - lemma-based analysis
|
|
|
|
| 271 |
normalized_freq: frequency
|
| 272 |
documents: range
|
| 273 |
range: range
|
| 274 |
+
|
| 275 |
COCA_spoken_bigram_association_token:
|
| 276 |
display_name: COCA Spoken Bigram Associations (Token)
|
| 277 |
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
|
|
|
| 310 |
t_score: association
|
| 311 |
delta_p: association
|
| 312 |
ap_collex: association
|
| 313 |
+
|
| 314 |
COCA_spoken_bigram_association_lemma:
|
| 315 |
display_name: COCA Spoken Bigram Associations (Lemma)
|
| 316 |
description: Bigram association measures (MI, T-score, Delta P) - lemma-based
|
| 317 |
analysis
|
| 318 |
file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
|
| 319 |
format: csv
|
| 320 |
+
columns:
|
| 321 |
+
bigram: 0
|
| 322 |
+
frequency: 1
|
| 323 |
+
mi_score: 5
|
| 324 |
+
mi_2_score: 6
|
| 325 |
+
t_score: 7
|
| 326 |
+
delta_p: 8
|
| 327 |
+
ap_collex: 9
|
| 328 |
has_header: true
|
| 329 |
enabled: true
|
| 330 |
analysis_type: lemma
|
|
|
|
| 349 |
t_score: association
|
| 350 |
delta_p: association
|
| 351 |
ap_collex: association
|
| 352 |
+
|
| 353 |
COCA_magazine_bigram_frequency_token:
|
| 354 |
display_name: COCA Magazine Bigram Frequency (Token)
|
| 355 |
description: Bigram frequencies and range data in Magazine - token-based analysis
|
|
|
|
| 384 |
normalized_freq: frequency
|
| 385 |
documents: range
|
| 386 |
range: range
|
| 387 |
+
|
| 388 |
COCA_magazine_bigram_frequency_lemma:
|
| 389 |
display_name: COCA Magazine Bigram Frequency (Lemma)
|
| 390 |
description: Bigram frequencies and range data in Magazine - lemma-based analysis
|
|
|
|
| 414 |
normalized_freq: frequency
|
| 415 |
documents: range
|
| 416 |
range: range
|
| 417 |
+
|
| 418 |
COCA_magazine_bigram_association_token:
|
| 419 |
display_name: COCA Magazine Bigram Associations (Token)
|
| 420 |
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
|
|
|
| 984 |
pos: unknown
|
| 985 |
frequency: frequency
|
| 986 |
japanese_corpus: true
|
| 987 |
+
|
| 988 |
jp_frequency_token:
|
| 989 |
display_name: Japanese Frequency List (Token)
|
| 990 |
description: Frequency data for Japanese words - token-based analysis
|
|
|
|
| 1006 |
- frequency
|
| 1007 |
measure_classifications:
|
| 1008 |
frequency: frequency
|
| 1009 |
+
|
| 1010 |
jp_frequency_lemma:
|
| 1011 |
display_name: Japanese Frequency List (Lemma)
|
| 1012 |
description: Frequency data for Japanese words - lemma-based analysis
|
debug_bigram_trigram.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Debug script to test bigram and trigram processing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add the project root to the path
|
| 10 |
+
sys.path.insert(0, os.getcwd())
|
| 11 |
+
|
| 12 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 13 |
+
from web_app.config_manager import ConfigManager
|
| 14 |
+
|
| 15 |
+
# Test simple text
|
| 16 |
+
test_text = "The cat sat on the mat. The dog ran quickly."
|
| 17 |
+
|
| 18 |
+
# Create analyzer
|
| 19 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 20 |
+
|
| 21 |
+
# Load config
|
| 22 |
+
config = ConfigManager.load_reference_config()
|
| 23 |
+
english_config = config.get('english', {})
|
| 24 |
+
|
| 25 |
+
print("=== Available Reference Lists ===")
|
| 26 |
+
for ngram_type, lists in english_config.items():
|
| 27 |
+
print(f"\n{ngram_type.upper()}:")
|
| 28 |
+
for list_name, list_config in lists.items():
|
| 29 |
+
if list_config.get('enabled', True):
|
| 30 |
+
print(f" - {list_name}")
|
| 31 |
+
|
| 32 |
+
# Test loading a bigram reference
|
| 33 |
+
print("\n=== Testing Bigram Reference Loading ===")
|
| 34 |
+
bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
|
| 35 |
+
if bigram_config:
|
| 36 |
+
print(f"Config: {bigram_config}")
|
| 37 |
+
|
| 38 |
+
# Load the data
|
| 39 |
+
data = ConfigManager.load_reference_list_data(bigram_config)
|
| 40 |
+
print(f"Loaded data keys: {data.keys()}")
|
| 41 |
+
|
| 42 |
+
if 'bigram' in data:
|
| 43 |
+
bigram_df = data['bigram']
|
| 44 |
+
print(f"Bigram DataFrame shape: {bigram_df.shape}")
|
| 45 |
+
print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
|
| 46 |
+
print("First 5 bigrams:")
|
| 47 |
+
print(bigram_df.head())
|
| 48 |
+
|
| 49 |
+
# Test with full reference list structure
|
| 50 |
+
print("\n=== Testing Analyzer with Bigram References ===")
|
| 51 |
+
reference_lists = {
|
| 52 |
+
'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
|
| 56 |
+
for name, data in reference_lists.items():
|
| 57 |
+
print(f" {name}: {list(data.keys())}")
|
| 58 |
+
|
| 59 |
+
# Load into analyzer
|
| 60 |
+
analyzer.load_reference_lists(reference_lists)
|
| 61 |
+
|
| 62 |
+
# Analyze text
|
| 63 |
+
results = analyzer.analyze_text(
|
| 64 |
+
test_text,
|
| 65 |
+
list(reference_lists.keys()),
|
| 66 |
+
apply_log=False
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
print("\n=== Analysis Results ===")
|
| 70 |
+
print(f"Summary keys: {list(results['summary'].keys())}")
|
| 71 |
+
print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
|
| 72 |
+
print(f"Bigram details count: {len(results.get('bigram_details', []))}")
|
| 73 |
+
print(f"Trigram details count: {len(results.get('trigram_details', []))}")
|
| 74 |
+
|
| 75 |
+
if results.get('bigram_details'):
|
| 76 |
+
print("\nFirst few bigram details:")
|
| 77 |
+
for detail in results['bigram_details'][:3]:
|
| 78 |
+
print(f" {detail}")
|
debug_plot_columns.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Debug script to examine column naming issues in bigram/trigram plots
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add the project root to the path
|
| 10 |
+
sys.path.insert(0, os.getcwd())
|
| 11 |
+
|
| 12 |
+
from web_app.config_manager import ConfigManager
|
| 13 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 14 |
+
|
| 15 |
+
def debug_plot_columns():
|
| 16 |
+
print("=== Debugging Plot Column Names ==")
|
| 17 |
+
|
| 18 |
+
# Load config and create reference lists
|
| 19 |
+
config = ConfigManager.load_reference_config()
|
| 20 |
+
english_config = config.get('english', {})
|
| 21 |
+
|
| 22 |
+
reference_lists = {}
|
| 23 |
+
|
| 24 |
+
# Load a unigram, bigram, and trigram reference
|
| 25 |
+
unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
|
| 26 |
+
bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
|
| 27 |
+
trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
|
| 28 |
+
|
| 29 |
+
reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
|
| 30 |
+
reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
|
| 31 |
+
reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
|
| 32 |
+
|
| 33 |
+
# Create analyzer and analyze text
|
| 34 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 35 |
+
analyzer.load_reference_lists(reference_lists)
|
| 36 |
+
|
| 37 |
+
test_text = "The cat sat on the mat. The dog ran quickly."
|
| 38 |
+
results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
|
| 39 |
+
|
| 40 |
+
print("\n=== Raw Scores Keys ===")
|
| 41 |
+
for key in results['raw_scores'].keys():
|
| 42 |
+
print(f" {key}")
|
| 43 |
+
|
| 44 |
+
print("\n=== Token Details Columns ===")
|
| 45 |
+
if results['token_details']:
|
| 46 |
+
print(f" Sample token: {list(results['token_details'][0].keys())}")
|
| 47 |
+
|
| 48 |
+
print("\n=== Bigram Details Columns ===")
|
| 49 |
+
if results['bigram_details']:
|
| 50 |
+
print(f" Sample bigram: {list(results['bigram_details'][0].keys())}")
|
| 51 |
+
|
| 52 |
+
print("\n=== Trigram Details Columns ===")
|
| 53 |
+
if results['trigram_details']:
|
| 54 |
+
print(f" Sample trigram: {list(results['trigram_details'][0].keys())}")
|
| 55 |
+
|
| 56 |
+
print("\n=== Column Matching Analysis ===")
|
| 57 |
+
|
| 58 |
+
# Test the current algorithm for bigrams
|
| 59 |
+
for key in results['raw_scores'].keys():
|
| 60 |
+
if '_bigram_' in key:
|
| 61 |
+
print(f"\nAnalyzing bigram key: {key}")
|
| 62 |
+
key_parts = key.split('_')
|
| 63 |
+
if len(key_parts) >= 3 and 'bigram' in key_parts:
|
| 64 |
+
measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:])
|
| 65 |
+
index_measure_col = f"{key_parts[0]}_{measure_name}"
|
| 66 |
+
print(f" Algorithm expects column: '{index_measure_col}'")
|
| 67 |
+
|
| 68 |
+
# Check if this column exists in bigram_details
|
| 69 |
+
if results['bigram_details']:
|
| 70 |
+
sample_bigram = results['bigram_details'][0]
|
| 71 |
+
if index_measure_col in sample_bigram:
|
| 72 |
+
print(f" ✅ Column found in bigram_details")
|
| 73 |
+
else:
|
| 74 |
+
print(f" ❌ Column NOT found in bigram_details")
|
| 75 |
+
print(f" Available columns: {list(sample_bigram.keys())}")
|
| 76 |
+
|
| 77 |
+
# Try to find the correct column
|
| 78 |
+
for col in sample_bigram.keys():
|
| 79 |
+
if measure_name in col:
|
| 80 |
+
print(f" Possible match: '{col}'")
|
| 81 |
+
|
| 82 |
+
# Test the current algorithm for trigrams
|
| 83 |
+
for key in results['raw_scores'].keys():
|
| 84 |
+
if '_trigram_' in key:
|
| 85 |
+
print(f"\nAnalyzing trigram key: {key}")
|
| 86 |
+
key_parts = key.split('_')
|
| 87 |
+
if len(key_parts) >= 3 and 'trigram' in key_parts:
|
| 88 |
+
measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:])
|
| 89 |
+
index_measure_col = f"{key_parts[0]}_{measure_name}"
|
| 90 |
+
print(f" Algorithm expects column: '{index_measure_col}'")
|
| 91 |
+
|
| 92 |
+
# Check if this column exists in trigram_details
|
| 93 |
+
if results['trigram_details']:
|
| 94 |
+
sample_trigram = results['trigram_details'][0]
|
| 95 |
+
if index_measure_col in sample_trigram:
|
| 96 |
+
print(f" ✅ Column found in trigram_details")
|
| 97 |
+
else:
|
| 98 |
+
print(f" ❌ Column NOT found in trigram_details")
|
| 99 |
+
print(f" Available columns: {list(sample_trigram.keys())}")
|
| 100 |
+
|
| 101 |
+
# Try to find the correct column
|
| 102 |
+
for col in sample_trigram.keys():
|
| 103 |
+
if measure_name in col:
|
| 104 |
+
print(f" Possible match: '{col}'")
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
debug_plot_columns()
|
test/test_advanced_selection.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify the advanced selection UI implementation.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Add project root to path
|
| 11 |
+
project_root = Path(__file__).parent
|
| 12 |
+
sys.path.insert(0, str(project_root))
|
| 13 |
+
|
| 14 |
+
def test_ui_components_grouping():
|
| 15 |
+
"""Test the grouping functionality of UI components."""
|
| 16 |
+
print("Testing UI Components Grouping...")
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from web_app.components.ui_components import UIComponents
|
| 20 |
+
from web_app.config_manager import ConfigManager
|
| 21 |
+
|
| 22 |
+
# Load the configuration
|
| 23 |
+
config = ConfigManager.load_reference_config()
|
| 24 |
+
|
| 25 |
+
# Simulate reference lists
|
| 26 |
+
mock_reference_lists = {
|
| 27 |
+
'COCA_spoken_frequency_token': {},
|
| 28 |
+
'COCA_spoken_frequency_lemma': {},
|
| 29 |
+
'concreteness_ratings_token': {},
|
| 30 |
+
'concreteness_ratings_lemma': {}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# Test grouping function
|
| 34 |
+
groups = UIComponents._group_reference_lists(mock_reference_lists, config)
|
| 35 |
+
|
| 36 |
+
print(f"✅ Grouping successful! Found {len(groups)} groups:")
|
| 37 |
+
for base_name, group_data in groups.items():
|
| 38 |
+
token_count = len(group_data['token'])
|
| 39 |
+
lemma_count = len(group_data['lemma'])
|
| 40 |
+
print(f" - {base_name}: {token_count} token entries, {lemma_count} lemma entries")
|
| 41 |
+
|
| 42 |
+
return True
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"❌ Grouping test failed: {e}")
|
| 46 |
+
return False
|
| 47 |
+
|
| 48 |
+
def test_config_structure():
|
| 49 |
+
"""Test that the configuration has the expected structure."""
|
| 50 |
+
print("\nTesting Configuration Structure...")
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
from web_app.config_manager import ConfigManager
|
| 54 |
+
|
| 55 |
+
config = ConfigManager.load_reference_config()
|
| 56 |
+
|
| 57 |
+
# Check for expected keys
|
| 58 |
+
expected_sections = ['english', 'japanese']
|
| 59 |
+
found_sections = []
|
| 60 |
+
|
| 61 |
+
for section in expected_sections:
|
| 62 |
+
if section in config:
|
| 63 |
+
found_sections.append(section)
|
| 64 |
+
print(f" ✅ Found {section} section")
|
| 65 |
+
|
| 66 |
+
# Check for subsections
|
| 67 |
+
for subsection in ['unigrams', 'bigrams', 'trigrams']:
|
| 68 |
+
if subsection in config[section]:
|
| 69 |
+
entries = len(config[section][subsection])
|
| 70 |
+
print(f" - {subsection}: {entries} entries")
|
| 71 |
+
|
| 72 |
+
if found_sections:
|
| 73 |
+
print(f"✅ Configuration structure valid!")
|
| 74 |
+
|
| 75 |
+
# Check for advanced selection fields
|
| 76 |
+
sample_entry = None
|
| 77 |
+
for lang in config.values():
|
| 78 |
+
if isinstance(lang, dict):
|
| 79 |
+
for ngram_type in lang.values():
|
| 80 |
+
if isinstance(ngram_type, dict):
|
| 81 |
+
for entry_name, entry_config in ngram_type.items():
|
| 82 |
+
sample_entry = entry_config
|
| 83 |
+
break
|
| 84 |
+
break
|
| 85 |
+
break
|
| 86 |
+
|
| 87 |
+
if sample_entry:
|
| 88 |
+
required_fields = ['selectable_measures', 'default_measures', 'default_log_transforms', 'log_transformable']
|
| 89 |
+
missing_fields = []
|
| 90 |
+
|
| 91 |
+
for field in required_fields:
|
| 92 |
+
if field not in sample_entry:
|
| 93 |
+
missing_fields.append(field)
|
| 94 |
+
else:
|
| 95 |
+
print(f" ✅ Found {field}: {sample_entry[field]}")
|
| 96 |
+
|
| 97 |
+
if missing_fields:
|
| 98 |
+
print(f" ⚠️ Missing fields: {missing_fields}")
|
| 99 |
+
else:
|
| 100 |
+
print(" ✅ All advanced selection fields present!")
|
| 101 |
+
|
| 102 |
+
return True
|
| 103 |
+
else:
|
| 104 |
+
print("❌ No valid configuration sections found")
|
| 105 |
+
return False
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ Configuration test failed: {e}")
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
def test_analyzer_parameters():
|
| 112 |
+
"""Test that the analyzer accepts the new parameters."""
|
| 113 |
+
print("\nTesting Analyzer Parameters...")
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 117 |
+
|
| 118 |
+
# Create analyzer
|
| 119 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 120 |
+
|
| 121 |
+
# Test parameter signature
|
| 122 |
+
import inspect
|
| 123 |
+
analyze_signature = inspect.signature(analyzer.analyze_text)
|
| 124 |
+
params = list(analyze_signature.parameters.keys())
|
| 125 |
+
|
| 126 |
+
required_params = ['log_transforms', 'selected_measures']
|
| 127 |
+
found_params = []
|
| 128 |
+
|
| 129 |
+
for param in required_params:
|
| 130 |
+
if param in params:
|
| 131 |
+
found_params.append(param)
|
| 132 |
+
print(f" ✅ Found parameter: {param}")
|
| 133 |
+
else:
|
| 134 |
+
print(f" ❌ Missing parameter: {param}")
|
| 135 |
+
|
| 136 |
+
if len(found_params) == len(required_params):
|
| 137 |
+
print("✅ Analyzer has all required parameters!")
|
| 138 |
+
return True
|
| 139 |
+
else:
|
| 140 |
+
print(f"❌ Analyzer missing {len(required_params) - len(found_params)} parameters")
|
| 141 |
+
return False
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f"❌ Analyzer test failed: {e}")
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
def main():
|
| 148 |
+
"""Run all tests."""
|
| 149 |
+
print("🧪 Testing Advanced Selection Implementation\n")
|
| 150 |
+
|
| 151 |
+
tests = [
|
| 152 |
+
test_config_structure,
|
| 153 |
+
test_ui_components_grouping,
|
| 154 |
+
test_analyzer_parameters
|
| 155 |
+
]
|
| 156 |
+
|
| 157 |
+
passed = 0
|
| 158 |
+
total = len(tests)
|
| 159 |
+
|
| 160 |
+
for test in tests:
|
| 161 |
+
if test():
|
| 162 |
+
passed += 1
|
| 163 |
+
|
| 164 |
+
print(f"\n📊 Test Results: {passed}/{total} tests passed")
|
| 165 |
+
|
| 166 |
+
if passed == total:
|
| 167 |
+
print("🎉 All tests passed! Advanced selection implementation is ready.")
|
| 168 |
+
return 0
|
| 169 |
+
else:
|
| 170 |
+
print("⚠️ Some tests failed. Please check the implementation.")
|
| 171 |
+
return 1
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
exit(main())
|
test/test_column_naming_fix.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that the column naming bug is fixed.
|
| 4 |
+
This script specifically tests that we don't get duplicate suffixes like:
|
| 5 |
+
- COCA_spoken_frequency_token_token
|
| 6 |
+
- COCA_spoken_frequency_lemma_lemma
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Add project root to path
|
| 14 |
+
project_root = Path(__file__).parent
|
| 15 |
+
sys.path.insert(0, str(project_root))
|
| 16 |
+
|
| 17 |
+
def test_column_naming():
|
| 18 |
+
"""Test that column names are clean without duplicate suffixes."""
|
| 19 |
+
print("Testing Column Naming Fix...")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 23 |
+
|
| 24 |
+
# Create analyzer
|
| 25 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 26 |
+
|
| 27 |
+
# Create mock reference lists
|
| 28 |
+
mock_reference_lists = {
|
| 29 |
+
'COCA_spoken_frequency_token': {
|
| 30 |
+
'token': {'hello': 100, 'world': 200}
|
| 31 |
+
},
|
| 32 |
+
'COCA_spoken_frequency_lemma': {
|
| 33 |
+
'lemma': {'hello': 150, 'world': 250}
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Load reference lists
|
| 38 |
+
analyzer.load_reference_lists(mock_reference_lists)
|
| 39 |
+
|
| 40 |
+
# Analyze a simple text
|
| 41 |
+
text = "Hello world, this is a test."
|
| 42 |
+
selected_indices = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
|
| 43 |
+
|
| 44 |
+
results = analyzer.analyze_text(text, selected_indices)
|
| 45 |
+
|
| 46 |
+
# Check token details for clean column names
|
| 47 |
+
if results['token_details']:
|
| 48 |
+
first_token = results['token_details'][0]
|
| 49 |
+
column_names = list(first_token.keys())
|
| 50 |
+
|
| 51 |
+
print(f"Column names found: {column_names}")
|
| 52 |
+
|
| 53 |
+
# Check for problematic duplicate suffixes
|
| 54 |
+
problematic_columns = []
|
| 55 |
+
for col in column_names:
|
| 56 |
+
if '_token_token' in col or '_lemma_lemma' in col or '_token_lemma' in col or '_lemma_token' in col:
|
| 57 |
+
problematic_columns.append(col)
|
| 58 |
+
|
| 59 |
+
if problematic_columns:
|
| 60 |
+
print(f"❌ Found problematic column names: {problematic_columns}")
|
| 61 |
+
return False
|
| 62 |
+
else:
|
| 63 |
+
print("✅ No duplicate suffixes found in column names!")
|
| 64 |
+
|
| 65 |
+
# Check that we have the expected clean column names
|
| 66 |
+
expected_clean_columns = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
|
| 67 |
+
found_clean_columns = [col for col in column_names if col in expected_clean_columns]
|
| 68 |
+
|
| 69 |
+
if found_clean_columns:
|
| 70 |
+
print(f"✅ Found expected clean columns: {found_clean_columns}")
|
| 71 |
+
return True
|
| 72 |
+
else:
|
| 73 |
+
print(f"⚠️ Expected clean columns not found. Available columns: {column_names}")
|
| 74 |
+
return False
|
| 75 |
+
else:
|
| 76 |
+
print("❌ No token details found in results")
|
| 77 |
+
return False
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Test failed with error: {e}")
|
| 81 |
+
return False
|
| 82 |
+
|
| 83 |
+
def main():
|
| 84 |
+
"""Run the column naming test."""
|
| 85 |
+
print("🧪 Testing Column Naming Fix\n")
|
| 86 |
+
|
| 87 |
+
if test_column_naming():
|
| 88 |
+
print("\n🎉 Column naming bug has been fixed!")
|
| 89 |
+
return 0
|
| 90 |
+
else:
|
| 91 |
+
print("\n⚠️ Column naming issue still exists.")
|
| 92 |
+
return 1
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
exit(main())
|
test_file_upload_handler.py → test/test_file_upload_handler.py
RENAMED
|
File without changes
|
test_fix_403.py → test/test_fix_403.py
RENAMED
|
File without changes
|
test/test_log_transform_consistency.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify that log transformations are consistently applied
|
| 4 |
+
to both token details table and summary statistics.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# Add project root to path
|
| 13 |
+
project_root = Path(__file__).parent
|
| 14 |
+
sys.path.insert(0, str(project_root))
|
| 15 |
+
|
| 16 |
+
def test_log_transform_consistency():
|
| 17 |
+
"""Test that log transformations are applied consistently."""
|
| 18 |
+
print("Testing Log Transform Consistency...")
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 22 |
+
|
| 23 |
+
# Create analyzer
|
| 24 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 25 |
+
|
| 26 |
+
# Create mock reference lists with known values
|
| 27 |
+
mock_reference_lists = {
|
| 28 |
+
'COCA_spoken_frequency_token': {
|
| 29 |
+
'token': {'hello': 1000, 'world': 2000} # Values that will be log-transformed
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# Load reference lists
|
| 34 |
+
analyzer.load_reference_lists(mock_reference_lists)
|
| 35 |
+
|
| 36 |
+
# Analyze text with log transformation enabled
|
| 37 |
+
text = "Hello world"
|
| 38 |
+
selected_indices = ['COCA_spoken_frequency_token']
|
| 39 |
+
|
| 40 |
+
# Enable log transformation for this index
|
| 41 |
+
log_transforms = {
|
| 42 |
+
'COCA_spoken_frequency_token': ['frequency']
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
results = analyzer.analyze_text(
|
| 46 |
+
text,
|
| 47 |
+
selected_indices,
|
| 48 |
+
log_transforms=log_transforms
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Get token details and summary statistics
|
| 52 |
+
token_details = results['token_details']
|
| 53 |
+
summary_stats = results['summary']
|
| 54 |
+
|
| 55 |
+
print(f"Token details: {len(token_details)} tokens")
|
| 56 |
+
print(f"Summary keys: {list(summary_stats.keys())}")
|
| 57 |
+
|
| 58 |
+
# Check consistency between token details and summary
|
| 59 |
+
if token_details:
|
| 60 |
+
# Get log-transformed values from token details
|
| 61 |
+
token_scores = []
|
| 62 |
+
for token_detail in token_details:
|
| 63 |
+
score = token_detail.get('COCA_spoken_frequency_token')
|
| 64 |
+
if score is not None:
|
| 65 |
+
token_scores.append(score)
|
| 66 |
+
print(f"Token '{token_detail['token']}': score = {score}")
|
| 67 |
+
|
| 68 |
+
if token_scores:
|
| 69 |
+
# Calculate mean from token details
|
| 70 |
+
token_mean = np.mean(token_scores)
|
| 71 |
+
|
| 72 |
+
# Get mean from summary statistics
|
| 73 |
+
summary_key = 'COCA_spoken_frequency_token_CW' # Content words
|
| 74 |
+
if summary_key in summary_stats:
|
| 75 |
+
summary_mean = summary_stats[summary_key]['mean']
|
| 76 |
+
|
| 77 |
+
print(f"Token details mean: {token_mean}")
|
| 78 |
+
print(f"Summary stats mean: {summary_mean}")
|
| 79 |
+
|
| 80 |
+
# Check if they're approximately equal (allowing for floating point precision)
|
| 81 |
+
if abs(token_mean - summary_mean) < 0.001:
|
| 82 |
+
print("✅ Token details and summary statistics are consistent!")
|
| 83 |
+
|
| 84 |
+
# Check that values are actually log-transformed
|
| 85 |
+
# Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3
|
| 86 |
+
if all(2.5 < score < 3.5 for score in token_scores):
|
| 87 |
+
print("✅ Values appear to be properly log-transformed!")
|
| 88 |
+
return True
|
| 89 |
+
else:
|
| 90 |
+
print(f"⚠️ Values don't appear to be log-transformed: {token_scores}")
|
| 91 |
+
return False
|
| 92 |
+
else:
|
| 93 |
+
print(f"❌ Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}")
|
| 94 |
+
return False
|
| 95 |
+
else:
|
| 96 |
+
print(f"❌ Summary key '{summary_key}' not found in summary stats")
|
| 97 |
+
return False
|
| 98 |
+
else:
|
| 99 |
+
print("❌ No token scores found")
|
| 100 |
+
return False
|
| 101 |
+
else:
|
| 102 |
+
print("❌ No token details found")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"❌ Test failed with error: {e}")
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
def main():
|
| 110 |
+
"""Run the log transform consistency test."""
|
| 111 |
+
print("🧪 Testing Log Transform Consistency\n")
|
| 112 |
+
|
| 113 |
+
if test_log_transform_consistency():
|
| 114 |
+
print("\n🎉 Log transformation consistency has been fixed!")
|
| 115 |
+
return 0
|
| 116 |
+
else:
|
| 117 |
+
print("\n⚠️ Log transformation consistency issue still exists.")
|
| 118 |
+
return 1
|
| 119 |
+
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
exit(main())
|
test_memory_upload.py → test/test_memory_upload.py
RENAMED
|
File without changes
|
test_tmp_upload.py → test/test_tmp_upload.py
RENAMED
|
File without changes
|
test_column_matching.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to understand the exact column matching pattern
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Test the pattern manually
|
| 7 |
+
raw_scores_keys = [
|
| 8 |
+
'COCA_spoken_bigram_frequency_token_bigram_frequency',
|
| 9 |
+
'COCA_spoken_bigram_frequency_token_bigram_normalized_freq',
|
| 10 |
+
'COCA_trigram_frequency_token_trigram_frequency'
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
actual_columns = [
|
| 14 |
+
'COCA_spoken_bigram_frequency_token_frequency',
|
| 15 |
+
'COCA_spoken_bigram_frequency_token_normalized_freq',
|
| 16 |
+
'COCA_trigram_frequency_token_frequency'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
print("=== Pattern Analysis ===")
|
| 20 |
+
for raw_key, expected_col in zip(raw_scores_keys, actual_columns):
|
| 21 |
+
print(f"\nRaw key: {raw_key}")
|
| 22 |
+
print(f"Expected: {expected_col}")
|
| 23 |
+
|
| 24 |
+
# The correct pattern - remove only the redundant _bigram or _trigram from the end measure
|
| 25 |
+
if '_bigram_' in raw_key:
|
| 26 |
+
# Find the last occurrence of '_bigram'
|
| 27 |
+
idx = raw_key.rfind('_bigram')
|
| 28 |
+
if idx != -1:
|
| 29 |
+
# Remove only the '_bigram' part, keep everything else
|
| 30 |
+
strategy = raw_key[:idx] + raw_key[idx+7:] # 7 = len('_bigram')
|
| 31 |
+
else:
|
| 32 |
+
strategy = raw_key
|
| 33 |
+
elif '_trigram_' in raw_key:
|
| 34 |
+
# Find the last occurrence of '_trigram'
|
| 35 |
+
idx = raw_key.rfind('_trigram')
|
| 36 |
+
if idx != -1:
|
| 37 |
+
# Remove only the '_trigram' part, keep everything else
|
| 38 |
+
strategy = raw_key[:idx] + raw_key[idx+8:] # 8 = len('_trigram')
|
| 39 |
+
else:
|
| 40 |
+
strategy = raw_key
|
| 41 |
+
else:
|
| 42 |
+
strategy = raw_key
|
| 43 |
+
|
| 44 |
+
print(f"Strategy (remove last _bigram/_trigram): {strategy}")
|
| 45 |
+
|
| 46 |
+
if strategy == expected_col:
|
| 47 |
+
print("✅ This strategy works!")
|
| 48 |
+
else:
|
| 49 |
+
print("❌ Still doesn't work")
|
| 50 |
+
print(f"Difference: '{strategy}' vs '{expected_col}'")
|
test_csv_comma_handling.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to demonstrate CSV comma handling with and without quoting parameters.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import csv
|
| 8 |
+
from io import StringIO
|
| 9 |
+
|
| 10 |
+
def test_csv_comma_handling():
|
| 11 |
+
"""Test how different CSV reading approaches handle commas in data."""
|
| 12 |
+
|
| 13 |
+
# Sample problematic CSV data
|
| 14 |
+
problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
|
| 15 |
+
murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
|
| 16 |
+
$ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
|
| 17 |
+
normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
|
| 18 |
+
|
| 19 |
+
# Properly quoted CSV data
|
| 20 |
+
quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
|
| 21 |
+
"murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
|
| 22 |
+
"$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
|
| 23 |
+
normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
|
| 24 |
+
|
| 25 |
+
print("=== Testing CSV Comma Handling ===\n")
|
| 26 |
+
|
| 27 |
+
# Test 1: Default pandas behavior (problematic)
|
| 28 |
+
print("1. Default pandas behavior with problematic CSV:")
|
| 29 |
+
try:
|
| 30 |
+
df_default = pd.read_csv(StringIO(problematic_csv))
|
| 31 |
+
print(f" Columns detected: {len(df_default.columns)}")
|
| 32 |
+
print(f" Column names: {list(df_default.columns)}")
|
| 33 |
+
print(f" First row data: {df_default.iloc[0].tolist()}")
|
| 34 |
+
print(f" Shape: {df_default.shape}")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f" Error: {e}")
|
| 37 |
+
print()
|
| 38 |
+
|
| 39 |
+
# Test 2: With quoting parameters (our solution)
|
| 40 |
+
print("2. With quoting parameters (our solution):")
|
| 41 |
+
try:
|
| 42 |
+
df_quoted = pd.read_csv(StringIO(problematic_csv),
|
| 43 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 44 |
+
print(f" Columns detected: {len(df_quoted.columns)}")
|
| 45 |
+
print(f" Column names: {list(df_quoted.columns)}")
|
| 46 |
+
print(f" First row data: {df_quoted.iloc[0].tolist()}")
|
| 47 |
+
print(f" Shape: {df_quoted.shape}")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f" Error: {e}")
|
| 50 |
+
print()
|
| 51 |
+
|
| 52 |
+
# Test 3: With properly quoted CSV
|
| 53 |
+
print("3. With properly quoted CSV data:")
|
| 54 |
+
try:
|
| 55 |
+
df_proper = pd.read_csv(StringIO(quoted_csv))
|
| 56 |
+
print(f" Columns detected: {len(df_proper.columns)}")
|
| 57 |
+
print(f" Column names: {list(df_proper.columns)}")
|
| 58 |
+
print(f" First row word: '{df_proper.iloc[0]['word']}'")
|
| 59 |
+
print(f" Second row word: '{df_proper.iloc[1]['word']}'")
|
| 60 |
+
print(f" Shape: {df_proper.shape}")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f" Error: {e}")
|
| 63 |
+
print()
|
| 64 |
+
|
| 65 |
+
# Test 4: Show the difference
|
| 66 |
+
print("4. Comparison of approaches:")
|
| 67 |
+
print(" Without quoting: Data with commas gets split incorrectly")
|
| 68 |
+
print(" With quoting: pandas can handle quoted fields properly")
|
| 69 |
+
print(" Best practice: Quote fields that contain commas in the source CSV")
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
test_csv_comma_handling()
|
test_plot_fix.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify the fix for bigram/trigram plot sample words
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add the project root to the path
|
| 10 |
+
sys.path.insert(0, os.getcwd())
|
| 11 |
+
|
| 12 |
+
from web_app.config_manager import ConfigManager
|
| 13 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 14 |
+
|
| 15 |
+
def test_plot_fix():
|
| 16 |
+
print("=== Testing Plot Fix ===")
|
| 17 |
+
|
| 18 |
+
# Load config and create reference lists
|
| 19 |
+
config = ConfigManager.load_reference_config()
|
| 20 |
+
english_config = config.get('english', {})
|
| 21 |
+
|
| 22 |
+
reference_lists = {}
|
| 23 |
+
|
| 24 |
+
# Load a unigram, bigram, and trigram reference
|
| 25 |
+
unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
|
| 26 |
+
bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
|
| 27 |
+
trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
|
| 28 |
+
|
| 29 |
+
reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
|
| 30 |
+
reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
|
| 31 |
+
reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
|
| 32 |
+
|
| 33 |
+
# Create analyzer and analyze text
|
| 34 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 35 |
+
analyzer.load_reference_lists(reference_lists)
|
| 36 |
+
|
| 37 |
+
test_text = "The cat sat on the mat. The dog ran quickly."
|
| 38 |
+
results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
|
| 39 |
+
|
| 40 |
+
print("\n=== Testing Column Matching with Fixed Algorithm ===")
|
| 41 |
+
|
| 42 |
+
# Test the fixed algorithm for bigrams
|
| 43 |
+
for key in results['raw_scores'].keys():
|
| 44 |
+
if '_bigram_' in key:
|
| 45 |
+
print(f"\nTesting bigram key: {key}")
|
| 46 |
+
# Use the new algorithm: remove '_bigram' from the key
|
| 47 |
+
index_measure_col = key.replace('_bigram', '')
|
| 48 |
+
print(f" Fixed algorithm expects column: '{index_measure_col}'")
|
| 49 |
+
|
| 50 |
+
# Check if this column exists in bigram_details
|
| 51 |
+
if results['bigram_details']:
|
| 52 |
+
sample_bigram = results['bigram_details'][0]
|
| 53 |
+
if index_measure_col in sample_bigram:
|
| 54 |
+
print(f" ✅ Column found in bigram_details")
|
| 55 |
+
|
| 56 |
+
# Test if we can build word_score_map successfully
|
| 57 |
+
word_score_map = {}
|
| 58 |
+
for bigram_detail in results['bigram_details']:
|
| 59 |
+
if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
|
| 60 |
+
bigram_text = bigram_detail.get('bigram', '')
|
| 61 |
+
word_score_map[bigram_text] = bigram_detail[index_measure_col]
|
| 62 |
+
|
| 63 |
+
print(f" ✅ Successfully built word_score_map with {len(word_score_map)} entries")
|
| 64 |
+
if word_score_map:
|
| 65 |
+
sample_entries = list(word_score_map.items())[:3]
|
| 66 |
+
print(f" Sample entries: {sample_entries}")
|
| 67 |
+
else:
|
| 68 |
+
print(f" ❌ Column still NOT found in bigram_details")
|
| 69 |
+
|
| 70 |
+
# Test the fixed algorithm for trigrams
|
| 71 |
+
for key in results['raw_scores'].keys():
|
| 72 |
+
if '_trigram_' in key:
|
| 73 |
+
print(f"\nTesting trigram key: {key}")
|
| 74 |
+
# Use the new algorithm: remove '_trigram' from the key
|
| 75 |
+
index_measure_col = key.replace('_trigram', '')
|
| 76 |
+
print(f" Fixed algorithm expects column: '{index_measure_col}'")
|
| 77 |
+
|
| 78 |
+
# Check if this column exists in trigram_details
|
| 79 |
+
if results['trigram_details']:
|
| 80 |
+
sample_trigram = results['trigram_details'][0]
|
| 81 |
+
if index_measure_col in sample_trigram:
|
| 82 |
+
print(f" ✅ Column found in trigram_details")
|
| 83 |
+
|
| 84 |
+
# Test if we can build word_score_map successfully
|
| 85 |
+
word_score_map = {}
|
| 86 |
+
for trigram_detail in results['trigram_details']:
|
| 87 |
+
if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
|
| 88 |
+
trigram_text = trigram_detail.get('trigram', '')
|
| 89 |
+
word_score_map[trigram_text] = trigram_detail[index_measure_col]
|
| 90 |
+
|
| 91 |
+
print(f" ✅ Successfully built word_score_map with {len(word_score_map)} entries")
|
| 92 |
+
if word_score_map:
|
| 93 |
+
sample_entries = list(word_score_map.items())[:3]
|
| 94 |
+
print(f" Sample entries: {sample_entries}")
|
| 95 |
+
else:
|
| 96 |
+
print(f" ❌ Column still NOT found in trigram_details")
|
| 97 |
+
|
| 98 |
+
print("\n=== Fix Verification Complete ===")
|
| 99 |
+
if any('_bigram_' in key for key in results['raw_scores'].keys()) and any('_trigram_' in key for key in results['raw_scores'].keys()):
|
| 100 |
+
print("✅ Fix appears to be working correctly!")
|
| 101 |
+
print("Sample words should now appear in bigram and trigram plots.")
|
| 102 |
+
else:
|
| 103 |
+
print("❌ No bigram/trigram results found to test")
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
test_plot_fix()
|
test_reference_loading_issue.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to diagnose the reference loading issue in the web app
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add the project root to the path
|
| 10 |
+
sys.path.insert(0, os.getcwd())
|
| 11 |
+
|
| 12 |
+
from web_app.config_manager import ConfigManager
|
| 13 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 14 |
+
|
| 15 |
+
def test_reference_loading():
|
| 16 |
+
print("=== Testing Reference Loading Issue ===")
|
| 17 |
+
|
| 18 |
+
# Load config
|
| 19 |
+
config = ConfigManager.load_reference_config()
|
| 20 |
+
english_config = config.get('english', {})
|
| 21 |
+
|
| 22 |
+
print(f"\nAvailable sections in config: {list(english_config.keys())}")
|
| 23 |
+
|
| 24 |
+
# Test what happens when we simulate loading different types of references
|
| 25 |
+
print("\n=== Simulating Reference List Selection ===")
|
| 26 |
+
|
| 27 |
+
# Simulate selecting some unigrams, bigrams, and trigrams
|
| 28 |
+
selected_lists = []
|
| 29 |
+
|
| 30 |
+
# Add a unigram
|
| 31 |
+
if 'unigrams' in english_config and 'COCA_spoken_frequency_token' in english_config['unigrams']:
|
| 32 |
+
unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
|
| 33 |
+
selected_lists.append(('unigrams', 'COCA_spoken_frequency_token', unigram_config))
|
| 34 |
+
print(f"Added unigram: COCA_spoken_frequency_token")
|
| 35 |
+
|
| 36 |
+
# Add a bigram
|
| 37 |
+
if 'bigrams' in english_config and 'COCA_spoken_bigram_frequency_token' in english_config['bigrams']:
|
| 38 |
+
bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
|
| 39 |
+
selected_lists.append(('bigrams', 'COCA_spoken_bigram_frequency_token', bigram_config))
|
| 40 |
+
print(f"Added bigram: COCA_spoken_bigram_frequency_token")
|
| 41 |
+
|
| 42 |
+
# Add a trigram
|
| 43 |
+
if 'trigrams' in english_config and 'COCA_trigram_frequency_token' in english_config['trigrams']:
|
| 44 |
+
trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
|
| 45 |
+
selected_lists.append(('trigrams', 'COCA_trigram_frequency_token', trigram_config))
|
| 46 |
+
print(f"Added trigram: COCA_trigram_frequency_token")
|
| 47 |
+
|
| 48 |
+
print(f"\nTotal selected lists: {len(selected_lists)}")
|
| 49 |
+
|
| 50 |
+
# Load reference data directly
|
| 51 |
+
reference_lists = {}
|
| 52 |
+
for ngram_type, list_key, list_config in selected_lists:
|
| 53 |
+
print(f"\nLoading {ngram_type}: {list_key}")
|
| 54 |
+
data = ConfigManager.load_reference_list_data(list_config)
|
| 55 |
+
|
| 56 |
+
if data:
|
| 57 |
+
print(f" Data keys: {list(data.keys())}")
|
| 58 |
+
for key, value in data.items():
|
| 59 |
+
if hasattr(value, '__len__'):
|
| 60 |
+
print(f" {key}: {len(value)} entries")
|
| 61 |
+
else:
|
| 62 |
+
print(f" {key}: {type(value)}")
|
| 63 |
+
|
| 64 |
+
reference_lists[list_key] = data
|
| 65 |
+
else:
|
| 66 |
+
print(f" Failed to load data for {list_key}")
|
| 67 |
+
|
| 68 |
+
# Check what was loaded
|
| 69 |
+
print(f"\n=== Loaded Reference Lists ===")
|
| 70 |
+
print(f"Keys loaded: {list(reference_lists.keys())}")
|
| 71 |
+
|
| 72 |
+
for key, data in reference_lists.items():
|
| 73 |
+
print(f"\n{key}:")
|
| 74 |
+
for file_type, file_data in data.items():
|
| 75 |
+
if hasattr(file_data, '__len__'):
|
| 76 |
+
print(f" {file_type}: {len(file_data)} entries")
|
| 77 |
+
else:
|
| 78 |
+
print(f" {file_type}: {type(file_data)}")
|
| 79 |
+
|
| 80 |
+
# Test analysis with these reference lists
|
| 81 |
+
print(f"\n=== Testing Analysis ===")
|
| 82 |
+
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
|
| 83 |
+
|
| 84 |
+
# Load reference lists into analyzer
|
| 85 |
+
analyzer.load_reference_lists(reference_lists)
|
| 86 |
+
|
| 87 |
+
# Test text
|
| 88 |
+
test_text = "The cat sat on the mat. The dog ran quickly."
|
| 89 |
+
|
| 90 |
+
# Analyze
|
| 91 |
+
results = analyzer.analyze_text(
|
| 92 |
+
test_text,
|
| 93 |
+
list(reference_lists.keys()),
|
| 94 |
+
apply_log=False
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
print(f"\nAnalysis summary keys: {list(results['summary'].keys())}")
|
| 98 |
+
print(f"Bigram details count: {len(results.get('bigram_details', []))}")
|
| 99 |
+
print(f"Trigram details count: {len(results.get('trigram_details', []))}")
|
| 100 |
+
|
| 101 |
+
# Check for bigram/trigram entries in summary
|
| 102 |
+
bigram_summary_keys = [k for k in results['summary'].keys() if 'bigram' in k]
|
| 103 |
+
trigram_summary_keys = [k for k in results['summary'].keys() if 'trigram' in k]
|
| 104 |
+
|
| 105 |
+
print(f"\nBigram summary keys: {bigram_summary_keys}")
|
| 106 |
+
print(f"Trigram summary keys: {trigram_summary_keys}")
|
| 107 |
+
|
| 108 |
+
if not bigram_summary_keys and not trigram_summary_keys:
|
| 109 |
+
print("\n⚠️ WARNING: No bigram or trigram results in summary!")
|
| 110 |
+
print("This suggests the issue is in the analysis process, not the display function.")
|
| 111 |
+
else:
|
| 112 |
+
print("\n✓ Bigram and trigram results found in summary.")
|
| 113 |
+
print("The issue might be in how the web app loads reference lists.")
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
test_reference_loading()
|
text_analyzer/corpus_visualizer.py
CHANGED
|
@@ -15,6 +15,7 @@ import logging
|
|
| 15 |
import re
|
| 16 |
from io import StringIO
|
| 17 |
import natsort
|
|
|
|
| 18 |
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
@@ -102,7 +103,8 @@ class CorpusVisualizer:
|
|
| 102 |
separator = format_info['separator']
|
| 103 |
|
| 104 |
# Load into DataFrame
|
| 105 |
-
df = pd.read_csv(StringIO(content), sep=separator
|
|
|
|
| 106 |
|
| 107 |
# Store the dataframe
|
| 108 |
if file_type == 'metadata':
|
|
|
|
| 15 |
import re
|
| 16 |
from io import StringIO
|
| 17 |
import natsort
|
| 18 |
+
import csv
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
|
|
|
| 103 |
separator = format_info['separator']
|
| 104 |
|
| 105 |
# Load into DataFrame
|
| 106 |
+
df = pd.read_csv(StringIO(content), sep=separator,
|
| 107 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 108 |
|
| 109 |
# Store the dataframe
|
| 110 |
if file_type == 'metadata':
|
text_analyzer/frequency_analyzer.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import Dict, List, Tuple, Optional, Union
|
|
| 12 |
import logging
|
| 13 |
import random
|
| 14 |
from io import StringIO
|
|
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
|
@@ -208,7 +209,8 @@ class FrequencyAnalyzer:
|
|
| 208 |
has_header = column_config.get('has_header', True)
|
| 209 |
|
| 210 |
# Read data
|
| 211 |
-
df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None
|
|
|
|
| 212 |
|
| 213 |
# Store column configuration
|
| 214 |
self.column_config = column_config.copy()
|
|
|
|
| 12 |
import logging
|
| 13 |
import random
|
| 14 |
from io import StringIO
|
| 15 |
+
import csv
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
|
|
|
| 209 |
has_header = column_config.get('has_header', True)
|
| 210 |
|
| 211 |
# Read data
|
| 212 |
+
df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None,
|
| 213 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 214 |
|
| 215 |
# Store column configuration
|
| 216 |
self.column_config = column_config.copy()
|
text_analyzer/lexical_sophistication.py
CHANGED
|
@@ -82,7 +82,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 82 |
delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
|
| 83 |
|
| 84 |
# Load the file
|
| 85 |
-
df = pd.read_csv(file_path, delimiter=delimiter, header=0
|
|
|
|
| 86 |
|
| 87 |
if file_type in ['token', 'lemma']:
|
| 88 |
# Check if this is a custom frequency list format with specific columns
|
|
@@ -183,10 +184,12 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 183 |
if 'content' in config:
|
| 184 |
# Use content directly
|
| 185 |
content_io = StringIO(config['content'])
|
| 186 |
-
df = pd.read_csv(content_io, delimiter=delimiter, header=0
|
|
|
|
| 187 |
elif 'file_path' in config:
|
| 188 |
# Fallback to file path for backward compatibility
|
| 189 |
-
df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0
|
|
|
|
| 190 |
else:
|
| 191 |
logger.error("No content or file_path found in config")
|
| 192 |
return {}
|
|
@@ -602,6 +605,18 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 602 |
|
| 603 |
# Look up scores for each selected index
|
| 604 |
for index_name in selected_indices:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
# Check if this is a Japanese corpus reference list
|
| 606 |
ref_data = self.reference_lists.get(index_name, {})
|
| 607 |
is_japanese_corpus = False
|
|
@@ -613,69 +628,87 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 613 |
|
| 614 |
if is_japanese_corpus and self.language == 'ja':
|
| 615 |
# Use enhanced UniDic lookup with 3-level fallback and diagnostics
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
}
|
| 643 |
|
| 644 |
elif is_japanese_corpus:
|
| 645 |
# Fallback to legacy Japanese lookup if UniDic not available
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
else:
|
| 655 |
# Standard lookup for non-Japanese data
|
| 656 |
-
|
| 657 |
-
|
|
|
|
|
|
|
| 658 |
|
| 659 |
-
#
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
all_scores[f"{index_name}_token_{word_type}"].append(score_val)
|
| 671 |
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
)
|
| 677 |
-
score_val = np.log10(lemma_score) if should_log_transform and lemma_score > 0 else lemma_score
|
| 678 |
-
all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
|
| 679 |
|
| 680 |
results['token_details'].append(token_detail)
|
| 681 |
|
|
@@ -722,25 +755,69 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 722 |
if ref_data is None or not isinstance(ref_data, pd.DataFrame):
|
| 723 |
continue
|
| 724 |
|
| 725 |
-
# Get
|
| 726 |
-
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
-
#
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
if
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
|
| 745 |
results[ngram_details_key].append(ngram_detail)
|
| 746 |
|
|
@@ -753,37 +830,93 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 753 |
if ref_data is None or not isinstance(ref_data, pd.DataFrame):
|
| 754 |
continue
|
| 755 |
|
| 756 |
-
# Get
|
| 757 |
-
|
|
|
|
|
|
|
|
|
|
| 758 |
|
| 759 |
-
#
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
if
|
| 763 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 775 |
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
|
| 788 |
return results
|
| 789 |
|
|
|
|
| 82 |
delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
|
| 83 |
|
| 84 |
# Load the file
|
| 85 |
+
df = pd.read_csv(file_path, delimiter=delimiter, header=0,
|
| 86 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 87 |
|
| 88 |
if file_type in ['token', 'lemma']:
|
| 89 |
# Check if this is a custom frequency list format with specific columns
|
|
|
|
| 184 |
if 'content' in config:
|
| 185 |
# Use content directly
|
| 186 |
content_io = StringIO(config['content'])
|
| 187 |
+
df = pd.read_csv(content_io, delimiter=delimiter, header=0,
|
| 188 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 189 |
elif 'file_path' in config:
|
| 190 |
# Fallback to file path for backward compatibility
|
| 191 |
+
df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0,
|
| 192 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 193 |
else:
|
| 194 |
logger.error("No content or file_path found in config")
|
| 195 |
return {}
|
|
|
|
| 605 |
|
| 606 |
# Look up scores for each selected index
|
| 607 |
for index_name in selected_indices:
|
| 608 |
+
# Extract base name and determine analysis type to avoid duplicate suffixes
|
| 609 |
+
if index_name.endswith('_token'):
|
| 610 |
+
base_name = index_name[:-6] # Remove '_token'
|
| 611 |
+
analysis_type = 'token'
|
| 612 |
+
elif index_name.endswith('_lemma'):
|
| 613 |
+
base_name = index_name[:-6] # Remove '_lemma'
|
| 614 |
+
analysis_type = 'lemma'
|
| 615 |
+
else:
|
| 616 |
+
# Fallback for entries without clear suffix
|
| 617 |
+
base_name = index_name
|
| 618 |
+
analysis_type = 'token' # Default to token
|
| 619 |
+
|
| 620 |
# Check if this is a Japanese corpus reference list
|
| 621 |
ref_data = self.reference_lists.get(index_name, {})
|
| 622 |
is_japanese_corpus = False
|
|
|
|
| 628 |
|
| 629 |
if is_japanese_corpus and self.language == 'ja':
|
| 630 |
# Use enhanced UniDic lookup with 3-level fallback and diagnostics
|
| 631 |
+
if analysis_type == 'token':
|
| 632 |
+
result = self._lookup_with_unidic_fallback(token, index_name, 'token')
|
| 633 |
+
score = result['score']
|
| 634 |
+
|
| 635 |
+
# Store enhanced details with clean column name
|
| 636 |
+
token_detail[index_name] = score if score is not None else None
|
| 637 |
+
token_detail[f"{index_name}_match_method"] = result['match_method']
|
| 638 |
+
token_detail[f"{index_name}_match_key"] = result['match_key'] or None
|
| 639 |
+
|
| 640 |
+
# Store UniDic features for display (only once per token)
|
| 641 |
+
if hasattr(token, '_') and hasattr(token._, 'unidic_lemma') and 'unidic_features' not in token_detail:
|
| 642 |
+
token_detail['unidic_features'] = {
|
| 643 |
+
'lemma': getattr(token._, 'unidic_lemma', ''),
|
| 644 |
+
'lForm': getattr(token._, 'unidic_lform', ''),
|
| 645 |
+
'pos1': getattr(token._, 'unidic_pos1', ''),
|
| 646 |
+
'pos2': getattr(token._, 'unidic_pos2', ''),
|
| 647 |
+
'goshu': getattr(token._, 'unidic_goshu', ''),
|
| 648 |
+
'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
|
| 649 |
+
}
|
| 650 |
+
else: # lemma analysis
|
| 651 |
+
result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
|
| 652 |
+
score = result['score']
|
| 653 |
+
|
| 654 |
+
# Store enhanced details with clean column name
|
| 655 |
+
token_detail[index_name] = score if score is not None else None
|
| 656 |
+
token_detail[f"{index_name}_match_method"] = result['match_method']
|
| 657 |
+
token_detail[f"{index_name}_match_key"] = result['match_key'] or None
|
| 658 |
|
| 659 |
elif is_japanese_corpus:
|
| 660 |
# Fallback to legacy Japanese lookup if UniDic not available
|
| 661 |
+
if analysis_type == 'token':
|
| 662 |
+
score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
|
| 663 |
+
|
| 664 |
+
# Apply log transformation if needed before storing
|
| 665 |
+
if score is not None:
|
| 666 |
+
should_log_transform = self._should_apply_log_transform(
|
| 667 |
+
index_name, analysis_type, 'frequency', log_transforms, apply_log
|
| 668 |
+
)
|
| 669 |
+
final_score = np.log10(score) if should_log_transform and score > 0 else score
|
| 670 |
+
else:
|
| 671 |
+
final_score = None
|
| 672 |
+
|
| 673 |
+
token_detail[index_name] = final_score
|
| 674 |
+
token_detail[f"{index_name}_match_method"] = "legacy_spacy"
|
| 675 |
+
else: # lemma analysis
|
| 676 |
+
score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
|
| 677 |
+
|
| 678 |
+
# Apply log transformation if needed before storing
|
| 679 |
+
if score is not None:
|
| 680 |
+
should_log_transform = self._should_apply_log_transform(
|
| 681 |
+
index_name, analysis_type, 'frequency', log_transforms, apply_log
|
| 682 |
+
)
|
| 683 |
+
final_score = np.log10(score) if should_log_transform and score > 0 else score
|
| 684 |
+
else:
|
| 685 |
+
final_score = None
|
| 686 |
+
|
| 687 |
+
token_detail[index_name] = final_score
|
| 688 |
+
token_detail[f"{index_name}_match_method"] = "legacy_spacy"
|
| 689 |
else:
|
| 690 |
# Standard lookup for non-Japanese data
|
| 691 |
+
if analysis_type == 'token':
|
| 692 |
+
score = self._lookup_score(token.text, index_name, 'token')
|
| 693 |
+
else: # lemma analysis
|
| 694 |
+
score = self._lookup_score(token.lemma_, index_name, 'lemma')
|
| 695 |
|
| 696 |
+
# Apply log transformation if needed before storing
|
| 697 |
+
if score is not None:
|
| 698 |
+
should_log_transform = self._should_apply_log_transform(
|
| 699 |
+
index_name, analysis_type, 'frequency', log_transforms, apply_log
|
| 700 |
+
)
|
| 701 |
+
final_score = np.log10(score) if should_log_transform and score > 0 else score
|
| 702 |
+
else:
|
| 703 |
+
final_score = None
|
| 704 |
+
|
| 705 |
+
# Store score with clean column name and transformed value
|
| 706 |
+
token_detail[index_name] = final_score
|
|
|
|
| 707 |
|
| 708 |
+
# Collect for summary statistics (score is already transformed if needed)
|
| 709 |
+
score = token_detail.get(index_name)
|
| 710 |
+
if score is not None:
|
| 711 |
+
all_scores[f"{index_name}_{word_type}"].append(score)
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
results['token_details'].append(token_detail)
|
| 714 |
|
|
|
|
| 755 |
if ref_data is None or not isinstance(ref_data, pd.DataFrame):
|
| 756 |
continue
|
| 757 |
|
| 758 |
+
# Get columns config for proper measure naming from YAML config
|
| 759 |
+
# We need to access the original YAML configuration to get proper measure names
|
| 760 |
+
from web_app.config_manager import ConfigManager
|
| 761 |
+
config = ConfigManager.load_reference_config()
|
| 762 |
+
language_key = "english" if self.language == 'en' else "japanese"
|
| 763 |
|
| 764 |
+
# Find the config entry for this index
|
| 765 |
+
config_entry = None
|
| 766 |
+
for config_section in [f"{ngram_type}s"]: # bigrams/trigrams sections
|
| 767 |
+
if config_section in config.get(language_key, {}):
|
| 768 |
+
if index_name in config[language_key][config_section]:
|
| 769 |
+
config_entry = config[language_key][config_section][index_name]
|
| 770 |
+
break
|
| 771 |
+
|
| 772 |
+
if config_entry and 'columns' in config_entry:
|
| 773 |
+
# Get columns config for this n-gram type
|
| 774 |
+
columns_config = config_entry.get('columns', {})
|
| 775 |
+
|
| 776 |
+
# Create mapping from column index to measure name
|
| 777 |
+
measure_mapping = {}
|
| 778 |
+
for measure_name, col_idx in columns_config.items():
|
| 779 |
+
if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
|
| 780 |
+
measure_mapping[ref_data.columns[col_idx]] = measure_name
|
| 781 |
+
|
| 782 |
+
# Use the measure mapping to get proper names
|
| 783 |
+
for col_name, measure_name in measure_mapping.items():
|
| 784 |
+
if col_name == ref_data.columns[0]: # Skip the n-gram text column
|
| 785 |
+
continue
|
| 786 |
+
|
| 787 |
+
# Check if this measure should be computed
|
| 788 |
+
if not self._should_compute_measure(index_name, measure_name, selected_measures):
|
| 789 |
+
continue
|
| 790 |
+
|
| 791 |
+
score = self._lookup_score(ngram, index_name, ngram_type, col_name)
|
| 792 |
+
if score is not None:
|
| 793 |
+
# Check if this measure should be log-transformed
|
| 794 |
+
should_log_transform = self._should_apply_log_transform(
|
| 795 |
+
index_name, ngram_type, measure_name, log_transforms, apply_log
|
| 796 |
+
)
|
| 797 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 798 |
+
ngram_detail[f"{index_name}_{measure_name}"] = score_val
|
| 799 |
+
else:
|
| 800 |
+
ngram_detail[f"{index_name}_{measure_name}"] = None
|
| 801 |
+
else:
|
| 802 |
+
# Fallback to old logic
|
| 803 |
+
available_measures = ref_data.columns[1:].tolist()
|
| 804 |
+
|
| 805 |
+
# Filter measures based on selection
|
| 806 |
+
for measure in available_measures:
|
| 807 |
+
# Check if this measure should be computed
|
| 808 |
+
if not self._should_compute_measure(index_name, measure, selected_measures):
|
| 809 |
+
continue
|
| 810 |
+
|
| 811 |
+
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 812 |
+
if score is not None:
|
| 813 |
+
# Check if this measure should be log-transformed
|
| 814 |
+
should_log_transform = self._should_apply_log_transform(
|
| 815 |
+
index_name, ngram_type, measure, log_transforms, apply_log
|
| 816 |
+
)
|
| 817 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 818 |
+
ngram_detail[f"{index_name}_{measure}"] = score_val
|
| 819 |
+
else:
|
| 820 |
+
ngram_detail[f"{index_name}_{measure}"] = None
|
| 821 |
|
| 822 |
results[ngram_details_key].append(ngram_detail)
|
| 823 |
|
|
|
|
| 830 |
if ref_data is None or not isinstance(ref_data, pd.DataFrame):
|
| 831 |
continue
|
| 832 |
|
| 833 |
+
# Get columns config for proper measure naming from YAML config
|
| 834 |
+
# We need to access the original YAML configuration to get proper measure names
|
| 835 |
+
from web_app.config_manager import ConfigManager
|
| 836 |
+
config = ConfigManager.load_reference_config()
|
| 837 |
+
language_key = "english" if self.language == 'en' else "japanese"
|
| 838 |
|
| 839 |
+
# Find the config entry for this index
|
| 840 |
+
config_entry = None
|
| 841 |
+
for config_section in [f"{ngram_type}s"]: # bigrams/trigrams sections
|
| 842 |
+
if config_section in config.get(language_key, {}):
|
| 843 |
+
if index_name in config[language_key][config_section]:
|
| 844 |
+
config_entry = config[language_key][config_section][index_name]
|
| 845 |
+
break
|
| 846 |
+
|
| 847 |
+
if config_entry and 'columns' in config_entry:
|
| 848 |
+
# Get columns config for this n-gram type
|
| 849 |
+
columns_config = config_entry.get('columns', {})
|
| 850 |
+
|
| 851 |
+
# Create mapping from column index to measure name
|
| 852 |
+
measure_mapping = {}
|
| 853 |
+
for measure_name, col_idx in columns_config.items():
|
| 854 |
+
if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
|
| 855 |
+
measure_mapping[ref_data.columns[col_idx]] = measure_name
|
| 856 |
+
|
| 857 |
+
# Use the measure mapping to get proper names
|
| 858 |
+
for col_name, measure_name in measure_mapping.items():
|
| 859 |
+
if col_name == ref_data.columns[0]: # Skip the n-gram text column
|
| 860 |
+
continue
|
| 861 |
+
|
| 862 |
+
# Check if this measure should be computed
|
| 863 |
+
if not self._should_compute_measure(index_name, measure_name, selected_measures):
|
| 864 |
+
continue
|
| 865 |
+
|
| 866 |
+
ngram_scores = []
|
| 867 |
+
for ngram in ngrams:
|
| 868 |
+
score = self._lookup_score(ngram, index_name, ngram_type, col_name)
|
| 869 |
+
if score is not None:
|
| 870 |
+
# Check if this measure should be log-transformed
|
| 871 |
+
should_log_transform = self._should_apply_log_transform(
|
| 872 |
+
index_name, ngram_type, measure_name, log_transforms, apply_log
|
| 873 |
+
)
|
| 874 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 875 |
+
ngram_scores.append(score_val)
|
| 876 |
|
| 877 |
+
if ngram_scores:
|
| 878 |
+
key = f"{index_name}_{ngram_type}_{measure_name}"
|
| 879 |
+
results['summary'][key] = {
|
| 880 |
+
'mean': np.mean(ngram_scores),
|
| 881 |
+
'std': np.std(ngram_scores),
|
| 882 |
+
'count': len(ngram_scores),
|
| 883 |
+
'min': np.min(ngram_scores),
|
| 884 |
+
'max': np.max(ngram_scores)
|
| 885 |
+
}
|
| 886 |
+
# Store raw scores for plotting
|
| 887 |
+
results['raw_scores'][key] = ngram_scores
|
| 888 |
+
else:
|
| 889 |
+
# Fallback to old logic if config not properly structured
|
| 890 |
+
available_measures = ref_data.columns[1:].tolist()
|
| 891 |
|
| 892 |
+
# Filter measures based on selection and compute summary statistics
|
| 893 |
+
for measure in available_measures:
|
| 894 |
+
# Check if this measure should be computed
|
| 895 |
+
if not self._should_compute_measure(index_name, measure, selected_measures):
|
| 896 |
+
continue
|
| 897 |
+
|
| 898 |
+
ngram_scores = []
|
| 899 |
+
for ngram in ngrams:
|
| 900 |
+
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 901 |
+
if score is not None:
|
| 902 |
+
# Check if this measure should be log-transformed
|
| 903 |
+
should_log_transform = self._should_apply_log_transform(
|
| 904 |
+
index_name, ngram_type, measure, log_transforms, apply_log
|
| 905 |
+
)
|
| 906 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 907 |
+
ngram_scores.append(score_val)
|
| 908 |
+
|
| 909 |
+
if ngram_scores:
|
| 910 |
+
key = f"{index_name}_{ngram_type}_{measure}"
|
| 911 |
+
results['summary'][key] = {
|
| 912 |
+
'mean': np.mean(ngram_scores),
|
| 913 |
+
'std': np.std(ngram_scores),
|
| 914 |
+
'count': len(ngram_scores),
|
| 915 |
+
'min': np.min(ngram_scores),
|
| 916 |
+
'max': np.max(ngram_scores)
|
| 917 |
+
}
|
| 918 |
+
# Store raw scores for plotting
|
| 919 |
+
results['raw_scores'][key] = ngram_scores
|
| 920 |
|
| 921 |
return results
|
| 922 |
|
web_app/app.py
CHANGED
|
@@ -91,7 +91,7 @@ def render_sidebar():
|
|
| 91 |
|
| 92 |
def render_lexical_sophistication_interface():
|
| 93 |
"""Render lexical sophistication analysis interface."""
|
| 94 |
-
st.header("🔍 Lexical Sophistication
|
| 95 |
|
| 96 |
# Get analyzer
|
| 97 |
analyzer = AnalysisHandlers.get_analyzer()
|
|
|
|
| 91 |
|
| 92 |
def render_lexical_sophistication_interface():
|
| 93 |
"""Render lexical sophistication analysis interface."""
|
| 94 |
+
st.header("🔍 Emulation of the Tool for Automatic Analysis of Lexical Sophistication (emuTAALES)")
|
| 95 |
|
| 96 |
# Get analyzer
|
| 97 |
analyzer = AnalysisHandlers.get_analyzer()
|
web_app/components/ui_components.py
CHANGED
|
@@ -298,7 +298,7 @@ class UIComponents:
|
|
| 298 |
|
| 299 |
@staticmethod
|
| 300 |
def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
| 301 |
-
"""Render the
|
| 302 |
from web_app.defaults_manager import DefaultsManager
|
| 303 |
|
| 304 |
# Initialize return values
|
|
@@ -309,34 +309,160 @@ class UIComponents:
|
|
| 309 |
st.info("No reference lists selected. Please configure reference lists first.")
|
| 310 |
return selected_measures, log_transforms
|
| 311 |
|
| 312 |
-
#
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
| 326 |
|
| 327 |
-
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
return selected_measures, log_transforms
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
@staticmethod
|
| 341 |
def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
|
| 342 |
"""Check if a group has smart defaults configured."""
|
|
|
|
| 298 |
|
| 299 |
@staticmethod
|
| 300 |
def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
| 301 |
+
"""Render the advanced reference list selection interface with hierarchical grouping and individual measure control."""
|
| 302 |
from web_app.defaults_manager import DefaultsManager
|
| 303 |
|
| 304 |
# Initialize return values
|
|
|
|
| 309 |
st.info("No reference lists selected. Please configure reference lists first.")
|
| 310 |
return selected_measures, log_transforms
|
| 311 |
|
| 312 |
+
# Group reference lists by base name for hierarchical display
|
| 313 |
+
groups = UIComponents._group_reference_lists(reference_lists, config)
|
| 314 |
+
|
| 315 |
+
st.write("**Reference Lists & Measures:**")
|
| 316 |
+
|
| 317 |
+
# Render each group with hierarchical interface
|
| 318 |
+
for base_name, group_data in groups.items():
|
| 319 |
+
# Group-level enable/disable checkbox
|
| 320 |
+
group_key = f"group_enabled_{base_name}"
|
| 321 |
+
group_enabled = st.checkbox(
|
| 322 |
+
f"☑️ **{base_name}**",
|
| 323 |
+
value=True, # Default enabled
|
| 324 |
+
key=group_key,
|
| 325 |
+
help=f"Enable/disable all {base_name} analyses"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
if group_enabled:
|
| 329 |
+
# Analysis type badges display
|
| 330 |
+
badges = []
|
| 331 |
+
if group_data['token']:
|
| 332 |
+
badges.append("[Token ✓]")
|
| 333 |
+
if group_data['lemma']:
|
| 334 |
+
badges.append("[Lemma ✓]")
|
| 335 |
+
|
| 336 |
+
if badges:
|
| 337 |
+
st.write(f" {' '.join(badges)}")
|
| 338 |
+
|
| 339 |
+
# Expandable measure selection for each analysis type
|
| 340 |
+
if group_data['token']:
|
| 341 |
+
with st.expander("📊 Token Measures ⬇️ (click to customize)", expanded=False):
|
| 342 |
+
token_measures, token_logs = UIComponents._render_measure_selection(
|
| 343 |
+
group_data['token'][0], 'token', base_name
|
| 344 |
+
)
|
| 345 |
+
# Always store the results, even if empty (to maintain structure)
|
| 346 |
+
selected_measures[group_data['token'][0][0]] = token_measures
|
| 347 |
+
log_transforms[group_data['token'][0][0]] = token_logs
|
| 348 |
|
| 349 |
+
if group_data['lemma']:
|
| 350 |
+
with st.expander("📊 Lemma Measures ⬇️ (click to customize)", expanded=False):
|
| 351 |
+
lemma_measures, lemma_logs = UIComponents._render_measure_selection(
|
| 352 |
+
group_data['lemma'][0], 'lemma', base_name
|
| 353 |
+
)
|
| 354 |
+
# Always store the results, even if empty (to maintain structure)
|
| 355 |
+
selected_measures[group_data['lemma'][0][0]] = lemma_measures
|
| 356 |
+
log_transforms[group_data['lemma'][0][0]] = lemma_logs
|
| 357 |
|
| 358 |
+
# Show smart defaults summary
|
| 359 |
+
token_entry_name = group_data['token'][0][0] if group_data['token'] else None
|
| 360 |
+
lemma_entry_name = group_data['lemma'][0][0] if group_data['lemma'] else None
|
| 361 |
|
| 362 |
+
total_measures = 0
|
| 363 |
+
total_logs = 0
|
| 364 |
|
| 365 |
+
if token_entry_name:
|
| 366 |
+
total_measures += len(selected_measures.get(token_entry_name, []))
|
| 367 |
+
total_logs += len(log_transforms.get(token_entry_name, []))
|
| 368 |
+
|
| 369 |
+
if lemma_entry_name:
|
| 370 |
+
total_measures += len(selected_measures.get(lemma_entry_name, []))
|
| 371 |
+
total_logs += len(log_transforms.get(lemma_entry_name, []))
|
| 372 |
+
|
| 373 |
+
st.write(f" 📊 {total_measures} measures selected, 🔄 {total_logs} log-transformed")
|
| 374 |
+
st.write("") # Add spacing
|
| 375 |
|
| 376 |
return selected_measures, log_transforms
|
| 377 |
|
| 378 |
+
@staticmethod
|
| 379 |
+
def _group_reference_lists(reference_lists: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Dict[str, List]]:
|
| 380 |
+
"""Group related reference lists for hierarchical display."""
|
| 381 |
+
from collections import defaultdict
|
| 382 |
+
|
| 383 |
+
groups = defaultdict(lambda: {'token': [], 'lemma': []})
|
| 384 |
+
|
| 385 |
+
for entry_name in reference_lists.keys():
|
| 386 |
+
# Extract base name (remove _token/_lemma suffix)
|
| 387 |
+
base_name = entry_name.replace('_token', '').replace('_lemma', '')
|
| 388 |
+
|
| 389 |
+
# Get analysis type from config
|
| 390 |
+
entry_config = UIComponents._find_entry_config(entry_name, config)
|
| 391 |
+
if entry_config:
|
| 392 |
+
analysis_type = entry_config.get('analysis_type', 'token')
|
| 393 |
+
groups[base_name][analysis_type].append((entry_name, entry_config))
|
| 394 |
+
|
| 395 |
+
return groups
|
| 396 |
+
|
| 397 |
+
@staticmethod
|
| 398 |
+
def _render_measure_selection(entry_data: Tuple[str, Dict], analysis_type: str, base_name: str) -> Tuple[List[str], List[str]]:
|
| 399 |
+
"""Render individual measure checkboxes with log transform controls."""
|
| 400 |
+
entry_name, entry_config = entry_data
|
| 401 |
+
|
| 402 |
+
# Get measure information from config
|
| 403 |
+
selectable_measures = entry_config.get('selectable_measures', [])
|
| 404 |
+
log_transformable = entry_config.get('log_transformable', [])
|
| 405 |
+
default_measures = entry_config.get('default_measures', [])
|
| 406 |
+
default_log_transforms = entry_config.get('default_log_transforms', [])
|
| 407 |
+
|
| 408 |
+
# Initialize session state for this entry if not exists
|
| 409 |
+
if f'custom_measures_{entry_name}' not in st.session_state:
|
| 410 |
+
st.session_state[f'custom_measures_{entry_name}'] = default_measures.copy()
|
| 411 |
+
if f'custom_logs_{entry_name}' not in st.session_state:
|
| 412 |
+
st.session_state[f'custom_logs_{entry_name}'] = default_log_transforms.copy()
|
| 413 |
+
|
| 414 |
+
# Display measure selection interface
|
| 415 |
+
st.write(f"**Available Measures for {entry_config.get('display_name', entry_name)}:**")
|
| 416 |
+
|
| 417 |
+
selected_measures = []
|
| 418 |
+
selected_logs = []
|
| 419 |
+
|
| 420 |
+
for measure in selectable_measures:
|
| 421 |
+
col1, col2 = st.columns([3, 1])
|
| 422 |
+
|
| 423 |
+
with col1:
|
| 424 |
+
# Measure checkbox (pre-selected based on defaults)
|
| 425 |
+
measure_key = f"measure_{entry_name}_{measure}"
|
| 426 |
+
selected = st.checkbox(
|
| 427 |
+
f"☑️ {measure.replace('_', ' ').title()}",
|
| 428 |
+
value=measure in st.session_state[f'custom_measures_{entry_name}'],
|
| 429 |
+
key=measure_key,
|
| 430 |
+
help=f"Include {measure} in analysis"
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
if selected:
|
| 434 |
+
selected_measures.append(measure)
|
| 435 |
+
|
| 436 |
+
with col2:
|
| 437 |
+
# Log transform toggle (disabled if not transformable)
|
| 438 |
+
if measure in log_transformable and selected:
|
| 439 |
+
log_key = f"log_{entry_name}_{measure}"
|
| 440 |
+
log_enabled = st.checkbox(
|
| 441 |
+
"🔄 log₁₀",
|
| 442 |
+
value=measure in st.session_state[f'custom_logs_{entry_name}'],
|
| 443 |
+
key=log_key,
|
| 444 |
+
help=f"Apply log₁₀ transformation to {measure}"
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
if log_enabled:
|
| 448 |
+
selected_logs.append(measure)
|
| 449 |
+
elif measure in log_transformable:
|
| 450 |
+
st.write("🔄 (disabled)")
|
| 451 |
+
else:
|
| 452 |
+
st.write("❌ (not transformable)")
|
| 453 |
+
|
| 454 |
+
# Update session state
|
| 455 |
+
st.session_state[f'custom_measures_{entry_name}'] = selected_measures
|
| 456 |
+
st.session_state[f'custom_logs_{entry_name}'] = selected_logs
|
| 457 |
+
|
| 458 |
+
# Show selection summary
|
| 459 |
+
if selected_measures:
|
| 460 |
+
st.success(f"✅ {len(selected_measures)} measures selected, {len(selected_logs)} log-transformed")
|
| 461 |
+
else:
|
| 462 |
+
st.warning("⚠️ No measures selected for this analysis type")
|
| 463 |
+
|
| 464 |
+
return selected_measures, selected_logs
|
| 465 |
+
|
| 466 |
@staticmethod
|
| 467 |
def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
|
| 468 |
"""Check if a group has smart defaults configured."""
|
web_app/config_manager.py
CHANGED
|
@@ -8,6 +8,7 @@ import pandas as pd
|
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Dict, List, Any, Optional, Tuple
|
| 10 |
import yaml
|
|
|
|
| 11 |
|
| 12 |
from web_app.session_manager import SessionManager
|
| 13 |
from web_app.utils import MemoryFileHandler
|
|
@@ -70,7 +71,8 @@ class ConfigManager:
|
|
| 70 |
content_io = StringIO(text_content)
|
| 71 |
|
| 72 |
# Load preview
|
| 73 |
-
df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5
|
|
|
|
| 74 |
|
| 75 |
# Store content in session state instead of file path
|
| 76 |
if 'uploaded_files_content' not in st.session_state:
|
|
@@ -209,9 +211,11 @@ class ConfigManager:
|
|
| 209 |
|
| 210 |
# Load file
|
| 211 |
if list_config.get('has_header', False):
|
| 212 |
-
df = pd.read_csv(file_path, delimiter=delimiter, header=0
|
|
|
|
| 213 |
else:
|
| 214 |
-
df = pd.read_csv(file_path, delimiter=delimiter, header=None
|
|
|
|
| 215 |
|
| 216 |
# Get column mapping
|
| 217 |
columns = list_config.get('columns', {})
|
|
@@ -247,6 +251,7 @@ class ConfigManager:
|
|
| 247 |
elif is_trigram:
|
| 248 |
data['trigram'] = df
|
| 249 |
else:
|
|
|
|
| 250 |
data[file_type] = df
|
| 251 |
|
| 252 |
except Exception as e:
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Dict, List, Any, Optional, Tuple
|
| 10 |
import yaml
|
| 11 |
+
import csv
|
| 12 |
|
| 13 |
from web_app.session_manager import SessionManager
|
| 14 |
from web_app.utils import MemoryFileHandler
|
|
|
|
| 71 |
content_io = StringIO(text_content)
|
| 72 |
|
| 73 |
# Load preview
|
| 74 |
+
df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5,
|
| 75 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 76 |
|
| 77 |
# Store content in session state instead of file path
|
| 78 |
if 'uploaded_files_content' not in st.session_state:
|
|
|
|
| 211 |
|
| 212 |
# Load file
|
| 213 |
if list_config.get('has_header', False):
|
| 214 |
+
df = pd.read_csv(file_path, delimiter=delimiter, header=0,
|
| 215 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 216 |
else:
|
| 217 |
+
df = pd.read_csv(file_path, delimiter=delimiter, header=None,
|
| 218 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 219 |
|
| 220 |
# Get column mapping
|
| 221 |
columns = list_config.get('columns', {})
|
|
|
|
| 251 |
elif is_trigram:
|
| 252 |
data['trigram'] = df
|
| 253 |
else:
|
| 254 |
+
# For standard unigram files that aren't bigrams or trigrams
|
| 255 |
data[file_type] = df
|
| 256 |
|
| 257 |
except Exception as e:
|
web_app/handlers/analysis_handlers.py
CHANGED
|
@@ -96,28 +96,47 @@ class AnalysisHandlers:
|
|
| 96 |
analyzer.load_reference_lists(reference_lists)
|
| 97 |
|
| 98 |
# Get analysis configuration
|
| 99 |
-
if use_smart_defaults:
|
| 100 |
-
# Use
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
else:
|
| 123 |
# Legacy mode - use global log transformation
|
|
@@ -402,22 +421,183 @@ class AnalysisHandlers:
|
|
| 402 |
|
| 403 |
@staticmethod
|
| 404 |
def create_density_plots(results: Dict[str, Any]):
|
| 405 |
-
"""Create density plots for score distributions."""
|
| 406 |
if 'raw_scores' not in results:
|
| 407 |
return
|
| 408 |
|
| 409 |
for key, scores in results['raw_scores'].items():
|
| 410 |
if len(scores) > 1: # Need at least 2 points for density
|
| 411 |
-
# Create
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
fig = go.Figure()
|
| 413 |
|
| 414 |
-
# Add histogram
|
| 415 |
-
|
| 416 |
x=scores,
|
| 417 |
-
nbinsx=
|
| 418 |
name='Histogram',
|
| 419 |
opacity=0.7,
|
| 420 |
histnorm='probability density'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
))
|
| 422 |
|
| 423 |
# Calculate and add KDE curve
|
|
@@ -433,6 +613,17 @@ class AnalysisHandlers:
|
|
| 433 |
line=dict(color='red', width=2)
|
| 434 |
))
|
| 435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
# Update layout
|
| 437 |
fig.update_layout(
|
| 438 |
title=f"Distribution of {key}",
|
|
@@ -447,79 +638,6 @@ class AnalysisHandlers:
|
|
| 447 |
|
| 448 |
@staticmethod
|
| 449 |
def render_enhanced_analysis_options():
|
| 450 |
-
"""Render the enhanced analysis interface with
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
from web_app.session_manager import SessionManager
|
| 454 |
-
|
| 455 |
-
st.subheader("🔧 Analysis Configuration")
|
| 456 |
-
|
| 457 |
-
# Get current configuration
|
| 458 |
-
config = ConfigManager.load_reference_config()
|
| 459 |
-
reference_lists = SessionManager.get_reference_lists()
|
| 460 |
-
|
| 461 |
-
# Enhanced Reference Lists & Measures Section
|
| 462 |
-
st.write("### 📋 Reference Lists & Measures")
|
| 463 |
-
|
| 464 |
-
# Simple hierarchical display for now (basic implementation)
|
| 465 |
-
if reference_lists:
|
| 466 |
-
st.write("**Selected Reference Lists:**")
|
| 467 |
-
for list_name in reference_lists.keys():
|
| 468 |
-
# Show smart defaults indicator
|
| 469 |
-
entry_config = UIComponents._find_entry_config(list_name, config)
|
| 470 |
-
if entry_config and entry_config.get('default_measures'):
|
| 471 |
-
defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
|
| 472 |
-
log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
|
| 473 |
-
st.write(f"├─ **{list_name}** [Token ✓] [Lemma ✓] [ℹ️ Smart defaults]")
|
| 474 |
-
st.write(f" {defaults_info}, {log_info}")
|
| 475 |
-
else:
|
| 476 |
-
st.write(f"├─ **{list_name}** [Legacy configuration]")
|
| 477 |
-
else:
|
| 478 |
-
st.info("No reference lists selected. Please configure reference lists first.")
|
| 479 |
-
|
| 480 |
-
# Global Analysis Options
|
| 481 |
-
st.write("### 🎯 Analysis Types")
|
| 482 |
-
col1, col2 = st.columns(2)
|
| 483 |
-
|
| 484 |
-
with col1:
|
| 485 |
-
token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
|
| 486 |
-
with col2:
|
| 487 |
-
lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
|
| 488 |
-
|
| 489 |
-
# Global Options
|
| 490 |
-
st.write("### ⚙️ Global Options")
|
| 491 |
-
word_type_filter = st.selectbox(
|
| 492 |
-
"Word Type Filter:",
|
| 493 |
-
options=[None, 'CW', 'FW'],
|
| 494 |
-
format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
|
| 495 |
-
key="word_type_filter"
|
| 496 |
-
)
|
| 497 |
-
|
| 498 |
-
# Advanced Configuration Section
|
| 499 |
-
with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
|
| 500 |
-
st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
|
| 501 |
-
"Expand this section only if you need custom control.")
|
| 502 |
-
|
| 503 |
-
# Legacy log transformation toggle
|
| 504 |
-
legacy_log_toggle = st.checkbox(
|
| 505 |
-
"Apply log₁₀ transformation to ALL measures (Legacy Mode)",
|
| 506 |
-
value=False,
|
| 507 |
-
help="⚠️ Not recommended: This applies log transformation to all measures, "
|
| 508 |
-
"including those where it's scientifically inappropriate (e.g., concreteness ratings).",
|
| 509 |
-
key="legacy_log_transform"
|
| 510 |
-
)
|
| 511 |
-
|
| 512 |
-
if legacy_log_toggle:
|
| 513 |
-
st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
|
| 514 |
-
"This may produce scientifically invalid results for psycholinguistic measures.")
|
| 515 |
-
|
| 516 |
-
# Return enhanced configuration
|
| 517 |
-
return {
|
| 518 |
-
'token_analysis': token_analysis,
|
| 519 |
-
'lemma_analysis': lemma_analysis,
|
| 520 |
-
'word_type_filter': word_type_filter,
|
| 521 |
-
'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
|
| 522 |
-
'legacy_log_transform': st.session_state.get('legacy_log_transform', False),
|
| 523 |
-
'selected_measures': {}, # Will be filled by smart defaults
|
| 524 |
-
'log_transforms': {} # Will be filled by smart defaults
|
| 525 |
-
}
|
|
|
|
| 96 |
analyzer.load_reference_lists(reference_lists)
|
| 97 |
|
| 98 |
# Get analysis configuration
|
| 99 |
+
if use_smart_defaults and not legacy_log_transform:
|
| 100 |
+
# Use custom selections from the enhanced UI
|
| 101 |
+
if selected_measures and any(selected_measures.values()):
|
| 102 |
+
# User has made custom selections
|
| 103 |
+
results = analyzer.analyze_text(
|
| 104 |
+
text_content,
|
| 105 |
+
list(reference_lists.keys()),
|
| 106 |
+
apply_log=False, # Superseded by log_transforms
|
| 107 |
+
word_type_filter=word_type_filter,
|
| 108 |
+
log_transforms=log_transforms,
|
| 109 |
+
selected_measures=selected_measures
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Calculate totals for user feedback
|
| 113 |
+
total_measures = sum(len(measures) for measures in selected_measures.values())
|
| 114 |
+
total_logs = sum(len(logs) for logs in log_transforms.values())
|
| 115 |
+
|
| 116 |
+
st.success("✨ Analysis completed using your custom selections!")
|
| 117 |
+
st.info(f"📊 Analyzed {total_measures} measures, {total_logs} log-transformed")
|
| 118 |
+
else:
|
| 119 |
+
# Fallback to smart defaults if no custom selections
|
| 120 |
+
from web_app.defaults_manager import DefaultsManager
|
| 121 |
+
from web_app.config_manager import ConfigManager
|
| 122 |
+
|
| 123 |
+
config = ConfigManager.load_reference_config()
|
| 124 |
+
default_measures, default_logs = DefaultsManager.get_default_analysis_config(
|
| 125 |
+
list(reference_lists.keys()), config
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
results = analyzer.analyze_text(
|
| 129 |
+
text_content,
|
| 130 |
+
list(reference_lists.keys()),
|
| 131 |
+
apply_log=False,
|
| 132 |
+
word_type_filter=word_type_filter,
|
| 133 |
+
log_transforms=default_logs,
|
| 134 |
+
selected_measures=default_measures
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
total_logs = sum(len(logs) for logs in default_logs.values())
|
| 138 |
+
st.success("✨ Analysis completed using Smart Defaults!")
|
| 139 |
+
st.info(f"📊 Applied selective log transforms to {total_logs} measures")
|
| 140 |
|
| 141 |
else:
|
| 142 |
# Legacy mode - use global log transformation
|
|
|
|
| 421 |
|
| 422 |
@staticmethod
|
| 423 |
def create_density_plots(results: Dict[str, Any]):
|
| 424 |
+
"""Create density plots for score distributions with mean line and example words."""
|
| 425 |
if 'raw_scores' not in results:
|
| 426 |
return
|
| 427 |
|
| 428 |
for key, scores in results['raw_scores'].items():
|
| 429 |
if len(scores) > 1: # Need at least 2 points for density
|
| 430 |
+
# Create word-to-score mapping for this measure
|
| 431 |
+
word_score_map = {}
|
| 432 |
+
|
| 433 |
+
# Determine if this is a bigram, trigram, or token-based measure
|
| 434 |
+
if '_bigram_' in key:
|
| 435 |
+
# Handle bigram measures
|
| 436 |
+
if 'bigram_details' in results and results['bigram_details']:
|
| 437 |
+
# Extract the correct column name from the key
|
| 438 |
+
# Raw scores key: 'COCA_spoken_bigram_frequency_token_bigram_frequency'
|
| 439 |
+
# Actual column: 'COCA_spoken_bigram_frequency_token_frequency'
|
| 440 |
+
# Remove the last occurrence of '_bigram' from the key
|
| 441 |
+
idx = key.rfind('_bigram')
|
| 442 |
+
if idx != -1:
|
| 443 |
+
index_measure_col = key[:idx] + key[idx+7:] # 7 = len('_bigram')
|
| 444 |
+
else:
|
| 445 |
+
index_measure_col = key
|
| 446 |
+
|
| 447 |
+
# Build mapping from bigram details
|
| 448 |
+
for bigram_detail in results['bigram_details']:
|
| 449 |
+
if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
|
| 450 |
+
bigram_text = bigram_detail.get('bigram', '')
|
| 451 |
+
word_score_map[bigram_text] = bigram_detail[index_measure_col]
|
| 452 |
+
|
| 453 |
+
elif '_trigram_' in key:
|
| 454 |
+
# Handle trigram measures
|
| 455 |
+
if 'trigram_details' in results and results['trigram_details']:
|
| 456 |
+
# Extract the correct column name from the key
|
| 457 |
+
# Raw scores key: 'COCA_trigram_frequency_token_trigram_frequency'
|
| 458 |
+
# Actual column: 'COCA_trigram_frequency_token_frequency'
|
| 459 |
+
# Remove the last occurrence of '_trigram' from the key
|
| 460 |
+
idx = key.rfind('_trigram')
|
| 461 |
+
if idx != -1:
|
| 462 |
+
index_measure_col = key[:idx] + key[idx+8:] # 8 = len('_trigram')
|
| 463 |
+
else:
|
| 464 |
+
index_measure_col = key
|
| 465 |
+
|
| 466 |
+
# Build mapping from trigram details
|
| 467 |
+
for trigram_detail in results['trigram_details']:
|
| 468 |
+
if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
|
| 469 |
+
trigram_text = trigram_detail.get('trigram', '')
|
| 470 |
+
word_score_map[trigram_text] = trigram_detail[index_measure_col]
|
| 471 |
+
|
| 472 |
+
else:
|
| 473 |
+
# Handle token-based measures (existing logic)
|
| 474 |
+
if 'token_details' in results:
|
| 475 |
+
# Handle key mismatch between raw_scores and token_details
|
| 476 |
+
# raw_scores keys may have suffixes like '_CW', '_FW', etc.
|
| 477 |
+
# while token_details uses the base column names
|
| 478 |
+
|
| 479 |
+
# Try to find matching column in token_details
|
| 480 |
+
matching_column = None
|
| 481 |
+
|
| 482 |
+
# First, try exact match
|
| 483 |
+
if any(key in token for token in results['token_details']):
|
| 484 |
+
matching_column = key
|
| 485 |
+
else:
|
| 486 |
+
# Try removing word type suffixes (_CW, _FW)
|
| 487 |
+
base_key = key
|
| 488 |
+
for suffix in ['_CW', '_FW']:
|
| 489 |
+
if key.endswith(suffix):
|
| 490 |
+
base_key = key[:-len(suffix)]
|
| 491 |
+
break
|
| 492 |
+
|
| 493 |
+
# Check if base key exists in token_details
|
| 494 |
+
if any(base_key in token for token in results['token_details']):
|
| 495 |
+
matching_column = base_key
|
| 496 |
+
else:
|
| 497 |
+
# Try finding partial matches for complex keys
|
| 498 |
+
for token in results['token_details']:
|
| 499 |
+
for col_name in token.keys():
|
| 500 |
+
if col_name != 'id' and col_name != 'token' and col_name != 'lemma' and col_name != 'pos' and col_name != 'tag' and col_name != 'word_type':
|
| 501 |
+
# Check if this column name is part of our key
|
| 502 |
+
if col_name in key or key.startswith(col_name):
|
| 503 |
+
matching_column = col_name
|
| 504 |
+
break
|
| 505 |
+
if matching_column:
|
| 506 |
+
break
|
| 507 |
+
|
| 508 |
+
# Build word-to-score mapping using the matching column
|
| 509 |
+
if matching_column:
|
| 510 |
+
for token in results['token_details']:
|
| 511 |
+
if matching_column in token and token[matching_column] is not None:
|
| 512 |
+
word_score_map[token['token']] = token[matching_column]
|
| 513 |
+
|
| 514 |
+
# Calculate number of bins
|
| 515 |
+
nbins = min(30, len(scores))
|
| 516 |
+
|
| 517 |
+
# Create figure and add histogram first to let Plotly calculate optimal bins
|
| 518 |
fig = go.Figure()
|
| 519 |
|
| 520 |
+
# Add histogram to get Plotly's binning
|
| 521 |
+
histogram_trace = go.Histogram(
|
| 522 |
x=scores,
|
| 523 |
+
nbinsx=nbins,
|
| 524 |
name='Histogram',
|
| 525 |
opacity=0.7,
|
| 526 |
histnorm='probability density'
|
| 527 |
+
)
|
| 528 |
+
fig.add_trace(histogram_trace)
|
| 529 |
+
|
| 530 |
+
# Extract Plotly's actual bin edges by creating a temporary figure to get the data
|
| 531 |
+
temp_fig = go.Figure()
|
| 532 |
+
temp_fig.add_trace(go.Histogram(x=scores, nbinsx=nbins))
|
| 533 |
+
|
| 534 |
+
# Calculate histogram using the same parameters as Plotly would use
|
| 535 |
+
# Plotly calculates bins similar to numpy's auto method
|
| 536 |
+
# We'll use numpy but ensure we get similar bin edges
|
| 537 |
+
hist_data, plotly_bin_edges = np.histogram(scores, bins=nbins)
|
| 538 |
+
|
| 539 |
+
# For better alignment with Plotly, we can also try to match Plotly's exact binning
|
| 540 |
+
# by using the range and number of bins
|
| 541 |
+
score_min, score_max = min(scores), max(scores)
|
| 542 |
+
# Add small padding as Plotly does
|
| 543 |
+
score_range = score_max - score_min
|
| 544 |
+
padding = score_range * 0.02 # Small padding like Plotly
|
| 545 |
+
adjusted_min = score_min - padding
|
| 546 |
+
adjusted_max = score_max + padding
|
| 547 |
+
|
| 548 |
+
# Create bins with the adjusted range
|
| 549 |
+
plotly_bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
|
| 550 |
+
hist_data, _ = np.histogram(scores, bins=plotly_bin_edges)
|
| 551 |
+
|
| 552 |
+
# Assign words to bins using Plotly-aligned bin edges
|
| 553 |
+
bin_examples = {}
|
| 554 |
+
if word_score_map:
|
| 555 |
+
import random
|
| 556 |
+
for word, score in word_score_map.items():
|
| 557 |
+
bin_idx = np.digitize(score, plotly_bin_edges) - 1
|
| 558 |
+
bin_idx = max(0, min(bin_idx, len(plotly_bin_edges) - 2)) # Clamp to valid range
|
| 559 |
+
|
| 560 |
+
if bin_idx not in bin_examples:
|
| 561 |
+
bin_examples[bin_idx] = []
|
| 562 |
+
bin_examples[bin_idx].append(word)
|
| 563 |
+
|
| 564 |
+
# Randomly sample up to 3 words per bin
|
| 565 |
+
for bin_idx in bin_examples:
|
| 566 |
+
if len(bin_examples[bin_idx]) > 3:
|
| 567 |
+
bin_examples[bin_idx] = random.sample(bin_examples[bin_idx], 3)
|
| 568 |
+
|
| 569 |
+
# Create hover text for each bin using Plotly's bins
|
| 570 |
+
hover_texts = []
|
| 571 |
+
for i in range(len(hist_data)):
|
| 572 |
+
bin_start = plotly_bin_edges[i]
|
| 573 |
+
bin_end = plotly_bin_edges[i + 1]
|
| 574 |
+
examples = bin_examples.get(i, [])
|
| 575 |
+
|
| 576 |
+
hover_text = f"Range: {bin_start:.3f} - {bin_end:.3f}<br>"
|
| 577 |
+
hover_text += f"Count: {hist_data[i]}<br>"
|
| 578 |
+
if examples:
|
| 579 |
+
hover_text += f"Examples: {', '.join(examples)}"
|
| 580 |
+
else:
|
| 581 |
+
hover_text += "Examples: none"
|
| 582 |
+
|
| 583 |
+
hover_texts.append(hover_text)
|
| 584 |
+
|
| 585 |
+
# Clear the figure and rebuild with custom hover text
|
| 586 |
+
fig = go.Figure()
|
| 587 |
+
|
| 588 |
+
# Add histogram with custom hover text using the calculated bin edges
|
| 589 |
+
fig.add_trace(go.Histogram(
|
| 590 |
+
x=scores,
|
| 591 |
+
xbins=dict(
|
| 592 |
+
start=plotly_bin_edges[0],
|
| 593 |
+
end=plotly_bin_edges[-1],
|
| 594 |
+
size=(plotly_bin_edges[-1] - plotly_bin_edges[0]) / nbins
|
| 595 |
+
),
|
| 596 |
+
name='Histogram',
|
| 597 |
+
opacity=0.7,
|
| 598 |
+
histnorm='probability density',
|
| 599 |
+
hovertemplate='%{customdata}<extra></extra>',
|
| 600 |
+
customdata=hover_texts
|
| 601 |
))
|
| 602 |
|
| 603 |
# Calculate and add KDE curve
|
|
|
|
| 613 |
line=dict(color='red', width=2)
|
| 614 |
))
|
| 615 |
|
| 616 |
+
# Add mean line
|
| 617 |
+
mean_score = np.mean(scores)
|
| 618 |
+
fig.add_vline(
|
| 619 |
+
x=mean_score,
|
| 620 |
+
line_dash="dash",
|
| 621 |
+
line_color="green",
|
| 622 |
+
line_width=2,
|
| 623 |
+
annotation_text=f"Mean: {mean_score:.3f}",
|
| 624 |
+
annotation_position="top"
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
# Update layout
|
| 628 |
fig.update_layout(
|
| 629 |
title=f"Distribution of {key}",
|
|
|
|
| 638 |
|
| 639 |
@staticmethod
|
| 640 |
def render_enhanced_analysis_options():
|
| 641 |
+
"""Render the enhanced analysis interface with advanced measure selection capabilities."""
|
| 642 |
+
# Use the new enhanced UI from UIComponents
|
| 643 |
+
return UIComponents.render_analysis_options()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
web_app/handlers/frequency_handlers.py
CHANGED
|
@@ -18,6 +18,7 @@ import sys
|
|
| 18 |
import os
|
| 19 |
from pathlib import Path
|
| 20 |
from io import StringIO, BytesIO
|
|
|
|
| 21 |
|
| 22 |
# Add parent directory to path for imports
|
| 23 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|
@@ -111,7 +112,8 @@ class FrequencyHandlers:
|
|
| 111 |
df_preview = pd.read_csv(StringIO(content),
|
| 112 |
sep=st.session_state.format_info['separator'],
|
| 113 |
header=0 if st.session_state.format_info['has_header'] else None,
|
| 114 |
-
nrows=100
|
|
|
|
| 115 |
|
| 116 |
# Detect available columns
|
| 117 |
st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
|
|
|
|
| 18 |
import os
|
| 19 |
from pathlib import Path
|
| 20 |
from io import StringIO, BytesIO
|
| 21 |
+
import csv
|
| 22 |
|
| 23 |
# Add parent directory to path for imports
|
| 24 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
|
|
|
| 112 |
df_preview = pd.read_csv(StringIO(content),
|
| 113 |
sep=st.session_state.format_info['separator'],
|
| 114 |
header=0 if st.session_state.format_info['has_header'] else None,
|
| 115 |
+
nrows=100,
|
| 116 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 117 |
|
| 118 |
# Detect available columns
|
| 119 |
st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
|
web_app/utils/memory_file_handler.py
CHANGED
|
@@ -10,6 +10,7 @@ from io import BytesIO, StringIO
|
|
| 10 |
from typing import Optional, Union, Dict, Any
|
| 11 |
import pandas as pd
|
| 12 |
import zipfile
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class MemoryFileHandler:
|
|
@@ -76,7 +77,8 @@ class MemoryFileHandler:
|
|
| 76 |
delimiter = ','
|
| 77 |
|
| 78 |
# Read directly into DataFrame
|
| 79 |
-
df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8'
|
|
|
|
| 80 |
return df
|
| 81 |
|
| 82 |
except Exception as e:
|
|
@@ -167,4 +169,4 @@ class MemoryFileHandler:
|
|
| 167 |
for key in keys_to_remove:
|
| 168 |
del st.session_state[key]
|
| 169 |
else:
|
| 170 |
-
st.session_state.clear()
|
|
|
|
| 10 |
from typing import Optional, Union, Dict, Any
|
| 11 |
import pandas as pd
|
| 12 |
import zipfile
|
| 13 |
+
import csv
|
| 14 |
|
| 15 |
|
| 16 |
class MemoryFileHandler:
|
|
|
|
| 77 |
delimiter = ','
|
| 78 |
|
| 79 |
# Read directly into DataFrame
|
| 80 |
+
df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8',
|
| 81 |
+
quoting=csv.QUOTE_MINIMAL, quotechar='"')
|
| 82 |
return df
|
| 83 |
|
| 84 |
except Exception as e:
|
|
|
|
| 169 |
for key in keys_to_remove:
|
| 170 |
del st.session_state[key]
|
| 171 |
else:
|
| 172 |
+
st.session_state.clear()
|