egumasa commited on
Commit
e7279e4
·
1 Parent(s): 42f8800
config/reference_lists.yaml CHANGED
@@ -88,7 +88,7 @@ english:
88
  measure_classifications:
89
  concreteness: psycholinguistic
90
  header_prefix: '#'
91
-
92
  concreteness_ratings_lemma:
93
  display_name: Concreteness Ratings (Lemma)
94
  description: Concreteness ratings for English words (1-5 scale) - lemma-based
@@ -241,6 +241,7 @@ english:
241
  normalized_freq: frequency
242
  documents: range
243
  range: range
 
244
  COCA_spoken_bigram_frequency_lemma:
245
  display_name: COCA Spoken Bigram Frequency (Lemma)
246
  description: Bigram frequencies and range data - lemma-based analysis
@@ -270,6 +271,7 @@ english:
270
  normalized_freq: frequency
271
  documents: range
272
  range: range
 
273
  COCA_spoken_bigram_association_token:
274
  display_name: COCA Spoken Bigram Associations (Token)
275
  description: Bigram association measures (MI, T-score, Delta P) - token-based
@@ -308,13 +310,21 @@ english:
308
  t_score: association
309
  delta_p: association
310
  ap_collex: association
 
311
  COCA_spoken_bigram_association_lemma:
312
  display_name: COCA Spoken Bigram Associations (Lemma)
313
  description: Bigram association measures (MI, T-score, Delta P) - lemma-based
314
  analysis
315
  file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
316
  format: csv
317
- columns: *id005
 
 
 
 
 
 
 
318
  has_header: true
319
  enabled: true
320
  analysis_type: lemma
@@ -339,6 +349,7 @@ english:
339
  t_score: association
340
  delta_p: association
341
  ap_collex: association
 
342
  COCA_magazine_bigram_frequency_token:
343
  display_name: COCA Magazine Bigram Frequency (Token)
344
  description: Bigram frequencies and range data in Magazine - token-based analysis
@@ -373,6 +384,7 @@ english:
373
  normalized_freq: frequency
374
  documents: range
375
  range: range
 
376
  COCA_magazine_bigram_frequency_lemma:
377
  display_name: COCA Magazine Bigram Frequency (Lemma)
378
  description: Bigram frequencies and range data in Magazine - lemma-based analysis
@@ -402,6 +414,7 @@ english:
402
  normalized_freq: frequency
403
  documents: range
404
  range: range
 
405
  COCA_magazine_bigram_association_token:
406
  display_name: COCA Magazine Bigram Associations (Token)
407
  description: Bigram association measures (MI, T-score, Delta P) - token-based
@@ -971,6 +984,7 @@ japanese:
971
  pos: unknown
972
  frequency: frequency
973
  japanese_corpus: true
 
974
  jp_frequency_token:
975
  display_name: Japanese Frequency List (Token)
976
  description: Frequency data for Japanese words - token-based analysis
@@ -992,6 +1006,7 @@ japanese:
992
  - frequency
993
  measure_classifications:
994
  frequency: frequency
 
995
  jp_frequency_lemma:
996
  display_name: Japanese Frequency List (Lemma)
997
  description: Frequency data for Japanese words - lemma-based analysis
 
88
  measure_classifications:
89
  concreteness: psycholinguistic
90
  header_prefix: '#'
91
+
92
  concreteness_ratings_lemma:
93
  display_name: Concreteness Ratings (Lemma)
94
  description: Concreteness ratings for English words (1-5 scale) - lemma-based
 
241
  normalized_freq: frequency
242
  documents: range
243
  range: range
244
+
245
  COCA_spoken_bigram_frequency_lemma:
246
  display_name: COCA Spoken Bigram Frequency (Lemma)
247
  description: Bigram frequencies and range data - lemma-based analysis
 
271
  normalized_freq: frequency
272
  documents: range
273
  range: range
274
+
275
  COCA_spoken_bigram_association_token:
276
  display_name: COCA Spoken Bigram Associations (Token)
277
  description: Bigram association measures (MI, T-score, Delta P) - token-based
 
310
  t_score: association
311
  delta_p: association
312
  ap_collex: association
313
+
314
  COCA_spoken_bigram_association_lemma:
315
  display_name: COCA Spoken Bigram Associations (Lemma)
316
  description: Bigram association measures (MI, T-score, Delta P) - lemma-based
317
  analysis
318
  file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
319
  format: csv
320
+ columns:
321
+ bigram: 0
322
+ frequency: 1
323
+ mi_score: 5
324
+ mi_2_score: 6
325
+ t_score: 7
326
+ delta_p: 8
327
+ ap_collex: 9
328
  has_header: true
329
  enabled: true
330
  analysis_type: lemma
 
349
  t_score: association
350
  delta_p: association
351
  ap_collex: association
352
+
353
  COCA_magazine_bigram_frequency_token:
354
  display_name: COCA Magazine Bigram Frequency (Token)
355
  description: Bigram frequencies and range data in Magazine - token-based analysis
 
384
  normalized_freq: frequency
385
  documents: range
386
  range: range
387
+
388
  COCA_magazine_bigram_frequency_lemma:
389
  display_name: COCA Magazine Bigram Frequency (Lemma)
390
  description: Bigram frequencies and range data in Magazine - lemma-based analysis
 
414
  normalized_freq: frequency
415
  documents: range
416
  range: range
417
+
418
  COCA_magazine_bigram_association_token:
419
  display_name: COCA Magazine Bigram Associations (Token)
420
  description: Bigram association measures (MI, T-score, Delta P) - token-based
 
984
  pos: unknown
985
  frequency: frequency
986
  japanese_corpus: true
987
+
988
  jp_frequency_token:
989
  display_name: Japanese Frequency List (Token)
990
  description: Frequency data for Japanese words - token-based analysis
 
1006
  - frequency
1007
  measure_classifications:
1008
  frequency: frequency
1009
+
1010
  jp_frequency_lemma:
1011
  display_name: Japanese Frequency List (Lemma)
1012
  description: Frequency data for Japanese words - lemma-based analysis
debug_bigram_trigram.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script to test bigram and trigram processing
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ # Add the project root to the path
10
+ sys.path.insert(0, os.getcwd())
11
+
12
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
13
+ from web_app.config_manager import ConfigManager
14
+
15
+ # Test simple text
16
+ test_text = "The cat sat on the mat. The dog ran quickly."
17
+
18
+ # Create analyzer
19
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
20
+
21
+ # Load config
22
+ config = ConfigManager.load_reference_config()
23
+ english_config = config.get('english', {})
24
+
25
+ print("=== Available Reference Lists ===")
26
+ for ngram_type, lists in english_config.items():
27
+ print(f"\n{ngram_type.upper()}:")
28
+ for list_name, list_config in lists.items():
29
+ if list_config.get('enabled', True):
30
+ print(f" - {list_name}")
31
+
32
+ # Test loading a bigram reference
33
+ print("\n=== Testing Bigram Reference Loading ===")
34
+ bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
35
+ if bigram_config:
36
+ print(f"Config: {bigram_config}")
37
+
38
+ # Load the data
39
+ data = ConfigManager.load_reference_list_data(bigram_config)
40
+ print(f"Loaded data keys: {data.keys()}")
41
+
42
+ if 'bigram' in data:
43
+ bigram_df = data['bigram']
44
+ print(f"Bigram DataFrame shape: {bigram_df.shape}")
45
+ print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
46
+ print("First 5 bigrams:")
47
+ print(bigram_df.head())
48
+
49
+ # Test with full reference list structure
50
+ print("\n=== Testing Analyzer with Bigram References ===")
51
+ reference_lists = {
52
+ 'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
53
+ }
54
+
55
+ print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
56
+ for name, data in reference_lists.items():
57
+ print(f" {name}: {list(data.keys())}")
58
+
59
+ # Load into analyzer
60
+ analyzer.load_reference_lists(reference_lists)
61
+
62
+ # Analyze text
63
+ results = analyzer.analyze_text(
64
+ test_text,
65
+ list(reference_lists.keys()),
66
+ apply_log=False
67
+ )
68
+
69
+ print("\n=== Analysis Results ===")
70
+ print(f"Summary keys: {list(results['summary'].keys())}")
71
+ print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
72
+ print(f"Bigram details count: {len(results.get('bigram_details', []))}")
73
+ print(f"Trigram details count: {len(results.get('trigram_details', []))}")
74
+
75
+ if results.get('bigram_details'):
76
+ print("\nFirst few bigram details:")
77
+ for detail in results['bigram_details'][:3]:
78
+ print(f" {detail}")
debug_plot_columns.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script to examine column naming issues in bigram/trigram plots
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ # Add the project root to the path
10
+ sys.path.insert(0, os.getcwd())
11
+
12
+ from web_app.config_manager import ConfigManager
13
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
14
+
15
+ def debug_plot_columns():
16
+ print("=== Debugging Plot Column Names ==")
17
+
18
+ # Load config and create reference lists
19
+ config = ConfigManager.load_reference_config()
20
+ english_config = config.get('english', {})
21
+
22
+ reference_lists = {}
23
+
24
+ # Load a unigram, bigram, and trigram reference
25
+ unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
26
+ bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
27
+ trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
28
+
29
+ reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
30
+ reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
31
+ reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
32
+
33
+ # Create analyzer and analyze text
34
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
35
+ analyzer.load_reference_lists(reference_lists)
36
+
37
+ test_text = "The cat sat on the mat. The dog ran quickly."
38
+ results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
39
+
40
+ print("\n=== Raw Scores Keys ===")
41
+ for key in results['raw_scores'].keys():
42
+ print(f" {key}")
43
+
44
+ print("\n=== Token Details Columns ===")
45
+ if results['token_details']:
46
+ print(f" Sample token: {list(results['token_details'][0].keys())}")
47
+
48
+ print("\n=== Bigram Details Columns ===")
49
+ if results['bigram_details']:
50
+ print(f" Sample bigram: {list(results['bigram_details'][0].keys())}")
51
+
52
+ print("\n=== Trigram Details Columns ===")
53
+ if results['trigram_details']:
54
+ print(f" Sample trigram: {list(results['trigram_details'][0].keys())}")
55
+
56
+ print("\n=== Column Matching Analysis ===")
57
+
58
+ # Test the current algorithm for bigrams
59
+ for key in results['raw_scores'].keys():
60
+ if '_bigram_' in key:
61
+ print(f"\nAnalyzing bigram key: {key}")
62
+ key_parts = key.split('_')
63
+ if len(key_parts) >= 3 and 'bigram' in key_parts:
64
+ measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:])
65
+ index_measure_col = f"{key_parts[0]}_{measure_name}"
66
+ print(f" Algorithm expects column: '{index_measure_col}'")
67
+
68
+ # Check if this column exists in bigram_details
69
+ if results['bigram_details']:
70
+ sample_bigram = results['bigram_details'][0]
71
+ if index_measure_col in sample_bigram:
72
+ print(f" ✅ Column found in bigram_details")
73
+ else:
74
+ print(f" ❌ Column NOT found in bigram_details")
75
+ print(f" Available columns: {list(sample_bigram.keys())}")
76
+
77
+ # Try to find the correct column
78
+ for col in sample_bigram.keys():
79
+ if measure_name in col:
80
+ print(f" Possible match: '{col}'")
81
+
82
+ # Test the current algorithm for trigrams
83
+ for key in results['raw_scores'].keys():
84
+ if '_trigram_' in key:
85
+ print(f"\nAnalyzing trigram key: {key}")
86
+ key_parts = key.split('_')
87
+ if len(key_parts) >= 3 and 'trigram' in key_parts:
88
+ measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:])
89
+ index_measure_col = f"{key_parts[0]}_{measure_name}"
90
+ print(f" Algorithm expects column: '{index_measure_col}'")
91
+
92
+ # Check if this column exists in trigram_details
93
+ if results['trigram_details']:
94
+ sample_trigram = results['trigram_details'][0]
95
+ if index_measure_col in sample_trigram:
96
+ print(f" ✅ Column found in trigram_details")
97
+ else:
98
+ print(f" ❌ Column NOT found in trigram_details")
99
+ print(f" Available columns: {list(sample_trigram.keys())}")
100
+
101
+ # Try to find the correct column
102
+ for col in sample_trigram.keys():
103
+ if measure_name in col:
104
+ print(f" Possible match: '{col}'")
105
+
106
+ if __name__ == "__main__":
107
+ debug_plot_columns()
test/test_advanced_selection.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify the advanced selection UI implementation.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ from pathlib import Path
9
+
10
+ # Add project root to path
11
+ project_root = Path(__file__).parent
12
+ sys.path.insert(0, str(project_root))
13
+
14
+ def test_ui_components_grouping():
15
+ """Test the grouping functionality of UI components."""
16
+ print("Testing UI Components Grouping...")
17
+
18
+ try:
19
+ from web_app.components.ui_components import UIComponents
20
+ from web_app.config_manager import ConfigManager
21
+
22
+ # Load the configuration
23
+ config = ConfigManager.load_reference_config()
24
+
25
+ # Simulate reference lists
26
+ mock_reference_lists = {
27
+ 'COCA_spoken_frequency_token': {},
28
+ 'COCA_spoken_frequency_lemma': {},
29
+ 'concreteness_ratings_token': {},
30
+ 'concreteness_ratings_lemma': {}
31
+ }
32
+
33
+ # Test grouping function
34
+ groups = UIComponents._group_reference_lists(mock_reference_lists, config)
35
+
36
+ print(f"✅ Grouping successful! Found {len(groups)} groups:")
37
+ for base_name, group_data in groups.items():
38
+ token_count = len(group_data['token'])
39
+ lemma_count = len(group_data['lemma'])
40
+ print(f" - {base_name}: {token_count} token entries, {lemma_count} lemma entries")
41
+
42
+ return True
43
+
44
+ except Exception as e:
45
+ print(f"❌ Grouping test failed: {e}")
46
+ return False
47
+
48
+ def test_config_structure():
49
+ """Test that the configuration has the expected structure."""
50
+ print("\nTesting Configuration Structure...")
51
+
52
+ try:
53
+ from web_app.config_manager import ConfigManager
54
+
55
+ config = ConfigManager.load_reference_config()
56
+
57
+ # Check for expected keys
58
+ expected_sections = ['english', 'japanese']
59
+ found_sections = []
60
+
61
+ for section in expected_sections:
62
+ if section in config:
63
+ found_sections.append(section)
64
+ print(f" ✅ Found {section} section")
65
+
66
+ # Check for subsections
67
+ for subsection in ['unigrams', 'bigrams', 'trigrams']:
68
+ if subsection in config[section]:
69
+ entries = len(config[section][subsection])
70
+ print(f" - {subsection}: {entries} entries")
71
+
72
+ if found_sections:
73
+ print(f"✅ Configuration structure valid!")
74
+
75
+ # Check for advanced selection fields
76
+ sample_entry = None
77
+ for lang in config.values():
78
+ if isinstance(lang, dict):
79
+ for ngram_type in lang.values():
80
+ if isinstance(ngram_type, dict):
81
+ for entry_name, entry_config in ngram_type.items():
82
+ sample_entry = entry_config
83
+ break
84
+ break
85
+ break
86
+
87
+ if sample_entry:
88
+ required_fields = ['selectable_measures', 'default_measures', 'default_log_transforms', 'log_transformable']
89
+ missing_fields = []
90
+
91
+ for field in required_fields:
92
+ if field not in sample_entry:
93
+ missing_fields.append(field)
94
+ else:
95
+ print(f" ✅ Found {field}: {sample_entry[field]}")
96
+
97
+ if missing_fields:
98
+ print(f" ⚠️ Missing fields: {missing_fields}")
99
+ else:
100
+ print(" ✅ All advanced selection fields present!")
101
+
102
+ return True
103
+ else:
104
+ print("❌ No valid configuration sections found")
105
+ return False
106
+
107
+ except Exception as e:
108
+ print(f"❌ Configuration test failed: {e}")
109
+ return False
110
+
111
+ def test_analyzer_parameters():
112
+ """Test that the analyzer accepts the new parameters."""
113
+ print("\nTesting Analyzer Parameters...")
114
+
115
+ try:
116
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
117
+
118
+ # Create analyzer
119
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
120
+
121
+ # Test parameter signature
122
+ import inspect
123
+ analyze_signature = inspect.signature(analyzer.analyze_text)
124
+ params = list(analyze_signature.parameters.keys())
125
+
126
+ required_params = ['log_transforms', 'selected_measures']
127
+ found_params = []
128
+
129
+ for param in required_params:
130
+ if param in params:
131
+ found_params.append(param)
132
+ print(f" ✅ Found parameter: {param}")
133
+ else:
134
+ print(f" ❌ Missing parameter: {param}")
135
+
136
+ if len(found_params) == len(required_params):
137
+ print("✅ Analyzer has all required parameters!")
138
+ return True
139
+ else:
140
+ print(f"❌ Analyzer missing {len(required_params) - len(found_params)} parameters")
141
+ return False
142
+
143
+ except Exception as e:
144
+ print(f"❌ Analyzer test failed: {e}")
145
+ return False
146
+
147
+ def main():
148
+ """Run all tests."""
149
+ print("🧪 Testing Advanced Selection Implementation\n")
150
+
151
+ tests = [
152
+ test_config_structure,
153
+ test_ui_components_grouping,
154
+ test_analyzer_parameters
155
+ ]
156
+
157
+ passed = 0
158
+ total = len(tests)
159
+
160
+ for test in tests:
161
+ if test():
162
+ passed += 1
163
+
164
+ print(f"\n📊 Test Results: {passed}/{total} tests passed")
165
+
166
+ if passed == total:
167
+ print("🎉 All tests passed! Advanced selection implementation is ready.")
168
+ return 0
169
+ else:
170
+ print("⚠️ Some tests failed. Please check the implementation.")
171
+ return 1
172
+
173
+ if __name__ == "__main__":
174
+ exit(main())
test/test_column_naming_fix.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify that the column naming bug is fixed.
4
+ This script specifically tests that we don't get duplicate suffixes like:
5
+ - COCA_spoken_frequency_token_token
6
+ - COCA_spoken_frequency_lemma_lemma
7
+ """
8
+
9
+ import sys
10
+ import os
11
+ from pathlib import Path
12
+
13
+ # Add project root to path
14
+ project_root = Path(__file__).parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ def test_column_naming():
18
+ """Test that column names are clean without duplicate suffixes."""
19
+ print("Testing Column Naming Fix...")
20
+
21
+ try:
22
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
23
+
24
+ # Create analyzer
25
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
26
+
27
+ # Create mock reference lists
28
+ mock_reference_lists = {
29
+ 'COCA_spoken_frequency_token': {
30
+ 'token': {'hello': 100, 'world': 200}
31
+ },
32
+ 'COCA_spoken_frequency_lemma': {
33
+ 'lemma': {'hello': 150, 'world': 250}
34
+ }
35
+ }
36
+
37
+ # Load reference lists
38
+ analyzer.load_reference_lists(mock_reference_lists)
39
+
40
+ # Analyze a simple text
41
+ text = "Hello world, this is a test."
42
+ selected_indices = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
43
+
44
+ results = analyzer.analyze_text(text, selected_indices)
45
+
46
+ # Check token details for clean column names
47
+ if results['token_details']:
48
+ first_token = results['token_details'][0]
49
+ column_names = list(first_token.keys())
50
+
51
+ print(f"Column names found: {column_names}")
52
+
53
+ # Check for problematic duplicate suffixes
54
+ problematic_columns = []
55
+ for col in column_names:
56
+ if '_token_token' in col or '_lemma_lemma' in col or '_token_lemma' in col or '_lemma_token' in col:
57
+ problematic_columns.append(col)
58
+
59
+ if problematic_columns:
60
+ print(f"❌ Found problematic column names: {problematic_columns}")
61
+ return False
62
+ else:
63
+ print("✅ No duplicate suffixes found in column names!")
64
+
65
+ # Check that we have the expected clean column names
66
+ expected_clean_columns = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
67
+ found_clean_columns = [col for col in column_names if col in expected_clean_columns]
68
+
69
+ if found_clean_columns:
70
+ print(f"✅ Found expected clean columns: {found_clean_columns}")
71
+ return True
72
+ else:
73
+ print(f"⚠️ Expected clean columns not found. Available columns: {column_names}")
74
+ return False
75
+ else:
76
+ print("❌ No token details found in results")
77
+ return False
78
+
79
+ except Exception as e:
80
+ print(f"❌ Test failed with error: {e}")
81
+ return False
82
+
83
+ def main():
84
+ """Run the column naming test."""
85
+ print("🧪 Testing Column Naming Fix\n")
86
+
87
+ if test_column_naming():
88
+ print("\n🎉 Column naming bug has been fixed!")
89
+ return 0
90
+ else:
91
+ print("\n⚠️ Column naming issue still exists.")
92
+ return 1
93
+
94
+ if __name__ == "__main__":
95
+ exit(main())
test_file_upload_handler.py → test/test_file_upload_handler.py RENAMED
File without changes
test_fix_403.py → test/test_fix_403.py RENAMED
File without changes
test/test_log_transform_consistency.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify that log transformations are consistently applied
4
+ to both token details table and summary statistics.
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ from pathlib import Path
10
+ import numpy as np
11
+
12
+ # Add project root to path
13
+ project_root = Path(__file__).parent
14
+ sys.path.insert(0, str(project_root))
15
+
16
+ def test_log_transform_consistency():
17
+ """Test that log transformations are applied consistently."""
18
+ print("Testing Log Transform Consistency...")
19
+
20
+ try:
21
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
22
+
23
+ # Create analyzer
24
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
25
+
26
+ # Create mock reference lists with known values
27
+ mock_reference_lists = {
28
+ 'COCA_spoken_frequency_token': {
29
+ 'token': {'hello': 1000, 'world': 2000} # Values that will be log-transformed
30
+ }
31
+ }
32
+
33
+ # Load reference lists
34
+ analyzer.load_reference_lists(mock_reference_lists)
35
+
36
+ # Analyze text with log transformation enabled
37
+ text = "Hello world"
38
+ selected_indices = ['COCA_spoken_frequency_token']
39
+
40
+ # Enable log transformation for this index
41
+ log_transforms = {
42
+ 'COCA_spoken_frequency_token': ['frequency']
43
+ }
44
+
45
+ results = analyzer.analyze_text(
46
+ text,
47
+ selected_indices,
48
+ log_transforms=log_transforms
49
+ )
50
+
51
+ # Get token details and summary statistics
52
+ token_details = results['token_details']
53
+ summary_stats = results['summary']
54
+
55
+ print(f"Token details: {len(token_details)} tokens")
56
+ print(f"Summary keys: {list(summary_stats.keys())}")
57
+
58
+ # Check consistency between token details and summary
59
+ if token_details:
60
+ # Get log-transformed values from token details
61
+ token_scores = []
62
+ for token_detail in token_details:
63
+ score = token_detail.get('COCA_spoken_frequency_token')
64
+ if score is not None:
65
+ token_scores.append(score)
66
+ print(f"Token '{token_detail['token']}': score = {score}")
67
+
68
+ if token_scores:
69
+ # Calculate mean from token details
70
+ token_mean = np.mean(token_scores)
71
+
72
+ # Get mean from summary statistics
73
+ summary_key = 'COCA_spoken_frequency_token_CW' # Content words
74
+ if summary_key in summary_stats:
75
+ summary_mean = summary_stats[summary_key]['mean']
76
+
77
+ print(f"Token details mean: {token_mean}")
78
+ print(f"Summary stats mean: {summary_mean}")
79
+
80
+ # Check if they're approximately equal (allowing for floating point precision)
81
+ if abs(token_mean - summary_mean) < 0.001:
82
+ print("✅ Token details and summary statistics are consistent!")
83
+
84
+ # Check that values are actually log-transformed
85
+ # Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3
86
+ if all(2.5 < score < 3.5 for score in token_scores):
87
+ print("✅ Values appear to be properly log-transformed!")
88
+ return True
89
+ else:
90
+ print(f"⚠️ Values don't appear to be log-transformed: {token_scores}")
91
+ return False
92
+ else:
93
+ print(f"❌ Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}")
94
+ return False
95
+ else:
96
+ print(f"❌ Summary key '{summary_key}' not found in summary stats")
97
+ return False
98
+ else:
99
+ print("❌ No token scores found")
100
+ return False
101
+ else:
102
+ print("❌ No token details found")
103
+ return False
104
+
105
+ except Exception as e:
106
+ print(f"❌ Test failed with error: {e}")
107
+ return False
108
+
109
+ def main():
110
+ """Run the log transform consistency test."""
111
+ print("🧪 Testing Log Transform Consistency\n")
112
+
113
+ if test_log_transform_consistency():
114
+ print("\n🎉 Log transformation consistency has been fixed!")
115
+ return 0
116
+ else:
117
+ print("\n⚠️ Log transformation consistency issue still exists.")
118
+ return 1
119
+
120
+ if __name__ == "__main__":
121
+ exit(main())
test_memory_upload.py → test/test_memory_upload.py RENAMED
File without changes
test_tmp_upload.py → test/test_tmp_upload.py RENAMED
File without changes
test_column_matching.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to understand the exact column matching pattern
4
+ """
5
+
6
+ # Test the pattern manually
7
+ raw_scores_keys = [
8
+ 'COCA_spoken_bigram_frequency_token_bigram_frequency',
9
+ 'COCA_spoken_bigram_frequency_token_bigram_normalized_freq',
10
+ 'COCA_trigram_frequency_token_trigram_frequency'
11
+ ]
12
+
13
+ actual_columns = [
14
+ 'COCA_spoken_bigram_frequency_token_frequency',
15
+ 'COCA_spoken_bigram_frequency_token_normalized_freq',
16
+ 'COCA_trigram_frequency_token_frequency'
17
+ ]
18
+
19
+ print("=== Pattern Analysis ===")
20
+ for raw_key, expected_col in zip(raw_scores_keys, actual_columns):
21
+ print(f"\nRaw key: {raw_key}")
22
+ print(f"Expected: {expected_col}")
23
+
24
+ # The correct pattern - remove only the redundant _bigram or _trigram from the end measure
25
+ if '_bigram_' in raw_key:
26
+ # Find the last occurrence of '_bigram'
27
+ idx = raw_key.rfind('_bigram')
28
+ if idx != -1:
29
+ # Remove only the '_bigram' part, keep everything else
30
+ strategy = raw_key[:idx] + raw_key[idx+7:] # 7 = len('_bigram')
31
+ else:
32
+ strategy = raw_key
33
+ elif '_trigram_' in raw_key:
34
+ # Find the last occurrence of '_trigram'
35
+ idx = raw_key.rfind('_trigram')
36
+ if idx != -1:
37
+ # Remove only the '_trigram' part, keep everything else
38
+ strategy = raw_key[:idx] + raw_key[idx+8:] # 8 = len('_trigram')
39
+ else:
40
+ strategy = raw_key
41
+ else:
42
+ strategy = raw_key
43
+
44
+ print(f"Strategy (remove last _bigram/_trigram): {strategy}")
45
+
46
+ if strategy == expected_col:
47
+ print("✅ This strategy works!")
48
+ else:
49
+ print("❌ Still doesn't work")
50
+ print(f"Difference: '{strategy}' vs '{expected_col}'")
test_csv_comma_handling.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to demonstrate CSV comma handling with and without quoting parameters.
4
+ """
5
+
6
+ import pandas as pd
7
+ import csv
8
+ from io import StringIO
9
+
10
+ def test_csv_comma_handling():
11
+ """Test how different CSV reading approaches handle commas in data."""
12
+
13
+ # Sample problematic CSV data
14
+ problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
15
+ murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
16
+ $ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
17
+ normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
18
+
19
+ # Properly quoted CSV data
20
+ quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
21
+ "murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
22
+ "$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
23
+ normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
24
+
25
+ print("=== Testing CSV Comma Handling ===\n")
26
+
27
+ # Test 1: Default pandas behavior (problematic)
28
+ print("1. Default pandas behavior with problematic CSV:")
29
+ try:
30
+ df_default = pd.read_csv(StringIO(problematic_csv))
31
+ print(f" Columns detected: {len(df_default.columns)}")
32
+ print(f" Column names: {list(df_default.columns)}")
33
+ print(f" First row data: {df_default.iloc[0].tolist()}")
34
+ print(f" Shape: {df_default.shape}")
35
+ except Exception as e:
36
+ print(f" Error: {e}")
37
+ print()
38
+
39
+ # Test 2: With quoting parameters (our solution)
40
+ print("2. With quoting parameters (our solution):")
41
+ try:
42
+ df_quoted = pd.read_csv(StringIO(problematic_csv),
43
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
44
+ print(f" Columns detected: {len(df_quoted.columns)}")
45
+ print(f" Column names: {list(df_quoted.columns)}")
46
+ print(f" First row data: {df_quoted.iloc[0].tolist()}")
47
+ print(f" Shape: {df_quoted.shape}")
48
+ except Exception as e:
49
+ print(f" Error: {e}")
50
+ print()
51
+
52
+ # Test 3: With properly quoted CSV
53
+ print("3. With properly quoted CSV data:")
54
+ try:
55
+ df_proper = pd.read_csv(StringIO(quoted_csv))
56
+ print(f" Columns detected: {len(df_proper.columns)}")
57
+ print(f" Column names: {list(df_proper.columns)}")
58
+ print(f" First row word: '{df_proper.iloc[0]['word']}'")
59
+ print(f" Second row word: '{df_proper.iloc[1]['word']}'")
60
+ print(f" Shape: {df_proper.shape}")
61
+ except Exception as e:
62
+ print(f" Error: {e}")
63
+ print()
64
+
65
+ # Test 4: Show the difference
66
+ print("4. Comparison of approaches:")
67
+ print(" Without quoting: Data with commas gets split incorrectly")
68
+ print(" With quoting: pandas can handle quoted fields properly")
69
+ print(" Best practice: Quote fields that contain commas in the source CSV")
70
+
71
+ if __name__ == "__main__":
72
+ test_csv_comma_handling()
test_plot_fix.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify the fix for bigram/trigram plot sample words
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ # Add the project root to the path
10
+ sys.path.insert(0, os.getcwd())
11
+
12
+ from web_app.config_manager import ConfigManager
13
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
14
+
15
+ def test_plot_fix():
16
+ print("=== Testing Plot Fix ===")
17
+
18
+ # Load config and create reference lists
19
+ config = ConfigManager.load_reference_config()
20
+ english_config = config.get('english', {})
21
+
22
+ reference_lists = {}
23
+
24
+ # Load a unigram, bigram, and trigram reference
25
+ unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
26
+ bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
27
+ trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
28
+
29
+ reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
30
+ reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
31
+ reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
32
+
33
+ # Create analyzer and analyze text
34
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
35
+ analyzer.load_reference_lists(reference_lists)
36
+
37
+ test_text = "The cat sat on the mat. The dog ran quickly."
38
+ results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
39
+
40
+ print("\n=== Testing Column Matching with Fixed Algorithm ===")
41
+
42
+ # Test the fixed algorithm for bigrams
43
+ for key in results['raw_scores'].keys():
44
+ if '_bigram_' in key:
45
+ print(f"\nTesting bigram key: {key}")
46
+ # Use the new algorithm: remove '_bigram' from the key
47
+ index_measure_col = key.replace('_bigram', '')
48
+ print(f" Fixed algorithm expects column: '{index_measure_col}'")
49
+
50
+ # Check if this column exists in bigram_details
51
+ if results['bigram_details']:
52
+ sample_bigram = results['bigram_details'][0]
53
+ if index_measure_col in sample_bigram:
54
+ print(f" ✅ Column found in bigram_details")
55
+
56
+ # Test if we can build word_score_map successfully
57
+ word_score_map = {}
58
+ for bigram_detail in results['bigram_details']:
59
+ if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
60
+ bigram_text = bigram_detail.get('bigram', '')
61
+ word_score_map[bigram_text] = bigram_detail[index_measure_col]
62
+
63
+ print(f" ✅ Successfully built word_score_map with {len(word_score_map)} entries")
64
+ if word_score_map:
65
+ sample_entries = list(word_score_map.items())[:3]
66
+ print(f" Sample entries: {sample_entries}")
67
+ else:
68
+ print(f" ❌ Column still NOT found in bigram_details")
69
+
70
+ # Test the fixed algorithm for trigrams
71
+ for key in results['raw_scores'].keys():
72
+ if '_trigram_' in key:
73
+ print(f"\nTesting trigram key: {key}")
74
+ # Use the new algorithm: remove '_trigram' from the key
75
+ index_measure_col = key.replace('_trigram', '')
76
+ print(f" Fixed algorithm expects column: '{index_measure_col}'")
77
+
78
+ # Check if this column exists in trigram_details
79
+ if results['trigram_details']:
80
+ sample_trigram = results['trigram_details'][0]
81
+ if index_measure_col in sample_trigram:
82
+ print(f" ✅ Column found in trigram_details")
83
+
84
+ # Test if we can build word_score_map successfully
85
+ word_score_map = {}
86
+ for trigram_detail in results['trigram_details']:
87
+ if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
88
+ trigram_text = trigram_detail.get('trigram', '')
89
+ word_score_map[trigram_text] = trigram_detail[index_measure_col]
90
+
91
+ print(f" ✅ Successfully built word_score_map with {len(word_score_map)} entries")
92
+ if word_score_map:
93
+ sample_entries = list(word_score_map.items())[:3]
94
+ print(f" Sample entries: {sample_entries}")
95
+ else:
96
+ print(f" ❌ Column still NOT found in trigram_details")
97
+
98
+ print("\n=== Fix Verification Complete ===")
99
+ if any('_bigram_' in key for key in results['raw_scores'].keys()) and any('_trigram_' in key for key in results['raw_scores'].keys()):
100
+ print("✅ Fix appears to be working correctly!")
101
+ print("Sample words should now appear in bigram and trigram plots.")
102
+ else:
103
+ print("❌ No bigram/trigram results found to test")
104
+
105
+ if __name__ == "__main__":
106
+ test_plot_fix()
test_reference_loading_issue.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to diagnose the reference loading issue in the web app
4
+ """
5
+
6
+ import sys
7
+ import os
8
+
9
+ # Add the project root to the path
10
+ sys.path.insert(0, os.getcwd())
11
+
12
+ from web_app.config_manager import ConfigManager
13
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
14
+
15
+ def test_reference_loading():
16
+ print("=== Testing Reference Loading Issue ===")
17
+
18
+ # Load config
19
+ config = ConfigManager.load_reference_config()
20
+ english_config = config.get('english', {})
21
+
22
+ print(f"\nAvailable sections in config: {list(english_config.keys())}")
23
+
24
+ # Test what happens when we simulate loading different types of references
25
+ print("\n=== Simulating Reference List Selection ===")
26
+
27
+ # Simulate selecting some unigrams, bigrams, and trigrams
28
+ selected_lists = []
29
+
30
+ # Add a unigram
31
+ if 'unigrams' in english_config and 'COCA_spoken_frequency_token' in english_config['unigrams']:
32
+ unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
33
+ selected_lists.append(('unigrams', 'COCA_spoken_frequency_token', unigram_config))
34
+ print(f"Added unigram: COCA_spoken_frequency_token")
35
+
36
+ # Add a bigram
37
+ if 'bigrams' in english_config and 'COCA_spoken_bigram_frequency_token' in english_config['bigrams']:
38
+ bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
39
+ selected_lists.append(('bigrams', 'COCA_spoken_bigram_frequency_token', bigram_config))
40
+ print(f"Added bigram: COCA_spoken_bigram_frequency_token")
41
+
42
+ # Add a trigram
43
+ if 'trigrams' in english_config and 'COCA_trigram_frequency_token' in english_config['trigrams']:
44
+ trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
45
+ selected_lists.append(('trigrams', 'COCA_trigram_frequency_token', trigram_config))
46
+ print(f"Added trigram: COCA_trigram_frequency_token")
47
+
48
+ print(f"\nTotal selected lists: {len(selected_lists)}")
49
+
50
+ # Load reference data directly
51
+ reference_lists = {}
52
+ for ngram_type, list_key, list_config in selected_lists:
53
+ print(f"\nLoading {ngram_type}: {list_key}")
54
+ data = ConfigManager.load_reference_list_data(list_config)
55
+
56
+ if data:
57
+ print(f" Data keys: {list(data.keys())}")
58
+ for key, value in data.items():
59
+ if hasattr(value, '__len__'):
60
+ print(f" {key}: {len(value)} entries")
61
+ else:
62
+ print(f" {key}: {type(value)}")
63
+
64
+ reference_lists[list_key] = data
65
+ else:
66
+ print(f" Failed to load data for {list_key}")
67
+
68
+ # Check what was loaded
69
+ print(f"\n=== Loaded Reference Lists ===")
70
+ print(f"Keys loaded: {list(reference_lists.keys())}")
71
+
72
+ for key, data in reference_lists.items():
73
+ print(f"\n{key}:")
74
+ for file_type, file_data in data.items():
75
+ if hasattr(file_data, '__len__'):
76
+ print(f" {file_type}: {len(file_data)} entries")
77
+ else:
78
+ print(f" {file_type}: {type(file_data)}")
79
+
80
+ # Test analysis with these reference lists
81
+ print(f"\n=== Testing Analysis ===")
82
+ analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
83
+
84
+ # Load reference lists into analyzer
85
+ analyzer.load_reference_lists(reference_lists)
86
+
87
+ # Test text
88
+ test_text = "The cat sat on the mat. The dog ran quickly."
89
+
90
+ # Analyze
91
+ results = analyzer.analyze_text(
92
+ test_text,
93
+ list(reference_lists.keys()),
94
+ apply_log=False
95
+ )
96
+
97
+ print(f"\nAnalysis summary keys: {list(results['summary'].keys())}")
98
+ print(f"Bigram details count: {len(results.get('bigram_details', []))}")
99
+ print(f"Trigram details count: {len(results.get('trigram_details', []))}")
100
+
101
+ # Check for bigram/trigram entries in summary
102
+ bigram_summary_keys = [k for k in results['summary'].keys() if 'bigram' in k]
103
+ trigram_summary_keys = [k for k in results['summary'].keys() if 'trigram' in k]
104
+
105
+ print(f"\nBigram summary keys: {bigram_summary_keys}")
106
+ print(f"Trigram summary keys: {trigram_summary_keys}")
107
+
108
+ if not bigram_summary_keys and not trigram_summary_keys:
109
+ print("\n⚠️ WARNING: No bigram or trigram results in summary!")
110
+ print("This suggests the issue is in the analysis process, not the display function.")
111
+ else:
112
+ print("\n✓ Bigram and trigram results found in summary.")
113
+ print("The issue might be in how the web app loads reference lists.")
114
+
115
+ if __name__ == "__main__":
116
+ test_reference_loading()
text_analyzer/corpus_visualizer.py CHANGED
@@ -15,6 +15,7 @@ import logging
15
  import re
16
  from io import StringIO
17
  import natsort
 
18
 
19
  logger = logging.getLogger(__name__)
20
 
@@ -102,7 +103,8 @@ class CorpusVisualizer:
102
  separator = format_info['separator']
103
 
104
  # Load into DataFrame
105
- df = pd.read_csv(StringIO(content), sep=separator)
 
106
 
107
  # Store the dataframe
108
  if file_type == 'metadata':
 
15
  import re
16
  from io import StringIO
17
  import natsort
18
+ import csv
19
 
20
  logger = logging.getLogger(__name__)
21
 
 
103
  separator = format_info['separator']
104
 
105
  # Load into DataFrame
106
+ df = pd.read_csv(StringIO(content), sep=separator,
107
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
108
 
109
  # Store the dataframe
110
  if file_type == 'metadata':
text_analyzer/frequency_analyzer.py CHANGED
@@ -12,6 +12,7 @@ from typing import Dict, List, Tuple, Optional, Union
12
  import logging
13
  import random
14
  from io import StringIO
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -208,7 +209,8 @@ class FrequencyAnalyzer:
208
  has_header = column_config.get('has_header', True)
209
 
210
  # Read data
211
- df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None)
 
212
 
213
  # Store column configuration
214
  self.column_config = column_config.copy()
 
12
  import logging
13
  import random
14
  from io import StringIO
15
+ import csv
16
 
17
  logger = logging.getLogger(__name__)
18
 
 
209
  has_header = column_config.get('has_header', True)
210
 
211
  # Read data
212
+ df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None,
213
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
214
 
215
  # Store column configuration
216
  self.column_config = column_config.copy()
text_analyzer/lexical_sophistication.py CHANGED
@@ -82,7 +82,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
82
  delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
83
 
84
  # Load the file
85
- df = pd.read_csv(file_path, delimiter=delimiter, header=0)
 
86
 
87
  if file_type in ['token', 'lemma']:
88
  # Check if this is a custom frequency list format with specific columns
@@ -183,10 +184,12 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
183
  if 'content' in config:
184
  # Use content directly
185
  content_io = StringIO(config['content'])
186
- df = pd.read_csv(content_io, delimiter=delimiter, header=0)
 
187
  elif 'file_path' in config:
188
  # Fallback to file path for backward compatibility
189
- df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0)
 
190
  else:
191
  logger.error("No content or file_path found in config")
192
  return {}
@@ -602,6 +605,18 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
602
 
603
  # Look up scores for each selected index
604
  for index_name in selected_indices:
 
 
 
 
 
 
 
 
 
 
 
 
605
  # Check if this is a Japanese corpus reference list
606
  ref_data = self.reference_lists.get(index_name, {})
607
  is_japanese_corpus = False
@@ -613,69 +628,87 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
613
 
614
  if is_japanese_corpus and self.language == 'ja':
615
  # Use enhanced UniDic lookup with 3-level fallback and diagnostics
616
- token_result = self._lookup_with_unidic_fallback(token, index_name, 'token')
617
- lemma_result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
618
-
619
- # Extract scores and diagnostic information
620
- token_score = token_result['score']
621
- lemma_score = lemma_result['score']
622
-
623
- # Store enhanced details with diagnostic information
624
- token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
625
- token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
626
-
627
- # Add diagnostic information for debugging
628
- token_detail[f"{index_name}_token_match_method"] = token_result['match_method']
629
- token_detail[f"{index_name}_lemma_match_method"] = lemma_result['match_method']
630
- token_detail[f"{index_name}_token_match_key"] = token_result['match_key'] or None
631
- token_detail[f"{index_name}_lemma_match_key"] = lemma_result['match_key'] or None
632
-
633
- # Store UniDic features for display
634
- if hasattr(token, '_') and hasattr(token._, 'unidic_lemma'):
635
- token_detail['unidic_features'] = {
636
- 'lemma': getattr(token._, 'unidic_lemma', ''),
637
- 'lForm': getattr(token._, 'unidic_lform', ''),
638
- 'pos1': getattr(token._, 'unidic_pos1', ''),
639
- 'pos2': getattr(token._, 'unidic_pos2', ''),
640
- 'goshu': getattr(token._, 'unidic_goshu', ''),
641
- 'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
642
- }
643
 
644
  elif is_japanese_corpus:
645
  # Fallback to legacy Japanese lookup if UniDic not available
646
- token_score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
647
- lemma_score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
648
-
649
- # Store scores
650
- token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
651
- token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
652
- token_detail[f"{index_name}_token_match_method"] = "legacy_spacy"
653
- token_detail[f"{index_name}_lemma_match_method"] = "legacy_spacy"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
  else:
655
  # Standard lookup for non-Japanese data
656
- token_score = self._lookup_score(token.text, index_name, 'token')
657
- lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
 
 
658
 
659
- # Store scores
660
- token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
661
- token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
662
-
663
- # Collect for summary statistics with selective log transformation
664
- if token_score is not None:
665
- # Check if this specific measure should be log-transformed
666
- should_log_transform = self._should_apply_log_transform(
667
- index_name, 'token', 'frequency', log_transforms, apply_log
668
- )
669
- score_val = np.log10(token_score) if should_log_transform and token_score > 0 else token_score
670
- all_scores[f"{index_name}_token_{word_type}"].append(score_val)
671
 
672
- if lemma_score is not None:
673
- # Check if this specific measure should be log-transformed
674
- should_log_transform = self._should_apply_log_transform(
675
- index_name, 'lemma', 'frequency', log_transforms, apply_log
676
- )
677
- score_val = np.log10(lemma_score) if should_log_transform and lemma_score > 0 else lemma_score
678
- all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
679
 
680
  results['token_details'].append(token_detail)
681
 
@@ -722,25 +755,69 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
722
  if ref_data is None or not isinstance(ref_data, pd.DataFrame):
723
  continue
724
 
725
- # Get available measures
726
- available_measures = ref_data.columns[1:].tolist()
 
 
 
727
 
728
- # Filter measures based on selection
729
- for measure in available_measures:
730
- # Check if this measure should be computed
731
- if not self._should_compute_measure(index_name, measure, selected_measures):
732
- continue
733
-
734
- score = self._lookup_score(ngram, index_name, ngram_type, measure)
735
- if score is not None:
736
- # Check if this measure should be log-transformed
737
- should_log_transform = self._should_apply_log_transform(
738
- index_name, ngram_type, measure, log_transforms, apply_log
739
- )
740
- score_val = np.log10(score) if should_log_transform and score > 0 else score
741
- ngram_detail[f"{index_name}_{measure}"] = score_val
742
- else:
743
- ngram_detail[f"{index_name}_{measure}"] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
 
745
  results[ngram_details_key].append(ngram_detail)
746
 
@@ -753,37 +830,93 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
753
  if ref_data is None or not isinstance(ref_data, pd.DataFrame):
754
  continue
755
 
756
- # Get available measures (all columns except the first one)
757
- available_measures = ref_data.columns[1:].tolist()
 
 
 
758
 
759
- # Filter measures based on selection and compute summary statistics
760
- for measure in available_measures:
761
- # Check if this measure should be computed
762
- if not self._should_compute_measure(index_name, measure, selected_measures):
763
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
 
765
- ngram_scores = []
766
- for ngram in ngrams:
767
- score = self._lookup_score(ngram, index_name, ngram_type, measure)
768
- if score is not None:
769
- # Check if this measure should be log-transformed
770
- should_log_transform = self._should_apply_log_transform(
771
- index_name, ngram_type, measure, log_transforms, apply_log
772
- )
773
- score_val = np.log10(score) if should_log_transform and score > 0 else score
774
- ngram_scores.append(score_val)
 
 
 
 
775
 
776
- if ngram_scores:
777
- key = f"{index_name}_{ngram_type}_{measure}"
778
- results['summary'][key] = {
779
- 'mean': np.mean(ngram_scores),
780
- 'std': np.std(ngram_scores),
781
- 'count': len(ngram_scores),
782
- 'min': np.min(ngram_scores),
783
- 'max': np.max(ngram_scores)
784
- }
785
- # Store raw scores for plotting
786
- results['raw_scores'][key] = ngram_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
787
 
788
  return results
789
 
 
82
  delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
83
 
84
  # Load the file
85
+ df = pd.read_csv(file_path, delimiter=delimiter, header=0,
86
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
87
 
88
  if file_type in ['token', 'lemma']:
89
  # Check if this is a custom frequency list format with specific columns
 
184
  if 'content' in config:
185
  # Use content directly
186
  content_io = StringIO(config['content'])
187
+ df = pd.read_csv(content_io, delimiter=delimiter, header=0,
188
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
189
  elif 'file_path' in config:
190
  # Fallback to file path for backward compatibility
191
+ df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0,
192
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
193
  else:
194
  logger.error("No content or file_path found in config")
195
  return {}
 
605
 
606
  # Look up scores for each selected index
607
  for index_name in selected_indices:
608
+ # Extract base name and determine analysis type to avoid duplicate suffixes
609
+ if index_name.endswith('_token'):
610
+ base_name = index_name[:-6] # Remove '_token'
611
+ analysis_type = 'token'
612
+ elif index_name.endswith('_lemma'):
613
+ base_name = index_name[:-6] # Remove '_lemma'
614
+ analysis_type = 'lemma'
615
+ else:
616
+ # Fallback for entries without clear suffix
617
+ base_name = index_name
618
+ analysis_type = 'token' # Default to token
619
+
620
  # Check if this is a Japanese corpus reference list
621
  ref_data = self.reference_lists.get(index_name, {})
622
  is_japanese_corpus = False
 
628
 
629
  if is_japanese_corpus and self.language == 'ja':
630
  # Use enhanced UniDic lookup with 3-level fallback and diagnostics
631
+ if analysis_type == 'token':
632
+ result = self._lookup_with_unidic_fallback(token, index_name, 'token')
633
+ score = result['score']
634
+
635
+ # Store enhanced details with clean column name
636
+ token_detail[index_name] = score if score is not None else None
637
+ token_detail[f"{index_name}_match_method"] = result['match_method']
638
+ token_detail[f"{index_name}_match_key"] = result['match_key'] or None
639
+
640
+ # Store UniDic features for display (only once per token)
641
+ if hasattr(token, '_') and hasattr(token._, 'unidic_lemma') and 'unidic_features' not in token_detail:
642
+ token_detail['unidic_features'] = {
643
+ 'lemma': getattr(token._, 'unidic_lemma', ''),
644
+ 'lForm': getattr(token._, 'unidic_lform', ''),
645
+ 'pos1': getattr(token._, 'unidic_pos1', ''),
646
+ 'pos2': getattr(token._, 'unidic_pos2', ''),
647
+ 'goshu': getattr(token._, 'unidic_goshu', ''),
648
+ 'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
649
+ }
650
+ else: # lemma analysis
651
+ result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
652
+ score = result['score']
653
+
654
+ # Store enhanced details with clean column name
655
+ token_detail[index_name] = score if score is not None else None
656
+ token_detail[f"{index_name}_match_method"] = result['match_method']
657
+ token_detail[f"{index_name}_match_key"] = result['match_key'] or None
658
 
659
  elif is_japanese_corpus:
660
  # Fallback to legacy Japanese lookup if UniDic not available
661
+ if analysis_type == 'token':
662
+ score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
663
+
664
+ # Apply log transformation if needed before storing
665
+ if score is not None:
666
+ should_log_transform = self._should_apply_log_transform(
667
+ index_name, analysis_type, 'frequency', log_transforms, apply_log
668
+ )
669
+ final_score = np.log10(score) if should_log_transform and score > 0 else score
670
+ else:
671
+ final_score = None
672
+
673
+ token_detail[index_name] = final_score
674
+ token_detail[f"{index_name}_match_method"] = "legacy_spacy"
675
+ else: # lemma analysis
676
+ score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
677
+
678
+ # Apply log transformation if needed before storing
679
+ if score is not None:
680
+ should_log_transform = self._should_apply_log_transform(
681
+ index_name, analysis_type, 'frequency', log_transforms, apply_log
682
+ )
683
+ final_score = np.log10(score) if should_log_transform and score > 0 else score
684
+ else:
685
+ final_score = None
686
+
687
+ token_detail[index_name] = final_score
688
+ token_detail[f"{index_name}_match_method"] = "legacy_spacy"
689
  else:
690
  # Standard lookup for non-Japanese data
691
+ if analysis_type == 'token':
692
+ score = self._lookup_score(token.text, index_name, 'token')
693
+ else: # lemma analysis
694
+ score = self._lookup_score(token.lemma_, index_name, 'lemma')
695
 
696
+ # Apply log transformation if needed before storing
697
+ if score is not None:
698
+ should_log_transform = self._should_apply_log_transform(
699
+ index_name, analysis_type, 'frequency', log_transforms, apply_log
700
+ )
701
+ final_score = np.log10(score) if should_log_transform and score > 0 else score
702
+ else:
703
+ final_score = None
704
+
705
+ # Store score with clean column name and transformed value
706
+ token_detail[index_name] = final_score
 
707
 
708
+ # Collect for summary statistics (score is already transformed if needed)
709
+ score = token_detail.get(index_name)
710
+ if score is not None:
711
+ all_scores[f"{index_name}_{word_type}"].append(score)
 
 
 
712
 
713
  results['token_details'].append(token_detail)
714
 
 
755
  if ref_data is None or not isinstance(ref_data, pd.DataFrame):
756
  continue
757
 
758
+ # Get columns config for proper measure naming from YAML config
759
+ # We need to access the original YAML configuration to get proper measure names
760
+ from web_app.config_manager import ConfigManager
761
+ config = ConfigManager.load_reference_config()
762
+ language_key = "english" if self.language == 'en' else "japanese"
763
 
764
+ # Find the config entry for this index
765
+ config_entry = None
766
+ for config_section in [f"{ngram_type}s"]: # bigrams/trigrams sections
767
+ if config_section in config.get(language_key, {}):
768
+ if index_name in config[language_key][config_section]:
769
+ config_entry = config[language_key][config_section][index_name]
770
+ break
771
+
772
+ if config_entry and 'columns' in config_entry:
773
+ # Get columns config for this n-gram type
774
+ columns_config = config_entry.get('columns', {})
775
+
776
+ # Create mapping from column index to measure name
777
+ measure_mapping = {}
778
+ for measure_name, col_idx in columns_config.items():
779
+ if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
780
+ measure_mapping[ref_data.columns[col_idx]] = measure_name
781
+
782
+ # Use the measure mapping to get proper names
783
+ for col_name, measure_name in measure_mapping.items():
784
+ if col_name == ref_data.columns[0]: # Skip the n-gram text column
785
+ continue
786
+
787
+ # Check if this measure should be computed
788
+ if not self._should_compute_measure(index_name, measure_name, selected_measures):
789
+ continue
790
+
791
+ score = self._lookup_score(ngram, index_name, ngram_type, col_name)
792
+ if score is not None:
793
+ # Check if this measure should be log-transformed
794
+ should_log_transform = self._should_apply_log_transform(
795
+ index_name, ngram_type, measure_name, log_transforms, apply_log
796
+ )
797
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
798
+ ngram_detail[f"{index_name}_{measure_name}"] = score_val
799
+ else:
800
+ ngram_detail[f"{index_name}_{measure_name}"] = None
801
+ else:
802
+ # Fallback to old logic
803
+ available_measures = ref_data.columns[1:].tolist()
804
+
805
+ # Filter measures based on selection
806
+ for measure in available_measures:
807
+ # Check if this measure should be computed
808
+ if not self._should_compute_measure(index_name, measure, selected_measures):
809
+ continue
810
+
811
+ score = self._lookup_score(ngram, index_name, ngram_type, measure)
812
+ if score is not None:
813
+ # Check if this measure should be log-transformed
814
+ should_log_transform = self._should_apply_log_transform(
815
+ index_name, ngram_type, measure, log_transforms, apply_log
816
+ )
817
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
818
+ ngram_detail[f"{index_name}_{measure}"] = score_val
819
+ else:
820
+ ngram_detail[f"{index_name}_{measure}"] = None
821
 
822
  results[ngram_details_key].append(ngram_detail)
823
 
 
830
  if ref_data is None or not isinstance(ref_data, pd.DataFrame):
831
  continue
832
 
833
+ # Get columns config for proper measure naming from YAML config
834
+ # We need to access the original YAML configuration to get proper measure names
835
+ from web_app.config_manager import ConfigManager
836
+ config = ConfigManager.load_reference_config()
837
+ language_key = "english" if self.language == 'en' else "japanese"
838
 
839
+ # Find the config entry for this index
840
+ config_entry = None
841
+ for config_section in [f"{ngram_type}s"]: # bigrams/trigrams sections
842
+ if config_section in config.get(language_key, {}):
843
+ if index_name in config[language_key][config_section]:
844
+ config_entry = config[language_key][config_section][index_name]
845
+ break
846
+
847
+ if config_entry and 'columns' in config_entry:
848
+ # Get columns config for this n-gram type
849
+ columns_config = config_entry.get('columns', {})
850
+
851
+ # Create mapping from column index to measure name
852
+ measure_mapping = {}
853
+ for measure_name, col_idx in columns_config.items():
854
+ if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
855
+ measure_mapping[ref_data.columns[col_idx]] = measure_name
856
+
857
+ # Use the measure mapping to get proper names
858
+ for col_name, measure_name in measure_mapping.items():
859
+ if col_name == ref_data.columns[0]: # Skip the n-gram text column
860
+ continue
861
+
862
+ # Check if this measure should be computed
863
+ if not self._should_compute_measure(index_name, measure_name, selected_measures):
864
+ continue
865
+
866
+ ngram_scores = []
867
+ for ngram in ngrams:
868
+ score = self._lookup_score(ngram, index_name, ngram_type, col_name)
869
+ if score is not None:
870
+ # Check if this measure should be log-transformed
871
+ should_log_transform = self._should_apply_log_transform(
872
+ index_name, ngram_type, measure_name, log_transforms, apply_log
873
+ )
874
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
875
+ ngram_scores.append(score_val)
876
 
877
+ if ngram_scores:
878
+ key = f"{index_name}_{ngram_type}_{measure_name}"
879
+ results['summary'][key] = {
880
+ 'mean': np.mean(ngram_scores),
881
+ 'std': np.std(ngram_scores),
882
+ 'count': len(ngram_scores),
883
+ 'min': np.min(ngram_scores),
884
+ 'max': np.max(ngram_scores)
885
+ }
886
+ # Store raw scores for plotting
887
+ results['raw_scores'][key] = ngram_scores
888
+ else:
889
+ # Fallback to old logic if config not properly structured
890
+ available_measures = ref_data.columns[1:].tolist()
891
 
892
+ # Filter measures based on selection and compute summary statistics
893
+ for measure in available_measures:
894
+ # Check if this measure should be computed
895
+ if not self._should_compute_measure(index_name, measure, selected_measures):
896
+ continue
897
+
898
+ ngram_scores = []
899
+ for ngram in ngrams:
900
+ score = self._lookup_score(ngram, index_name, ngram_type, measure)
901
+ if score is not None:
902
+ # Check if this measure should be log-transformed
903
+ should_log_transform = self._should_apply_log_transform(
904
+ index_name, ngram_type, measure, log_transforms, apply_log
905
+ )
906
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
907
+ ngram_scores.append(score_val)
908
+
909
+ if ngram_scores:
910
+ key = f"{index_name}_{ngram_type}_{measure}"
911
+ results['summary'][key] = {
912
+ 'mean': np.mean(ngram_scores),
913
+ 'std': np.std(ngram_scores),
914
+ 'count': len(ngram_scores),
915
+ 'min': np.min(ngram_scores),
916
+ 'max': np.max(ngram_scores)
917
+ }
918
+ # Store raw scores for plotting
919
+ results['raw_scores'][key] = ngram_scores
920
 
921
  return results
922
 
web_app/app.py CHANGED
@@ -91,7 +91,7 @@ def render_sidebar():
91
 
92
  def render_lexical_sophistication_interface():
93
  """Render lexical sophistication analysis interface."""
94
- st.header("🔍 Lexical Sophistication Analysis")
95
 
96
  # Get analyzer
97
  analyzer = AnalysisHandlers.get_analyzer()
 
91
 
92
  def render_lexical_sophistication_interface():
93
  """Render lexical sophistication analysis interface."""
94
+ st.header("🔍 Emulation of the Tool for Automatic Analysis of Lexical Sophistication (emuTAALES)")
95
 
96
  # Get analyzer
97
  analyzer = AnalysisHandlers.get_analyzer()
web_app/components/ui_components.py CHANGED
@@ -298,7 +298,7 @@ class UIComponents:
298
 
299
  @staticmethod
300
  def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
301
- """Render the enhanced reference list selection interface with hierarchical display."""
302
  from web_app.defaults_manager import DefaultsManager
303
 
304
  # Initialize return values
@@ -309,34 +309,160 @@ class UIComponents:
309
  st.info("No reference lists selected. Please configure reference lists first.")
310
  return selected_measures, log_transforms
311
 
312
- # Simple hierarchical display showing selected lists with smart defaults info
313
- for list_name in reference_lists.keys():
314
- # Show smart defaults indicator
315
- entry_config = UIComponents._find_entry_config(list_name, config)
316
- if entry_config and entry_config.get('default_measures'):
317
- defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
318
- log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
- # Determine analysis type badges
321
- analysis_badges = []
322
- if entry_config.get('analysis_type') == 'token' or not entry_config.get('analysis_type'):
323
- analysis_badges.append("[Token ✓]")
324
- if entry_config.get('analysis_type') == 'lemma' or not entry_config.get('analysis_type'):
325
- analysis_badges.append("[Lemma ✓]")
 
 
326
 
327
- analysis_info = " ".join(analysis_badges) if analysis_badges else ""
 
 
328
 
329
- st.write(f"├─ **{list_name}** {analysis_info} [ℹ️ Smart defaults]")
330
- st.write(f" {defaults_info}, {log_info}")
331
 
332
- # Apply smart defaults to return values
333
- selected_measures[list_name] = entry_config.get('default_measures', [])
334
- log_transforms[list_name] = entry_config.get('default_log_transforms', [])
335
- else:
336
- st.write(f"├─ **{list_name}** [Legacy configuration]")
 
 
 
 
 
337
 
338
  return selected_measures, log_transforms
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  @staticmethod
341
  def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
342
  """Check if a group has smart defaults configured."""
 
298
 
299
  @staticmethod
300
  def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
301
+ """Render the advanced reference list selection interface with hierarchical grouping and individual measure control."""
302
  from web_app.defaults_manager import DefaultsManager
303
 
304
  # Initialize return values
 
309
  st.info("No reference lists selected. Please configure reference lists first.")
310
  return selected_measures, log_transforms
311
 
312
+ # Group reference lists by base name for hierarchical display
313
+ groups = UIComponents._group_reference_lists(reference_lists, config)
314
+
315
+ st.write("**Reference Lists & Measures:**")
316
+
317
+ # Render each group with hierarchical interface
318
+ for base_name, group_data in groups.items():
319
+ # Group-level enable/disable checkbox
320
+ group_key = f"group_enabled_{base_name}"
321
+ group_enabled = st.checkbox(
322
+ f"☑️ **{base_name}**",
323
+ value=True, # Default enabled
324
+ key=group_key,
325
+ help=f"Enable/disable all {base_name} analyses"
326
+ )
327
+
328
+ if group_enabled:
329
+ # Analysis type badges display
330
+ badges = []
331
+ if group_data['token']:
332
+ badges.append("[Token ✓]")
333
+ if group_data['lemma']:
334
+ badges.append("[Lemma ✓]")
335
+
336
+ if badges:
337
+ st.write(f" {' '.join(badges)}")
338
+
339
+ # Expandable measure selection for each analysis type
340
+ if group_data['token']:
341
+ with st.expander("📊 Token Measures ⬇️ (click to customize)", expanded=False):
342
+ token_measures, token_logs = UIComponents._render_measure_selection(
343
+ group_data['token'][0], 'token', base_name
344
+ )
345
+ # Always store the results, even if empty (to maintain structure)
346
+ selected_measures[group_data['token'][0][0]] = token_measures
347
+ log_transforms[group_data['token'][0][0]] = token_logs
348
 
349
+ if group_data['lemma']:
350
+ with st.expander("📊 Lemma Measures ⬇️ (click to customize)", expanded=False):
351
+ lemma_measures, lemma_logs = UIComponents._render_measure_selection(
352
+ group_data['lemma'][0], 'lemma', base_name
353
+ )
354
+ # Always store the results, even if empty (to maintain structure)
355
+ selected_measures[group_data['lemma'][0][0]] = lemma_measures
356
+ log_transforms[group_data['lemma'][0][0]] = lemma_logs
357
 
358
+ # Show smart defaults summary
359
+ token_entry_name = group_data['token'][0][0] if group_data['token'] else None
360
+ lemma_entry_name = group_data['lemma'][0][0] if group_data['lemma'] else None
361
 
362
+ total_measures = 0
363
+ total_logs = 0
364
 
365
+ if token_entry_name:
366
+ total_measures += len(selected_measures.get(token_entry_name, []))
367
+ total_logs += len(log_transforms.get(token_entry_name, []))
368
+
369
+ if lemma_entry_name:
370
+ total_measures += len(selected_measures.get(lemma_entry_name, []))
371
+ total_logs += len(log_transforms.get(lemma_entry_name, []))
372
+
373
+ st.write(f" 📊 {total_measures} measures selected, 🔄 {total_logs} log-transformed")
374
+ st.write("") # Add spacing
375
 
376
  return selected_measures, log_transforms
377
 
378
+ @staticmethod
379
+ def _group_reference_lists(reference_lists: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Dict[str, List]]:
380
+ """Group related reference lists for hierarchical display."""
381
+ from collections import defaultdict
382
+
383
+ groups = defaultdict(lambda: {'token': [], 'lemma': []})
384
+
385
+ for entry_name in reference_lists.keys():
386
+ # Extract base name (remove _token/_lemma suffix)
387
+ base_name = entry_name.replace('_token', '').replace('_lemma', '')
388
+
389
+ # Get analysis type from config
390
+ entry_config = UIComponents._find_entry_config(entry_name, config)
391
+ if entry_config:
392
+ analysis_type = entry_config.get('analysis_type', 'token')
393
+ groups[base_name][analysis_type].append((entry_name, entry_config))
394
+
395
+ return groups
396
+
397
+ @staticmethod
398
+ def _render_measure_selection(entry_data: Tuple[str, Dict], analysis_type: str, base_name: str) -> Tuple[List[str], List[str]]:
399
+ """Render individual measure checkboxes with log transform controls."""
400
+ entry_name, entry_config = entry_data
401
+
402
+ # Get measure information from config
403
+ selectable_measures = entry_config.get('selectable_measures', [])
404
+ log_transformable = entry_config.get('log_transformable', [])
405
+ default_measures = entry_config.get('default_measures', [])
406
+ default_log_transforms = entry_config.get('default_log_transforms', [])
407
+
408
+ # Initialize session state for this entry if not exists
409
+ if f'custom_measures_{entry_name}' not in st.session_state:
410
+ st.session_state[f'custom_measures_{entry_name}'] = default_measures.copy()
411
+ if f'custom_logs_{entry_name}' not in st.session_state:
412
+ st.session_state[f'custom_logs_{entry_name}'] = default_log_transforms.copy()
413
+
414
+ # Display measure selection interface
415
+ st.write(f"**Available Measures for {entry_config.get('display_name', entry_name)}:**")
416
+
417
+ selected_measures = []
418
+ selected_logs = []
419
+
420
+ for measure in selectable_measures:
421
+ col1, col2 = st.columns([3, 1])
422
+
423
+ with col1:
424
+ # Measure checkbox (pre-selected based on defaults)
425
+ measure_key = f"measure_{entry_name}_{measure}"
426
+ selected = st.checkbox(
427
+ f"☑️ {measure.replace('_', ' ').title()}",
428
+ value=measure in st.session_state[f'custom_measures_{entry_name}'],
429
+ key=measure_key,
430
+ help=f"Include {measure} in analysis"
431
+ )
432
+
433
+ if selected:
434
+ selected_measures.append(measure)
435
+
436
+ with col2:
437
+ # Log transform toggle (disabled if not transformable)
438
+ if measure in log_transformable and selected:
439
+ log_key = f"log_{entry_name}_{measure}"
440
+ log_enabled = st.checkbox(
441
+ "🔄 log₁₀",
442
+ value=measure in st.session_state[f'custom_logs_{entry_name}'],
443
+ key=log_key,
444
+ help=f"Apply log₁₀ transformation to {measure}"
445
+ )
446
+
447
+ if log_enabled:
448
+ selected_logs.append(measure)
449
+ elif measure in log_transformable:
450
+ st.write("🔄 (disabled)")
451
+ else:
452
+ st.write("❌ (not transformable)")
453
+
454
+ # Update session state
455
+ st.session_state[f'custom_measures_{entry_name}'] = selected_measures
456
+ st.session_state[f'custom_logs_{entry_name}'] = selected_logs
457
+
458
+ # Show selection summary
459
+ if selected_measures:
460
+ st.success(f"✅ {len(selected_measures)} measures selected, {len(selected_logs)} log-transformed")
461
+ else:
462
+ st.warning("⚠️ No measures selected for this analysis type")
463
+
464
+ return selected_measures, selected_logs
465
+
466
  @staticmethod
467
  def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
468
  """Check if a group has smart defaults configured."""
web_app/config_manager.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  from pathlib import Path
9
  from typing import Dict, List, Any, Optional, Tuple
10
  import yaml
 
11
 
12
  from web_app.session_manager import SessionManager
13
  from web_app.utils import MemoryFileHandler
@@ -70,7 +71,8 @@ class ConfigManager:
70
  content_io = StringIO(text_content)
71
 
72
  # Load preview
73
- df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5)
 
74
 
75
  # Store content in session state instead of file path
76
  if 'uploaded_files_content' not in st.session_state:
@@ -209,9 +211,11 @@ class ConfigManager:
209
 
210
  # Load file
211
  if list_config.get('has_header', False):
212
- df = pd.read_csv(file_path, delimiter=delimiter, header=0)
 
213
  else:
214
- df = pd.read_csv(file_path, delimiter=delimiter, header=None)
 
215
 
216
  # Get column mapping
217
  columns = list_config.get('columns', {})
@@ -247,6 +251,7 @@ class ConfigManager:
247
  elif is_trigram:
248
  data['trigram'] = df
249
  else:
 
250
  data[file_type] = df
251
 
252
  except Exception as e:
 
8
  from pathlib import Path
9
  from typing import Dict, List, Any, Optional, Tuple
10
  import yaml
11
+ import csv
12
 
13
  from web_app.session_manager import SessionManager
14
  from web_app.utils import MemoryFileHandler
 
71
  content_io = StringIO(text_content)
72
 
73
  # Load preview
74
+ df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5,
75
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
76
 
77
  # Store content in session state instead of file path
78
  if 'uploaded_files_content' not in st.session_state:
 
211
 
212
  # Load file
213
  if list_config.get('has_header', False):
214
+ df = pd.read_csv(file_path, delimiter=delimiter, header=0,
215
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
216
  else:
217
+ df = pd.read_csv(file_path, delimiter=delimiter, header=None,
218
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
219
 
220
  # Get column mapping
221
  columns = list_config.get('columns', {})
 
251
  elif is_trigram:
252
  data['trigram'] = df
253
  else:
254
+ # For standard unigram files that aren't bigrams or trigrams
255
  data[file_type] = df
256
 
257
  except Exception as e:
web_app/handlers/analysis_handlers.py CHANGED
@@ -96,28 +96,47 @@ class AnalysisHandlers:
96
  analyzer.load_reference_lists(reference_lists)
97
 
98
  # Get analysis configuration
99
- if use_smart_defaults:
100
- # Use smart defaults from configuration
101
- from web_app.defaults_manager import DefaultsManager
102
- from web_app.config_manager import ConfigManager
103
-
104
- config = ConfigManager.load_reference_config()
105
- selected_measures, log_transforms = DefaultsManager.get_default_analysis_config(
106
- list(reference_lists.keys()), config
107
- )
108
-
109
- # Perform enhanced analysis with smart defaults
110
- results = analyzer.analyze_text(
111
- text_content,
112
- list(reference_lists.keys()),
113
- apply_log=False, # Superseded by log_transforms
114
- word_type_filter=word_type_filter,
115
- log_transforms=log_transforms,
116
- selected_measures=selected_measures
117
- )
118
-
119
- st.success("✨ Analysis completed using Smart Defaults!")
120
- st.info(f"📊 Applied selective log transforms to {sum(len(measures) for measures in log_transforms.values())} measures")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  else:
123
  # Legacy mode - use global log transformation
@@ -402,22 +421,183 @@ class AnalysisHandlers:
402
 
403
  @staticmethod
404
  def create_density_plots(results: Dict[str, Any]):
405
- """Create density plots for score distributions."""
406
  if 'raw_scores' not in results:
407
  return
408
 
409
  for key, scores in results['raw_scores'].items():
410
  if len(scores) > 1: # Need at least 2 points for density
411
- # Create histogram with density curve
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  fig = go.Figure()
413
 
414
- # Add histogram
415
- fig.add_trace(go.Histogram(
416
  x=scores,
417
- nbinsx=min(30, len(scores)),
418
  name='Histogram',
419
  opacity=0.7,
420
  histnorm='probability density'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  ))
422
 
423
  # Calculate and add KDE curve
@@ -433,6 +613,17 @@ class AnalysisHandlers:
433
  line=dict(color='red', width=2)
434
  ))
435
 
 
 
 
 
 
 
 
 
 
 
 
436
  # Update layout
437
  fig.update_layout(
438
  title=f"Distribution of {key}",
@@ -447,79 +638,6 @@ class AnalysisHandlers:
447
 
448
  @staticmethod
449
  def render_enhanced_analysis_options():
450
- """Render the enhanced analysis interface with smart defaults and hierarchical display."""
451
- from web_app.defaults_manager import DefaultsManager
452
- from web_app.config_manager import ConfigManager
453
- from web_app.session_manager import SessionManager
454
-
455
- st.subheader("🔧 Analysis Configuration")
456
-
457
- # Get current configuration
458
- config = ConfigManager.load_reference_config()
459
- reference_lists = SessionManager.get_reference_lists()
460
-
461
- # Enhanced Reference Lists & Measures Section
462
- st.write("### 📋 Reference Lists & Measures")
463
-
464
- # Simple hierarchical display for now (basic implementation)
465
- if reference_lists:
466
- st.write("**Selected Reference Lists:**")
467
- for list_name in reference_lists.keys():
468
- # Show smart defaults indicator
469
- entry_config = UIComponents._find_entry_config(list_name, config)
470
- if entry_config and entry_config.get('default_measures'):
471
- defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
472
- log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
473
- st.write(f"├─ **{list_name}** [Token ✓] [Lemma ✓] [ℹ️ Smart defaults]")
474
- st.write(f" {defaults_info}, {log_info}")
475
- else:
476
- st.write(f"├─ **{list_name}** [Legacy configuration]")
477
- else:
478
- st.info("No reference lists selected. Please configure reference lists first.")
479
-
480
- # Global Analysis Options
481
- st.write("### 🎯 Analysis Types")
482
- col1, col2 = st.columns(2)
483
-
484
- with col1:
485
- token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
486
- with col2:
487
- lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
488
-
489
- # Global Options
490
- st.write("### ⚙️ Global Options")
491
- word_type_filter = st.selectbox(
492
- "Word Type Filter:",
493
- options=[None, 'CW', 'FW'],
494
- format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
495
- key="word_type_filter"
496
- )
497
-
498
- # Advanced Configuration Section
499
- with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
500
- st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
501
- "Expand this section only if you need custom control.")
502
-
503
- # Legacy log transformation toggle
504
- legacy_log_toggle = st.checkbox(
505
- "Apply log₁₀ transformation to ALL measures (Legacy Mode)",
506
- value=False,
507
- help="⚠️ Not recommended: This applies log transformation to all measures, "
508
- "including those where it's scientifically inappropriate (e.g., concreteness ratings).",
509
- key="legacy_log_transform"
510
- )
511
-
512
- if legacy_log_toggle:
513
- st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
514
- "This may produce scientifically invalid results for psycholinguistic measures.")
515
-
516
- # Return enhanced configuration
517
- return {
518
- 'token_analysis': token_analysis,
519
- 'lemma_analysis': lemma_analysis,
520
- 'word_type_filter': word_type_filter,
521
- 'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
522
- 'legacy_log_transform': st.session_state.get('legacy_log_transform', False),
523
- 'selected_measures': {}, # Will be filled by smart defaults
524
- 'log_transforms': {} # Will be filled by smart defaults
525
- }
 
96
  analyzer.load_reference_lists(reference_lists)
97
 
98
  # Get analysis configuration
99
+ if use_smart_defaults and not legacy_log_transform:
100
+ # Use custom selections from the enhanced UI
101
+ if selected_measures and any(selected_measures.values()):
102
+ # User has made custom selections
103
+ results = analyzer.analyze_text(
104
+ text_content,
105
+ list(reference_lists.keys()),
106
+ apply_log=False, # Superseded by log_transforms
107
+ word_type_filter=word_type_filter,
108
+ log_transforms=log_transforms,
109
+ selected_measures=selected_measures
110
+ )
111
+
112
+ # Calculate totals for user feedback
113
+ total_measures = sum(len(measures) for measures in selected_measures.values())
114
+ total_logs = sum(len(logs) for logs in log_transforms.values())
115
+
116
+ st.success("✨ Analysis completed using your custom selections!")
117
+ st.info(f"📊 Analyzed {total_measures} measures, {total_logs} log-transformed")
118
+ else:
119
+ # Fallback to smart defaults if no custom selections
120
+ from web_app.defaults_manager import DefaultsManager
121
+ from web_app.config_manager import ConfigManager
122
+
123
+ config = ConfigManager.load_reference_config()
124
+ default_measures, default_logs = DefaultsManager.get_default_analysis_config(
125
+ list(reference_lists.keys()), config
126
+ )
127
+
128
+ results = analyzer.analyze_text(
129
+ text_content,
130
+ list(reference_lists.keys()),
131
+ apply_log=False,
132
+ word_type_filter=word_type_filter,
133
+ log_transforms=default_logs,
134
+ selected_measures=default_measures
135
+ )
136
+
137
+ total_logs = sum(len(logs) for logs in default_logs.values())
138
+ st.success("✨ Analysis completed using Smart Defaults!")
139
+ st.info(f"📊 Applied selective log transforms to {total_logs} measures")
140
 
141
  else:
142
  # Legacy mode - use global log transformation
 
421
 
422
  @staticmethod
423
  def create_density_plots(results: Dict[str, Any]):
424
+ """Create density plots for score distributions with mean line and example words."""
425
  if 'raw_scores' not in results:
426
  return
427
 
428
  for key, scores in results['raw_scores'].items():
429
  if len(scores) > 1: # Need at least 2 points for density
430
+ # Create word-to-score mapping for this measure
431
+ word_score_map = {}
432
+
433
+ # Determine if this is a bigram, trigram, or token-based measure
434
+ if '_bigram_' in key:
435
+ # Handle bigram measures
436
+ if 'bigram_details' in results and results['bigram_details']:
437
+ # Extract the correct column name from the key
438
+ # Raw scores key: 'COCA_spoken_bigram_frequency_token_bigram_frequency'
439
+ # Actual column: 'COCA_spoken_bigram_frequency_token_frequency'
440
+ # Remove the last occurrence of '_bigram' from the key
441
+ idx = key.rfind('_bigram')
442
+ if idx != -1:
443
+ index_measure_col = key[:idx] + key[idx+7:] # 7 = len('_bigram')
444
+ else:
445
+ index_measure_col = key
446
+
447
+ # Build mapping from bigram details
448
+ for bigram_detail in results['bigram_details']:
449
+ if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
450
+ bigram_text = bigram_detail.get('bigram', '')
451
+ word_score_map[bigram_text] = bigram_detail[index_measure_col]
452
+
453
+ elif '_trigram_' in key:
454
+ # Handle trigram measures
455
+ if 'trigram_details' in results and results['trigram_details']:
456
+ # Extract the correct column name from the key
457
+ # Raw scores key: 'COCA_trigram_frequency_token_trigram_frequency'
458
+ # Actual column: 'COCA_trigram_frequency_token_frequency'
459
+ # Remove the last occurrence of '_trigram' from the key
460
+ idx = key.rfind('_trigram')
461
+ if idx != -1:
462
+ index_measure_col = key[:idx] + key[idx+8:] # 8 = len('_trigram')
463
+ else:
464
+ index_measure_col = key
465
+
466
+ # Build mapping from trigram details
467
+ for trigram_detail in results['trigram_details']:
468
+ if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
469
+ trigram_text = trigram_detail.get('trigram', '')
470
+ word_score_map[trigram_text] = trigram_detail[index_measure_col]
471
+
472
+ else:
473
+ # Handle token-based measures (existing logic)
474
+ if 'token_details' in results:
475
+ # Handle key mismatch between raw_scores and token_details
476
+ # raw_scores keys may have suffixes like '_CW', '_FW', etc.
477
+ # while token_details uses the base column names
478
+
479
+ # Try to find matching column in token_details
480
+ matching_column = None
481
+
482
+ # First, try exact match
483
+ if any(key in token for token in results['token_details']):
484
+ matching_column = key
485
+ else:
486
+ # Try removing word type suffixes (_CW, _FW)
487
+ base_key = key
488
+ for suffix in ['_CW', '_FW']:
489
+ if key.endswith(suffix):
490
+ base_key = key[:-len(suffix)]
491
+ break
492
+
493
+ # Check if base key exists in token_details
494
+ if any(base_key in token for token in results['token_details']):
495
+ matching_column = base_key
496
+ else:
497
+ # Try finding partial matches for complex keys
498
+ for token in results['token_details']:
499
+ for col_name in token.keys():
500
+ if col_name != 'id' and col_name != 'token' and col_name != 'lemma' and col_name != 'pos' and col_name != 'tag' and col_name != 'word_type':
501
+ # Check if this column name is part of our key
502
+ if col_name in key or key.startswith(col_name):
503
+ matching_column = col_name
504
+ break
505
+ if matching_column:
506
+ break
507
+
508
+ # Build word-to-score mapping using the matching column
509
+ if matching_column:
510
+ for token in results['token_details']:
511
+ if matching_column in token and token[matching_column] is not None:
512
+ word_score_map[token['token']] = token[matching_column]
513
+
514
+ # Calculate number of bins
515
+ nbins = min(30, len(scores))
516
+
517
+ # Create figure and add histogram first to let Plotly calculate optimal bins
518
  fig = go.Figure()
519
 
520
+ # Add histogram to get Plotly's binning
521
+ histogram_trace = go.Histogram(
522
  x=scores,
523
+ nbinsx=nbins,
524
  name='Histogram',
525
  opacity=0.7,
526
  histnorm='probability density'
527
+ )
528
+ fig.add_trace(histogram_trace)
529
+
530
+ # Extract Plotly's actual bin edges by creating a temporary figure to get the data
531
+ temp_fig = go.Figure()
532
+ temp_fig.add_trace(go.Histogram(x=scores, nbinsx=nbins))
533
+
534
+ # Calculate histogram using the same parameters as Plotly would use
535
+ # Plotly calculates bins similar to numpy's auto method
536
+ # We'll use numpy but ensure we get similar bin edges
537
+ hist_data, plotly_bin_edges = np.histogram(scores, bins=nbins)
538
+
539
+ # For better alignment with Plotly, we can also try to match Plotly's exact binning
540
+ # by using the range and number of bins
541
+ score_min, score_max = min(scores), max(scores)
542
+ # Add small padding as Plotly does
543
+ score_range = score_max - score_min
544
+ padding = score_range * 0.02 # Small padding like Plotly
545
+ adjusted_min = score_min - padding
546
+ adjusted_max = score_max + padding
547
+
548
+ # Create bins with the adjusted range
549
+ plotly_bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
550
+ hist_data, _ = np.histogram(scores, bins=plotly_bin_edges)
551
+
552
+ # Assign words to bins using Plotly-aligned bin edges
553
+ bin_examples = {}
554
+ if word_score_map:
555
+ import random
556
+ for word, score in word_score_map.items():
557
+ bin_idx = np.digitize(score, plotly_bin_edges) - 1
558
+ bin_idx = max(0, min(bin_idx, len(plotly_bin_edges) - 2)) # Clamp to valid range
559
+
560
+ if bin_idx not in bin_examples:
561
+ bin_examples[bin_idx] = []
562
+ bin_examples[bin_idx].append(word)
563
+
564
+ # Randomly sample up to 3 words per bin
565
+ for bin_idx in bin_examples:
566
+ if len(bin_examples[bin_idx]) > 3:
567
+ bin_examples[bin_idx] = random.sample(bin_examples[bin_idx], 3)
568
+
569
+ # Create hover text for each bin using Plotly's bins
570
+ hover_texts = []
571
+ for i in range(len(hist_data)):
572
+ bin_start = plotly_bin_edges[i]
573
+ bin_end = plotly_bin_edges[i + 1]
574
+ examples = bin_examples.get(i, [])
575
+
576
+ hover_text = f"Range: {bin_start:.3f} - {bin_end:.3f}<br>"
577
+ hover_text += f"Count: {hist_data[i]}<br>"
578
+ if examples:
579
+ hover_text += f"Examples: {', '.join(examples)}"
580
+ else:
581
+ hover_text += "Examples: none"
582
+
583
+ hover_texts.append(hover_text)
584
+
585
+ # Clear the figure and rebuild with custom hover text
586
+ fig = go.Figure()
587
+
588
+ # Add histogram with custom hover text using the calculated bin edges
589
+ fig.add_trace(go.Histogram(
590
+ x=scores,
591
+ xbins=dict(
592
+ start=plotly_bin_edges[0],
593
+ end=plotly_bin_edges[-1],
594
+ size=(plotly_bin_edges[-1] - plotly_bin_edges[0]) / nbins
595
+ ),
596
+ name='Histogram',
597
+ opacity=0.7,
598
+ histnorm='probability density',
599
+ hovertemplate='%{customdata}<extra></extra>',
600
+ customdata=hover_texts
601
  ))
602
 
603
  # Calculate and add KDE curve
 
613
  line=dict(color='red', width=2)
614
  ))
615
 
616
+ # Add mean line
617
+ mean_score = np.mean(scores)
618
+ fig.add_vline(
619
+ x=mean_score,
620
+ line_dash="dash",
621
+ line_color="green",
622
+ line_width=2,
623
+ annotation_text=f"Mean: {mean_score:.3f}",
624
+ annotation_position="top"
625
+ )
626
+
627
  # Update layout
628
  fig.update_layout(
629
  title=f"Distribution of {key}",
 
638
 
639
  @staticmethod
640
  def render_enhanced_analysis_options():
641
+ """Render the enhanced analysis interface with advanced measure selection capabilities."""
642
+ # Use the new enhanced UI from UIComponents
643
+ return UIComponents.render_analysis_options()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
web_app/handlers/frequency_handlers.py CHANGED
@@ -18,6 +18,7 @@ import sys
18
  import os
19
  from pathlib import Path
20
  from io import StringIO, BytesIO
 
21
 
22
  # Add parent directory to path for imports
23
  sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
@@ -111,7 +112,8 @@ class FrequencyHandlers:
111
  df_preview = pd.read_csv(StringIO(content),
112
  sep=st.session_state.format_info['separator'],
113
  header=0 if st.session_state.format_info['has_header'] else None,
114
- nrows=100)
 
115
 
116
  # Detect available columns
117
  st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
 
18
  import os
19
  from pathlib import Path
20
  from io import StringIO, BytesIO
21
+ import csv
22
 
23
  # Add parent directory to path for imports
24
  sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 
112
  df_preview = pd.read_csv(StringIO(content),
113
  sep=st.session_state.format_info['separator'],
114
  header=0 if st.session_state.format_info['has_header'] else None,
115
+ nrows=100,
116
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
117
 
118
  # Detect available columns
119
  st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
web_app/utils/memory_file_handler.py CHANGED
@@ -10,6 +10,7 @@ from io import BytesIO, StringIO
10
  from typing import Optional, Union, Dict, Any
11
  import pandas as pd
12
  import zipfile
 
13
 
14
 
15
  class MemoryFileHandler:
@@ -76,7 +77,8 @@ class MemoryFileHandler:
76
  delimiter = ','
77
 
78
  # Read directly into DataFrame
79
- df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8')
 
80
  return df
81
 
82
  except Exception as e:
@@ -167,4 +169,4 @@ class MemoryFileHandler:
167
  for key in keys_to_remove:
168
  del st.session_state[key]
169
  else:
170
- st.session_state.clear()
 
10
  from typing import Optional, Union, Dict, Any
11
  import pandas as pd
12
  import zipfile
13
+ import csv
14
 
15
 
16
  class MemoryFileHandler:
 
77
  delimiter = ','
78
 
79
  # Read directly into DataFrame
80
+ df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8',
81
+ quoting=csv.QUOTE_MINIMAL, quotechar='"')
82
  return df
83
 
84
  except Exception as e:
 
169
  for key in keys_to_remove:
170
  del st.session_state[key]
171
  else:
172
+ st.session_state.clear()