Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Jul 28, 2025

Commit

e7279e4

1 Parent(s): 42f8800

emuTAALES

Browse files

Files changed (23) hide show

config/reference_lists.yaml +17 -2
debug_bigram_trigram.py +78 -0
debug_plot_columns.py +107 -0
test/test_advanced_selection.py +174 -0
test/test_column_naming_fix.py +95 -0
test_file_upload_handler.py → test/test_file_upload_handler.py +0 -0
test_fix_403.py → test/test_fix_403.py +0 -0
test/test_log_transform_consistency.py +121 -0
test_memory_upload.py → test/test_memory_upload.py +0 -0
test_tmp_upload.py → test/test_tmp_upload.py +0 -0
test_column_matching.py +50 -0
test_csv_comma_handling.py +72 -0
test_plot_fix.py +106 -0
test_reference_loading_issue.py +116 -0
text_analyzer/corpus_visualizer.py +3 -1
text_analyzer/frequency_analyzer.py +3 -1
text_analyzer/lexical_sophistication.py +238 -105
web_app/app.py +1 -1
web_app/components/ui_components.py +148 -22
web_app/config_manager.py +8 -3
web_app/handlers/analysis_handlers.py +221 -103
web_app/handlers/frequency_handlers.py +3 -1
web_app/utils/memory_file_handler.py +4 -2

config/reference_lists.yaml CHANGED Viewed

@@ -88,7 +88,7 @@ english:
       measure_classifications:
         concreteness: psycholinguistic
       header_prefix: '#'
     concreteness_ratings_lemma:
       display_name: Concreteness Ratings (Lemma)
       description: Concreteness ratings for English words (1-5 scale) - lemma-based
@@ -241,6 +241,7 @@ english:
         normalized_freq: frequency
         documents: range
         range: range
     COCA_spoken_bigram_frequency_lemma:
       display_name: COCA Spoken Bigram Frequency (Lemma)
       description: Bigram frequencies and range data - lemma-based analysis
@@ -270,6 +271,7 @@ english:
         normalized_freq: frequency
         documents: range
         range: range
     COCA_spoken_bigram_association_token:
       display_name: COCA Spoken Bigram Associations (Token)
       description: Bigram association measures (MI, T-score, Delta P) - token-based
@@ -308,13 +310,21 @@ english:
         t_score: association
         delta_p: association
         ap_collex: association
     COCA_spoken_bigram_association_lemma:
       display_name: COCA Spoken Bigram Associations (Lemma)
       description: Bigram association measures (MI, T-score, Delta P) - lemma-based
         analysis
       file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
       format: csv
-      columns: *id005
       has_header: true
       enabled: true
       analysis_type: lemma
@@ -339,6 +349,7 @@ english:
         t_score: association
         delta_p: association
         ap_collex: association
     COCA_magazine_bigram_frequency_token:
       display_name: COCA Magazine Bigram Frequency (Token)
       description: Bigram frequencies and range data in Magazine - token-based analysis
@@ -373,6 +384,7 @@ english:
         normalized_freq: frequency
         documents: range
         range: range
     COCA_magazine_bigram_frequency_lemma:
       display_name: COCA Magazine Bigram Frequency (Lemma)
       description: Bigram frequencies and range data in Magazine - lemma-based analysis
@@ -402,6 +414,7 @@ english:
         normalized_freq: frequency
         documents: range
         range: range
     COCA_magazine_bigram_association_token:
       display_name: COCA Magazine Bigram Associations (Token)
       description: Bigram association measures (MI, T-score, Delta P) - token-based
@@ -971,6 +984,7 @@ japanese:
         pos: unknown
         frequency: frequency
       japanese_corpus: true
     jp_frequency_token:
       display_name: Japanese Frequency List (Token)
       description: Frequency data for Japanese words - token-based analysis
@@ -992,6 +1006,7 @@ japanese:
       - frequency
       measure_classifications:
         frequency: frequency
     jp_frequency_lemma:
       display_name: Japanese Frequency List (Lemma)
       description: Frequency data for Japanese words - lemma-based analysis

       measure_classifications:
         concreteness: psycholinguistic
       header_prefix: '#'
     concreteness_ratings_lemma:
       display_name: Concreteness Ratings (Lemma)
       description: Concreteness ratings for English words (1-5 scale) - lemma-based
         normalized_freq: frequency
         documents: range
         range: range
     COCA_spoken_bigram_frequency_lemma:
       display_name: COCA Spoken Bigram Frequency (Lemma)
       description: Bigram frequencies and range data - lemma-based analysis
         normalized_freq: frequency
         documents: range
         range: range
     COCA_spoken_bigram_association_token:
       display_name: COCA Spoken Bigram Associations (Token)
       description: Bigram association measures (MI, T-score, Delta P) - token-based
         t_score: association
         delta_p: association
         ap_collex: association
     COCA_spoken_bigram_association_lemma:
       display_name: COCA Spoken Bigram Associations (Lemma)
       description: Bigram association measures (MI, T-score, Delta P) - lemma-based
         analysis
       file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
       format: csv
+      columns:
+        bigram: 0
+        frequency: 1
+        mi_score: 5
+        mi_2_score: 6
+        t_score: 7
+        delta_p: 8
+        ap_collex: 9
       has_header: true
       enabled: true
       analysis_type: lemma
         t_score: association
         delta_p: association
         ap_collex: association
     COCA_magazine_bigram_frequency_token:
       display_name: COCA Magazine Bigram Frequency (Token)
       description: Bigram frequencies and range data in Magazine - token-based analysis
         normalized_freq: frequency
         documents: range
         range: range
     COCA_magazine_bigram_frequency_lemma:
       display_name: COCA Magazine Bigram Frequency (Lemma)
       description: Bigram frequencies and range data in Magazine - lemma-based analysis
         normalized_freq: frequency
         documents: range
         range: range
     COCA_magazine_bigram_association_token:
       display_name: COCA Magazine Bigram Associations (Token)
       description: Bigram association measures (MI, T-score, Delta P) - token-based
         pos: unknown
         frequency: frequency
       japanese_corpus: true
     jp_frequency_token:
       display_name: Japanese Frequency List (Token)
       description: Frequency data for Japanese words - token-based analysis
       - frequency
       measure_classifications:
         frequency: frequency
     jp_frequency_lemma:
       display_name: Japanese Frequency List (Lemma)
       description: Frequency data for Japanese words - lemma-based analysis

debug_bigram_trigram.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python3
+"""
+Debug script to test bigram and trigram processing
+"""
+import sys
+import os
+# Add the project root to the path
+sys.path.insert(0, os.getcwd())
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+from web_app.config_manager import ConfigManager
+# Test simple text
+test_text = "The cat sat on the mat. The dog ran quickly."
+# Create analyzer
+analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+# Load config
+config = ConfigManager.load_reference_config()
+english_config = config.get('english', {})
+print("=== Available Reference Lists ===")
+for ngram_type, lists in english_config.items():
+    print(f"\n{ngram_type.upper()}:")
+    for list_name, list_config in lists.items():
+        if list_config.get('enabled', True):
+            print(f"  - {list_name}")
+# Test loading a bigram reference
+print("\n=== Testing Bigram Reference Loading ===")
+bigram_config = english_config.get('bigrams', {}).get('COCA_spoken_bigram_frequency_token', {})
+if bigram_config:
+    print(f"Config: {bigram_config}")
+    # Load the data
+    data = ConfigManager.load_reference_list_data(bigram_config)
+    print(f"Loaded data keys: {data.keys()}")
+    if 'bigram' in data:
+        bigram_df = data['bigram']
+        print(f"Bigram DataFrame shape: {bigram_df.shape}")
+        print(f"Bigram DataFrame columns: {list(bigram_df.columns)}")
+        print("First 5 bigrams:")
+        print(bigram_df.head())
+# Test with full reference list structure
+print("\n=== Testing Analyzer with Bigram References ===")
+reference_lists = {
+    'COCA_spoken_bigram_frequency_token': ConfigManager.load_reference_list_data(bigram_config)
+}
+print(f"Reference lists for analyzer: {list(reference_lists.keys())}")
+for name, data in reference_lists.items():
+    print(f"  {name}: {list(data.keys())}")
+# Load into analyzer
+analyzer.load_reference_lists(reference_lists)
+# Analyze text
+results = analyzer.analyze_text(
+    test_text,
+    list(reference_lists.keys()),
+    apply_log=False
+)
+print("\n=== Analysis Results ===")
+print(f"Summary keys: {list(results['summary'].keys())}")
+print(f"Raw scores keys: {list(results['raw_scores'].keys())}")
+print(f"Bigram details count: {len(results.get('bigram_details', []))}")
+print(f"Trigram details count: {len(results.get('trigram_details', []))}")
+if results.get('bigram_details'):
+    print("\nFirst few bigram details:")
+    for detail in results['bigram_details'][:3]:
+        print(f"  {detail}")

debug_plot_columns.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+"""
+Debug script to examine column naming issues in bigram/trigram plots
+"""
+import sys
+import os
+# Add the project root to the path
+sys.path.insert(0, os.getcwd())
+from web_app.config_manager import ConfigManager
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+def debug_plot_columns():
+    print("=== Debugging Plot Column Names ==")
+    # Load config and create reference lists
+    config = ConfigManager.load_reference_config()
+    english_config = config.get('english', {})
+    reference_lists = {}
+    # Load a unigram, bigram, and trigram reference
+    unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
+    bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
+    trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
+    reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
+    reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
+    reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
+    # Create analyzer and analyze text
+    analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+    analyzer.load_reference_lists(reference_lists)
+    test_text = "The cat sat on the mat. The dog ran quickly."
+    results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
+    print("\n=== Raw Scores Keys ===")
+    for key in results['raw_scores'].keys():
+        print(f"  {key}")
+    print("\n=== Token Details Columns ===")
+    if results['token_details']:
+        print(f"  Sample token: {list(results['token_details'][0].keys())}")
+    print("\n=== Bigram Details Columns ===")
+    if results['bigram_details']:
+        print(f"  Sample bigram: {list(results['bigram_details'][0].keys())}")
+    print("\n=== Trigram Details Columns ===")
+    if results['trigram_details']:
+        print(f"  Sample trigram: {list(results['trigram_details'][0].keys())}")
+    print("\n=== Column Matching Analysis ===")
+    # Test the current algorithm for bigrams
+    for key in results['raw_scores'].keys():
+        if '_bigram_' in key:
+            print(f"\nAnalyzing bigram key: {key}")
+            key_parts = key.split('_')
+            if len(key_parts) >= 3 and 'bigram' in key_parts:
+                measure_name = '_'.join(key_parts[key_parts.index('bigram') + 1:])
+                index_measure_col = f"{key_parts[0]}_{measure_name}"
+                print(f"  Algorithm expects column: '{index_measure_col}'")
+                # Check if this column exists in bigram_details
+                if results['bigram_details']:
+                    sample_bigram = results['bigram_details'][0]
+                    if index_measure_col in sample_bigram:
+                        print(f"  ✅ Column found in bigram_details")
+                    else:
+                        print(f"  ❌ Column NOT found in bigram_details")
+                        print(f"  Available columns: {list(sample_bigram.keys())}")
+                        # Try to find the correct column
+                        for col in sample_bigram.keys():
+                            if measure_name in col:
+                                print(f"  Possible match: '{col}'")
+    # Test the current algorithm for trigrams
+    for key in results['raw_scores'].keys():
+        if '_trigram_' in key:
+            print(f"\nAnalyzing trigram key: {key}")
+            key_parts = key.split('_')
+            if len(key_parts) >= 3 and 'trigram' in key_parts:
+                measure_name = '_'.join(key_parts[key_parts.index('trigram') + 1:])
+                index_measure_col = f"{key_parts[0]}_{measure_name}"
+                print(f"  Algorithm expects column: '{index_measure_col}'")
+                # Check if this column exists in trigram_details
+                if results['trigram_details']:
+                    sample_trigram = results['trigram_details'][0]
+                    if index_measure_col in sample_trigram:
+                        print(f"  ✅ Column found in trigram_details")
+                    else:
+                        print(f"  ❌ Column NOT found in trigram_details")
+                        print(f"  Available columns: {list(sample_trigram.keys())}")
+                        # Try to find the correct column
+                        for col in sample_trigram.keys():
+                            if measure_name in col:
+                                print(f"  Possible match: '{col}'")
+if __name__ == "__main__":
+    debug_plot_columns()

test/test_advanced_selection.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the advanced selection UI implementation.
+"""
+import sys
+import os
+from pathlib import Path
+# Add project root to path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+def test_ui_components_grouping():
+    """Test the grouping functionality of UI components."""
+    print("Testing UI Components Grouping...")
+    try:
+        from web_app.components.ui_components import UIComponents
+        from web_app.config_manager import ConfigManager
+        # Load the configuration
+        config = ConfigManager.load_reference_config()
+        # Simulate reference lists
+        mock_reference_lists = {
+            'COCA_spoken_frequency_token': {},
+            'COCA_spoken_frequency_lemma': {},
+            'concreteness_ratings_token': {},
+            'concreteness_ratings_lemma': {}
+        }
+        # Test grouping function
+        groups = UIComponents._group_reference_lists(mock_reference_lists, config)
+        print(f"✅ Grouping successful! Found {len(groups)} groups:")
+        for base_name, group_data in groups.items():
+            token_count = len(group_data['token'])
+            lemma_count = len(group_data['lemma'])
+            print(f"  - {base_name}: {token_count} token entries, {lemma_count} lemma entries")
+        return True
+    except Exception as e:
+        print(f"❌ Grouping test failed: {e}")
+        return False
+def test_config_structure():
+    """Test that the configuration has the expected structure."""
+    print("\nTesting Configuration Structure...")
+    try:
+        from web_app.config_manager import ConfigManager
+        config = ConfigManager.load_reference_config()
+        # Check for expected keys
+        expected_sections = ['english', 'japanese']
+        found_sections = []
+        for section in expected_sections:
+            if section in config:
+                found_sections.append(section)
+                print(f"  ✅ Found {section} section")
+                # Check for subsections
+                for subsection in ['unigrams', 'bigrams', 'trigrams']:
+                    if subsection in config[section]:
+                        entries = len(config[section][subsection])
+                        print(f"    - {subsection}: {entries} entries")
+        if found_sections:
+            print(f"✅ Configuration structure valid!")
+            # Check for advanced selection fields
+            sample_entry = None
+            for lang in config.values():
+                if isinstance(lang, dict):
+                    for ngram_type in lang.values():
+                        if isinstance(ngram_type, dict):
+                            for entry_name, entry_config in ngram_type.items():
+                                sample_entry = entry_config
+                                break
+                            break
+                    break
+            if sample_entry:
+                required_fields = ['selectable_measures', 'default_measures', 'default_log_transforms', 'log_transformable']
+                missing_fields = []
+                for field in required_fields:
+                    if field not in sample_entry:
+                        missing_fields.append(field)
+                    else:
+                        print(f"    ✅ Found {field}: {sample_entry[field]}")
+                if missing_fields:
+                    print(f"    ⚠️  Missing fields: {missing_fields}")
+                else:
+                    print("    ✅ All advanced selection fields present!")
+            return True
+        else:
+            print("❌ No valid configuration sections found")
+            return False
+    except Exception as e:
+        print(f"❌ Configuration test failed: {e}")
+        return False
+def test_analyzer_parameters():
+    """Test that the analyzer accepts the new parameters."""
+    print("\nTesting Analyzer Parameters...")
+    try:
+        from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+        # Create analyzer
+        analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+        # Test parameter signature
+        import inspect
+        analyze_signature = inspect.signature(analyzer.analyze_text)
+        params = list(analyze_signature.parameters.keys())
+        required_params = ['log_transforms', 'selected_measures']
+        found_params = []
+        for param in required_params:
+            if param in params:
+                found_params.append(param)
+                print(f"  ✅ Found parameter: {param}")
+            else:
+                print(f"  ❌ Missing parameter: {param}")
+        if len(found_params) == len(required_params):
+            print("✅ Analyzer has all required parameters!")
+            return True
+        else:
+            print(f"❌ Analyzer missing {len(required_params) - len(found_params)} parameters")
+            return False
+    except Exception as e:
+        print(f"❌ Analyzer test failed: {e}")
+        return False
+def main():
+    """Run all tests."""
+    print("🧪 Testing Advanced Selection Implementation\n")
+    tests = [
+        test_config_structure,
+        test_ui_components_grouping,
+        test_analyzer_parameters
+    ]
+    passed = 0
+    total = len(tests)
+    for test in tests:
+        if test():
+            passed += 1
+    print(f"\n📊 Test Results: {passed}/{total} tests passed")
+    if passed == total:
+        print("🎉 All tests passed! Advanced selection implementation is ready.")
+        return 0
+    else:
+        print("⚠️  Some tests failed. Please check the implementation.")
+        return 1
+if __name__ == "__main__":
+    exit(main())

test/test_column_naming_fix.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#!/usr/bin/env python3
+"""
+Test script to verify that the column naming bug is fixed.
+This script specifically tests that we don't get duplicate suffixes like:
+- COCA_spoken_frequency_token_token
+- COCA_spoken_frequency_lemma_lemma
+"""
+import sys
+import os
+from pathlib import Path
+# Add project root to path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+def test_column_naming():
+    """Test that column names are clean without duplicate suffixes."""
+    print("Testing Column Naming Fix...")
+    try:
+        from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+        # Create analyzer
+        analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+        # Create mock reference lists
+        mock_reference_lists = {
+            'COCA_spoken_frequency_token': {
+                'token': {'hello': 100, 'world': 200}
+            },
+            'COCA_spoken_frequency_lemma': {
+                'lemma': {'hello': 150, 'world': 250}
+            }
+        }
+        # Load reference lists
+        analyzer.load_reference_lists(mock_reference_lists)
+        # Analyze a simple text
+        text = "Hello world, this is a test."
+        selected_indices = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
+        results = analyzer.analyze_text(text, selected_indices)
+        # Check token details for clean column names
+        if results['token_details']:
+            first_token = results['token_details'][0]
+            column_names = list(first_token.keys())
+            print(f"Column names found: {column_names}")
+            # Check for problematic duplicate suffixes
+            problematic_columns = []
+            for col in column_names:
+                if '_token_token' in col or '_lemma_lemma' in col or '_token_lemma' in col or '_lemma_token' in col:
+                    problematic_columns.append(col)
+            if problematic_columns:
+                print(f"❌ Found problematic column names: {problematic_columns}")
+                return False
+            else:
+                print("✅ No duplicate suffixes found in column names!")
+                # Check that we have the expected clean column names
+                expected_clean_columns = ['COCA_spoken_frequency_token', 'COCA_spoken_frequency_lemma']
+                found_clean_columns = [col for col in column_names if col in expected_clean_columns]
+                if found_clean_columns:
+                    print(f"✅ Found expected clean columns: {found_clean_columns}")
+                    return True
+                else:
+                    print(f"⚠️ Expected clean columns not found. Available columns: {column_names}")
+                    return False
+        else:
+            print("❌ No token details found in results")
+            return False
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+        return False
+def main():
+    """Run the column naming test."""
+    print("🧪 Testing Column Naming Fix\n")
+    if test_column_naming():
+        print("\n🎉 Column naming bug has been fixed!")
+        return 0
+    else:
+        print("\n⚠️ Column naming issue still exists.")
+        return 1
+if __name__ == "__main__":
+    exit(main())

test_file_upload_handler.py → test/test_file_upload_handler.py RENAMED Viewed

File without changes

test_fix_403.py → test/test_fix_403.py RENAMED Viewed

File without changes

test/test_log_transform_consistency.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+"""
+Test script to verify that log transformations are consistently applied
+to both token details table and summary statistics.
+"""
+import sys
+import os
+from pathlib import Path
+import numpy as np
+# Add project root to path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+def test_log_transform_consistency():
+    """Test that log transformations are applied consistently."""
+    print("Testing Log Transform Consistency...")
+    try:
+        from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+        # Create analyzer
+        analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+        # Create mock reference lists with known values
+        mock_reference_lists = {
+            'COCA_spoken_frequency_token': {
+                'token': {'hello': 1000, 'world': 2000}  # Values that will be log-transformed
+            }
+        }
+        # Load reference lists
+        analyzer.load_reference_lists(mock_reference_lists)
+        # Analyze text with log transformation enabled
+        text = "Hello world"
+        selected_indices = ['COCA_spoken_frequency_token']
+        # Enable log transformation for this index
+        log_transforms = {
+            'COCA_spoken_frequency_token': ['frequency']
+        }
+        results = analyzer.analyze_text(
+            text,
+            selected_indices,
+            log_transforms=log_transforms
+        )
+        # Get token details and summary statistics
+        token_details = results['token_details']
+        summary_stats = results['summary']
+        print(f"Token details: {len(token_details)} tokens")
+        print(f"Summary keys: {list(summary_stats.keys())}")
+        # Check consistency between token details and summary
+        if token_details:
+            # Get log-transformed values from token details
+            token_scores = []
+            for token_detail in token_details:
+                score = token_detail.get('COCA_spoken_frequency_token')
+                if score is not None:
+                    token_scores.append(score)
+                    print(f"Token '{token_detail['token']}': score = {score}")
+            if token_scores:
+                # Calculate mean from token details
+                token_mean = np.mean(token_scores)
+                # Get mean from summary statistics
+                summary_key = 'COCA_spoken_frequency_token_CW'  # Content words
+                if summary_key in summary_stats:
+                    summary_mean = summary_stats[summary_key]['mean']
+                    print(f"Token details mean: {token_mean}")
+                    print(f"Summary stats mean: {summary_mean}")
+                    # Check if they're approximately equal (allowing for floating point precision)
+                    if abs(token_mean - summary_mean) < 0.001:
+                        print("✅ Token details and summary statistics are consistent!")
+                        # Check that values are actually log-transformed
+                        # Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3
+                        if all(2.5 < score < 3.5 for score in token_scores):
+                            print("✅ Values appear to be properly log-transformed!")
+                            return True
+                        else:
+                            print(f"⚠️ Values don't appear to be log-transformed: {token_scores}")
+                            return False
+                    else:
+                        print(f"❌ Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}")
+                        return False
+                else:
+                    print(f"❌ Summary key '{summary_key}' not found in summary stats")
+                    return False
+            else:
+                print("❌ No token scores found")
+                return False
+        else:
+            print("❌ No token details found")
+            return False
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+        return False
+def main():
+    """Run the log transform consistency test."""
+    print("🧪 Testing Log Transform Consistency\n")
+    if test_log_transform_consistency():
+        print("\n🎉 Log transformation consistency has been fixed!")
+        return 0
+    else:
+        print("\n⚠️ Log transformation consistency issue still exists.")
+        return 1
+if __name__ == "__main__":
+    exit(main())

test_memory_upload.py → test/test_memory_upload.py RENAMED Viewed

File without changes

test_tmp_upload.py → test/test_tmp_upload.py RENAMED Viewed

File without changes

test_column_matching.py ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env python3
+"""
+Test script to understand the exact column matching pattern
+"""
+# Test the pattern manually
+raw_scores_keys = [
+    'COCA_spoken_bigram_frequency_token_bigram_frequency',
+    'COCA_spoken_bigram_frequency_token_bigram_normalized_freq',
+    'COCA_trigram_frequency_token_trigram_frequency'
+]
+actual_columns = [
+    'COCA_spoken_bigram_frequency_token_frequency',
+    'COCA_spoken_bigram_frequency_token_normalized_freq',
+    'COCA_trigram_frequency_token_frequency'
+]
+print("=== Pattern Analysis ===")
+for raw_key, expected_col in zip(raw_scores_keys, actual_columns):
+    print(f"\nRaw key: {raw_key}")
+    print(f"Expected: {expected_col}")
+    # The correct pattern - remove only the redundant _bigram or _trigram from the end measure
+    if '_bigram_' in raw_key:
+        # Find the last occurrence of '_bigram'
+        idx = raw_key.rfind('_bigram')
+        if idx != -1:
+            # Remove only the '_bigram' part, keep everything else
+            strategy = raw_key[:idx] + raw_key[idx+7:]  # 7 = len('_bigram')
+        else:
+            strategy = raw_key
+    elif '_trigram_' in raw_key:
+        # Find the last occurrence of '_trigram'
+        idx = raw_key.rfind('_trigram')
+        if idx != -1:
+            # Remove only the '_trigram' part, keep everything else
+            strategy = raw_key[:idx] + raw_key[idx+8:]  # 8 = len('_trigram')
+        else:
+            strategy = raw_key
+    else:
+        strategy = raw_key
+    print(f"Strategy (remove last _bigram/_trigram): {strategy}")
+    if strategy == expected_col:
+        print("✅ This strategy works!")
+    else:
+        print("❌ Still doesn't work")
+        print(f"Difference: '{strategy}' vs '{expected_col}'")

test_csv_comma_handling.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+"""
+Test script to demonstrate CSV comma handling with and without quoting parameters.
+"""
+import pandas as pd
+import csv
+from io import StringIO
+def test_csv_comma_handling():
+    """Test how different CSV reading approaches handle commas in data."""
+    # Sample problematic CSV data
+    problematic_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
+murder in,951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
+$ 100,000,950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
+normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
+    # Properly quoted CSV data
+    quoted_csv = """word,freq1,freq2,freq3,other1,other2,other3,other4,other5,other6
+"murder in",951.0,11.2359497461,693.0,0.0211467455982,1.17513238089,8.03264644343,21.3160999278,0.0364941871107,657.479274987
+"$ 100,000",950.0,11.2241348673,710.0,0.0216654969333,6.51710183621,13.3735638208,30.7765166526,0.0169430040291,949.18097172
+normal_word,800.0,10.5,600.0,0.02,1.5,7.2,18.5,0.04,500.0"""
+    print("=== Testing CSV Comma Handling ===\n")
+    # Test 1: Default pandas behavior (problematic)
+    print("1. Default pandas behavior with problematic CSV:")
+    try:
+        df_default = pd.read_csv(StringIO(problematic_csv))
+        print(f"   Columns detected: {len(df_default.columns)}")
+        print(f"   Column names: {list(df_default.columns)}")
+        print(f"   First row data: {df_default.iloc[0].tolist()}")
+        print(f"   Shape: {df_default.shape}")
+    except Exception as e:
+        print(f"   Error: {e}")
+    print()
+    # Test 2: With quoting parameters (our solution)
+    print("2. With quoting parameters (our solution):")
+    try:
+        df_quoted = pd.read_csv(StringIO(problematic_csv),
+                               quoting=csv.QUOTE_MINIMAL, quotechar='"')
+        print(f"   Columns detected: {len(df_quoted.columns)}")
+        print(f"   Column names: {list(df_quoted.columns)}")
+        print(f"   First row data: {df_quoted.iloc[0].tolist()}")
+        print(f"   Shape: {df_quoted.shape}")
+    except Exception as e:
+        print(f"   Error: {e}")
+    print()
+    # Test 3: With properly quoted CSV
+    print("3. With properly quoted CSV data:")
+    try:
+        df_proper = pd.read_csv(StringIO(quoted_csv))
+        print(f"   Columns detected: {len(df_proper.columns)}")
+        print(f"   Column names: {list(df_proper.columns)}")
+        print(f"   First row word: '{df_proper.iloc[0]['word']}'")
+        print(f"   Second row word: '{df_proper.iloc[1]['word']}'")
+        print(f"   Shape: {df_proper.shape}")
+    except Exception as e:
+        print(f"   Error: {e}")
+    print()
+    # Test 4: Show the difference
+    print("4. Comparison of approaches:")
+    print("   Without quoting: Data with commas gets split incorrectly")
+    print("   With quoting: pandas can handle quoted fields properly")
+    print("   Best practice: Quote fields that contain commas in the source CSV")
+if __name__ == "__main__":
+    test_csv_comma_handling()

test_plot_fix.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the fix for bigram/trigram plot sample words
+"""
+import sys
+import os
+# Add the project root to the path
+sys.path.insert(0, os.getcwd())
+from web_app.config_manager import ConfigManager
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+def test_plot_fix():
+    print("=== Testing Plot Fix ===")
+    # Load config and create reference lists
+    config = ConfigManager.load_reference_config()
+    english_config = config.get('english', {})
+    reference_lists = {}
+    # Load a unigram, bigram, and trigram reference
+    unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
+    bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
+    trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
+    reference_lists['COCA_spoken_frequency_token'] = ConfigManager.load_reference_list_data(unigram_config)
+    reference_lists['COCA_spoken_bigram_frequency_token'] = ConfigManager.load_reference_list_data(bigram_config)
+    reference_lists['COCA_trigram_frequency_token'] = ConfigManager.load_reference_list_data(trigram_config)
+    # Create analyzer and analyze text
+    analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+    analyzer.load_reference_lists(reference_lists)
+    test_text = "The cat sat on the mat. The dog ran quickly."
+    results = analyzer.analyze_text(test_text, list(reference_lists.keys()), apply_log=False)
+    print("\n=== Testing Column Matching with Fixed Algorithm ===")
+    # Test the fixed algorithm for bigrams
+    for key in results['raw_scores'].keys():
+        if '_bigram_' in key:
+            print(f"\nTesting bigram key: {key}")
+            # Use the new algorithm: remove '_bigram' from the key
+            index_measure_col = key.replace('_bigram', '')
+            print(f"  Fixed algorithm expects column: '{index_measure_col}'")
+            # Check if this column exists in bigram_details
+            if results['bigram_details']:
+                sample_bigram = results['bigram_details'][0]
+                if index_measure_col in sample_bigram:
+                    print(f"  ✅ Column found in bigram_details")
+                    # Test if we can build word_score_map successfully
+                    word_score_map = {}
+                    for bigram_detail in results['bigram_details']:
+                        if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
+                            bigram_text = bigram_detail.get('bigram', '')
+                            word_score_map[bigram_text] = bigram_detail[index_measure_col]
+                    print(f"  ✅ Successfully built word_score_map with {len(word_score_map)} entries")
+                    if word_score_map:
+                        sample_entries = list(word_score_map.items())[:3]
+                        print(f"  Sample entries: {sample_entries}")
+                else:
+                    print(f"  ❌ Column still NOT found in bigram_details")
+    # Test the fixed algorithm for trigrams
+    for key in results['raw_scores'].keys():
+        if '_trigram_' in key:
+            print(f"\nTesting trigram key: {key}")
+            # Use the new algorithm: remove '_trigram' from the key
+            index_measure_col = key.replace('_trigram', '')
+            print(f"  Fixed algorithm expects column: '{index_measure_col}'")
+            # Check if this column exists in trigram_details
+            if results['trigram_details']:
+                sample_trigram = results['trigram_details'][0]
+                if index_measure_col in sample_trigram:
+                    print(f"  ✅ Column found in trigram_details")
+                    # Test if we can build word_score_map successfully
+                    word_score_map = {}
+                    for trigram_detail in results['trigram_details']:
+                        if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
+                            trigram_text = trigram_detail.get('trigram', '')
+                            word_score_map[trigram_text] = trigram_detail[index_measure_col]
+                    print(f"  ✅ Successfully built word_score_map with {len(word_score_map)} entries")
+                    if word_score_map:
+                        sample_entries = list(word_score_map.items())[:3]
+                        print(f"  Sample entries: {sample_entries}")
+                else:
+                    print(f"  ❌ Column still NOT found in trigram_details")
+    print("\n=== Fix Verification Complete ===")
+    if any('_bigram_' in key for key in results['raw_scores'].keys()) and any('_trigram_' in key for key in results['raw_scores'].keys()):
+        print("✅ Fix appears to be working correctly!")
+        print("Sample words should now appear in bigram and trigram plots.")
+    else:
+        print("❌ No bigram/trigram results found to test")
+if __name__ == "__main__":
+    test_plot_fix()

test_reference_loading_issue.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/usr/bin/env python3
+"""
+Test script to diagnose the reference loading issue in the web app
+"""
+import sys
+import os
+# Add the project root to the path
+sys.path.insert(0, os.getcwd())
+from web_app.config_manager import ConfigManager
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+def test_reference_loading():
+    print("=== Testing Reference Loading Issue ===")
+    # Load config
+    config = ConfigManager.load_reference_config()
+    english_config = config.get('english', {})
+    print(f"\nAvailable sections in config: {list(english_config.keys())}")
+    # Test what happens when we simulate loading different types of references
+    print("\n=== Simulating Reference List Selection ===")
+    # Simulate selecting some unigrams, bigrams, and trigrams
+    selected_lists = []
+    # Add a unigram
+    if 'unigrams' in english_config and 'COCA_spoken_frequency_token' in english_config['unigrams']:
+        unigram_config = english_config['unigrams']['COCA_spoken_frequency_token']
+        selected_lists.append(('unigrams', 'COCA_spoken_frequency_token', unigram_config))
+        print(f"Added unigram: COCA_spoken_frequency_token")
+    # Add a bigram
+    if 'bigrams' in english_config and 'COCA_spoken_bigram_frequency_token' in english_config['bigrams']:
+        bigram_config = english_config['bigrams']['COCA_spoken_bigram_frequency_token']
+        selected_lists.append(('bigrams', 'COCA_spoken_bigram_frequency_token', bigram_config))
+        print(f"Added bigram: COCA_spoken_bigram_frequency_token")
+    # Add a trigram
+    if 'trigrams' in english_config and 'COCA_trigram_frequency_token' in english_config['trigrams']:
+        trigram_config = english_config['trigrams']['COCA_trigram_frequency_token']
+        selected_lists.append(('trigrams', 'COCA_trigram_frequency_token', trigram_config))
+        print(f"Added trigram: COCA_trigram_frequency_token")
+    print(f"\nTotal selected lists: {len(selected_lists)}")
+    # Load reference data directly
+    reference_lists = {}
+    for ngram_type, list_key, list_config in selected_lists:
+        print(f"\nLoading {ngram_type}: {list_key}")
+        data = ConfigManager.load_reference_list_data(list_config)
+        if data:
+            print(f"  Data keys: {list(data.keys())}")
+            for key, value in data.items():
+                if hasattr(value, '__len__'):
+                    print(f"    {key}: {len(value)} entries")
+                else:
+                    print(f"    {key}: {type(value)}")
+            reference_lists[list_key] = data
+        else:
+            print(f"  Failed to load data for {list_key}")
+    # Check what was loaded
+    print(f"\n=== Loaded Reference Lists ===")
+    print(f"Keys loaded: {list(reference_lists.keys())}")
+    for key, data in reference_lists.items():
+        print(f"\n{key}:")
+        for file_type, file_data in data.items():
+            if hasattr(file_data, '__len__'):
+                print(f"  {file_type}: {len(file_data)} entries")
+            else:
+                print(f"  {file_type}: {type(file_data)}")
+    # Test analysis with these reference lists
+    print(f"\n=== Testing Analysis ===")
+    analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
+    # Load reference lists into analyzer
+    analyzer.load_reference_lists(reference_lists)
+    # Test text
+    test_text = "The cat sat on the mat. The dog ran quickly."
+    # Analyze
+    results = analyzer.analyze_text(
+        test_text,
+        list(reference_lists.keys()),
+        apply_log=False
+    )
+    print(f"\nAnalysis summary keys: {list(results['summary'].keys())}")
+    print(f"Bigram details count: {len(results.get('bigram_details', []))}")
+    print(f"Trigram details count: {len(results.get('trigram_details', []))}")
+    # Check for bigram/trigram entries in summary
+    bigram_summary_keys = [k for k in results['summary'].keys() if 'bigram' in k]
+    trigram_summary_keys = [k for k in results['summary'].keys() if 'trigram' in k]
+    print(f"\nBigram summary keys: {bigram_summary_keys}")
+    print(f"Trigram summary keys: {trigram_summary_keys}")
+    if not bigram_summary_keys and not trigram_summary_keys:
+        print("\n⚠️  WARNING: No bigram or trigram results in summary!")
+        print("This suggests the issue is in the analysis process, not the display function.")
+    else:
+        print("\n✓ Bigram and trigram results found in summary.")
+        print("The issue might be in how the web app loads reference lists.")
+if __name__ == "__main__":
+    test_reference_loading()

text_analyzer/corpus_visualizer.py CHANGED Viewed

@@ -15,6 +15,7 @@ import logging
 import re
 from io import StringIO
 import natsort
 logger = logging.getLogger(__name__)
@@ -102,7 +103,8 @@ class CorpusVisualizer:
         separator = format_info['separator']
         # Load into DataFrame
-        df = pd.read_csv(StringIO(content), sep=separator)
         # Store the dataframe
         if file_type == 'metadata':

 import re
 from io import StringIO
 import natsort
+import csv
 logger = logging.getLogger(__name__)
         separator = format_info['separator']
         # Load into DataFrame
+        df = pd.read_csv(StringIO(content), sep=separator,
+                        quoting=csv.QUOTE_MINIMAL, quotechar='"')
         # Store the dataframe
         if file_type == 'metadata':

text_analyzer/frequency_analyzer.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Dict, List, Tuple, Optional, Union
 import logging
 import random
 from io import StringIO
 logger = logging.getLogger(__name__)
@@ -208,7 +209,8 @@ class FrequencyAnalyzer:
                 has_header = column_config.get('has_header', True)
             # Read data
-            df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None)
             # Store column configuration
             self.column_config = column_config.copy()

 import logging
 import random
 from io import StringIO
+import csv
 logger = logging.getLogger(__name__)
                 has_header = column_config.get('has_header', True)
             # Read data
+            df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None,
+                           quoting=csv.QUOTE_MINIMAL, quotechar='"')
             # Store column configuration
             self.column_config = column_config.copy()

text_analyzer/lexical_sophistication.py CHANGED Viewed

@@ -82,7 +82,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                         delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
                     # Load the file
-                    df = pd.read_csv(file_path, delimiter=delimiter, header=0)
                     if file_type in ['token', 'lemma']:
                         # Check if this is a custom frequency list format with specific columns
@@ -183,10 +184,12 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
             if 'content' in config:
                 # Use content directly
                 content_io = StringIO(config['content'])
-                df = pd.read_csv(content_io, delimiter=delimiter, header=0)
             elif 'file_path' in config:
                 # Fallback to file path for backward compatibility
-                df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0)
             else:
                 logger.error("No content or file_path found in config")
                 return {}
@@ -602,6 +605,18 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
             # Look up scores for each selected index
             for index_name in selected_indices:
                 # Check if this is a Japanese corpus reference list
                 ref_data = self.reference_lists.get(index_name, {})
                 is_japanese_corpus = False
@@ -613,69 +628,87 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                 if is_japanese_corpus and self.language == 'ja':
                     # Use enhanced UniDic lookup with 3-level fallback and diagnostics
-                    token_result = self._lookup_with_unidic_fallback(token, index_name, 'token')
-                    lemma_result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
-                    # Extract scores and diagnostic information
-                    token_score = token_result['score']
-                    lemma_score = lemma_result['score']
-                    # Store enhanced details with diagnostic information
-                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
-                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
-                    # Add diagnostic information for debugging
-                    token_detail[f"{index_name}_token_match_method"] = token_result['match_method']
-                    token_detail[f"{index_name}_lemma_match_method"] = lemma_result['match_method']
-                    token_detail[f"{index_name}_token_match_key"] = token_result['match_key'] or None
-                    token_detail[f"{index_name}_lemma_match_key"] = lemma_result['match_key'] or None
-                    # Store UniDic features for display
-                    if hasattr(token, '_') and hasattr(token._, 'unidic_lemma'):
-                        token_detail['unidic_features'] = {
-                            'lemma': getattr(token._, 'unidic_lemma', ''),
-                            'lForm': getattr(token._, 'unidic_lform', ''),
-                            'pos1': getattr(token._, 'unidic_pos1', ''),
-                            'pos2': getattr(token._, 'unidic_pos2', ''),
-                            'goshu': getattr(token._, 'unidic_goshu', ''),
-                            'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
-                        }
                 elif is_japanese_corpus:
                     # Fallback to legacy Japanese lookup if UniDic not available
-                    token_score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
-                    lemma_score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
-                    # Store scores
-                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
-                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
-                    token_detail[f"{index_name}_token_match_method"] = "legacy_spacy"
-                    token_detail[f"{index_name}_lemma_match_method"] = "legacy_spacy"
                 else:
                     # Standard lookup for non-Japanese data
-                    token_score = self._lookup_score(token.text, index_name, 'token')
-                    lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
-                    # Store scores
-                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
-                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
-                # Collect for summary statistics with selective log transformation
-                if token_score is not None:
-                    # Check if this specific measure should be log-transformed
-                    should_log_transform = self._should_apply_log_transform(
-                        index_name, 'token', 'frequency', log_transforms, apply_log
-                    )
-                    score_val = np.log10(token_score) if should_log_transform and token_score > 0 else token_score
-                    all_scores[f"{index_name}_token_{word_type}"].append(score_val)
-                if lemma_score is not None:
-                    # Check if this specific measure should be log-transformed
-                    should_log_transform = self._should_apply_log_transform(
-                        index_name, 'lemma', 'frequency', log_transforms, apply_log
-                    )
-                    score_val = np.log10(lemma_score) if should_log_transform and lemma_score > 0 else lemma_score
-                    all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
             results['token_details'].append(token_detail)
@@ -722,25 +755,69 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                     if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                         continue
-                    # Get available measures
-                    available_measures = ref_data.columns[1:].tolist()
-                    # Filter measures based on selection
-                    for measure in available_measures:
-                        # Check if this measure should be computed
-                        if not self._should_compute_measure(index_name, measure, selected_measures):
-                            continue
-                        score = self._lookup_score(ngram, index_name, ngram_type, measure)
-                        if score is not None:
-                            # Check if this measure should be log-transformed
-                            should_log_transform = self._should_apply_log_transform(
-                                index_name, ngram_type, measure, log_transforms, apply_log
-                            )
-                            score_val = np.log10(score) if should_log_transform and score > 0 else score
-                            ngram_detail[f"{index_name}_{measure}"] = score_val
-                        else:
-                            ngram_detail[f"{index_name}_{measure}"] = None
                 results[ngram_details_key].append(ngram_detail)
@@ -753,37 +830,93 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                 if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                     continue
-                # Get available measures (all columns except the first one)
-                available_measures = ref_data.columns[1:].tolist()
-                # Filter measures based on selection and compute summary statistics
-                for measure in available_measures:
-                    # Check if this measure should be computed
-                    if not self._should_compute_measure(index_name, measure, selected_measures):
-                        continue
-                    ngram_scores = []
-                    for ngram in ngrams:
-                        score = self._lookup_score(ngram, index_name, ngram_type, measure)
-                        if score is not None:
-                            # Check if this measure should be log-transformed
-                            should_log_transform = self._should_apply_log_transform(
-                                index_name, ngram_type, measure, log_transforms, apply_log
-                            )
-                            score_val = np.log10(score) if should_log_transform and score > 0 else score
-                            ngram_scores.append(score_val)
-                    if ngram_scores:
-                        key = f"{index_name}_{ngram_type}_{measure}"
-                        results['summary'][key] = {
-                            'mean': np.mean(ngram_scores),
-                            'std': np.std(ngram_scores),
-                            'count': len(ngram_scores),
-                            'min': np.min(ngram_scores),
-                            'max': np.max(ngram_scores)
-                        }
-                        # Store raw scores for plotting
-                        results['raw_scores'][key] = ngram_scores
         return results

                         delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
                     # Load the file
+                    df = pd.read_csv(file_path, delimiter=delimiter, header=0,
+                                   quoting=csv.QUOTE_MINIMAL, quotechar='"')
                     if file_type in ['token', 'lemma']:
                         # Check if this is a custom frequency list format with specific columns
             if 'content' in config:
                 # Use content directly
                 content_io = StringIO(config['content'])
+                df = pd.read_csv(content_io, delimiter=delimiter, header=0,
+                               quoting=csv.QUOTE_MINIMAL, quotechar='"')
             elif 'file_path' in config:
                 # Fallback to file path for backward compatibility
+                df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0,
+                               quoting=csv.QUOTE_MINIMAL, quotechar='"')
             else:
                 logger.error("No content or file_path found in config")
                 return {}
             # Look up scores for each selected index
             for index_name in selected_indices:
+                # Extract base name and determine analysis type to avoid duplicate suffixes
+                if index_name.endswith('_token'):
+                    base_name = index_name[:-6]  # Remove '_token'
+                    analysis_type = 'token'
+                elif index_name.endswith('_lemma'):
+                    base_name = index_name[:-6]  # Remove '_lemma'
+                    analysis_type = 'lemma'
+                else:
+                    # Fallback for entries without clear suffix
+                    base_name = index_name
+                    analysis_type = 'token'  # Default to token
                 # Check if this is a Japanese corpus reference list
                 ref_data = self.reference_lists.get(index_name, {})
                 is_japanese_corpus = False
                 if is_japanese_corpus and self.language == 'ja':
                     # Use enhanced UniDic lookup with 3-level fallback and diagnostics
+                    if analysis_type == 'token':
+                        result = self._lookup_with_unidic_fallback(token, index_name, 'token')
+                        score = result['score']
+                        # Store enhanced details with clean column name
+                        token_detail[index_name] = score if score is not None else None
+                        token_detail[f"{index_name}_match_method"] = result['match_method']
+                        token_detail[f"{index_name}_match_key"] = result['match_key'] or None
+                        # Store UniDic features for display (only once per token)
+                        if hasattr(token, '_') and hasattr(token._, 'unidic_lemma') and 'unidic_features' not in token_detail:
+                            token_detail['unidic_features'] = {
+                                'lemma': getattr(token._, 'unidic_lemma', ''),
+                                'lForm': getattr(token._, 'unidic_lform', ''),
+                                'pos1': getattr(token._, 'unidic_pos1', ''),
+                                'pos2': getattr(token._, 'unidic_pos2', ''),
+                                'goshu': getattr(token._, 'unidic_goshu', ''),
+                                'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
+                            }
+                    else:  # lemma analysis
+                        result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
+                        score = result['score']
+                        # Store enhanced details with clean column name
+                        token_detail[index_name] = score if score is not None else None
+                        token_detail[f"{index_name}_match_method"] = result['match_method']
+                        token_detail[f"{index_name}_match_key"] = result['match_key'] or None
                 elif is_japanese_corpus:
                     # Fallback to legacy Japanese lookup if UniDic not available
+                    if analysis_type == 'token':
+                        score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
+                        # Apply log transformation if needed before storing
+                        if score is not None:
+                            should_log_transform = self._should_apply_log_transform(
+                                index_name, analysis_type, 'frequency', log_transforms, apply_log
+                            )
+                            final_score = np.log10(score) if should_log_transform and score > 0 else score
+                        else:
+                            final_score = None
+                        token_detail[index_name] = final_score
+                        token_detail[f"{index_name}_match_method"] = "legacy_spacy"
+                    else:  # lemma analysis
+                        score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
+                        # Apply log transformation if needed before storing
+                        if score is not None:
+                            should_log_transform = self._should_apply_log_transform(
+                                index_name, analysis_type, 'frequency', log_transforms, apply_log
+                            )
+                            final_score = np.log10(score) if should_log_transform and score > 0 else score
+                        else:
+                            final_score = None
+                        token_detail[index_name] = final_score
+                        token_detail[f"{index_name}_match_method"] = "legacy_spacy"
                 else:
                     # Standard lookup for non-Japanese data
+                    if analysis_type == 'token':
+                        score = self._lookup_score(token.text, index_name, 'token')
+                    else:  # lemma analysis
+                        score = self._lookup_score(token.lemma_, index_name, 'lemma')
+                    # Apply log transformation if needed before storing
+                    if score is not None:
+                        should_log_transform = self._should_apply_log_transform(
+                            index_name, analysis_type, 'frequency', log_transforms, apply_log
+                        )
+                        final_score = np.log10(score) if should_log_transform and score > 0 else score
+                    else:
+                        final_score = None
+                    # Store score with clean column name and transformed value
+                    token_detail[index_name] = final_score
+                # Collect for summary statistics (score is already transformed if needed)
+                score = token_detail.get(index_name)
+                if score is not None:
+                    all_scores[f"{index_name}_{word_type}"].append(score)
             results['token_details'].append(token_detail)
                     if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                         continue
+                    # Get columns config for proper measure naming from YAML config
+                    # We need to access the original YAML configuration to get proper measure names
+                    from web_app.config_manager import ConfigManager
+                    config = ConfigManager.load_reference_config()
+                    language_key = "english" if self.language == 'en' else "japanese"
+                    # Find the config entry for this index
+                    config_entry = None
+                    for config_section in [f"{ngram_type}s"]:  # bigrams/trigrams sections
+                        if config_section in config.get(language_key, {}):
+                            if index_name in config[language_key][config_section]:
+                                config_entry = config[language_key][config_section][index_name]
+                                break
+                    if config_entry and 'columns' in config_entry:
+                        # Get columns config for this n-gram type
+                        columns_config = config_entry.get('columns', {})
+                        # Create mapping from column index to measure name
+                        measure_mapping = {}
+                        for measure_name, col_idx in columns_config.items():
+                            if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
+                                measure_mapping[ref_data.columns[col_idx]] = measure_name
+                        # Use the measure mapping to get proper names
+                        for col_name, measure_name in measure_mapping.items():
+                            if col_name == ref_data.columns[0]:  # Skip the n-gram text column
+                                continue
+                            # Check if this measure should be computed
+                            if not self._should_compute_measure(index_name, measure_name, selected_measures):
+                                continue
+                            score = self._lookup_score(ngram, index_name, ngram_type, col_name)
+                            if score is not None:
+                                # Check if this measure should be log-transformed
+                                should_log_transform = self._should_apply_log_transform(
+                                    index_name, ngram_type, measure_name, log_transforms, apply_log
+                                )
+                                score_val = np.log10(score) if should_log_transform and score > 0 else score
+                                ngram_detail[f"{index_name}_{measure_name}"] = score_val
+                            else:
+                                ngram_detail[f"{index_name}_{measure_name}"] = None
+                    else:
+                        # Fallback to old logic
+                        available_measures = ref_data.columns[1:].tolist()
+                        # Filter measures based on selection
+                        for measure in available_measures:
+                            # Check if this measure should be computed
+                            if not self._should_compute_measure(index_name, measure, selected_measures):
+                                continue
+                            score = self._lookup_score(ngram, index_name, ngram_type, measure)
+                            if score is not None:
+                                # Check if this measure should be log-transformed
+                                should_log_transform = self._should_apply_log_transform(
+                                    index_name, ngram_type, measure, log_transforms, apply_log
+                                )
+                                score_val = np.log10(score) if should_log_transform and score > 0 else score
+                                ngram_detail[f"{index_name}_{measure}"] = score_val
+                            else:
+                                ngram_detail[f"{index_name}_{measure}"] = None
                 results[ngram_details_key].append(ngram_detail)
                 if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                     continue
+                # Get columns config for proper measure naming from YAML config
+                # We need to access the original YAML configuration to get proper measure names
+                from web_app.config_manager import ConfigManager
+                config = ConfigManager.load_reference_config()
+                language_key = "english" if self.language == 'en' else "japanese"
+                # Find the config entry for this index
+                config_entry = None
+                for config_section in [f"{ngram_type}s"]:  # bigrams/trigrams sections
+                    if config_section in config.get(language_key, {}):
+                        if index_name in config[language_key][config_section]:
+                            config_entry = config[language_key][config_section][index_name]
+                            break
+                if config_entry and 'columns' in config_entry:
+                    # Get columns config for this n-gram type
+                    columns_config = config_entry.get('columns', {})
+                    # Create mapping from column index to measure name
+                    measure_mapping = {}
+                    for measure_name, col_idx in columns_config.items():
+                        if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
+                            measure_mapping[ref_data.columns[col_idx]] = measure_name
+                    # Use the measure mapping to get proper names
+                    for col_name, measure_name in measure_mapping.items():
+                        if col_name == ref_data.columns[0]:  # Skip the n-gram text column
+                            continue
+                        # Check if this measure should be computed
+                        if not self._should_compute_measure(index_name, measure_name, selected_measures):
+                            continue
+                        ngram_scores = []
+                        for ngram in ngrams:
+                            score = self._lookup_score(ngram, index_name, ngram_type, col_name)
+                            if score is not None:
+                                # Check if this measure should be log-transformed
+                                should_log_transform = self._should_apply_log_transform(
+                                    index_name, ngram_type, measure_name, log_transforms, apply_log
+                                )
+                                score_val = np.log10(score) if should_log_transform and score > 0 else score
+                                ngram_scores.append(score_val)
+                        if ngram_scores:
+                            key = f"{index_name}_{ngram_type}_{measure_name}"
+                            results['summary'][key] = {
+                                'mean': np.mean(ngram_scores),
+                                'std': np.std(ngram_scores),
+                                'count': len(ngram_scores),
+                                'min': np.min(ngram_scores),
+                                'max': np.max(ngram_scores)
+                            }
+                            # Store raw scores for plotting
+                            results['raw_scores'][key] = ngram_scores
+                else:
+                    # Fallback to old logic if config not properly structured
+                    available_measures = ref_data.columns[1:].tolist()
+                    # Filter measures based on selection and compute summary statistics
+                    for measure in available_measures:
+                        # Check if this measure should be computed
+                        if not self._should_compute_measure(index_name, measure, selected_measures):
+                            continue
+                        ngram_scores = []
+                        for ngram in ngrams:
+                            score = self._lookup_score(ngram, index_name, ngram_type, measure)
+                            if score is not None:
+                                # Check if this measure should be log-transformed
+                                should_log_transform = self._should_apply_log_transform(
+                                    index_name, ngram_type, measure, log_transforms, apply_log
+                                )
+                                score_val = np.log10(score) if should_log_transform and score > 0 else score
+                                ngram_scores.append(score_val)
+                        if ngram_scores:
+                            key = f"{index_name}_{ngram_type}_{measure}"
+                            results['summary'][key] = {
+                                'mean': np.mean(ngram_scores),
+                                'std': np.std(ngram_scores),
+                                'count': len(ngram_scores),
+                                'min': np.min(ngram_scores),
+                                'max': np.max(ngram_scores)
+                            }
+                            # Store raw scores for plotting
+                            results['raw_scores'][key] = ngram_scores
         return results

web_app/app.py CHANGED Viewed

@@ -91,7 +91,7 @@ def render_sidebar():
 def render_lexical_sophistication_interface():
     """Render lexical sophistication analysis interface."""
-    st.header("🔍 Lexical Sophistication Analysis")
     # Get analyzer
     analyzer = AnalysisHandlers.get_analyzer()

 def render_lexical_sophistication_interface():
     """Render lexical sophistication analysis interface."""
+    st.header("🔍 Emulation of the Tool for Automatic Analysis of Lexical Sophistication (emuTAALES)")
     # Get analyzer
     analyzer = AnalysisHandlers.get_analyzer()

web_app/components/ui_components.py CHANGED Viewed

@@ -298,7 +298,7 @@ class UIComponents:
     @staticmethod
     def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
-        """Render the enhanced reference list selection interface with hierarchical display."""
         from web_app.defaults_manager import DefaultsManager
         # Initialize return values
@@ -309,34 +309,160 @@ class UIComponents:
             st.info("No reference lists selected. Please configure reference lists first.")
             return selected_measures, log_transforms
-        # Simple hierarchical display showing selected lists with smart defaults info
-        for list_name in reference_lists.keys():
-            # Show smart defaults indicator
-            entry_config = UIComponents._find_entry_config(list_name, config)
-            if entry_config and entry_config.get('default_measures'):
-                defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
-                log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
-                # Determine analysis type badges
-                analysis_badges = []
-                if entry_config.get('analysis_type') == 'token' or not entry_config.get('analysis_type'):
-                    analysis_badges.append("[Token ✓]")
-                if entry_config.get('analysis_type') == 'lemma' or not entry_config.get('analysis_type'):
-                    analysis_badges.append("[Lemma ✓]")
-                analysis_info = " ".join(analysis_badges) if analysis_badges else ""
-                st.write(f"├─ **{list_name}** {analysis_info} [ℹ️ Smart defaults]")
-                st.write(f"   {defaults_info}, {log_info}")
-                # Apply smart defaults to return values
-                selected_measures[list_name] = entry_config.get('default_measures', [])
-                log_transforms[list_name] = entry_config.get('default_log_transforms', [])
-            else:
-                st.write(f"├─ **{list_name}** [Legacy configuration]")
         return selected_measures, log_transforms
     @staticmethod
     def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
         """Check if a group has smart defaults configured."""

     @staticmethod
     def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
+        """Render the advanced reference list selection interface with hierarchical grouping and individual measure control."""
         from web_app.defaults_manager import DefaultsManager
         # Initialize return values
             st.info("No reference lists selected. Please configure reference lists first.")
             return selected_measures, log_transforms
+        # Group reference lists by base name for hierarchical display
+        groups = UIComponents._group_reference_lists(reference_lists, config)
+        st.write("**Reference Lists & Measures:**")
+        # Render each group with hierarchical interface
+        for base_name, group_data in groups.items():
+            # Group-level enable/disable checkbox
+            group_key = f"group_enabled_{base_name}"
+            group_enabled = st.checkbox(
+                f"☑️ **{base_name}**",
+                value=True,  # Default enabled
+                key=group_key,
+                help=f"Enable/disable all {base_name} analyses"
+            )
+            if group_enabled:
+                # Analysis type badges display
+                badges = []
+                if group_data['token']:
+                    badges.append("[Token ✓]")
+                if group_data['lemma']:
+                    badges.append("[Lemma ✓]")
+                if badges:
+                    st.write(f"   {' '.join(badges)}")
+                # Expandable measure selection for each analysis type
+                if group_data['token']:
+                    with st.expander("📊 Token Measures ⬇️ (click to customize)", expanded=False):
+                        token_measures, token_logs = UIComponents._render_measure_selection(
+                            group_data['token'][0], 'token', base_name
+                        )
+                        # Always store the results, even if empty (to maintain structure)
+                        selected_measures[group_data['token'][0][0]] = token_measures
+                        log_transforms[group_data['token'][0][0]] = token_logs
+                if group_data['lemma']:
+                    with st.expander("📊 Lemma Measures ⬇️ (click to customize)", expanded=False):
+                        lemma_measures, lemma_logs = UIComponents._render_measure_selection(
+                            group_data['lemma'][0], 'lemma', base_name
+                        )
+                        # Always store the results, even if empty (to maintain structure)
+                        selected_measures[group_data['lemma'][0][0]] = lemma_measures
+                        log_transforms[group_data['lemma'][0][0]] = lemma_logs
+                # Show smart defaults summary
+                token_entry_name = group_data['token'][0][0] if group_data['token'] else None
+                lemma_entry_name = group_data['lemma'][0][0] if group_data['lemma'] else None
+                total_measures = 0
+                total_logs = 0
+                if token_entry_name:
+                    total_measures += len(selected_measures.get(token_entry_name, []))
+                    total_logs += len(log_transforms.get(token_entry_name, []))
+                if lemma_entry_name:
+                    total_measures += len(selected_measures.get(lemma_entry_name, []))
+                    total_logs += len(log_transforms.get(lemma_entry_name, []))
+                st.write(f"   📊 {total_measures} measures selected, 🔄 {total_logs} log-transformed")
+                st.write("")  # Add spacing
         return selected_measures, log_transforms
+    @staticmethod
+    def _group_reference_lists(reference_lists: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Dict[str, List]]:
+        """Group related reference lists for hierarchical display."""
+        from collections import defaultdict
+        groups = defaultdict(lambda: {'token': [], 'lemma': []})
+        for entry_name in reference_lists.keys():
+            # Extract base name (remove _token/_lemma suffix)
+            base_name = entry_name.replace('_token', '').replace('_lemma', '')
+            # Get analysis type from config
+            entry_config = UIComponents._find_entry_config(entry_name, config)
+            if entry_config:
+                analysis_type = entry_config.get('analysis_type', 'token')
+                groups[base_name][analysis_type].append((entry_name, entry_config))
+        return groups
+    @staticmethod
+    def _render_measure_selection(entry_data: Tuple[str, Dict], analysis_type: str, base_name: str) -> Tuple[List[str], List[str]]:
+        """Render individual measure checkboxes with log transform controls."""
+        entry_name, entry_config = entry_data
+        # Get measure information from config
+        selectable_measures = entry_config.get('selectable_measures', [])
+        log_transformable = entry_config.get('log_transformable', [])
+        default_measures = entry_config.get('default_measures', [])
+        default_log_transforms = entry_config.get('default_log_transforms', [])
+        # Initialize session state for this entry if not exists
+        if f'custom_measures_{entry_name}' not in st.session_state:
+            st.session_state[f'custom_measures_{entry_name}'] = default_measures.copy()
+        if f'custom_logs_{entry_name}' not in st.session_state:
+            st.session_state[f'custom_logs_{entry_name}'] = default_log_transforms.copy()
+        # Display measure selection interface
+        st.write(f"**Available Measures for {entry_config.get('display_name', entry_name)}:**")
+        selected_measures = []
+        selected_logs = []
+        for measure in selectable_measures:
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                # Measure checkbox (pre-selected based on defaults)
+                measure_key = f"measure_{entry_name}_{measure}"
+                selected = st.checkbox(
+                    f"☑️ {measure.replace('_', ' ').title()}",
+                    value=measure in st.session_state[f'custom_measures_{entry_name}'],
+                    key=measure_key,
+                    help=f"Include {measure} in analysis"
+                )
+                if selected:
+                    selected_measures.append(measure)
+            with col2:
+                # Log transform toggle (disabled if not transformable)
+                if measure in log_transformable and selected:
+                    log_key = f"log_{entry_name}_{measure}"
+                    log_enabled = st.checkbox(
+                        "🔄 log₁₀",
+                        value=measure in st.session_state[f'custom_logs_{entry_name}'],
+                        key=log_key,
+                        help=f"Apply log₁₀ transformation to {measure}"
+                    )
+                    if log_enabled:
+                        selected_logs.append(measure)
+                elif measure in log_transformable:
+                    st.write("🔄 (disabled)")
+                else:
+                    st.write("❌ (not transformable)")
+        # Update session state
+        st.session_state[f'custom_measures_{entry_name}'] = selected_measures
+        st.session_state[f'custom_logs_{entry_name}'] = selected_logs
+        # Show selection summary
+        if selected_measures:
+            st.success(f"✅ {len(selected_measures)} measures selected, {len(selected_logs)} log-transformed")
+        else:
+            st.warning("⚠️ No measures selected for this analysis type")
+        return selected_measures, selected_logs
     @staticmethod
     def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
         """Check if a group has smart defaults configured."""

web_app/config_manager.py CHANGED Viewed

@@ -8,6 +8,7 @@ import pandas as pd
 from pathlib import Path
 from typing import Dict, List, Any, Optional, Tuple
 import yaml
 from web_app.session_manager import SessionManager
 from web_app.utils import MemoryFileHandler
@@ -70,7 +71,8 @@ class ConfigManager:
             content_io = StringIO(text_content)
             # Load preview
-            df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5)
             # Store content in session state instead of file path
             if 'uploaded_files_content' not in st.session_state:
@@ -209,9 +211,11 @@ class ConfigManager:
                 # Load file
                 if list_config.get('has_header', False):
-                    df = pd.read_csv(file_path, delimiter=delimiter, header=0)
                 else:
-                    df = pd.read_csv(file_path, delimiter=delimiter, header=None)
                 # Get column mapping
                 columns = list_config.get('columns', {})
@@ -247,6 +251,7 @@ class ConfigManager:
                     elif is_trigram:
                         data['trigram'] = df
                     else:
                         data[file_type] = df
             except Exception as e:

 from pathlib import Path
 from typing import Dict, List, Any, Optional, Tuple
 import yaml
+import csv
 from web_app.session_manager import SessionManager
 from web_app.utils import MemoryFileHandler
             content_io = StringIO(text_content)
             # Load preview
+            df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5,
+                                   quoting=csv.QUOTE_MINIMAL, quotechar='"')
             # Store content in session state instead of file path
             if 'uploaded_files_content' not in st.session_state:
                 # Load file
                 if list_config.get('has_header', False):
+                    df = pd.read_csv(file_path, delimiter=delimiter, header=0,
+                                   quoting=csv.QUOTE_MINIMAL, quotechar='"')
                 else:
+                    df = pd.read_csv(file_path, delimiter=delimiter, header=None,
+                                   quoting=csv.QUOTE_MINIMAL, quotechar='"')
                 # Get column mapping
                 columns = list_config.get('columns', {})
                     elif is_trigram:
                         data['trigram'] = df
                     else:
+                        # For standard unigram files that aren't bigrams or trigrams
                         data[file_type] = df
             except Exception as e:

web_app/handlers/analysis_handlers.py CHANGED Viewed

@@ -96,28 +96,47 @@ class AnalysisHandlers:
                     analyzer.load_reference_lists(reference_lists)
                     # Get analysis configuration
-                    if use_smart_defaults:
-                        # Use smart defaults from configuration
-                        from web_app.defaults_manager import DefaultsManager
-                        from web_app.config_manager import ConfigManager
-                        config = ConfigManager.load_reference_config()
-                        selected_measures, log_transforms = DefaultsManager.get_default_analysis_config(
-                            list(reference_lists.keys()), config
-                        )
-                        # Perform enhanced analysis with smart defaults
-                        results = analyzer.analyze_text(
-                            text_content,
-                            list(reference_lists.keys()),
-                            apply_log=False,  # Superseded by log_transforms
-                            word_type_filter=word_type_filter,
-                            log_transforms=log_transforms,
-                            selected_measures=selected_measures
-                        )
-                        st.success("✨ Analysis completed using Smart Defaults!")
-                        st.info(f"📊 Applied selective log transforms to {sum(len(measures) for measures in log_transforms.values())} measures")
                     else:
                         # Legacy mode - use global log transformation
@@ -402,22 +421,183 @@ class AnalysisHandlers:
     @staticmethod
     def create_density_plots(results: Dict[str, Any]):
-        """Create density plots for score distributions."""
         if 'raw_scores' not in results:
             return
         for key, scores in results['raw_scores'].items():
             if len(scores) > 1:  # Need at least 2 points for density
-                # Create histogram with density curve
                 fig = go.Figure()
-                # Add histogram
-                fig.add_trace(go.Histogram(
                     x=scores,
-                    nbinsx=min(30, len(scores)),
                     name='Histogram',
                     opacity=0.7,
                     histnorm='probability density'
                 ))
                 # Calculate and add KDE curve
@@ -433,6 +613,17 @@ class AnalysisHandlers:
                     line=dict(color='red', width=2)
                 ))
                 # Update layout
                 fig.update_layout(
                     title=f"Distribution of {key}",
@@ -447,79 +638,6 @@ class AnalysisHandlers:
     @staticmethod
     def render_enhanced_analysis_options():
-        """Render the enhanced analysis interface with smart defaults and hierarchical display."""
-        from web_app.defaults_manager import DefaultsManager
-        from web_app.config_manager import ConfigManager
-        from web_app.session_manager import SessionManager
-        st.subheader("🔧 Analysis Configuration")
-        # Get current configuration
-        config = ConfigManager.load_reference_config()
-        reference_lists = SessionManager.get_reference_lists()
-        # Enhanced Reference Lists & Measures Section
-        st.write("### 📋 Reference Lists & Measures")
-        # Simple hierarchical display for now (basic implementation)
-        if reference_lists:
-            st.write("**Selected Reference Lists:**")
-            for list_name in reference_lists.keys():
-                # Show smart defaults indicator
-                entry_config = UIComponents._find_entry_config(list_name, config)
-                if entry_config and entry_config.get('default_measures'):
-                    defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
-                    log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
-                    st.write(f"├─ **{list_name}** [Token ✓] [Lemma ✓] [ℹ️ Smart defaults]")
-                    st.write(f"   {defaults_info}, {log_info}")
-                else:
-                    st.write(f"├─ **{list_name}** [Legacy configuration]")
-        else:
-            st.info("No reference lists selected. Please configure reference lists first.")
-        # Global Analysis Options
-        st.write("### 🎯 Analysis Types")
-        col1, col2 = st.columns(2)
-        with col1:
-            token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
-        with col2:
-            lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
-        # Global Options
-        st.write("### ⚙️ Global Options")
-        word_type_filter = st.selectbox(
-            "Word Type Filter:",
-            options=[None, 'CW', 'FW'],
-            format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
-            key="word_type_filter"
-        )
-        # Advanced Configuration Section
-        with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
-            st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
-                   "Expand this section only if you need custom control.")
-            # Legacy log transformation toggle
-            legacy_log_toggle = st.checkbox(
-                "Apply log₁₀ transformation to ALL measures (Legacy Mode)",
-                value=False,
-                help="⚠️ Not recommended: This applies log transformation to all measures, "
-                     "including those where it's scientifically inappropriate (e.g., concreteness ratings).",
-                key="legacy_log_transform"
-            )
-            if legacy_log_toggle:
-                st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
-                          "This may produce scientifically invalid results for psycholinguistic measures.")
-        # Return enhanced configuration
-        return {
-            'token_analysis': token_analysis,
-            'lemma_analysis': lemma_analysis,
-            'word_type_filter': word_type_filter,
-            'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
-            'legacy_log_transform': st.session_state.get('legacy_log_transform', False),
-            'selected_measures': {},  # Will be filled by smart defaults
-            'log_transforms': {}     # Will be filled by smart defaults
-        }

                     analyzer.load_reference_lists(reference_lists)
                     # Get analysis configuration
+                    if use_smart_defaults and not legacy_log_transform:
+                        # Use custom selections from the enhanced UI
+                        if selected_measures and any(selected_measures.values()):
+                            # User has made custom selections
+                            results = analyzer.analyze_text(
+                                text_content,
+                                list(reference_lists.keys()),
+                                apply_log=False,  # Superseded by log_transforms
+                                word_type_filter=word_type_filter,
+                                log_transforms=log_transforms,
+                                selected_measures=selected_measures
+                            )
+                            # Calculate totals for user feedback
+                            total_measures = sum(len(measures) for measures in selected_measures.values())
+                            total_logs = sum(len(logs) for logs in log_transforms.values())
+                            st.success("✨ Analysis completed using your custom selections!")
+                            st.info(f"📊 Analyzed {total_measures} measures, {total_logs} log-transformed")
+                        else:
+                            # Fallback to smart defaults if no custom selections
+                            from web_app.defaults_manager import DefaultsManager
+                            from web_app.config_manager import ConfigManager
+                            config = ConfigManager.load_reference_config()
+                            default_measures, default_logs = DefaultsManager.get_default_analysis_config(
+                                list(reference_lists.keys()), config
+                            )
+                            results = analyzer.analyze_text(
+                                text_content,
+                                list(reference_lists.keys()),
+                                apply_log=False,
+                                word_type_filter=word_type_filter,
+                                log_transforms=default_logs,
+                                selected_measures=default_measures
+                            )
+                            total_logs = sum(len(logs) for logs in default_logs.values())
+                            st.success("✨ Analysis completed using Smart Defaults!")
+                            st.info(f"📊 Applied selective log transforms to {total_logs} measures")
                     else:
                         # Legacy mode - use global log transformation
     @staticmethod
     def create_density_plots(results: Dict[str, Any]):
+        """Create density plots for score distributions with mean line and example words."""
         if 'raw_scores' not in results:
             return
         for key, scores in results['raw_scores'].items():
             if len(scores) > 1:  # Need at least 2 points for density
+                # Create word-to-score mapping for this measure
+                word_score_map = {}
+                # Determine if this is a bigram, trigram, or token-based measure
+                if '_bigram_' in key:
+                    # Handle bigram measures
+                    if 'bigram_details' in results and results['bigram_details']:
+                        # Extract the correct column name from the key
+                        # Raw scores key: 'COCA_spoken_bigram_frequency_token_bigram_frequency'
+                        # Actual column: 'COCA_spoken_bigram_frequency_token_frequency'
+                        # Remove the last occurrence of '_bigram' from the key
+                        idx = key.rfind('_bigram')
+                        if idx != -1:
+                            index_measure_col = key[:idx] + key[idx+7:]  # 7 = len('_bigram')
+                        else:
+                            index_measure_col = key
+                        # Build mapping from bigram details
+                        for bigram_detail in results['bigram_details']:
+                            if index_measure_col in bigram_detail and bigram_detail[index_measure_col] is not None:
+                                bigram_text = bigram_detail.get('bigram', '')
+                                word_score_map[bigram_text] = bigram_detail[index_measure_col]
+                elif '_trigram_' in key:
+                    # Handle trigram measures
+                    if 'trigram_details' in results and results['trigram_details']:
+                        # Extract the correct column name from the key
+                        # Raw scores key: 'COCA_trigram_frequency_token_trigram_frequency'
+                        # Actual column: 'COCA_trigram_frequency_token_frequency'
+                        # Remove the last occurrence of '_trigram' from the key
+                        idx = key.rfind('_trigram')
+                        if idx != -1:
+                            index_measure_col = key[:idx] + key[idx+8:]  # 8 = len('_trigram')
+                        else:
+                            index_measure_col = key
+                        # Build mapping from trigram details
+                        for trigram_detail in results['trigram_details']:
+                            if index_measure_col in trigram_detail and trigram_detail[index_measure_col] is not None:
+                                trigram_text = trigram_detail.get('trigram', '')
+                                word_score_map[trigram_text] = trigram_detail[index_measure_col]
+                else:
+                    # Handle token-based measures (existing logic)
+                    if 'token_details' in results:
+                        # Handle key mismatch between raw_scores and token_details
+                        # raw_scores keys may have suffixes like '_CW', '_FW', etc.
+                        # while token_details uses the base column names
+                        # Try to find matching column in token_details
+                        matching_column = None
+                        # First, try exact match
+                        if any(key in token for token in results['token_details']):
+                            matching_column = key
+                        else:
+                            # Try removing word type suffixes (_CW, _FW)
+                            base_key = key
+                            for suffix in ['_CW', '_FW']:
+                                if key.endswith(suffix):
+                                    base_key = key[:-len(suffix)]
+                                    break
+                            # Check if base key exists in token_details
+                            if any(base_key in token for token in results['token_details']):
+                                matching_column = base_key
+                            else:
+                                # Try finding partial matches for complex keys
+                                for token in results['token_details']:
+                                    for col_name in token.keys():
+                                        if col_name != 'id' and col_name != 'token' and col_name != 'lemma' and col_name != 'pos' and col_name != 'tag' and col_name != 'word_type':
+                                            # Check if this column name is part of our key
+                                            if col_name in key or key.startswith(col_name):
+                                                matching_column = col_name
+                                                break
+                                    if matching_column:
+                                        break
+                        # Build word-to-score mapping using the matching column
+                        if matching_column:
+                            for token in results['token_details']:
+                                if matching_column in token and token[matching_column] is not None:
+                                    word_score_map[token['token']] = token[matching_column]
+                # Calculate number of bins
+                nbins = min(30, len(scores))
+                # Create figure and add histogram first to let Plotly calculate optimal bins
                 fig = go.Figure()
+                # Add histogram to get Plotly's binning
+                histogram_trace = go.Histogram(
                     x=scores,
+                    nbinsx=nbins,
                     name='Histogram',
                     opacity=0.7,
                     histnorm='probability density'
+                )
+                fig.add_trace(histogram_trace)
+                # Extract Plotly's actual bin edges by creating a temporary figure to get the data
+                temp_fig = go.Figure()
+                temp_fig.add_trace(go.Histogram(x=scores, nbinsx=nbins))
+                # Calculate histogram using the same parameters as Plotly would use
+                # Plotly calculates bins similar to numpy's auto method
+                # We'll use numpy but ensure we get similar bin edges
+                hist_data, plotly_bin_edges = np.histogram(scores, bins=nbins)
+                # For better alignment with Plotly, we can also try to match Plotly's exact binning
+                # by using the range and number of bins
+                score_min, score_max = min(scores), max(scores)
+                # Add small padding as Plotly does
+                score_range = score_max - score_min
+                padding = score_range * 0.02  # Small padding like Plotly
+                adjusted_min = score_min - padding
+                adjusted_max = score_max + padding
+                # Create bins with the adjusted range
+                plotly_bin_edges = np.linspace(adjusted_min, adjusted_max, nbins + 1)
+                hist_data, _ = np.histogram(scores, bins=plotly_bin_edges)
+                # Assign words to bins using Plotly-aligned bin edges
+                bin_examples = {}
+                if word_score_map:
+                    import random
+                    for word, score in word_score_map.items():
+                        bin_idx = np.digitize(score, plotly_bin_edges) - 1
+                        bin_idx = max(0, min(bin_idx, len(plotly_bin_edges) - 2))  # Clamp to valid range
+                        if bin_idx not in bin_examples:
+                            bin_examples[bin_idx] = []
+                        bin_examples[bin_idx].append(word)
+                    # Randomly sample up to 3 words per bin
+                    for bin_idx in bin_examples:
+                        if len(bin_examples[bin_idx]) > 3:
+                            bin_examples[bin_idx] = random.sample(bin_examples[bin_idx], 3)
+                # Create hover text for each bin using Plotly's bins
+                hover_texts = []
+                for i in range(len(hist_data)):
+                    bin_start = plotly_bin_edges[i]
+                    bin_end = plotly_bin_edges[i + 1]
+                    examples = bin_examples.get(i, [])
+                    hover_text = f"Range: {bin_start:.3f} - {bin_end:.3f}<br>"
+                    hover_text += f"Count: {hist_data[i]}<br>"
+                    if examples:
+                        hover_text += f"Examples: {', '.join(examples)}"
+                    else:
+                        hover_text += "Examples: none"
+                    hover_texts.append(hover_text)
+                # Clear the figure and rebuild with custom hover text
+                fig = go.Figure()
+                # Add histogram with custom hover text using the calculated bin edges
+                fig.add_trace(go.Histogram(
+                    x=scores,
+                    xbins=dict(
+                        start=plotly_bin_edges[0],
+                        end=plotly_bin_edges[-1],
+                        size=(plotly_bin_edges[-1] - plotly_bin_edges[0]) / nbins
+                    ),
+                    name='Histogram',
+                    opacity=0.7,
+                    histnorm='probability density',
+                    hovertemplate='%{customdata}<extra></extra>',
+                    customdata=hover_texts
                 ))
                 # Calculate and add KDE curve
                     line=dict(color='red', width=2)
                 ))
+                # Add mean line
+                mean_score = np.mean(scores)
+                fig.add_vline(
+                    x=mean_score,
+                    line_dash="dash",
+                    line_color="green",
+                    line_width=2,
+                    annotation_text=f"Mean: {mean_score:.3f}",
+                    annotation_position="top"
+                )
                 # Update layout
                 fig.update_layout(
                     title=f"Distribution of {key}",
     @staticmethod
     def render_enhanced_analysis_options():
+        """Render the enhanced analysis interface with advanced measure selection capabilities."""
+        # Use the new enhanced UI from UIComponents
+        return UIComponents.render_analysis_options()

web_app/handlers/frequency_handlers.py CHANGED Viewed

@@ -18,6 +18,7 @@ import sys
 import os
 from pathlib import Path
 from io import StringIO, BytesIO
 # Add parent directory to path for imports
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
@@ -111,7 +112,8 @@ class FrequencyHandlers:
                     df_preview = pd.read_csv(StringIO(content),
                                            sep=st.session_state.format_info['separator'],
                                            header=0 if st.session_state.format_info['has_header'] else None,
-                                           nrows=100)
                     # Detect available columns
                     st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)

 import os
 from pathlib import Path
 from io import StringIO, BytesIO
+import csv
 # Add parent directory to path for imports
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
                     df_preview = pd.read_csv(StringIO(content),
                                            sep=st.session_state.format_info['separator'],
                                            header=0 if st.session_state.format_info['has_header'] else None,
+                                           nrows=100,
+                                           quoting=csv.QUOTE_MINIMAL, quotechar='"')
                     # Detect available columns
                     st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)

web_app/utils/memory_file_handler.py CHANGED Viewed

@@ -10,6 +10,7 @@ from io import BytesIO, StringIO
 from typing import Optional, Union, Dict, Any
 import pandas as pd
 import zipfile
 class MemoryFileHandler:
@@ -76,7 +77,8 @@ class MemoryFileHandler:
                     delimiter = ','
             # Read directly into DataFrame
-            df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8')
             return df
         except Exception as e:
@@ -167,4 +169,4 @@ class MemoryFileHandler:
             for key in keys_to_remove:
                 del st.session_state[key]
         else:
-            st.session_state.clear()

 from typing import Optional, Union, Dict, Any
 import pandas as pd
 import zipfile
+import csv
 class MemoryFileHandler:
                     delimiter = ','
             # Read directly into DataFrame
+            df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8',
+                           quoting=csv.QUOTE_MINIMAL, quotechar='"')
             return df
         except Exception as e:
             for key in keys_to_remove:
                 del st.session_state[key]
         else:
+            st.session_state.clear()