#!/usr/bin/env python3 """ Test script to verify that log transformations are consistently applied to both token details table and summary statistics. """ import sys import os from pathlib import Path import numpy as np # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) def test_log_transform_consistency(): """Test that log transformations are applied consistently.""" print("Testing Log Transform Consistency...") try: from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer # Create analyzer analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md') # Create mock reference lists with known values mock_reference_lists = { 'COCA_spoken_frequency_token': { 'token': {'hello': 1000, 'world': 2000} # Values that will be log-transformed } } # Load reference lists analyzer.load_reference_lists(mock_reference_lists) # Analyze text with log transformation enabled text = "Hello world" selected_indices = ['COCA_spoken_frequency_token'] # Enable log transformation for this index log_transforms = { 'COCA_spoken_frequency_token': ['frequency'] } results = analyzer.analyze_text( text, selected_indices, log_transforms=log_transforms ) # Get token details and summary statistics token_details = results['token_details'] summary_stats = results['summary'] print(f"Token details: {len(token_details)} tokens") print(f"Summary keys: {list(summary_stats.keys())}") # Check consistency between token details and summary if token_details: # Get log-transformed values from token details token_scores = [] for token_detail in token_details: score = token_detail.get('COCA_spoken_frequency_token') if score is not None: token_scores.append(score) print(f"Token '{token_detail['token']}': score = {score}") if token_scores: # Calculate mean from token details token_mean = np.mean(token_scores) # Get mean from summary statistics summary_key = 'COCA_spoken_frequency_token_CW' # Content words if summary_key in summary_stats: summary_mean = summary_stats[summary_key]['mean'] print(f"Token details mean: {token_mean}") print(f"Summary stats mean: {summary_mean}") # Check if they're approximately equal (allowing for floating point precision) if abs(token_mean - summary_mean) < 0.001: print("✅ Token details and summary statistics are consistent!") # Check that values are actually log-transformed # Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3 if all(2.5 < score < 3.5 for score in token_scores): print("✅ Values appear to be properly log-transformed!") return True else: print(f"⚠️ Values don't appear to be log-transformed: {token_scores}") return False else: print(f"❌ Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}") return False else: print(f"❌ Summary key '{summary_key}' not found in summary stats") return False else: print("❌ No token scores found") return False else: print("❌ No token details found") return False except Exception as e: print(f"❌ Test failed with error: {e}") return False def main(): """Run the log transform consistency test.""" print("🧪 Testing Log Transform Consistency\n") if test_log_transform_consistency(): print("\n🎉 Log transformation consistency has been fixed!") return 0 else: print("\n⚠️ Log transformation consistency issue still exists.") return 1 if __name__ == "__main__": exit(main())