Spaces:
Building
Building
| #!/usr/bin/env python3 | |
| """ | |
| Test script to verify that log transformations are consistently applied | |
| to both token details table and summary statistics. | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| import numpy as np | |
| # Add project root to path | |
| project_root = Path(__file__).parent | |
| sys.path.insert(0, str(project_root)) | |
| def test_log_transform_consistency(): | |
| """Test that log transformations are applied consistently.""" | |
| print("Testing Log Transform Consistency...") | |
| try: | |
| from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer | |
| # Create analyzer | |
| analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md') | |
| # Create mock reference lists with known values | |
| mock_reference_lists = { | |
| 'COCA_spoken_frequency_token': { | |
| 'token': {'hello': 1000, 'world': 2000} # Values that will be log-transformed | |
| } | |
| } | |
| # Load reference lists | |
| analyzer.load_reference_lists(mock_reference_lists) | |
| # Analyze text with log transformation enabled | |
| text = "Hello world" | |
| selected_indices = ['COCA_spoken_frequency_token'] | |
| # Enable log transformation for this index | |
| log_transforms = { | |
| 'COCA_spoken_frequency_token': ['frequency'] | |
| } | |
| results = analyzer.analyze_text( | |
| text, | |
| selected_indices, | |
| log_transforms=log_transforms | |
| ) | |
| # Get token details and summary statistics | |
| token_details = results['token_details'] | |
| summary_stats = results['summary'] | |
| print(f"Token details: {len(token_details)} tokens") | |
| print(f"Summary keys: {list(summary_stats.keys())}") | |
| # Check consistency between token details and summary | |
| if token_details: | |
| # Get log-transformed values from token details | |
| token_scores = [] | |
| for token_detail in token_details: | |
| score = token_detail.get('COCA_spoken_frequency_token') | |
| if score is not None: | |
| token_scores.append(score) | |
| print(f"Token '{token_detail['token']}': score = {score}") | |
| if token_scores: | |
| # Calculate mean from token details | |
| token_mean = np.mean(token_scores) | |
| # Get mean from summary statistics | |
| summary_key = 'COCA_spoken_frequency_token_CW' # Content words | |
| if summary_key in summary_stats: | |
| summary_mean = summary_stats[summary_key]['mean'] | |
| print(f"Token details mean: {token_mean}") | |
| print(f"Summary stats mean: {summary_mean}") | |
| # Check if they're approximately equal (allowing for floating point precision) | |
| if abs(token_mean - summary_mean) < 0.001: | |
| print("β Token details and summary statistics are consistent!") | |
| # Check that values are actually log-transformed | |
| # Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3 | |
| if all(2.5 < score < 3.5 for score in token_scores): | |
| print("β Values appear to be properly log-transformed!") | |
| return True | |
| else: | |
| print(f"β οΈ Values don't appear to be log-transformed: {token_scores}") | |
| return False | |
| else: | |
| print(f"β Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}") | |
| return False | |
| else: | |
| print(f"β Summary key '{summary_key}' not found in summary stats") | |
| return False | |
| else: | |
| print("β No token scores found") | |
| return False | |
| else: | |
| print("β No token details found") | |
| return False | |
| except Exception as e: | |
| print(f"β Test failed with error: {e}") | |
| return False | |
| def main(): | |
| """Run the log transform consistency test.""" | |
| print("π§ͺ Testing Log Transform Consistency\n") | |
| if test_log_transform_consistency(): | |
| print("\nπ Log transformation consistency has been fixed!") | |
| return 0 | |
| else: | |
| print("\nβ οΈ Log transformation consistency issue still exists.") | |
| return 1 | |
| if __name__ == "__main__": | |
| exit(main()) | |