simple-text-analyzer / test /test_log_transform_consistency.py
egumasa's picture
emuTAALES
e7279e4
#!/usr/bin/env python3
"""
Test script to verify that log transformations are consistently applied
to both token details table and summary statistics.
"""
import sys
import os
from pathlib import Path
import numpy as np
# Add project root to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
def test_log_transform_consistency():
"""Test that log transformations are applied consistently."""
print("Testing Log Transform Consistency...")
try:
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
# Create analyzer
analyzer = LexicalSophisticationAnalyzer(language='en', model_size='md')
# Create mock reference lists with known values
mock_reference_lists = {
'COCA_spoken_frequency_token': {
'token': {'hello': 1000, 'world': 2000} # Values that will be log-transformed
}
}
# Load reference lists
analyzer.load_reference_lists(mock_reference_lists)
# Analyze text with log transformation enabled
text = "Hello world"
selected_indices = ['COCA_spoken_frequency_token']
# Enable log transformation for this index
log_transforms = {
'COCA_spoken_frequency_token': ['frequency']
}
results = analyzer.analyze_text(
text,
selected_indices,
log_transforms=log_transforms
)
# Get token details and summary statistics
token_details = results['token_details']
summary_stats = results['summary']
print(f"Token details: {len(token_details)} tokens")
print(f"Summary keys: {list(summary_stats.keys())}")
# Check consistency between token details and summary
if token_details:
# Get log-transformed values from token details
token_scores = []
for token_detail in token_details:
score = token_detail.get('COCA_spoken_frequency_token')
if score is not None:
token_scores.append(score)
print(f"Token '{token_detail['token']}': score = {score}")
if token_scores:
# Calculate mean from token details
token_mean = np.mean(token_scores)
# Get mean from summary statistics
summary_key = 'COCA_spoken_frequency_token_CW' # Content words
if summary_key in summary_stats:
summary_mean = summary_stats[summary_key]['mean']
print(f"Token details mean: {token_mean}")
print(f"Summary stats mean: {summary_mean}")
# Check if they're approximately equal (allowing for floating point precision)
if abs(token_mean - summary_mean) < 0.001:
print("βœ… Token details and summary statistics are consistent!")
# Check that values are actually log-transformed
# Original values were 1000 and 2000, log10 would be ~3.0 and ~3.3
if all(2.5 < score < 3.5 for score in token_scores):
print("βœ… Values appear to be properly log-transformed!")
return True
else:
print(f"⚠️ Values don't appear to be log-transformed: {token_scores}")
return False
else:
print(f"❌ Inconsistency found: token mean = {token_mean}, summary mean = {summary_mean}")
return False
else:
print(f"❌ Summary key '{summary_key}' not found in summary stats")
return False
else:
print("❌ No token scores found")
return False
else:
print("❌ No token details found")
return False
except Exception as e:
print(f"❌ Test failed with error: {e}")
return False
def main():
"""Run the log transform consistency test."""
print("πŸ§ͺ Testing Log Transform Consistency\n")
if test_log_transform_consistency():
print("\nπŸŽ‰ Log transformation consistency has been fixed!")
return 0
else:
print("\n⚠️ Log transformation consistency issue still exists.")
return 1
if __name__ == "__main__":
exit(main())