|
|
|
|
|
""" |
|
|
Simple test script to verify GLEN environment is ready for The Vault dataset |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import torch |
|
|
import pandas as pd |
|
|
from pathlib import Path |
|
|
|
|
|
def test_dependencies(): |
|
|
"""Test if all required dependencies are installed""" |
|
|
print("Testing dependencies...") |
|
|
|
|
|
try: |
|
|
import transformers |
|
|
print(f"β
transformers: {transformers.__version__}") |
|
|
except ImportError: |
|
|
print("β transformers not found") |
|
|
return False |
|
|
|
|
|
try: |
|
|
import torch |
|
|
print(f"β
torch: {torch.__version__}") |
|
|
print(f"β
CUDA available: {torch.cuda.is_available()}") |
|
|
if torch.cuda.is_available(): |
|
|
print(f"β
GPU: {torch.cuda.get_device_name(0)}") |
|
|
except ImportError: |
|
|
print("β torch not found") |
|
|
return False |
|
|
|
|
|
try: |
|
|
import pandas |
|
|
print(f"β
pandas: {pandas.__version__}") |
|
|
except ImportError: |
|
|
print("β pandas not found") |
|
|
return False |
|
|
|
|
|
try: |
|
|
import wandb |
|
|
print(f"β
wandb: {wandb.__version__}") |
|
|
except ImportError: |
|
|
print("β wandb not found") |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def test_data_files(): |
|
|
"""Test if required data files exist""" |
|
|
print("\nTesting data files...") |
|
|
|
|
|
data_dir = Path("data/the_vault") |
|
|
required_files = [ |
|
|
"DOC_VAULT_train.tsv", |
|
|
"GTQ_VAULT_train.tsv", |
|
|
"ID_VAULT_t5_bm25_truncate_3.tsv", |
|
|
"DOC_VAULT_validate.tsv", |
|
|
"GTQ_VAULT_dev.tsv" |
|
|
] |
|
|
|
|
|
all_found = True |
|
|
for file_name in required_files: |
|
|
file_path = data_dir / file_name |
|
|
if file_path.exists(): |
|
|
size = file_path.stat().st_size / 1024 |
|
|
print(f"β
{file_name} ({size:.1f} KB)") |
|
|
else: |
|
|
print(f"β {file_name} not found") |
|
|
all_found = False |
|
|
|
|
|
return all_found |
|
|
|
|
|
def test_tevatron_imports(): |
|
|
"""Test if tevatron modules can be imported""" |
|
|
print("\nTesting tevatron imports...") |
|
|
|
|
|
try: |
|
|
from tevatron.arguments import ( |
|
|
GLENP1ModelArguments, |
|
|
GLENP1DataArguments, |
|
|
GLENP1TrainingArguments |
|
|
) |
|
|
print("β
Phase 1 arguments imported") |
|
|
except ImportError as e: |
|
|
print(f"β Phase 1 arguments import failed: {e}") |
|
|
return False |
|
|
|
|
|
try: |
|
|
from tevatron.utils.gpu_monitor import GPUMemoryMonitor |
|
|
print("β
GPU monitor imported") |
|
|
except ImportError as e: |
|
|
print(f"β GPU monitor import failed: {e}") |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def test_gpu_monitor(): |
|
|
"""Test GPU memory monitor functionality""" |
|
|
print("\nTesting GPU monitor...") |
|
|
|
|
|
try: |
|
|
from tevatron.utils.gpu_monitor import GPUMemoryMonitor |
|
|
|
|
|
monitor = GPUMemoryMonitor(memory_threshold=0.8, check_interval=10) |
|
|
stats = monitor.get_memory_stats() |
|
|
|
|
|
if stats["enabled"]: |
|
|
print(f"β
GPU monitor enabled") |
|
|
print(f" - Total GPU memory: {stats['total_gb']:.2f} GB") |
|
|
print(f" - Current usage: {stats['usage_ratio']:.1%}") |
|
|
|
|
|
|
|
|
can_continue = monitor.check_memory() |
|
|
print(f" - Memory check passed: {can_continue}") |
|
|
else: |
|
|
print("β οΈ GPU monitor disabled (no CUDA)") |
|
|
|
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"β GPU monitor test failed: {e}") |
|
|
return False |
|
|
|
|
|
def test_data_loading(): |
|
|
"""Test loading a sample of data""" |
|
|
print("\nTesting data loading...") |
|
|
|
|
|
try: |
|
|
train_doc_path = "data/the_vault/DOC_VAULT_train.tsv" |
|
|
if os.path.exists(train_doc_path): |
|
|
df = pd.read_csv(train_doc_path, sep='\t', nrows=5) |
|
|
print(f"β
Loaded {len(df)} sample documents") |
|
|
print(f" - Columns: {list(df.columns)}") |
|
|
|
|
|
|
|
|
if 'doc_content' in df.columns and len(df['doc_content'].iloc[0]) > 50: |
|
|
print("β
Document content looks valid") |
|
|
else: |
|
|
print("β οΈ Document content might be too short") |
|
|
|
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"β Data loading test failed: {e}") |
|
|
return False |
|
|
|
|
|
def main(): |
|
|
print("π§ͺ GLEN Environment Test for The Vault Dataset") |
|
|
print("=" * 50) |
|
|
|
|
|
tests = [ |
|
|
("Dependencies", test_dependencies), |
|
|
("Data Files", test_data_files), |
|
|
("Tevatron Imports", test_tevatron_imports), |
|
|
("GPU Monitor", test_gpu_monitor), |
|
|
("Data Loading", test_data_loading) |
|
|
] |
|
|
|
|
|
passed = 0 |
|
|
total = len(tests) |
|
|
|
|
|
for test_name, test_func in tests: |
|
|
print(f"\nπ {test_name}") |
|
|
print("-" * 30) |
|
|
if test_func(): |
|
|
passed += 1 |
|
|
print(f"β
{test_name} PASSED") |
|
|
else: |
|
|
print(f"β {test_name} FAILED") |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print(f"π― Test Results: {passed}/{total} tests passed") |
|
|
|
|
|
if passed == total: |
|
|
print("π Environment is ready for GLEN training!") |
|
|
print("\nNext steps:") |
|
|
print("1. Run full preprocessing if needed:") |
|
|
print(" python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/") |
|
|
print("2. Start training:") |
|
|
print(" bash scripts/train_glen_p1_vault.sh") |
|
|
return True |
|
|
else: |
|
|
print("β οΈ Some tests failed. Please fix the issues above.") |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
success = main() |
|
|
sys.exit(0 if success else 1) |