#!/usr/bin/env python
# train_cuad_lora.py
"""
CUAD fine-tune with LoRA on an L4 / T4 GPU.
Expected wall-clock on Nvidia L4: ~25-30 min.
"""

import os, json, random, gc
from collections import defaultdict

import torch, numpy as np
from datasets import load_dataset, Dataset, disable_caching
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,
    TrainingArguments, default_data_collator
)
from transformers import QuestionAnsweringTrainer, EvalPrediction
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from huggingface_hub import login

disable_caching()                 # avoids giant disk cache on Colab

# ─────────────────────────────────────────────────────────────── helpers ──

MAX_LEN     = 384                 # window
DOC_STRIDE  = 128
SEED        = 42

def set_seed(seed):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def balance_has_answer(dataset, ratio=2.0):
    """Keep all has-answer rows, down-sample no-answer rows to `ratio`."""
    has, no = [], []
    for ex in dataset:
        (has if ex["answers"]["text"] else no).append(ex)
    k = int(len(has) * ratio)
    no = random.sample(no, min(k, len(no)))
    return Dataset.from_list(has + no)

# ────────────────────────────────────────────────────────────── postproc ──

metric = evaluate.load("squad")

def postprocess_qa(examples, features, raw_predictions, tokenizer):
    """HF-style span extraction + n-best, returns SQuAD format dict."""
    all_start, all_end = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = defaultdict(list)
    for i, feat_id in enumerate(features["example_id"]):
        features_per_example[example_id_to_index[feat_id]].append(i)

    predictions = []

    for example_idx, example in enumerate(examples):
        best_score = -1e9
        best_span  = ""
        context    = example["context"]

        for feat_idx in features_per_example[example_idx]:
            start_logit = all_start[feat_idx]
            end_logit   = all_end[feat_idx]
            offset      = features["offset_mapping"][feat_idx]

            start_idx = int(np.argmax(start_logit))
            end_idx   = int(np.argmax(end_logit))

            if start_idx <= end_idx < len(offset):
                start_char, _ = offset[start_idx]
                _, end_char   = offset[end_idx]
                span = context[start_char:end_char].strip()
                score = start_logit[start_idx] + end_logit[end_idx]
                if score > best_score and span:
                    best_score, best_span = score, span

        predictions.append(
            {"id": example["id"], "prediction_text": best_span}
        )
    return predictions

def compute_metrics(eval_pred: EvalPrediction):
    predictions = postprocess_qa(raw_val, val_feats, eval_pred.predictions, tok)
    references  = [
        {"id": ex["id"], "answers": ex["answers"]} for ex in raw_val
    ]
    return metric.compute(predictions=predictions, references=references)

# ───────────────────────────────────────────────────────────────── main ──

def main():
    set_seed(SEED)

    #  model name to store on Hub
    model_repo = os.getenv("MODEL_NAME", "AvocadoMuffin/roberta-cuad-qa-v2")

    if (tokn := os.getenv("roberta_token")):
        try:  login(tokn);  print("🔑 HuggingFace Hub login OK")
        except Exception as e: print("Hub login failed:", e)

    print("📚 Loading CUAD…")
    cuad = load_dataset("theatticusproject/cuad-qa", split="train", trust_remote_code=True)
    cuad = cuad.shuffle(seed=SEED)
    cuad = balance_has_answer(cuad, ratio=2.0)        #  ≈18 k rows

    # train / val 90-10
    ds = cuad.train_test_split(test_size=0.1, seed=SEED)
    train_raw, val_raw = ds["train"], ds["test"]

    # ── tokeniser & model (SQuAD-2 tuned) ───────────────────────────────
    base_ckpt = "deepset/roberta-base-squad2"
    tok   = AutoTokenizer.from_pretrained(base_ckpt, use_fast=True)
    model = AutoModelForQuestionAnswering.from_pretrained(base_ckpt)

    # LoRA
    lora = LoraConfig(
        task_type=TaskType.QUESTION_ANS,
        r=16, lora_alpha=32, lora_dropout=0.05,
        target_modules=["query", "value"],
    )
    model = get_peft_model(model, lora)
    model.print_trainable_parameters()

    # ── preprocess ─────────────────────────────────────────────────────
    def preprocess(examples):
        return tok(
            examples["question"],
            examples["context"],
            truncation="only_second",
            max_length=MAX_LEN,
            stride=DOC_STRIDE,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        ) | { "example_id": examples["id"] }

    train_feats = train_raw.map(
        preprocess, batched=True, remove_columns=train_raw.column_names,
        num_proc=4, desc="tokenise-train"
    )
    val_feats = val_raw.map(
        preprocess, batched=True, remove_columns=val_raw.column_names,
        num_proc=4, desc="tokenise-val"
    )

    global raw_val     # for metric fn
    raw_val = val_raw

    # ── training args ──────────────────────────────────────────────────
    args = TrainingArguments(
        output_dir="./cuad_lora_out",
        learning_rate=3e-5,
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,      # eff. BS 32
        fp16=False, bf16=True,              # L4 = bf16
        evaluation_strategy="steps",
        eval_steps=250,
        save_steps=500,
        save_total_limit=2,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_steps=50,
        report_to="none",
    )

    trainer = QuestionAnsweringTrainer(
        model=model,
        args=args,
        train_dataset=train_feats,
        eval_dataset=val_feats,
        tokenizer=tok,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
    )

    print("🚀 Training…")
    trainer.train()

    print("✅ Done.  Best F1:", trainer.state.best_metric)
    trainer.save_model("./cuad_lora_out")
    tok.save_pretrained("./cuad_lora_out")

    # optional: push
    if tokn:
        trainer.push_to_hub(model_repo, private=False)
        tok.push_to_hub(model_repo, private=False)
        print("🚀 Pushed to:", f"https://huggingface.co/{model_repo}")

if __name__ == "__main__":
    main()