"""
Text preprocessing for LexiMind.

Lightweight text cleaning and tokenization pipeline for model input preparation.

Author: Oliver Perrin
Date: December 2025
"""

from __future__ import annotations

from dataclasses import dataclass, replace
from typing import List, Sequence

import torch

from .tokenization import Tokenizer, TokenizerConfig

# --------------- Text Cleaning ---------------


class TextCleaner:
    """Basic text normalization."""

    def __init__(self, lowercase: bool = True) -> None:
        self.lowercase = lowercase

    def clean(self, text: str) -> str:
        """Strip, normalize whitespace, optionally lowercase."""
        text = text.strip()
        if self.lowercase:
            text = text.lower()
        return " ".join(text.split())

    def clean_batch(self, texts: Sequence[str]) -> List[str]:
        """Clean multiple texts."""
        return [self.clean(t) for t in texts]

    # Backwards compatibility alias
    def transform(self, texts: Sequence[str]) -> List[str]:
        """Alias for clean_batch (sklearn-style interface)."""
        return self.clean_batch(texts)


# --------------- Batch Output ---------------


@dataclass
class Batch:
    """Tokenized batch ready for model consumption."""

    input_ids: torch.Tensor
    attention_mask: torch.Tensor
    lengths: List[int]


# --------------- Preprocessor ---------------


class TextPreprocessor:
    """Combines text cleaning with tokenization."""

    def __init__(
        self,
        tokenizer: Tokenizer | None = None,
        *,
        tokenizer_config: TokenizerConfig | None = None,
        tokenizer_name: str = "google/flan-t5-base",
        max_length: int | None = None,
        lowercase: bool = True,
    ) -> None:
        self.cleaner = TextCleaner(lowercase=lowercase)

        # Initialize or validate tokenizer
        if tokenizer is None:
            cfg = tokenizer_config or TokenizerConfig(pretrained_model_name=tokenizer_name)
            if max_length is not None:
                cfg = replace(cfg, max_length=max_length)
            self.tokenizer = Tokenizer(cfg)
        else:
            self.tokenizer = tokenizer
            if max_length is not None and max_length != tokenizer.config.max_length:
                raise ValueError(
                    "max_length conflicts with tokenizer config - "
                    "initialize tokenizer with desired settings"
                )

        self.max_length = max_length or self.tokenizer.config.max_length

    def clean_text(self, text: str) -> str:
        """Clean a single text."""
        return self.cleaner.clean(text)

    def batch_encode(self, texts: Sequence[str]) -> Batch:
        """Clean and tokenize texts into a batch."""
        cleaned = self.cleaner.clean_batch(texts)
        encoded = self.tokenizer.batch_encode(cleaned, max_length=self.max_length)

        input_ids = encoded["input_ids"]
        attention_mask = encoded["attention_mask"].to(dtype=torch.bool)
        lengths = attention_mask.sum(dim=1).tolist()

        return Batch(input_ids=input_ids, attention_mask=attention_mask, lengths=lengths)

    def __call__(self, texts: Sequence[str]) -> Batch:
        """Alias for batch_encode."""
        return self.batch_encode(texts)


# --------------- Backwards Compatibility ---------------

# Keep old name for any imports
BasicTextCleaner = TextCleaner