Spaces:

OliverPerrin
/

LexiMind

Sleeping

App Files Files Community

OliverPerrin commited on Oct 30

Commit

f9edbb4

1 Parent(s): 1bdd1c1

Updated Summarizer, Preprocessor to run on my custom transformer and added basic streamlit frontend demo

Browse files

Files changed (10) hide show

requirements-dev.txt +2 -1
requirements.txt +5 -1
src/api/inference/__init__.py +7 -0
src/api/inference/inference.py +133 -0
src/data/download.py +47 -41
src/data/preprocessing.py +251 -254
src/inference/__init__.py +7 -0
src/inference/baseline_summarizer.py +39 -220
src/models/__init__.py +33 -0
src/ui/streamlit_app.py +108 -0

requirements-dev.txt CHANGED Viewed

@@ -6,4 +6,5 @@ isort>=5.12.0
 flake8>=6.0.0
 mypy>=1.4.0
 jupyter>=1.0.0
-ipywidgets>=8.0.0

 flake8>=6.0.0
 mypy>=1.4.0
 jupyter>=1.0.0
+ipywidgets>=8.0.0
+pre-commit>=3.4.0

requirements.txt CHANGED Viewed

@@ -15,4 +15,8 @@ omegaconf>=2.3.0
 tensorboard>=2.13.0
 gradio>=3.35.0
 requests>=2.31.0
-kagglehub>=0.2.0

 tensorboard>=2.13.0
 gradio>=3.35.0
 requests>=2.31.0
+kaggle>=1.5.12
+streamlit>=1.25.0
+plotly>=5.18.0
+faiss-cpu==1.9.0; platform_system != "Windows"
+faiss-cpu==1.9.0; platform_system == "Windows"

src/api/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+API inference module for LexiMind.
+"""
+from .inference import load_models, summarize_text, classify_emotion, topic_for_text
+__all__ = ["load_models", "summarize_text", "classify_emotion", "topic_for_text"]

src/api/inference/inference.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""Minimal inference helpers that rely on the custom transformer stack."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from ...data.preprocessing import TextPreprocessor, TransformerTokenizer
+from ...models.multitask import MultiTaskModel
+def _load_tokenizer(tokenizer_path: Path) -> TransformerTokenizer:
+    if not tokenizer_path.exists():
+        raise FileNotFoundError(f"tokenizer file '{tokenizer_path}' not found")
+    return TransformerTokenizer.load(tokenizer_path)
+def load_models(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Load MultiTaskModel together with the tokenizer-driven preprocessor."""
+    device = torch.device(config.get("device", "cpu"))
+    tokenizer_path = config.get("tokenizer_path")
+    if tokenizer_path is None:
+        raise ValueError("'tokenizer_path' missing in config")
+    tokenizer = _load_tokenizer(Path(tokenizer_path))
+    preprocessor = TextPreprocessor(
+        max_length=int(config.get("max_length", 512)),
+        tokenizer=tokenizer,
+        min_freq=int(config.get("min_freq", 1)),
+        lowercase=bool(config.get("lowercase", True)),
+    )
+    encoder_kwargs = dict(config.get("encoder", {}))
+    decoder_kwargs = dict(config.get("decoder", {}))
+    encoder = preprocessor.build_encoder(**encoder_kwargs)
+    decoder = preprocessor.build_decoder(**decoder_kwargs)
+    model = MultiTaskModel(encoder=encoder, decoder=decoder)
+    checkpoint_path = config.get("checkpoint_path")
+    if checkpoint_path:
+        state = torch.load(checkpoint_path, map_location=device)
+        if isinstance(state, dict) and "state_dict" in state:
+            state = state["state_dict"]
+        model.load_state_dict(state, strict=False)
+    model.to(device)
+    return {
+        "loaded": True,
+        "device": device,
+        "mt": model,
+        "preprocessor": preprocessor,
+    }
+def summarize_text(
+    text: str,
+    compression: float = 0.25,
+    collect_attn: bool = False,
+    models: Optional[Dict[str, Any]] = None,
+) -> Tuple[str, Optional[Dict[str, torch.Tensor]]]:
+    if models is None or not models.get("loaded"):
+        raise RuntimeError("Models must be loaded via load_models before summarize_text is called")
+    model: MultiTaskModel = models["mt"]
+    preprocessor: TextPreprocessor = models["preprocessor"]
+    device: torch.device = models["device"]
+    batch = preprocessor.batch_encode([text])
+    tokenizer = preprocessor.tokenizer
+    encoder = model.encoder
+    decoder = model.decoder
+    if tokenizer is None or encoder is None or decoder is None:
+        raise RuntimeError("Encoder, decoder, and tokenizer must be configured before summarization")
+    input_ids = batch.input_ids.to(device)
+    memory = encoder(input_ids)
+    src_len = batch.lengths[0]
+    max_tgt = max(4, int(src_len * compression))
+    generated = decoder.greedy_decode(
+        memory,
+        max_len=min(preprocessor.max_length, max_tgt),
+        start_token_id=tokenizer.bos_id,
+        end_token_id=tokenizer.eos_id,
+    )
+    summary = tokenizer.decode(generated[0].tolist(), skip_special_tokens=True)
+    return summary.strip(), None if not collect_attn else {}
+def classify_emotion(text: str, models: Optional[Dict[str, Any]] = None) -> Tuple[List[float], List[str]]:
+    if models is None or not models.get("loaded"):
+        raise RuntimeError("Models must be loaded via load_models before classify_emotion is called")
+    model: MultiTaskModel = models["mt"]
+    preprocessor: TextPreprocessor = models["preprocessor"]
+    device: torch.device = models["device"]
+    batch = preprocessor.batch_encode([text])
+    input_ids = batch.input_ids.to(device)
+    result = model.forward("emotion", {"input_ids": input_ids})
+    logits = result[1] if isinstance(result, tuple) else result
+    scores = torch.sigmoid(logits).squeeze(0).detach().cpu().tolist()
+    labels = models.get("emotion_labels") or [
+        "joy",
+        "sadness",
+        "anger",
+        "fear",
+        "surprise",
+        "disgust",
+    ]
+    return scores, labels[: len(scores)]
+def topic_for_text(text: str, models: Optional[Dict[str, Any]] = None) -> Tuple[int, List[str]]:
+    if models is None or not models.get("loaded"):
+        raise RuntimeError("Models must be loaded via load_models before topic_for_text is called")
+    model: MultiTaskModel = models["mt"]
+    preprocessor: TextPreprocessor = models["preprocessor"]
+    device: torch.device = models["device"]
+    batch = preprocessor.batch_encode([text])
+    input_ids = batch.input_ids.to(device)
+    encoder = model.encoder
+    if encoder is None:
+        raise RuntimeError("Encoder must be configured before topic_for_text is called")
+    memory = encoder(input_ids)
+    embedding = memory.mean(dim=1).detach().cpu()
+    _ = embedding  # placeholder for downstream clustering hook
+    return 0, ["topic_stub"]

src/data/download.py CHANGED Viewed

@@ -1,57 +1,63 @@
 import os
 import requests
-import kaggle
-def download_gutenberg():
-    """Example: download Pride and Prejudice"""
-    url = "https://www.gutenberg.org/files/1342/1342-0.txt"
-    os.makedirs("data/raw/books", exist_ok=True)
-    out_path = "data/raw/books/pride_and_prejudice.txt"
-    if not os.path.exists(out_path):
-        r = requests.get(url)
         with open(out_path, "wb") as f:
             f.write(r.content)
-    print("Downloaded:", out_path)
-# Kaggle dataset download helpers
 def download_emotion_dataset():
-    """Download the emotions dataset from Kaggle."""
     target_dir = "data/raw/emotion"
-    os.makedirs(target_dir, exist_ok=True)
-    # Downloading using Kaggle Python API
-    kaggle.api.authenticate()
-    kaggle.api.dataset_download_files(
-        'praveengovi/emotions-dataset-for-nlp',
-        path=target_dir,
-        unzip=True
-    )
-    print("Downloaded Kaggle emotion dataset to", target_dir)
 def download_cnn_dailymail():
-    """Download the CNN/DailyMail summarization dataset from Kaggle."""
     target_dir = "data/raw/summarization"
-    os.makedirs(target_dir, exist_ok=True)
-    # Downloading using Kaggle Python API
-    kaggle.api.authenticate()
-    kaggle.api.dataset_download_files(
-        'gowrishankarp/newspaper-text-summarization-cnn-dailymail',
-        path=target_dir,
-        unzip=True
-    )
-    print("Downloaded Kaggle CNN/DailyMail dataset to", target_dir)
 def download_ag_news():
-    """Download the AG News dataset from Kaggle."""
     target_dir = "data/raw/topic"
-    os.makedirs(target_dir, exist_ok=True)
-    # Downloading using Kaggle Python API
-    kaggle.api.authenticate()
-    kaggle.api.dataset_download_files(
-        'amananandrai/ag-news-classification-dataset',
-        path=target_dir,
-        unzip=True
-    )
-    print("Downloaded Kaggle AG News dataset to", target_dir)
 if __name__ == "__main__":
     download_gutenberg()

+"""
+Download helpers for datasets.
+This version:
+- Adds robust error handling when Kaggle API is not configured.
+- Stores files under data/raw/ subfolders.
+- Keeps the Gutenberg direct download example.
+Make sure you have Kaggle credentials configured if you call Kaggle downloads.
+"""
 import os
 import requests
+def download_gutenberg(out_dir="data/raw/books", gutenberg_id: int = 1342, filename: str = "pride_and_prejudice.txt"):
+    """Download a Gutenberg text file by direct URL template (best-effort)."""
+    url = f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-0.txt"
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, filename)
+    if os.path.exists(out_path):
+        print("Already downloaded:", out_path)
+        return out_path
+    try:
+        r = requests.get(url, timeout=30)
+        r.raise_for_status()
         with open(out_path, "wb") as f:
             f.write(r.content)
+        print("Downloaded:", out_path)
+        return out_path
+    except Exception as e:
+        print("Failed to download Gutenberg file:", e)
+        return None
+# Kaggle helpers: optional, wrapped to avoid hard failure when Kaggle isn't configured.
+def _safe_kaggle_download(dataset: str, path: str):
+    try:
+        import kaggle
+    except Exception as e:
+        print("Kaggle package not available or not configured. Please install 'kaggle' and configure API token. Error:", e)
+        return False
+    try:
+        os.makedirs(path, exist_ok=True)
+        kaggle.api.authenticate()
+        kaggle.api.dataset_download_files(dataset, path=path, unzip=True)
+        print(f"Downloaded Kaggle dataset {dataset} to {path}")
+        return True
+    except Exception as e:
+        print("Failed to download Kaggle dataset:", e)
+        return False
 def download_emotion_dataset():
     target_dir = "data/raw/emotion"
+    return _safe_kaggle_download('praveengovi/emotions-dataset-for-nlp', target_dir)
 def download_cnn_dailymail():
     target_dir = "data/raw/summarization"
+    return _safe_kaggle_download('gowrishankarp/newspaper-text-summarization-cnn-dailymail', target_dir)
 def download_ag_news():
     target_dir = "data/raw/topic"
+    return _safe_kaggle_download('amananandrai/ag-news-classification-dataset', target_dir)
 if __name__ == "__main__":
     download_gutenberg()

src/data/preprocessing.py CHANGED Viewed

@@ -1,263 +1,260 @@
-# src/preprocessing.py
-import re
-import os
 import json
-import tensorflow as tf
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from transformers import AutoTokenizer
-import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-class textPreprocessor:
-    def __init__(self, max_length=512, model_name='bert-base-uncased'):
         self.max_length = max_length
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     def clean_text(self, text: str) -> str:
-        """Cleaning and Normalizing Text"""
-        text = re.sub(r'\s+', ' ', text) # Getting rid of extra spaces
-        text = re.sub(r'[^a-zA-Z0-9.,;:?!\'" ]', '', text) # Removing weird characters
-        return text.strip()
-    def tokenize_text(self, texts: list[str]):
-        return self.tokenizer(
-            texts,
-            truncation=True,
-            padding=True,
-            max_length=self.max_length,
-            return_tensors='tf'
         )
-    def prepare_data(self, texts: list[str], labels=None):
-        """Preparing Data for Training"""
-        cleaned_texts = [self.clean_text(text) for text in texts]
-        encoded = self.tokenize_text(cleaned_texts)
-        if labels is not None:
-            return encoded, tf.convert_to_tensor(labels)
-        return encoded
-    def load_books(self, folder_path="data/raw/books") -> list[str]:
-        """Load books from text files in the specific folder"""
-        texts = []
-        for filename in os.listdir(folder_path):
-            if filename.endswith(".txt"):
-                file_path = os.path.join(folder_path, filename)
-                with open(file_path, 'r', encoding='utf-8', errors ="ignore") as file:
-                    raw_text = file.read()
-                    cleaned = self.clean_text(raw_text)
-                    texts.append(cleaned)
-        return texts
-    def chunk_text(self, text: str, chunk_size=1000, overlap=100) -> list[str]:
-        """Splits long texts into smaller segments or chunks"""
-        words = text.split()
-        chunks = []
         start = 0
         while start < len(words):
-            end = start + chunk_size
-            chunk = " ".join(words[start:end])
-            chunks.append(chunk)
             start += chunk_size - overlap
         return chunks
-    def save_preprocessed_books(self, data, input_folder="data/raw/books", output_folder="data/processed/books", chunk_size=1000, overlap=100):
-        os.makedirs(output_folder, exist_ok=True)
-        for filename in os.listdir(input_folder):
-            if filename.endswith(".txt"):
-                file_path = os.path.join(input_folder, filename)
-                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                    raw_text = f.read()
-                    cleaned = self.clean_text(raw_text)
-                    chunks = self.chunk_text(cleaned, chunk_size, overlap)
-                # Saving as JSON, one file for each book
-                out_file = os.path.join(output_folder, filename.replace(".txt", ".json"))
-                with open(out_file, "w", encoding="utf-8") as out:
-                    json.dump(chunks, out, ensure_ascii=False, indent=2)
-                print(f"Processed and saved {filename} → {out_file}")
-    # ----- Dataset-specific processing methods ------
-    def process_summarization_dataset(self):
-        """Process summarization dataset: clean, split, and save."""
-        input_folder = "data/raw/summarization/cnn_dailymail"
-        output_folder = "data/processed/summarization"
-        os.makedirs(output_folder, exist_ok=True)
-        # Process each CSV file separately (train.csv, validation.csv, test.csv)
-        file_mapping = {
-            'train.csv': 'train',
-            'validation.csv': 'val',
-            'test.csv': 'test'
-        }
-        for csv_file, split_name in file_mapping.items():
-            file_path = os.path.join(input_folder, csv_file)
-            if not os.path.exists(file_path):
-                print(f"Missing file: {file_path}")
-                continue
-            print(f"Processing {csv_file}...")
-            df = pd.read_csv(file_path)
-            # Check for required columns (article and highlights)
-            if 'article' not in df.columns or 'highlights' not in df.columns:
-                print(f"CSV {csv_file} must have 'article' and 'highlights' columns.")
-                continue
-            # Clean the text data
-            df['article'] = df['article'].astype(str).apply(self.clean_text)
-            df['summary'] = df['highlights'].astype(str).apply(self.clean_text)  # rename highlights to summary
-            # Convert to records format
-            records = df[['article', 'summary']].to_dict(orient='records')
-            # Save as JSON
-            output_file = os.path.join(output_folder, f"{split_name}.json")
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(records, f, ensure_ascii=False, indent=2)
-            print(f"Processed {csv_file}: {len(records)} samples saved to {split_name}.json")
-        print("Summarization dataset processed and saved.")
-    def process_emotion_dataset(self):
-        """Process emotion dataset: clean, split, and save."""
-        input_folder = "data/raw/emotion"
-        output_folder = "data/processed/emotion"
-        os.makedirs(output_folder, exist_ok=True)
-        # Process each txt file (train.txt, val.txt, test.txt)
-        for split_file in ['train.txt', 'val.txt', 'test.txt']:
-            file_path = os.path.join(input_folder, split_file)
-            if not os.path.exists(file_path):
-                print(f"Missing file: {file_path}")
-                continue
-            records = []
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                for line in f:
-                    line = line.strip()
-                    if line and ';' in line:
-                        # Split on the last semicolon to handle semicolons in text
-                        text, label = line.rsplit(';', 1)
-                        records.append({
-                            'text': self.clean_text(text),
-                            'label': label.strip()
-                        })
-            # Save as JSON
-            split_name = split_file.replace('.txt', '')
-            output_file = os.path.join(output_folder, f"{split_name}.json")
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(records, f, ensure_ascii=False, indent=2)
-            print(f"Processed {split_file}: {len(records)} samples saved to {split_name}.json")
-        print("Emotion dataset processed and saved.")
-    def process_topic_dataset(self):
-        """Process topic dataset: clean, split, and save."""
-        input_folder = "data/raw/topic"
-        output_folder = "data/processed/topic"
-        os.makedirs(output_folder, exist_ok=True)
-        # Process each CSV file separately (train.csv, test.csv)
-        file_mapping = {
-            'train.csv': 'train',
-            'test.csv': 'test'
-        }
-        # Class index to topic name mapping for AG News dataset
-        class_map = {
-            1: 'World',
-            2: 'Sports',
-            3: 'Business',
-            4: 'Science/Technology'
-        }
-        for csv_file, split_name in file_mapping.items():
-            file_path = os.path.join(input_folder, csv_file)
-            if not os.path.exists(file_path):
-                print(f"Missing file: {file_path}")
-                continue
-            print(f"Processing {csv_file}...")
-            df = pd.read_csv(file_path)
-            # Check for required columns
-            if 'Class Index' not in df.columns:
-                print(f"CSV {csv_file} must have 'Class Index' column.")
-                continue
-            # Concatenate title and description
-            if 'Title' in df.columns and 'Description' in df.columns:
-                text = df['Title'].astype(str) + ". " + df['Description'].astype(str)
-            elif 'Title' in df.columns:
-                text = df['Title'].astype(str)
-            elif 'Description' in df.columns:
-                text = df['Description'].astype(str)
-            else:
-                print("CSV must have 'Title' or 'Description' columns.")
-                continue
-            df['text'] = text.apply(self.clean_text)
-            # Map numeric labels to category names
-            df['label'] = df['Class Index'].map(class_map)
-            # Convert to records format
-            records = df[['text', 'label']].to_dict(orient='records')
-            # Save as JSON
-            output_file = os.path.join(output_folder, f"{split_name}.json")
-            with open(output_file, "w", encoding="utf-8") as f:
-                json.dump(records, f, ensure_ascii=False, indent=2)
-            print(f"Processed {csv_file}: {len(records)} samples saved to {split_name}.json")
-        # Create validation split from training data
-        if os.path.exists(os.path.join(output_folder, "train.json")):
-            print("Creating validation split from training data...")
-            with open(os.path.join(output_folder, "train.json"), "r", encoding="utf-8") as f:
-                train_data = json.load(f)
-            # Split training data into train and validation
-            train_records, val_records = train_test_split(train_data, test_size=0.2, random_state=42)
-            # Save updated train and new validation files
-            with open(os.path.join(output_folder, "train.json"), "w", encoding="utf-8") as f:
-                json.dump(train_records, f, ensure_ascii=False, indent=2)
-            with open(os.path.join(output_folder, "val.json"), "w", encoding="utf-8") as f:
-                json.dump(val_records, f, ensure_ascii=False, indent=2)
-            print(f"Updated train.json: {len(train_records)} samples")
-            print(f"Created val.json: {len(val_records)} samples")
-        print("Topic dataset processed and saved.")
-# ----- Main function for quick testing ------
-if __name__ == "__main__":
-    preprocessor = textPreprocessor(max_length=128)
-    # Process and save all books
-    preprocessor.save_preprocessed_books(data=None)
-    # Load a processed book back
-    import json
-    with open("data/processed/books/pride_and_prejudice.json", "r") as f:
-        chunks = json.load(f)
-    print(f"Loaded {len(chunks)} chunks from Pride and Prejudice")
-    print(chunks[0][:200])  # printing first 200 chars of chunk
-    # Process new datasets
-    preprocessor.process_summarization_dataset()
-    preprocessor.process_emotion_dataset()
-    preprocessor.process_topic_dataset()

+"""Lightweight preprocessing utilities built around the in-repo transformer."""
+from __future__ import annotations
+from collections import Counter
+from dataclasses import dataclass
 import json
+from pathlib import Path
+import re
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+import torch
+from ..models.decoder import TransformerDecoder
+from ..models.encoder import TransformerEncoder
+SPECIAL_TOKENS: Tuple[str, str, str, str] = ("<pad>", "<bos>", "<eos>", "<unk>")
+def _normalize(text: str, lowercase: bool) -> str:
+    text = text.strip()
+    text = re.sub(r"\s+", " ", text)
+    if lowercase:
+        text = text.lower()
+    return text
+def _basic_tokenize(text: str) -> List[str]:
+    return re.findall(r"\b\w+\b|[.,;:?!]", text)
+class TransformerTokenizer:
+    """Minimal tokenizer that keeps vocabulary aligned with the custom transformer."""
+    def __init__(
+        self,
+        stoi: Dict[str, int],
+        itos: List[str],
+        specials: Sequence[str] = SPECIAL_TOKENS,
+        lowercase: bool = True,
+    ) -> None:
+        self.stoi = stoi
+        self.itos = itos
+        self.specials = tuple(specials)
+        self.lowercase = lowercase
+        self.pad_id = self._lookup(self.specials[0])
+        self.bos_id = self._lookup(self.specials[1])
+        self.eos_id = self._lookup(self.specials[2])
+        self.unk_id = self._lookup(self.specials[3])
+    @classmethod
+    def build(
+        cls,
+        texts: Iterable[str],
+        min_freq: int = 1,
+        lowercase: bool = True,
+        specials: Sequence[str] = SPECIAL_TOKENS,
+    ) -> "TransformerTokenizer":
+        counter: Counter[str] = Counter()
+        for text in texts:
+            normalized = _normalize(text, lowercase)
+            counter.update(_basic_tokenize(normalized))
+        ordered_specials = list(dict.fromkeys(specials))
+        itos: List[str] = ordered_specials.copy()
+        for token, freq in counter.most_common():
+            if freq < min_freq:
+                continue
+            if token in itos:
+                continue
+            itos.append(token)
+        stoi = {token: idx for idx, token in enumerate(itos)}
+        return cls(stoi=stoi, itos=itos, specials=ordered_specials, lowercase=lowercase)
+    @property
+    def vocab_size(self) -> int:
+        return len(self.itos)
+    def tokenize(self, text: str) -> List[str]:
+        normalized = _normalize(text, self.lowercase)
+        return _basic_tokenize(normalized)
+    def encode(
+        self,
+        text: str,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        tokens = self.tokenize(text)
+        pieces = [self.stoi.get(tok, self.unk_id) for tok in tokens]
+        if add_special_tokens:
+            pieces = [self.bos_id] + pieces + [self.eos_id]
+        if max_length is not None and len(pieces) > max_length:
+            if add_special_tokens and max_length >= 2:
+                inner_max = max_length - 2
+                trimmed = pieces[1:-1][:inner_max]
+                pieces = [self.bos_id] + trimmed + [self.eos_id]
+            else:
+                pieces = pieces[:max_length]
+        return pieces
+    def decode(self, ids: Sequence[int], skip_special_tokens: bool = True) -> str:
+        tokens: List[str] = []
+        for idx in ids:
+            if idx < 0 or idx >= len(self.itos):
+                continue
+            token = self.itos[idx]
+            if skip_special_tokens and token in self.specials:
+                continue
+            tokens.append(token)
+        return " ".join(tokens).strip()
+    def pad_batch(
+        self,
+        sequences: Sequence[Sequence[int]],
+        pad_to_length: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not sequences:
+            raise ValueError("pad_batch requires at least one sequence")
+        if pad_to_length is None:
+            pad_to_length = max(len(seq) for seq in sequences)
+        padded: List[List[int]] = []
+        mask: List[List[int]] = []
+        for seq in sequences:
+            trimmed = list(seq[:pad_to_length])
+            pad_len = pad_to_length - len(trimmed)
+            padded.append(trimmed + [self.pad_id] * pad_len)
+            mask.append([1] * len(trimmed) + [0] * pad_len)
+        return torch.tensor(padded, dtype=torch.long), torch.tensor(mask, dtype=torch.bool)
+    def save(self, path: Path) -> None:
+        payload = {
+            "itos": self.itos,
+            "specials": list(self.specials),
+            "lowercase": self.lowercase,
+        }
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+    @classmethod
+    def load(cls, path: Path) -> "TransformerTokenizer":
+        data = json.loads(path.read_text(encoding="utf-8"))
+        itos = list(data["itos"])
+        stoi = {token: idx for idx, token in enumerate(itos)}
+        specials = data.get("specials", list(SPECIAL_TOKENS))
+        lowercase = bool(data.get("lowercase", True))
+        return cls(stoi=stoi, itos=itos, specials=specials, lowercase=lowercase)
+    def _lookup(self, token: str) -> int:
+        if token not in self.stoi:
+            raise ValueError(f"token '{token}' missing from vocabulary")
+        return self.stoi[token]
+@dataclass
+class Batch:
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    lengths: List[int]
+class TextPreprocessor:
+    """Prepares text so it can flow directly into the custom transformer stack."""
+    def __init__(
+        self,
+        max_length: int = 512,
+        tokenizer: Optional[TransformerTokenizer] = None,
+        *,
+        min_freq: int = 1,
+        lowercase: bool = True,
+    ) -> None:
         self.max_length = max_length
+        self.min_freq = min_freq
+        self.lowercase = lowercase
+        self.tokenizer = tokenizer
     def clean_text(self, text: str) -> str:
+        return _normalize(text, self.lowercase)
+    def fit_tokenizer(self, texts: Iterable[str]) -> TransformerTokenizer:
+        cleaned = [self.clean_text(text) for text in texts]
+        self.tokenizer = TransformerTokenizer.build(
+            cleaned,
+            min_freq=self.min_freq,
+            lowercase=False,
         )
+        return self.tokenizer
+    def encode(self, text: str, *, add_special_tokens: bool = True) -> List[int]:
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer not fitted")
+        cleaned = self.clean_text(text)
+        return self.tokenizer.encode(cleaned, add_special_tokens=add_special_tokens, max_length=self.max_length)
+    def batch_encode(self, texts: Sequence[str]) -> Batch:
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer not fitted")
+        sequences = [self.encode(text) for text in texts]
+        lengths = [len(seq) for seq in sequences]
+        input_ids, attention_mask = self.tokenizer.pad_batch(sequences, pad_to_length=self.max_length)
+        return Batch(input_ids=input_ids, attention_mask=attention_mask, lengths=lengths)
+    def build_encoder(self, **encoder_kwargs) -> TransformerEncoder:
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer not fitted")
+        return TransformerEncoder(
+            vocab_size=self.tokenizer.vocab_size,
+            max_len=self.max_length,
+            pad_token_id=self.tokenizer.pad_id,
+            **encoder_kwargs,
+        )
+    def build_decoder(self, **decoder_kwargs) -> TransformerDecoder:
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer not fitted")
+        return TransformerDecoder(
+            vocab_size=self.tokenizer.vocab_size,
+            max_len=self.max_length,
+            pad_token_id=self.tokenizer.pad_id,
+            **decoder_kwargs,
+        )
+    def save_tokenizer(self, path: Path) -> None:
+        if self.tokenizer is None:
+            raise RuntimeError("Tokenizer not fitted")
+        self.tokenizer.save(path)
+    def load_tokenizer(self, path: Path) -> TransformerTokenizer:
+        self.tokenizer = TransformerTokenizer.load(path)
+        return self.tokenizer
+    def chunk_text(self, text: str, *, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
+        if chunk_size <= overlap:
+            raise ValueError("chunk_size must be larger than overlap")
+        words = self.clean_text(text).split()
+        chunks: List[str] = []
         start = 0
         while start < len(words):
+            end = min(start + chunk_size, len(words))
+            chunks.append(" ".join(words[start:end]))
             start += chunk_size - overlap
         return chunks
+    def save_book_chunks(
+        self,
+        input_path: Path,
+        out_dir: Path,
+        *,
+        chunk_size: int = 1000,
+        overlap: int = 100,
+    ) -> Path:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        raw_text = input_path.read_text(encoding="utf-8", errors="ignore")
+        chunks = self.chunk_text(raw_text, chunk_size=chunk_size, overlap=overlap)
+        out_file = out_dir / f"{input_path.stem}.json"
+        out_file.write_text(json.dumps(chunks, ensure_ascii=False, indent=2), encoding="utf-8")
+        return out_file

src/inference/__init__.py CHANGED Viewed

	@@ -0,0 +1,7 @@

+"""
+Inference utilities for LexiMind.
+"""
+from .baseline_summarizer import Summarizer, TransformerSummarizer
+__all__ = ["Summarizer", "TransformerSummarizer"]

src/inference/baseline_summarizer.py CHANGED Viewed

@@ -1,222 +1,41 @@
-import os
-import json
-from typing import Any, List, Dict, Optional
-import torch
-from torch.utils.data import Dataset, DataLoader
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-class Summarizer:
-    def __init__(self, model_name: str = "t5-small", max_input: int = 512, max_output: int = 128, device: Optional[str] = None):
-        self.model_name = model_name
-        self.max_input = max_input
-        self.max_output = max_output
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        self.device = torch.device(device) if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-    def load_data(self, split: str = "train", limit: Optional[int] = None) -> List[Dict[str, str]]:
-        """
-        Load processed summarization data from JSON files.
-        Args:
-            split (str): Data split to load ('train', 'val', 'test')
-            limit (int): Maximum number of samples to load (None for all)
-        Returns:
-            list: List of dictionaries with 'article' and 'summary' keys
-        """
-        # Resolve to project root regardless of current working directory
-        root = os.path.dirname(os.path.dirname(__file__))
-        file_path = os.path.join(root, "data", "processed", "summarization", f"{split}.json")
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"Data file not found: {file_path}")
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        if limit:
-            data = data[:limit]
-        return data
-    def encode(self, articles: List[str] | str, summaries: Optional[List[str] | str] = None):
-        if isinstance(articles, str):
-            articles = [articles]
-        if summaries is not None and isinstance(summaries, str):
-            summaries = [summaries]
-        inputs = self.tokenizer(
-            [f"summarize: {a}" for a in articles],
-            max_length=self.max_input,
-            truncation=True,
-            padding="max_length",
-            return_tensors="pt"
-        )
-        result = {
-            "input_ids": inputs.input_ids.to(self.device),
-            "attention_mask": inputs.attention_mask.to(self.device)
-        }
-        if summaries is not None:
-            labels = self.tokenizer(
-                summaries,
-                max_length=self.max_output,
-                truncation=True,
-                padding="max_length",
-                return_tensors="pt"
-            ).input_ids
-            # Mask pad tokens in labels with -100 for loss
-            labels[labels == self.tokenizer.pad_token_id] = -100
-            result["labels"] = labels.to(self.device)
-        return result
-    def train(self, epochs: int = 3, batch_size: int = 4, train_limit: int = 2000, val_limit: int = 500, learning_rate: float = 5e-5):
-        train_data = self.load_data("train", limit=train_limit)
-        val_data = self.load_data("val", limit=val_limit)
-        train_ds = _SummarizationDataset(train_data, self.tokenizer, self.max_input, self.max_output)
-        val_ds = _SummarizationDataset(val_data, self.tokenizer, self.max_input, self.max_output) if val_data else None
-        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
-        val_loader = DataLoader(val_ds, batch_size=batch_size) if val_ds else None
-        optim = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)
-        self.model.train()
-        for epoch in range(epochs):
-            print(f"Epoch {epoch+1}/{epochs} - {len(train_loader)} batches", flush=True)
-            for i, batch in enumerate(train_loader, start=1):
-                batch = {k: v.to(self.device) for k, v in batch.items()}
-                outputs = self.model(**batch)
-                loss = outputs.loss
-                loss.backward()
-                optim.step()
-                optim.zero_grad()
-                if (i % max(1, len(train_loader)//5 or 1)) == 0:
-                    print(f"  step {i}/{len(train_loader)} - loss {float(loss):.4f}", flush=True)
-            if val_loader:
-                _ = self.evaluate(val_data[: min(100, len(val_data))])
-        print("Training complete.", flush=True)
-    def evaluate(self, val_data: List[Dict[str, str]]) -> float:
-        if not val_data:
-            return 0.0
-        ds = _SummarizationDataset(val_data, self.tokenizer, self.max_input, self.max_output)
-        loader = DataLoader(ds, batch_size=4)
-        self.model.eval()
-        total = 0.0
-        count = 0
-        with torch.no_grad():
-            for batch in loader:
-                batch = {k: v.to(self.device) for k, v in batch.items()}
-                outputs = self.model(**batch)
-                total += float(outputs.loss) * batch["input_ids"].size(0)
-                count += batch["input_ids"].size(0)
-        self.model.train()
-        return total / max(count, 1)
-    def summarize(self, text: str, max_length: Optional[int] = None, num_beams: int = 4) -> str:
-        if not text.strip():
-            return ""
-        inputs = self.tokenizer(
-            f"summarize: {text}",
-            return_tensors="pt",
-            max_length=self.max_input,
-            truncation=True,
-            padding=True
-        )
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            summary_ids = self.model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs.get("attention_mask"),
-                max_length=max_length or self.max_output,
-                num_beams=num_beams,
-                length_penalty=2.0,
-                early_stopping=True
-            )
-        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
-    def save(self, path: str = "models/summarizer"):
-        """
-        Save the trained model and tokenizer.
-        Args:
-            path (str): Directory path to save the model
-        """
-        os.makedirs(path, exist_ok=True)
-        self.model.save_pretrained(path)
-        self.tokenizer.save_pretrained(path)
-    @classmethod
-    def load(cls, path: str = "models/summarizer"):
-        """
-        Load a pre-trained model from disk.
-        Args:
-            path (str): Directory path containing the saved model
-        Returns:
-            Summarizer: Loaded summarizer instance
-        """
-        obj = cls.__new__(cls)
-        obj.model_name = path
-        obj.max_input = 512
-        obj.max_output = 128
-        obj.tokenizer = AutoTokenizer.from_pretrained(path)
-        obj.model = AutoModelForSeq2SeqLM.from_pretrained(path)
-        obj.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        obj.model.to(obj.device)
-        return obj
-class _SummarizationDataset(Dataset):
-    def __init__(self, data: List[Dict[str, str]], tokenizer: Any, max_input: int, max_output: int):
-        self.data = data
-        self.tokenizer = tokenizer
-        self.max_input = max_input
-        self.max_output = max_output
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx: int):
-        item = self.data[idx]
-        inputs = self.tokenizer(
-            f"summarize: {item['article']}",
-            max_length=self.max_input,
-            truncation=True,
-            padding="max_length",
-            return_tensors="pt"
         )
-        labels = self.tokenizer(
-            item["summary"],
-            max_length=self.max_output,
-            truncation=True,
-            padding="max_length",
-            return_tensors="pt"
-        ).input_ids
-        labels[labels == self.tokenizer.pad_token_id] = -100
-        return {
-            "input_ids": inputs.input_ids.squeeze(0),
-            "attention_mask": inputs.attention_mask.squeeze(0),
-            "labels": labels.squeeze(0),
-        }
-if __name__ == "__main__":
-    print("Initializing summarizer...", flush=True)
-    summarizer = Summarizer(model_name="t5-small")
-    print("Starting a short training run...", flush=True)
-    summarizer.train(epochs=3, batch_size=2, train_limit=100, val_limit=50)
-    test_text = (
-        "The quick brown fox jumps over the lazy dog. This is a common "
-        "pangram used in typography and printing. It contains every letter of the "
-        "alphabet at least once, making it useful for testing fonts and keyboards."
-    )
-    print("Generating summary...", flush=True)
-    summary = summarizer.summarize(test_text)
-    print(f"\nOriginal text: {test_text}")
-    print(f"Summary: {summary}")
-    summarizer.save()

+"""Thin wrapper around the custom transformer summarizer."""
+from __future__ import annotations
+from typing import Any, Dict, Optional, Tuple
+import torch
+from ..api.inference import load_models
+class TransformerSummarizer:
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        models = load_models(config or {})
+        if not models.get("loaded"):
+            raise RuntimeError("load_models returned an unloaded model; check configuration")
+        self.model = models["mt"]
+        self.preprocessor = models["preprocessor"]
+        self.device = models["device"]
+    def summarize(
+        self,
+        text: str,
+        compression: float = 0.25,
+        collect_attn: bool = False,
+    ) -> Tuple[str, Optional[Dict[str, torch.Tensor]]]:
+        batch = self.preprocessor.batch_encode([text])
+        tokenizer = self.preprocessor.tokenizer
+        encoder = self.model.encoder
+        decoder = self.model.decoder
+        if tokenizer is None or encoder is None or decoder is None:
+            raise RuntimeError("Model components are missing; ensure encoder, decoder, and tokenizer are set")
+        input_ids = batch.input_ids.to(self.device)
+        memory = encoder(input_ids)
+        src_len = batch.lengths[0]
+        target_len = max(4, int(src_len * compression))
+        generated = decoder.greedy_decode(
+            memory,
+            max_len=min(self.preprocessor.max_length, target_len),
+            start_token_id=tokenizer.bos_id,
+            end_token_id=tokenizer.eos_id,
         )
+        summary = tokenizer.decode(generated[0].tolist(), skip_special_tokens=True)
+        return summary.strip(), None if not collect_attn else {}

src/models/__init__.py CHANGED Viewed

	@@ -0,0 +1,33 @@

+"""
+LexiMind custom transformer models.
+This package provides a from-scratch transformer implementation with:
+- TransformerEncoder/TransformerDecoder
+- MultiHeadAttention, FeedForward, PositionalEncoding
+- Task heads: ClassificationHead, TokenClassificationHead, LMHead
+- MultiTaskModel: composable wrapper for encoder/decoder + task heads
+"""
+from .encoder import TransformerEncoder, TransformerEncoderLayer
+from .decoder import TransformerDecoder, TransformerDecoderLayer, create_causal_mask
+from .attention import MultiHeadAttention
+from .feedforward import FeedForward
+from .positional_encoding import PositionalEncoding
+from .heads import ClassificationHead, TokenClassificationHead, LMHead, ProjectionHead
+from .multitask import MultiTaskModel
+__all__ = [
+    "TransformerEncoder",
+    "TransformerEncoderLayer",
+    "TransformerDecoder",
+    "TransformerDecoderLayer",
+    "create_causal_mask",
+    "MultiHeadAttention",
+    "FeedForward",
+    "PositionalEncoding",
+    "ClassificationHead",
+    "TokenClassificationHead",
+    "LMHead",
+    "ProjectionHead",
+    "MultiTaskModel",
+]

src/ui/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Streamlit prototype for LexiMind (summarization, emotion, topic).
+Run from repo root: streamlit run streamlit_app.py
+"""
+import streamlit as st
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.figure_factory as ff
+# Stable absolute import; ensure repo root is on PYTHONPATH (running from repo root is standard)
+try:
+    from ..api.inference import load_models, summarize_text, classify_emotion, topic_for_text
+except Exception as e:
+    st.error(f"Failed to import inference helpers: {e}")
+    raise
+st.set_page_config(page_title="LexiMind demo", layout="wide")
+MODEL_CONFIG = {
+    "checkpoint_path": "checkpoints/best.pt",  # change to your trained checkpoint
+    "tokenizer_path": "artifacts/tokenizer.json",  # JSON produced by TextPreprocessor.save_tokenizer
+    "device": "cpu",
+}
+try:
+    models = load_models(MODEL_CONFIG)
+except Exception as exc:
+    st.error(f"Failed to load models: {exc}")
+    st.stop()
+st.sidebar.title("LexiMind")
+task = st.sidebar.selectbox("Task", ["Summarize", "Emotion", "Topic", "Search demo"])
+compression = st.sidebar.slider("Compression (summary length)", 0.1, 1.0, 0.25)
+show_attn = st.sidebar.checkbox("Show attention heatmap (collect_attn)", value=False)
+st.sidebar.markdown("Demo controls")
+sample_choice = st.sidebar.selectbox("Use sample text", ["None", "Gutenberg sample", "News sample"])
+SAMPLES = {
+    "Gutenberg sample": (
+        "It was the best of times, it was the worst of times, it was the age of wisdom, "
+        "it was the age of foolishness..."
+    ),
+    "News sample": (
+        "Markets rallied today as tech stocks posted gains amid broad optimism over earnings..."
+    ),
+}
+st.title("LexiMind — Summarization, Emotion, Topic (Prototype)")
+if sample_choice != "None":
+    input_text = st.text_area("Input text", value=SAMPLES[sample_choice], height=280)
+else:
+    input_text = st.text_area("Input text", value="", height=280)
+col1, col2 = st.columns([2, 1])
+with col1:
+    st.subheader("Output")
+    if st.button("Run"):
+        if not input_text.strip():
+            st.warning("Enter some text or select a sample to run the model.")
+        else:
+            if task == "Summarize":
+                summary, attn_data = summarize_text(input_text, compression=compression, collect_attn=show_attn, models=models)
+                st.markdown("**Summary**")
+                st.write(summary)
+                if show_attn and attn_data is not None:
+                    st.markdown("**Attention heatmap (averaged heads)**")
+                    src_tokens = attn_data.get("src_tokens", None)
+                    tgt_tokens = attn_data.get("tgt_tokens", None)
+                    weights = attn_data.get("weights", None)
+                    if weights is not None:
+                        arr = np.array(weights)
+                        if arr.ndim == 4:
+                            arr = arr.mean(axis=(0,1))
+                        elif arr.ndim == 3:
+                            arr = arr.mean(axis=0)
+                        fig = ff.create_annotated_heatmap(
+                            z=arr.tolist(),
+                            x=src_tokens if src_tokens else [f"tok{i}" for i in range(arr.shape[1])],
+                            y=tgt_tokens if tgt_tokens else [f"tok{i}" for i in range(arr.shape[0])],
+                            colorscale="Viridis",
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    else:
+                        st.info("Attention data not available from the model.")
+            elif task == "Emotion":
+                probs, labels = classify_emotion(input_text, models=models)
+                st.markdown("**Emotion predictions (multi-label probabilities)**")
+                df = pd.DataFrame({"emotion": labels, "prob": probs})
+                fig = px.bar(df, x="emotion", y="prob", color="prob", range_y=[0,1])
+                st.plotly_chart(fig, use_container_width=True)
+            elif task == "Topic":
+                topic_id, topic_terms = topic_for_text(input_text, models=models)
+                st.markdown("**Topic cluster**")
+                st.write(f"Cluster ID: {topic_id}")
+                st.write("Top terms:", ", ".join(topic_terms))
+            elif task == "Search demo":
+                st.info("Search demo will be available when ingestion is run (see scripts).")
+with col2:
+    st.subheader("Model & Info")
+    st.markdown(f"*Model loaded:* {'yes' if models.get('loaded', False) else 'no'}")
+    st.markdown(f"*Device:* {models.get('device', MODEL_CONFIG['device'])}")
+    st.markdown("**Notes**")
+    st.markdown("- Attention visualization depends on model support to return attention.")
+    st.markdown("- For long inputs the UI truncates tokens for heatmap clarity.")