Spaces:

OliverPerrin
/

LexiMind

Sleeping

App Files Files Community

OliverPerrin commited on Nov 19

Commit

ea3248a

1 Parent(s): 7aaf14d

Summarization fix

Browse files

Files changed (5) hide show

debug_heads.py +60 -0
debug_summarization.py +84 -0
inspect_checkpoint.py +33 -0
src/inference/pipeline.py +12 -0
src/models/decoder.py +45 -0

debug_heads.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import sys
+from pathlib import Path
+import torch
+import logging
+# Add project root to path
+PROJECT_ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.models.factory import ModelConfig
+from src.data.tokenization import Tokenizer, TokenizerConfig
+from src.models.factory import build_multitask_model
+from src.utils.io import load_state
+from src.utils.labels import load_label_metadata
+from src.inference.pipeline import InferencePipeline, InferenceConfig
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def debug_pipeline():
+    labels = load_label_metadata("artifacts/labels.json")
+    tokenizer = Tokenizer(TokenizerConfig(pretrained_model_name="artifacts/hf_tokenizer"))
+    for heads in [4, 8, 16]:
+        print(f"\n============================================")
+        print(f"Testing num_heads={heads}")
+        print(f"============================================")
+        try:
+            cfg = ModelConfig(num_attention_heads=heads)
+            model = build_multitask_model(
+                tokenizer,
+                num_emotions=labels.emotion_size,
+                num_topics=labels.topic_size,
+                config=cfg,
+            )
+            load_state(model, "checkpoints/best.pt")
+            # Tie weights (as per my previous fix)
+            if hasattr(model.decoder, "output_projection") and hasattr(model.decoder, "embedding"):
+                model.decoder.output_projection.weight = model.decoder.embedding.weight
+            pipeline = InferencePipeline(
+                model=model,
+                tokenizer=tokenizer,
+                config=InferenceConfig(device="cpu"),
+                emotion_labels=labels.emotion,
+                topic_labels=labels.topic,
+                device="cpu"
+            )
+            text = "Artificial intelligence is rapidly transforming the technology landscape."
+            summary = pipeline.summarize([text], max_length=20)
+            print(f"Summary: '{summary[0]}'")
+        except Exception as e:
+            print(f"Error: {e}")
+if __name__ == "__main__":
+    debug_pipeline()

debug_summarization.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import sys
+from pathlib import Path
+import torch
+import logging
+# Add project root to path
+PROJECT_ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.inference.factory import create_inference_pipeline
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def debug_pipeline():
+    print("Loading pipeline...")
+    pipeline, _ = create_inference_pipeline(
+        tokenizer_dir="artifacts/hf_tokenizer/",
+        checkpoint_path="checkpoints/best.pt",
+        labels_path="artifacts/labels.json",
+    )
+    tokenizer = pipeline.tokenizer
+    print(f"BOS ID: {tokenizer.bos_token_id}")
+    print(f"EOS ID: {tokenizer.eos_token_id}")
+    print(f"PAD ID: {tokenizer.pad_token_id}")
+    text = "Artificial intelligence is rapidly transforming the technology landscape."
+    print("\n--- Input Analysis ---")
+    encoded = tokenizer.encode(text)
+    print(f"Encoded input: {encoded}")
+    print(f"Decoded input: {tokenizer.decode(encoded)}")
+    print("\n--- Model Generation Debug ---")
+    # Manually run the summarization steps
+    batch = pipeline.preprocessor.batch_encode([text])
+    batch = pipeline._batch_to_device(batch)
+    src_ids = batch.input_ids
+    src_mask = batch.attention_mask
+    print(f"Source IDs shape: {src_ids.shape}")
+    print(f"Source IDs: {src_ids}")
+    with torch.inference_mode():
+        encoder_mask = src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
+        memory = pipeline.model.encoder(src_ids, mask=encoder_mask)
+        # Try decoding with BOS as start
+        print("\n--- Decoding with BOS start ---")
+        generated_bos = pipeline.model.decoder.greedy_decode(
+            memory=memory,
+            max_len=20,
+            start_token_id=tokenizer.bos_token_id,
+            end_token_id=tokenizer.eos_token_id,
+            device=pipeline.device,
+            min_len=0
+        )
+        print(f"Generated IDs (BOS start): {generated_bos.tolist()}")
+        print(f"Decoded (BOS start): {tokenizer.decode_batch(generated_bos.tolist())}")
+        # Try decoding with [BOS, FirstContentToken] start
+        print("\n--- Decoding with [BOS, FirstContentToken] start ---")
+        bos_id = tokenizer.bos_token_id
+        first_content_id = src_ids[0, 1] # Skip BOS in input
+        print(f"First content token ID: {first_content_id} ({tokenizer.decode([first_content_id])})")
+        generated = torch.tensor([[bos_id, first_content_id]], dtype=torch.long, device=pipeline.device)
+        for _ in range(20):
+            logits = pipeline.model.decoder.forward(generated, memory, collect_attn=False)
+            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+            generated = torch.cat([generated, next_token], dim=1)
+            if next_token.item() == tokenizer.eos_token_id:
+                break
+        print(f"Generated IDs ([BOS, Content] start): {generated.tolist()}")
+        print(f"Decoded ([BOS, Content] start): {tokenizer.decode_batch(generated.tolist())}")
+if __name__ == "__main__":
+    debug_pipeline()

inspect_checkpoint.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import sys
+from pathlib import Path
+def inspect_checkpoint():
+    path = "checkpoints/best.pt"
+    print(f"Loading {path}...")
+    try:
+        state_dict = torch.load(path, map_location="cpu", weights_only=True)
+        print(f"Keys found: {len(state_dict)}")
+        print("\n--- Head Keys ---")
+        head_keys = [k for k in state_dict.keys() if "head" in k]
+        for k in sorted(head_keys):
+            print(k)
+        print("\n--- Decoder Keys (Sample) ---")
+        decoder_keys = [k for k in state_dict.keys() if "decoder" in k][:10]
+        for k in sorted(decoder_keys):
+            print(k)
+        print("\n--- Checking for Cross Attention ---")
+        if "decoder.layers.0.cross_attn.W_Q.weight" in state_dict:
+            print("Found decoder.layers.0.cross_attn.W_Q.weight")
+        else:
+            print("MISSING decoder.layers.0.cross_attn.W_Q.weight")
+    except Exception as e:
+        print(f"Failed to load: {e}")
+if __name__ == "__main__":
+    inspect_checkpoint()

src/inference/pipeline.py CHANGED Viewed

@@ -77,6 +77,16 @@ class InferencePipeline:
             memory = self.model.encoder(src_ids, mask=encoder_mask)
             # Force a minimum length to prevent immediate EOS
             min_len = 10
             generated = self.model.decoder.greedy_decode(
                 memory=memory,
                 max_len=max_len,
@@ -84,6 +94,8 @@ class InferencePipeline:
                 end_token_id=self.tokenizer.eos_token_id,
                 device=self.device,
                 min_len=min_len,
             )
             # Post-process to remove repetition if detected

             memory = self.model.encoder(src_ids, mask=encoder_mask)
             # Force a minimum length to prevent immediate EOS
             min_len = 10
+            # Ban BOS, PAD, UNK from being generated
+            ban_token_ids = [
+                self.tokenizer.bos_token_id,
+                self.tokenizer.pad_token_id,
+                self.tokenizer.tokenizer.unk_token_id
+            ]
+            # Filter out None values just in case
+            ban_token_ids = [tid for tid in ban_token_ids if tid is not None]
             generated = self.model.decoder.greedy_decode(
                 memory=memory,
                 max_len=max_len,
                 end_token_id=self.tokenizer.eos_token_id,
                 device=self.device,
                 min_len=min_len,
+                ban_token_ids=ban_token_ids,
+                no_repeat_ngram_size=3,
             )
             # Post-process to remove repetition if detected

src/models/decoder.py CHANGED Viewed

@@ -221,6 +221,8 @@ class TransformerDecoder(nn.Module):
         device: Optional[torch.device] = None,
         *,
         min_len: Optional[int] = None,
     ) -> torch.Tensor:
         """
         Naive greedy decoding: repeatedly run the decoder on the growing prefix.
@@ -237,9 +239,52 @@ class TransformerDecoder(nn.Module):
             logits = self.forward(generated, memory, collect_attn=False)  # (B, L, V)
             assert isinstance(logits, torch.Tensor)  # type narrowing
             next_step_logits = logits[:, -1, :]
             if end_token_id is not None and generated.size(1) < max(1, min_len):
                 next_step_logits = next_step_logits.clone()
                 next_step_logits[:, end_token_id] = float("-inf")
             next_token = next_step_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
             generated = torch.cat([generated, next_token], dim=1)

         device: Optional[torch.device] = None,
         *,
         min_len: Optional[int] = None,
+        ban_token_ids: Optional[List[int]] = None,
+        no_repeat_ngram_size: int = 0,
     ) -> torch.Tensor:
         """
         Naive greedy decoding: repeatedly run the decoder on the growing prefix.
             logits = self.forward(generated, memory, collect_attn=False)  # (B, L, V)
             assert isinstance(logits, torch.Tensor)  # type narrowing
             next_step_logits = logits[:, -1, :]
+            # Apply constraints (min_len or ban_token_ids)
+            should_clone = False
             if end_token_id is not None and generated.size(1) < max(1, min_len):
+                should_clone = True
+            if ban_token_ids:
+                should_clone = True
+            # Check for n-gram repetition
+            if no_repeat_ngram_size > 0:
+                # We might need to clone if we find something to ban
+                pass
+            if should_clone:
                 next_step_logits = next_step_logits.clone()
+            if end_token_id is not None and generated.size(1) < max(1, min_len):
                 next_step_logits[:, end_token_id] = float("-inf")
+            if ban_token_ids:
+                next_step_logits[:, ban_token_ids] = float("-inf")
+            if no_repeat_ngram_size > 0:
+                # Calculate banned tokens based on n-grams
+                for b in range(B):
+                    gen_seq = generated[b].tolist()
+                    if len(gen_seq) < no_repeat_ngram_size - 1:
+                        continue
+                    prefix = tuple(gen_seq[-(no_repeat_ngram_size - 1):])
+                    banned_for_this_batch = set()
+                    # Scan history for prefix
+                    for i in range(len(gen_seq) - no_repeat_ngram_size + 1):
+                        window = tuple(gen_seq[i : i + no_repeat_ngram_size - 1])
+                        if window == prefix:
+                            # The token that followed this instance of prefix
+                            if i + no_repeat_ngram_size - 1 < len(gen_seq):
+                                banned_for_this_batch.add(gen_seq[i + no_repeat_ngram_size - 1])
+                    if banned_for_this_batch:
+                        if not should_clone:
+                             next_step_logits = next_step_logits.clone()
+                             should_clone = True
+                        next_step_logits[b, list(banned_for_this_batch)] = float("-inf")
             next_token = next_step_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
             generated = torch.cat([generated, next_token], dim=1)