Upload 9 files

Browse files

Files changed (9) hide show

src/convert_to_hf.py +184 -0
src/data_pre_to_raw.py +37 -0
src/data_processing.py +86 -0
src/hf_inference.py +101 -0
src/inference.py +285 -0
src/model.py +182 -0
src/tokenization.py +138 -0
src/train.py +404 -0
src/utils.py +15 -0

src/convert_to_hf.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python
+import os
+import argparse
+import yaml
+import json
+import torch
+import shutil
+import tiktoken
+from model import create_model  # your model creation function from model.py
+def load_config(config_path):
+    """Load the training configuration from a YAML file."""
+    with open(config_path, "r") as f:
+        return yaml.safe_load(f)
+def load_tokenizer_encoding(tokenizer_dir):
+    """Reads the encoding name from encoding_config.txt in your tokenizer directory."""
+    encoding_config_path = os.path.join(tokenizer_dir, "encoding_config.txt")
+    if not os.path.exists(encoding_config_path):
+        raise FileNotFoundError(f"Encoding config not found at {encoding_config_path}")
+    with open(encoding_config_path, "r") as f:
+        content = f.read().strip()
+    # Expect a line like: "encoding_name: cl100k_base"
+    if ":" not in content:
+        raise ValueError(f"Invalid encoding config format: {content}")
+    _, encoding_name = content.split(":", 1)
+    return encoding_name.strip()
+def get_tokenizer(encoding_name):
+    """Initialize tiktoken encoding."""
+    tokenizer = tiktoken.get_encoding(encoding_name)
+    return tokenizer
+def load_state_dict(checkpoint_dir):
+    """
+    Loads the model state dict from a DeepSpeed checkpoint.
+    First tries to load a consolidated checkpoint, then attempts to convert from ZeRO format.
+    """
+    # First try loading from converted_model directory
+    converted_path = os.path.join(checkpoint_dir, "converted_model", "pytorch_model.bin")
+    if os.path.exists(converted_path):
+        print(f"Loading converted checkpoint from {converted_path}")
+        state_dict = torch.load(converted_path, map_location="cpu")
+        # Remove "_orig_mod." prefix from keys if present
+        if all(k.startswith("_orig_mod.") for k in state_dict.keys()):
+            print("Removing '_orig_mod.' prefix from state dict keys")
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        return state_dict
+    # Try loading consolidated checkpoint from main directory
+    consolidated_path = os.path.join(checkpoint_dir, "pytorch_model.bin")
+    if os.path.exists(consolidated_path):
+        print(f"Loading consolidated checkpoint from {consolidated_path}")
+        state_dict = torch.load(consolidated_path, map_location="cpu")
+        # Remove "_orig_mod." prefix from keys if present
+        if all(k.startswith("_orig_mod.") for k in state_dict.keys()):
+            print("Removing '_orig_mod.' prefix from state dict keys")
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        return state_dict
+    # If no consolidated checkpoint exists, try converting from ZeRO format
+    print("No consolidated checkpoint found. Converting from ZeRO format...")
+    # Import the zero_to_fp32 module from the checkpoint directory
+    import sys
+    sys.path.append(checkpoint_dir)
+    from zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+    try:
+        # Convert ZeRO checkpoint to consolidated checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, exclude_frozen_parameters=False)
+        if state_dict is None:
+            raise ValueError("Failed to convert ZeRO checkpoint")
+        # Remove "_orig_mod." prefix from keys if present
+        if all(k.startswith("_orig_mod.") for k in state_dict.keys()):
+            print("Removing '_orig_mod.' prefix from state dict keys")
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        print("Successfully converted ZeRO checkpoint to consolidated format")
+        return state_dict
+    except Exception as e:
+        print(f"Error converting ZeRO checkpoint: {str(e)}")
+        raise
+def convert_to_hf(checkpoint_dir, tokenizer_dir, config_path, output_dir):
+    # Load configurations
+    config = load_config(config_path)
+    # Set up tokenizer
+    encoding_name = load_tokenizer_encoding(tokenizer_dir)
+    tokenizer = get_tokenizer(encoding_name)
+    vocab_size = tokenizer.n_vocab
+    print(f"Using tokenizer encoding: {encoding_name} (vocab size: {vocab_size})")
+    # Update config with correct vocab size
+    config["model"]["vocab_size"] = vocab_size
+    # Create model and load weights
+    model = create_model(config)
+    state_dict = load_state_dict(checkpoint_dir)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # 1. Save model weights
+    model_path = os.path.join(output_dir, "pytorch_model.bin")
+    torch.save(model.state_dict(), model_path)
+    print(f"Saved model weights to {model_path}")
+    # 2. Save model config
+    model_config = {
+        "architectures": ["CustomLanguageModel"],
+        "model_type": "custom-gpt",
+        "vocab_size": vocab_size,
+        "n_positions": config["model"]["n_positions"],
+        "n_embd": config["model"]["n_embd"],
+        "n_layer": config["model"]["n_layer"],
+        "n_head": config["model"]["n_head"],
+        "bos_token_id": None,
+        "eos_token_id": tokenizer.eot_token,
+        "tie_word_embeddings": True,
+        "gradient_checkpointing": config["model"].get("gradient_checkpointing", False)
+    }
+    config_path = os.path.join(output_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(model_config, f, indent=2)
+    print(f"Saved model config to {config_path}")
+    # 3. Save tokenizer config
+    tokenizer_config = {
+        "model_type": "tiktoken",
+        "encoding_name": encoding_name,
+        "vocab_size": vocab_size,
+        "max_length": config["dataset"]["max_length"],
+        "padding_side": "right",
+        "truncation_side": "right",
+        "bos_token": "<|endoftext|>",
+        "eos_token": "<|endoftext|>",
+        "unk_token": "<|endoftext|>",
+        "pad_token": "<|endoftext|>"
+    }
+    tokenizer_config_path = os.path.join(output_dir, "tokenizer_config.json")
+    with open(tokenizer_config_path, "w") as f:
+        json.dump(tokenizer_config, f, indent=2)
+    print(f"Saved tokenizer config to {tokenizer_config_path}")
+    # 4. Copy tokenizer files
+    src_encoding_config = os.path.join(tokenizer_dir, "encoding_config.txt")
+    if os.path.exists(src_encoding_config):
+        dst_encoding_config = os.path.join(output_dir, "encoding_config.txt")
+        shutil.copy2(src_encoding_config, dst_encoding_config)
+        print(f"Copied encoding config to {dst_encoding_config}")
+    print(f"\nConversion complete! HuggingFace model saved to: {output_dir}")
+def main():
+    parser = argparse.ArgumentParser(description="Convert DeepSpeed checkpoint to HuggingFace format")
+    parser.add_argument("--checkpoint_dir", required=True,
+                        help="Path to the checkpoint directory")
+    parser.add_argument("--tokenizer_dir", required=True,
+                        help="Path to the tokenizer directory")
+    parser.add_argument("--config", default="config/config.yaml",
+                        help="Path to the training config.yaml file")
+    parser.add_argument("--output_dir", required=True,
+                        help="Output directory for HuggingFace model")
+    args = parser.parse_args()
+    convert_to_hf(args.checkpoint_dir, args.tokenizer_dir, args.config, args.output_dir)
+if __name__ == "__main__":
+    main()

src/data_pre_to_raw.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+from tqdm import tqdm
+from pathlib import Path
+def convert_to_raw_text():
+    # Setup paths
+    processed_dir = Path("data/processed")
+    raw_dir = Path("data/raw")
+    # Create raw directory if it doesn't exist
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    # Output file for combined raw text
+    output_file = raw_dir / "combined_raw_text.txt"
+    # Process all txt files in processed directory
+    processed_files = list(processed_dir.glob("*.txt"))
+    print(f"Found {len(processed_files)} files to process")
+    with open(output_file, 'w', encoding='utf-8') as outfile:
+        for proc_file in tqdm(processed_files, desc="Converting files"):
+            with open(proc_file, 'r', encoding='utf-8') as infile:
+                for line in infile:
+                    # Skip metadata lines (starting with #)
+                    if not line.startswith('#'):
+                        # Only write non-empty lines
+                        line = line.strip()
+                        if line:
+                            outfile.write(line + '\n')
+if __name__ == "__main__":
+    try:
+        convert_to_raw_text()
+        print("Successfully converted processed data to raw text")
+    except Exception as e:
+        print(f"Error during conversion: {e}")

src/data_processing.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from datasets import load_dataset
+from tqdm import tqdm
+import os
+from utils import load_config, setup_logging
+import psutil  # For monitoring memory usage
+def download_and_process_data(config):
+    """Downloads, preprocesses, and saves the dataset."""
+    setup_logging()
+    dataset_name = config["dataset"]["name"]
+    streaming = config["dataset"]["streaming"]
+    text_column = config["dataset"]["text_column"]
+    target_size_gb = config["dataset"]["target_size_gb"]
+    max_length = config["dataset"]["max_length"]
+    subset = config["dataset"]["subset"]
+    # Download dataset (streaming is essential for large datasets)
+    try:
+        dataset = load_dataset(dataset_name, subset,streaming=streaming)
+        if not streaming:
+            raise ValueError("Streaming must be True for large datasets like fineweb")
+    except Exception as e:
+        raise Exception(f"Failed to download dataset: {e}. Check dataset name and internet connection, and HF login.")
+    # Filter data - removing the subset filter since it's specific to CC-MAIN
+    dataset = dataset["train"]  # Taking only train split
+    # Add basic quality filters
+    def quality_filter(example):
+        return (
+            example['text'] is not None and
+            len(example['text'].strip()) > 0 and
+            example['language'] == 'en' and  # Filter for English content
+            example['language_score'] >= 0.8  # High confidence in language detection
+        )
+    dataset = dataset.filter(quality_filter)
+    # Create output directory if it doesn't exist
+    output_dir = os.path.join("data", "processed")
+    os.makedirs(output_dir, exist_ok=True)
+    # Process and save in chunks, monitoring data size
+    def process_and_save_chunk(chunk, chunk_num, total_bytes):
+        output_file = os.path.join(output_dir, f"processed_data_{chunk_num}.txt")
+        with open(output_file, "w", encoding="utf-8") as f:
+            for example in tqdm(chunk, desc=f"Processing chunk {chunk_num}"):
+                text = example[text_column].strip()
+                if text:
+                    # Add metadata as a comment before each text
+                    metadata = f"# ID: {example['id']} | URL: {example['url']} | Date: {example['date']}\n"
+                    f.write(metadata)
+                    f.write(text + "\n\n")  # Add extra newline for separation
+                    total_bytes += len(text.encode("utf-8")) + len(metadata.encode("utf-8"))
+        return total_bytes
+    chunk_num = 0
+    chunk = []
+    total_bytes_processed = 0
+    target_bytes = target_size_gb * (1024**3)  # Convert GB to bytes
+    for example in tqdm(dataset, desc="Processing and saving data"):
+        chunk.append(example)
+        if len(chunk) >= 10000:  # Adjust chunk size as needed
+            total_bytes_processed = process_and_save_chunk(chunk, chunk_num, total_bytes_processed)
+            chunk = []
+            chunk_num += 1
+            print(f"Processed: {total_bytes_processed / (1024**3):.2f} GB")
+        if total_bytes_processed >= target_bytes:
+            print("Target data size reached.")
+            break  # Stop processing
+    if chunk:
+        process_and_save_chunk(chunk, chunk_num,total_bytes_processed) #for remaining data
+    print(f"Data download and processing complete. Total processed size: {total_bytes_processed / (1024**3):.2f} GB")
+if __name__ == "__main__":
+    config = load_config()
+    download_and_process_data(config)

src/hf_inference.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+from transformers import PreTrainedModel, PretrainedConfig
+from utils import load_config
+from tokenization import get_tokenizer
+class CustomConfig(PretrainedConfig):
+    """Configuration class for the custom language model."""
+    model_type = "custom_llm"
+    def __init__(
+        self,
+        vocab_size: int = 50000,
+        n_embd: int = 640,
+        n_head: int = 10,
+        n_layer: int = 12,
+        n_positions: int = 512,
+        tie_word_embeddings: bool = True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.n_positions = n_positions
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
+def generate_text(
+    prompt: str,
+    model_path: str = "outputs/hf_model",
+    max_length: int = 200,
+    temperature: float = 0.8,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.2,
+    no_repeat_ngram_size: int = 3
+):
+    """Generate text using the model."""
+    # Load config and tokenizer
+    config = load_config()
+    tokenizer = get_tokenizer(config)
+    # Load model
+    from inference import CustomModelForCausalLM  # Import here to avoid circular imports
+    model = CustomModelForCausalLM.from_pretrained(model_path)
+    # Move model to GPU if available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    # Encode prompt
+    encoded = tokenizer.batch_encode(
+        [prompt],
+        return_tensors="pt"
+    )
+    input_ids = encoded["input_ids"].to(device)
+    # Generate
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size
+        )
+    # Decode and return
+    generated_text = tokenizer.decode(output_ids[0].tolist())
+    return generated_text
+if __name__ == "__main__":
+    # Example prompts to test
+    prompts = [
+        "Once upon a time",
+        "The meaning of life is",
+        "In the distant future",
+        "The best way to learn programming is",
+        "Today I learned that"
+    ]
+    print("\nGenerating text from multiple prompts:")
+    print("=" * 50)
+    for prompt in prompts:
+        generated_text = generate_text(
+            prompt=prompt,
+            max_length=200,
+            temperature=0.8,  # Adjust for creativity (higher = more creative)
+            top_k=50,        # Limit to top 50 tokens
+            top_p=0.9,       # Nucleus sampling threshold
+            repetition_penalty=1.2,  # Penalize repetition
+            no_repeat_ngram_size=3   # Prevent 3-gram repetition
+        )
+        print(f"\nPrompt: {prompt}")
+        print(f"Generated: {generated_text}")
+        print("-" * 50)

src/inference.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
+from typing import Optional, Tuple, Union, List
+import os
+import json
+from model import CustomLanguageModel
+from utils import load_config
+from tokenization import get_tokenizer
+import torch.nn as nn
+class CustomConfig(PretrainedConfig):
+    """Configuration class for the custom language model."""
+    model_type = "custom_llm"
+    def __init__(
+        self,
+        vocab_size: int = 50000,
+        n_embd: int = 768,
+        n_head: int = 12,
+        n_layer: int = 12,
+        n_positions: int = 2048,
+        tie_word_embeddings: bool = False,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.n_positions = n_positions
+        self.tie_word_embeddings = tie_word_embeddings
+        super().__init__(**kwargs)
+class CustomModelForCausalLM(PreTrainedModel):
+    """Wrapper class to make the model compatible with Hugging Face's interface."""
+    config_class = CustomConfig
+    supports_gradient_checkpointing = True
+    def __init__(self, config):
+        super().__init__(config)
+        # Convert config to dictionary format expected by CustomLanguageModel
+        model_config = {
+            "model": {
+                "vocab_size": config.vocab_size,
+                "n_embd": config.n_embd,
+                "n_head": config.n_head,
+                "n_layer": config.n_layer,
+                "n_positions": config.n_positions,
+            }
+        }
+        self.transformer = CustomLanguageModel(model_config)
+        # Explicitly create separate weights for lm_head
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Initialize weights
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs
+    ):
+        outputs = self.transformer(input_ids=input_ids, labels=labels)
+        return outputs
+    def get_input_embeddings(self):
+        return self.transformer.token_embedding
+    def set_input_embeddings(self, value):
+        self.transformer.token_embedding = value
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_length: int = 100,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.2,
+        no_repeat_ngram_size: int = 3,
+        **kwargs
+    ):
+        """Enhanced generation method with better controls for repetition."""
+        self.eval()
+        current_ids = input_ids.clone()
+        batch_size = current_ids.shape[0]
+        # Get EOS token ID from tokenizer
+        eos_token_id = self.transformer.eos_token_id if hasattr(self.transformer, 'eos_token_id') else None
+        # Track generated tokens for repetition penalty
+        generated_tokens = current_ids.clone()
+        with torch.no_grad():
+            for _ in range(max_length - input_ids.size(1)):
+                # Forward pass
+                outputs = self.transformer(current_ids)
+                logits = outputs["logits"][:, -1, :] / temperature
+                # Apply repetition penalty
+                if repetition_penalty != 1.0:
+                    for i in range(batch_size):
+                        for token in set(generated_tokens[i].tolist()):
+                            logits[i, token] /= repetition_penalty
+                # Apply n-gram blocking
+                if no_repeat_ngram_size > 0:
+                    # Get the last n-gram from the input
+                    for i in range(batch_size):
+                        ngram_size = min(no_repeat_ngram_size, len(generated_tokens[i]))
+                        if ngram_size > 0:
+                            ngrams = [tuple(generated_tokens[i, -j:].tolist()) for j in range(1, ngram_size + 1)]
+                            for ngram in ngrams:
+                                for token_idx in range(len(generated_tokens[i]) - len(ngram) + 1):
+                                    if tuple(generated_tokens[i, token_idx:token_idx + len(ngram)].tolist()) == ngram:
+                                        if token_idx + len(ngram) < len(generated_tokens[i]):
+                                            next_token = generated_tokens[i, token_idx + len(ngram)]
+                                            logits[i, next_token] = float('-inf')
+                # Apply top-k filtering
+                if top_k > 0:
+                    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                    logits[indices_to_remove] = float('-inf')
+                # Apply top-p (nucleus) filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float('-inf')
+                # Sample from the filtered distribution
+                probs = torch.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Early stopping if EOS token is generated
+                if eos_token_id is not None and (next_token == eos_token_id).any():
+                    break
+                # Update generated sequence
+                current_ids = torch.cat([current_ids, next_token], dim=1)
+                generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
+            return current_ids
+def convert_to_hf_model(checkpoint_path: str, output_dir: str):
+    """Convert the custom model checkpoint to Hugging Face format using safetensors."""
+    # Load the original config and checkpoint
+    config = load_config()
+    # Get tokenizer and its vocab size
+    tokenizer = get_tokenizer(config)
+    vocab_size = tokenizer.get_vocab_size()
+    # Create HF config with the correct vocab size
+    hf_config = CustomConfig(
+        vocab_size=vocab_size,
+        n_embd=config["model"]["n_embd"],
+        n_head=config["model"]["n_head"],
+        n_layer=config["model"]["n_layer"],
+        n_positions=config["model"]["n_positions"],
+        tie_word_embeddings=False  # Explicitly disable weight tying
+    )
+    # Create HF model
+    model = CustomModelForCausalLM(hf_config)
+    # Load checkpoint
+    checkpoint = torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"), map_location="cpu")
+    # Process state dict
+    new_state_dict = {}
+    for key, value in checkpoint.items():
+        if key.startswith("_orig_mod."):
+            key = key[len("_orig_mod."):]
+        if "token_embedding.weight" in key:
+            new_state_dict[f"transformer.{key}"] = value
+            # Copy embedding weights to lm_head
+            new_state_dict["lm_head.weight"] = value.clone()
+        else:
+            new_state_dict[f"transformer.{key}"] = value
+    # Load the modified state dict
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    print(f"Missing keys: {missing_keys}")
+    print(f"Unexpected keys: {unexpected_keys}")
+    # Save in Hugging Face format with safetensors
+    os.makedirs(output_dir, exist_ok=True)
+    # Save the model in safetensors format
+    model.save_pretrained(
+        output_dir,
+        safe_serialization=True
+    )
+    print(f"Model successfully saved in safetensors format to {output_dir}")
+    # Save config
+    hf_config.save_pretrained(output_dir)
+    # Copy tokenizer files
+    tokenizer_files = ["vocab.json", "merges.txt", "tokenizer_config.json"]
+    for file in tokenizer_files:
+        src_path = os.path.join(config["tokenizer"]["model_path"], file)
+        dst_path = os.path.join(output_dir, file)
+        if os.path.exists(src_path):
+            import shutil
+            shutil.copy2(src_path, dst_path)
+    return model, tokenizer
+def generate_text(
+    prompt: str,
+    model_path: str,
+    max_length: int = 100,
+    temperature: float = 2,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.2,
+    no_repeat_ngram_size: int = 3
+):
+    """Generate text using the converted model."""
+    # Load model and tokenizer
+    config = load_config()
+    model = CustomModelForCausalLM.from_pretrained(model_path)
+    tokenizer = get_tokenizer(config)
+    # Move model to GPU if available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model.eval()
+    # Encode prompt
+    encoded = tokenizer.batch_encode(
+        [prompt],
+        return_tensors="pt"
+    )
+    input_ids = encoded["input_ids"].to(device)
+    # Generate
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size
+        )
+    # Decode and return
+    generated_text = tokenizer.decode(output_ids[0].tolist())
+    return generated_text
+if __name__ == "__main__":
+    # Example usage
+    checkpoint_path = r"my_model/"  # Path to your trained model
+    hf_output_dir = "outputs/hf_model"  # Where to save the converted model
+    # Convert model
+    model, tokenizer = convert_to_hf_model(checkpoint_path, hf_output_dir)
+    # Generate text with better parameters
+    prompt = "Hello I am Clera "
+    generated_text = generate_text(
+        prompt=prompt,
+        model_path=hf_output_dir,
+        max_length=20,
+        temperature=2.5,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        no_repeat_ngram_size=1
+    )
+    print(f"\nPrompt: {prompt}")
+    print(f"Generated text: {generated_text}")

src/model.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer
+from utils import load_config
+from tokenizers import Tokenizer
+import os
+import json
+class TransformerBlock(nn.Module):
+    """Single transformer block with self-attention and feed-forward layers"""
+    def __init__(self, n_embd, n_head, dropout=0.1):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(n_embd, n_head, dropout=dropout, batch_first=True)
+        self.feed_forward = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            nn.GELU(),
+            nn.Linear(4 * n_embd, n_embd)
+        )
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        # Ensure mask is same dtype as input
+        if mask is not None:
+            mask = mask.to(dtype=x.dtype)
+        # Self-attention with residual connection
+        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
+        x = x + self.dropout(attn_out)
+        x = self.ln1(x)
+        # Feed-forward with residual connection
+        ff_out = self.feed_forward(x)
+        x = x + self.dropout(ff_out)
+        x = self.ln2(x)
+        return x
+class CustomLanguageModel(nn.Module):
+    """Custom transformer-based language model"""
+    def __init__(self, config):
+        super().__init__()
+        self.vocab_size = config["model"]["vocab_size"]
+        self.n_embd = config["model"]["n_embd"]
+        self.n_head = config["model"]["n_head"]
+        self.n_layer = config["model"]["n_layer"]
+        self.n_positions = config["model"]["n_positions"]
+        # Token and position embeddings
+        self.token_embedding = nn.Embedding(self.vocab_size, self.n_embd)
+        self.position_embedding = nn.Embedding(self.n_positions, self.n_embd)
+        # Transformer blocks
+        self.transformer_blocks = nn.ModuleList([
+            TransformerBlock(self.n_embd, self.n_head)
+            for _ in range(self.n_layer)
+        ])
+        # Output layer
+        self.ln_f = nn.LayerNorm(self.n_embd)
+        self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=False)
+        # Tie weights between token embedding and output layer
+        self.token_embedding.weight = self.lm_head.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+        # Set gradient checkpointing flag based on config
+        self.gradient_checkpointing_enable = config["model"].get("gradient_checkpointing", False)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            torch.nn.init.zeros_(module.bias)
+            torch.nn.init.ones_(module.weight)
+    def forward(self, input_ids, labels=None):
+        batch_size, seq_length = input_ids.shape
+        # Create position indices
+        positions = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
+        positions = positions.unsqueeze(0).expand(batch_size, -1)
+        # Get embeddings and sum token & position embeddings
+        token_embeddings = self.token_embedding(input_ids)
+        position_embeddings = self.position_embedding(positions)
+        x = token_embeddings + position_embeddings
+        # Create causal mask and convert to same dtype as embeddings
+        mask = torch.triu(torch.ones((seq_length, seq_length), device=input_ids.device) * float('-inf'), diagonal=1)
+        mask = mask.to(dtype=x.dtype)
+        # Process through transformer blocks (use gradient checkpointing only if enabled)
+        if self.training and self.gradient_checkpointing_enable:
+            for block in self.transformer_blocks:
+                x = torch.utils.checkpoint.checkpoint(block, x, mask, use_reentrant=False)
+        else:
+            for block in self.transformer_blocks:
+                x = block(x, mask=mask)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.vocab_size), labels.view(-1))
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+    def num_parameters(self):
+        """Returns the number of trainable parameters in the model."""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+def create_model(config):
+    """Creates a custom language model from scratch based on the configuration."""
+    model = CustomLanguageModel(config)
+    return model
+def get_tokenizer(config):
+    """Loads a trained ByteLevelBPE tokenizer."""
+    from tokenizers import ByteLevelBPETokenizer
+    model_path = config["tokenizer"]["model_path"]
+    if not os.path.exists(os.path.join(model_path, "vocab.json")):
+        raise ValueError(f"No tokenizer found at {model_path}. Please train the tokenizer first.")
+    tokenizer = ByteLevelBPETokenizer(
+        os.path.join(model_path, "vocab.json"),
+        os.path.join(model_path, "merges.txt")
+    )
+    # Add special tokens if they don't exist
+    special_tokens = {
+        "eos_token": "<|endoftext|>",
+        "pad_token": "<|pad|>",
+        "unk_token": "<|unk|>",
+        "mask_token": "<|mask|>"
+    }
+    tokenizer.add_special_tokens(list(special_tokens.values()))
+    # Add methods to match expected interface
+    tokenizer.get_vocab_size = lambda: len(tokenizer.get_vocab())
+    def batch_encode(texts, padding=True, truncation=True, max_length=None, return_tensors=None):
+        encodings = tokenizer.encode_batch(texts)
+        # Extract token ids from encodings
+        token_ids = [enc.ids for enc in encodings]
+        if max_length and truncation:
+            token_ids = [ids[:max_length] for ids in token_ids]
+        if padding:
+            max_len = max(len(ids) for ids in token_ids)
+            pad_token_id = tokenizer.token_to_id("<|pad|>")
+            padded = []
+            for ids in token_ids:
+                pad_length = max_len - len(ids)
+                padded.append(ids + [pad_token_id] * pad_length)
+            token_ids = padded
+        if return_tensors == "pt":
+            return {
+                "input_ids": torch.tensor(token_ids),
+                "attention_mask": torch.ones_like(torch.tensor(token_ids))
+            }
+        return {"input_ids": token_ids}
+    tokenizer.batch_encode = batch_encode
+    print(f"ByteLevelBPE tokenizer loaded successfully. Vocab size: {tokenizer.get_vocab_size()}")
+    return tokenizer
+if __name__ == "__main__":
+    config = load_config()
+    tokenizer = get_tokenizer(config)
+    config["model"]["vocab_size"] = tokenizer.get_vocab_size()
+    model = create_model(config)
+    print(f"Model created with {model.num_parameters():,} parameters.")
+    print(model)

src/tokenization.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
+from tokenizers.implementations import ByteLevelBPETokenizer
+import os
+from utils import load_config, setup_logging
+from glob import glob
+from tqdm import tqdm
+import json
+import torch
+class CustomTokenizer:
+    """Wrapper around ByteLevelBPETokenizer with additional functionality."""
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self._vocab_size = len(tokenizer.get_vocab())
+        self.pad_token_id = tokenizer.token_to_id("<|pad|>")
+        self.eos_token_id = tokenizer.token_to_id("<|endoftext|>")
+    def get_vocab_size(self):
+        return self._vocab_size
+    def batch_encode(self, texts, padding=True, truncation=True, max_length=None, return_tensors=None):
+        encodings = self.tokenizer.encode_batch(texts)
+        if max_length and truncation:
+            encodings = [enc.ids[:max_length] for enc in encodings]
+        if padding:
+            max_len = max(len(enc.ids) for enc in encodings)
+            padded = []
+            for enc in encodings:
+                pad_length = max_len - len(enc.ids)
+                padded.append(enc.ids + [self.pad_token_id] * pad_length)
+            encodings = padded
+        if return_tensors == "pt":
+            return {
+                "input_ids": torch.tensor(encodings),
+                "attention_mask": torch.ones_like(torch.tensor(encodings))
+            }
+        return {"input_ids": encodings}
+    def decode(self, token_ids):
+        """Decode a list of token IDs back to a string."""
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.tolist()
+        # Filter out padding tokens
+        token_ids = [t for t in token_ids if t != self.pad_token_id]
+        # Use the underlying tokenizer's decode method
+        return self.tokenizer.decode(token_ids)
+def train_tokenizer(config):
+    """Trains a custom BPE tokenizer using the tokenizers library."""
+    setup_logging()
+    model_path = config["tokenizer"]["model_path"]
+    vocab_size = config["tokenizer"].get("vocab_size", 50000)
+    min_frequency = config["tokenizer"].get("min_frequency", 2)
+    # Create output directory if it doesn't exist
+    os.makedirs(model_path, exist_ok=True)
+    # Initialize a new tokenizer
+    tokenizer = ByteLevelBPETokenizer()
+    # Get all text files from the data directory
+    data_files = glob(os.path.join("data/raw", "*.txt"))
+    if not data_files:
+        raise ValueError("No text files found in data/raw directory")
+    print(f"Training tokenizer on {len(data_files)} files...")
+    print(f"Target vocab size: {vocab_size}")
+    print(f"Min frequency: {min_frequency}")
+    # Train the tokenizer
+    tokenizer.train(
+        files=data_files,
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=[
+            "<|endoftext|>",  # End of text token
+            "<|pad|>",        # Padding token
+            "<|unk|>",        # Unknown token
+            "<|mask|>"        # Mask token
+        ]
+    )
+    # Save the tokenizer files
+    tokenizer.save_model(model_path)
+    # Save the tokenizer configuration
+    tokenizer_config = {
+        "vocab_size": vocab_size,
+        "min_frequency": min_frequency,
+        "model_type": "byte_level_bpe",
+        "special_tokens": {
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|pad|>",
+            "unk_token": "<|unk|>",
+            "mask_token": "<|mask|>"
+        }
+    }
+    with open(os.path.join(model_path, "tokenizer_config.json"), "w") as f:
+        json.dump(tokenizer_config, f, indent=2)
+    print(f"Tokenizer trained and saved to {model_path}")
+    return tokenizer
+def get_tokenizer(config):
+    """Loads a trained tokenizer."""
+    model_path = config["tokenizer"]["model_path"]
+    if not os.path.exists(os.path.join(model_path, "vocab.json")):
+        raise ValueError(f"No tokenizer found at {model_path}. Please train the tokenizer first.")
+    base_tokenizer = ByteLevelBPETokenizer(
+        os.path.join(model_path, "vocab.json"),
+        os.path.join(model_path, "merges.txt")
+    )
+    # Add special tokens if they don't exist
+    special_tokens = {
+        "eos_token": "<|endoftext|>",
+        "pad_token": "<|pad|>",
+        "unk_token": "<|unk|>",
+        "mask_token": "<|mask|>"
+    }
+    base_tokenizer.add_special_tokens(list(special_tokens.values()))
+    # Create wrapped tokenizer
+    tokenizer = CustomTokenizer(base_tokenizer)
+    print(f"ByteLevelBPE tokenizer loaded successfully. Vocab size: {tokenizer.get_vocab_size()}")
+    return tokenizer
+if __name__ == "__main__":
+    config = load_config()
+    train_tokenizer(config)
+    print("Tokenizer training complete.")

src/train.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import os
+import math
+import time
+import json
+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from torch.utils.data import DataLoader, Dataset, IterableDataset
+from tqdm import tqdm
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.logging import get_logger
+import deepspeed
+import wandb
+from datetime import datetime
+from transformers import get_scheduler
+from model import create_model, get_tokenizer
+from utils import load_config, setup_logging
+from torch.nn.utils.rnn import pad_sequence
+logger = get_logger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Enable TF32 for faster matrix multiplications (if supported)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+def load_text_files(data_dir, chunk_size=2000000):
+    """Load text files from directory in chunks."""
+    if not os.path.exists(data_dir):
+        raise ValueError(f"Data directory {data_dir} does not exist")
+    all_files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
+    print(f"Found {len(all_files)} text files in {data_dir}")
+    total_size = sum(os.path.getsize(os.path.join(data_dir, f)) for f in all_files)
+    estimated_chunks = math.ceil(total_size / chunk_size)
+    total_characters = 0
+    current_chunk_num = 0
+    for file_name in all_files:
+        file_path = os.path.join(data_dir, file_name)
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                file_size = os.path.getsize(file_path)
+                print(f"Processing file: {file_name} (Size: {file_size/1024/1024:.2f}MB)")
+                print(f"Estimated total chunks: {estimated_chunks}")
+                current_chunk = []
+                current_size = 0
+                chunk_start_char = total_characters
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        current_chunk.append(line)
+                        current_size += len(line)
+                        total_characters += len(line)
+                        if current_size >= chunk_size:
+                            current_chunk_num += 1
+                            print(f"Yielding chunk {current_chunk_num}/{estimated_chunks} "
+                                  f"({len(current_chunk)} texts, {current_size:,} characters, "
+                                  f"Range: {chunk_start_char:,} - {total_characters:,})")
+                            yield current_chunk
+                            current_chunk = []
+                            current_size = 0
+                            chunk_start_char = total_characters
+                if current_chunk:
+                    current_chunk_num += 1
+                    print(f"Yielding final chunk {current_chunk_num}/{estimated_chunks} "
+                          f"({len(current_chunk)} texts, {current_size:,} characters, "
+                          f"Range: {chunk_start_char:,} - {total_characters:,})")
+                    yield current_chunk
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+            continue
+class TextDataset(Dataset):
+    def __init__(self, tokenized_texts):
+        self.input_ids = tokenized_texts["input_ids"]
+        self.labels = tokenized_texts["labels"]
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, idx):
+        return {"input_ids": self.input_ids[idx], "labels": self.labels[idx]}
+class StreamingTextDataset(IterableDataset):
+    def __init__(self, data_dir, tokenizer, max_length):
+        super().__init__()
+        self.data_dir = data_dir
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.files = [f for f in os.listdir(data_dir) if f.endswith('.txt')]
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        files_per_worker = len(self.files)
+        if worker_info is not None:
+            files_per_worker = len(self.files) // worker_info.num_workers
+            start_idx = worker_info.id * files_per_worker
+            end_idx = start_idx + files_per_worker if worker_info.id < worker_info.num_workers - 1 else len(self.files)
+            files = self.files[start_idx:end_idx]
+        else:
+            files = self.files
+        for file_name in files:
+            file_path = os.path.join(self.data_dir, file_name)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text_buffer = []
+                current_length = 0
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    text_buffer.append(line)
+                    current_length += len(line)
+                    if current_length >= self.max_length:
+                        # Encode and yield the batch
+                        text = " ".join(text_buffer)
+                        encodings = self.tokenizer.batch_encode(
+                            [text],
+                            max_length=self.max_length,
+                            truncation=True,
+                            padding=False,  # Don't pad here, we'll pad in collate_fn
+                            return_tensors="pt"
+                        )
+                        # Return individual tensors
+                        yield {
+                            "input_ids": encodings["input_ids"][0],
+                            "labels": encodings["input_ids"][0].clone()
+                        }
+                        text_buffer = []
+                        current_length = 0
+def collate_batch(batch):
+    """Custom collate function to handle variable length sequences."""
+    # Separate input_ids and labels
+    input_ids = [item["input_ids"] for item in batch]
+    labels = [item["labels"] for item in batch]
+    # Pad sequences
+    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
+    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is PyTorch's default ignore index
+    # Create attention masks
+    attention_mask = (input_ids != 0).long()
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+        "attention_mask": attention_mask
+    }
+def train_model(config):
+    """Trains the model using DeepSpeed and Accelerate for memory efficiency."""
+    # Create output directory
+    output_dir = config["training"]["output_dir"]
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Model will be saved to: {output_dir}")
+    # Initialize DeepSpeed plugin and accelerator
+    deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=config["training"]["deepspeed"])
+    accelerator = Accelerator(
+        gradient_accumulation_steps=config["training"]["gradient_accumulation_steps"],
+        mixed_precision="fp16",
+        deepspeed_plugin=deepspeed_plugin,
+        log_with=config["training"]["report_to"]
+    )
+    # Initialize tracking
+    if accelerator.is_main_process:
+        accelerator.init_trackers(
+            project_name=config["training"]["wandb"]["project"],
+            config=config,
+            init_kwargs={
+                "wandb": {
+                    "entity": config["training"]["wandb"]["entity"],
+                    "name": config["training"]["wandb"]["name"],
+                }
+            }
+        )
+        print(f"Tracking initialized with {config['training']['report_to']}")
+    device = accelerator.device
+    print(f"Using device: {device}")
+    # Load tokenizer and model
+    tokenizer = get_tokenizer(config)
+    config["model"]["vocab_size"] = tokenizer.get_vocab_size()
+    model = create_model(config)
+    try:
+        model = torch.compile(model)
+        print("torch.compile enabled for faster training.")
+    except Exception as e:
+        print("torch.compile not available or failed, continuing without it.")
+    optimizer = AdamW(
+        model.parameters(),
+        lr=config["training"]["learning_rate"],
+        weight_decay=config["training"]["weight_decay"]
+    )
+    # Create streaming dataset with custom collate function
+    dataset = StreamingTextDataset(
+        data_dir="data/raw",
+        tokenizer=tokenizer,
+        max_length=config["dataset"]["max_length"]
+    )
+    train_loader = DataLoader(
+        dataset,
+        batch_size=config["training"]["per_device_train_batch_size"],
+        num_workers=config["training"]["dataloader_num_workers"],
+        pin_memory=True,
+        collate_fn=collate_batch  # Add custom collate function
+    )
+    # Prepare for distributed training
+    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
+    # Calculate approximate steps per epoch based on target dataset size
+    avg_seq_length = config["dataset"]["max_length"] // 2  # Average sequence length
+    batch_size = config["training"]["per_device_train_batch_size"]
+    target_size_gb = config["dataset"].get("target_size_gb", 2.5)
+    chars_per_token = 4
+    total_tokens = (target_size_gb * 1024 * 1024 * 1024) // chars_per_token
+    steps_per_epoch = int(total_tokens // (avg_seq_length * batch_size))  # Convert to int
+    total_epochs = config["training"]["num_train_epochs"]
+    total_steps = int(steps_per_epoch * total_epochs)  # Convert to int
+    print(f"\nTraining Statistics (Estimated):")
+    print(f"Total epochs: {total_epochs}")
+    print(f"Estimated steps per epoch: {steps_per_epoch:,}")
+    print(f"Estimated total steps: {total_steps:,}")
+    # Track gradients for logging
+    def grad_norm(model):
+        total_norm = 0.0
+        for p in model.parameters():
+            if p.grad is not None:
+                param_norm = p.grad.detach().data.norm(2)
+                total_norm += param_norm.item() ** 2
+        return total_norm ** 0.5
+    # Initialize GPU monitoring
+    if torch.cuda.is_available():
+        gpu_id = torch.cuda.current_device()
+    training_stats = {
+        'train/loss': 0.0,
+        'train/learning_rate': 0.0,
+        'train/epoch': 0.0,
+        'train/global_step': 0,
+        'train/samples_per_second': 0.0,
+        'train/grad_norm': 0.0,
+        'performance/gpu_memory': 0.0,
+        'performance/gpu_utilization': 0.0,
+        'performance/batch_time': 0.0,
+    }
+    for epoch in range(total_epochs):
+        epoch_start_time = time.time()
+        model.train()
+        running_loss = 0
+        num_batches = 0
+        samples_processed = 0
+        progress_bar = tqdm(
+            total=steps_per_epoch,
+            desc=f"Epoch {epoch+1}/{total_epochs}",
+            disable=not accelerator.is_local_main_process
+        )
+        for batch in train_loader:
+            batch_start_time = time.time()
+            with accelerator.accumulate(model):
+                outputs = model(input_ids=batch["input_ids"], labels=batch["labels"])
+                loss = outputs["loss"]
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    training_stats['train/grad_norm'] = grad_norm(model)
+                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                optimizer.zero_grad()
+            # Update statistics
+            loss_value = loss.item()
+            running_loss += loss_value
+            num_batches += 1
+            samples_processed += batch["input_ids"].size(0)
+            batch_time = time.time() - batch_start_time
+            # Update training stats
+            training_stats.update({
+                'train/loss': loss_value,
+                'train/learning_rate': optimizer.param_groups[0]['lr'],
+                'train/epoch': epoch + 1,
+                'train/global_step': num_batches + (epoch * steps_per_epoch),
+                'train/samples_per_second': batch["input_ids"].size(0) / batch_time,
+                'performance/batch_time': batch_time,
+            })
+            # GPU stats (if available)
+            if torch.cuda.is_available():
+                training_stats.update({
+                    'performance/gpu_memory': torch.cuda.memory_allocated(gpu_id) / 1024**3,  # GB
+                    'performance/gpu_utilization': torch.cuda.utilization(gpu_id),
+                })
+            # Update progress bar
+            avg_speed = num_batches / (time.time() - epoch_start_time)
+            eta_epoch = (steps_per_epoch - num_batches) / avg_speed / 60  # minutes
+            eta_total = (total_steps - (epoch * steps_per_epoch + num_batches)) / avg_speed / 60  # minutes
+            progress_bar.set_postfix({
+                'loss': f'{loss_value:.4f}',
+                'avg_loss': f'{running_loss/num_batches:.4f}',
+                'lr': f'{optimizer.param_groups[0]["lr"]:.2e}',
+                'samples/s': f'{training_stats["train/samples_per_second"]:.2f}',
+                'epoch_eta': f'{eta_epoch:.1f}min',
+                'total_eta': f'{eta_total:.1f}min'
+            })
+            progress_bar.update(1)
+            # Log metrics based on logging_steps
+            if num_batches % config["training"]["logging_steps"] == 0:
+                if accelerator.is_main_process:
+                    current_step = int(num_batches + (epoch * steps_per_epoch))  # Convert to int
+                    accelerator.log(training_stats, step=current_step)
+            # Save checkpoint based on save_steps
+            if num_batches % config["training"]["save_steps"] == 0:
+                if accelerator.is_local_main_process:
+                    checkpoint_dir = os.path.join(output_dir, f"checkpoint-epoch{epoch+1}-step{num_batches}")
+                    os.makedirs(checkpoint_dir, exist_ok=True)
+                    print(f"\nSaving checkpoint at step {num_batches} to {checkpoint_dir}")
+                    accelerator.save_state(checkpoint_dir)
+                    with open(os.path.join(checkpoint_dir, "config.json"), "w") as f:
+                        json.dump(config, f, indent=2)
+            # Break if we've reached the estimated steps for this epoch
+            if num_batches >= steps_per_epoch:
+                break
+        progress_bar.close()
+        # End of epoch logging
+        epoch_time = time.time() - epoch_start_time
+        epoch_avg_loss = running_loss / num_batches
+        epoch_perplexity = torch.exp(torch.tensor(epoch_avg_loss))
+        if accelerator.is_main_process:
+            print(f"\nEpoch {epoch+1}/{total_epochs} Summary:")
+            print(f"Time: {epoch_time/60:.2f} minutes")
+            print(f"Average Loss: {epoch_avg_loss:.4f}")
+            print(f"Perplexity: {epoch_perplexity:.2f}")
+            print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.2e}")
+            print(f"Samples Processed: {samples_processed:,}")
+            print(f"Average Speed: {samples_processed/epoch_time:.1f} samples/s")
+            # Estimate remaining time
+            epochs_remaining = total_epochs - (epoch + 1)
+            estimated_remaining_time = epochs_remaining * epoch_time / 60
+            print(f"Estimated time for remaining {epochs_remaining} epochs: {estimated_remaining_time:.1f} minutes")
+            # Log epoch summary to wandb with correct step
+            current_step = int((epoch + 1) * steps_per_epoch)  # Convert to int
+            accelerator.log({
+                'epoch/average_loss': epoch_avg_loss,
+                'epoch/perplexity': epoch_perplexity.item(),
+                'epoch/time': epoch_time,
+                'epoch/samples_processed': samples_processed,
+            }, step=current_step)
+    # Save final model
+    if accelerator.is_local_main_process:
+        final_model_dir = os.path.join(output_dir, "final_model")
+        os.makedirs(final_model_dir, exist_ok=True)
+        print(f"\nSaving final model to {final_model_dir}")
+        # Save with DeepSpeed
+        accelerator.save_state(final_model_dir)
+        # Save configuration
+        with open(os.path.join(final_model_dir, "config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        print("Final model saved successfully")
+        accelerator.end_training()
+if __name__ == "__main__":
+    config = load_config()
+    train_model(config)
+    print("Training complete.")

src/utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import yaml
+from pathlib import Path
+def load_config(config_path="config/config.yaml"):
+    """Loads the configuration from a YAML file."""
+    with open(config_path, "r") as f:
+        return yaml.safe_load(f)
+def setup_logging(log_level="INFO"):
+    """Sets up basic logging."""
+    import logging
+    logging.basicConfig(
+        level=getattr(logging, log_level.upper(), logging.INFO),
+        format="%(asctime)s - %(levelname)s - %(message)s"
+    )