Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

App Files Files Community

Petite-LLM-3 / test_pre_quantized_model.py

Tonic

tries to improve the generation paramaters

d784738 5 months ago

raw

history blame

3.24 kB

	#!/usr/bin/env python3
	"""
	Test script for pre-quantized model inference
	"""

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def test_pre_quantized_model():
	"""Test the pre-quantized model loading and generation"""

	model_id = "Tonic/petite-elle-L-aime-3-sft"
	device = "cuda" if torch.cuda.is_available() else "cpu"

	logger.info(f"Testing pre-quantized model on device: {device}")

	try:
	# Load tokenizer
	logger.info("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="int4")
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token_id = tokenizer.eos_token_id

	# Load pre-quantized model
	logger.info("Loading pre-quantized model...")
	model_kwargs = {
	"device_map": "auto" if device == "cuda" else "cpu",
	"torch_dtype": torch.float32,
	"trust_remote_code": True,
	"low_cpu_mem_usage": True,
	}

	model = AutoModelForCausalLM.from_pretrained(model_id, subfolder="int4", **model_kwargs)

	# Test generation
	test_prompt = "Bonjour, comment allez-vous?"
	inputs = tokenizer(test_prompt, return_tensors="pt")

	if device == "cuda":
	inputs = {k: v.cuda() for k, v in inputs.items()}

	logger.info("Generating response...")
	with torch.no_grad():
	output_ids = model.generate(
	inputs['input_ids'],
	max_new_tokens=50,
	temperature=0.7,
	top_p=0.95,
	do_sample=True,
	attention_mask=inputs['attention_mask'],
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	cache_implementation="static" # Important for quantized models
	)

	response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	assistant_response = response[len(test_prompt):].strip()

	logger.info("✅ Pre-quantized model test successful!")
	logger.info(f"Input: {test_prompt}")
	logger.info(f"Output: {assistant_response}")

	# Check model quantization status
	logger.info("Checking model quantization status...")
	quantized_layers = 0
	total_layers = 0
	for name, module in model.named_modules():
	if hasattr(module, 'weight'):
	total_layers += 1
	if module.weight.dtype != torch.float32:
	quantized_layers += 1
	logger.info(f"Quantized layer: {name} - {module.weight.dtype}")

	logger.info(f"Quantized layers: {quantized_layers}/{total_layers}")

	# Clean up
	del model
	torch.cuda.empty_cache() if device == "cuda" else None

	except Exception as e:
	logger.error(f"❌ Pre-quantized model test failed: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	test_pre_quantized_model()