LLaMA: Open and Efficient Foundation Language Models
Paper
•
2302.13971
•
Published
•
20
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from peft import PeftModel
base_model="llma-7b"
LORA_WEIGHTS = "llma-med-alpaca-7b"
LOAD_8BIT = False
tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
base_model
load_in_8bit=LOAD_8BIT,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model,
LORA_WEIGHTS,
torch_dtype=torch.float16,
)
config = {
"temperature": 0 ,
"max_new_tokens": 1024,
"top_p": 0.5
}
prompt = "Translate to English: Je t’aime."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(input_ids=input_ids, max_new_tokens=config["max_new_tokens"], temperature=config["temperature"])
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(decoded[len(prompt):])