from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline from peft import LoraConfig, get_peft_model, TaskType from datasets import load_dataset import gradio as gr # ================== MODEL TRAINING PART ================== # Load the dataset dataset = load_dataset("msc-smart-contract-auditing/audits-with-reasons", split="train[:100]") # Load model and tokenizer model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model_dir = "./huggingface/hub" tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_dir) # Format example for instruction-tuned model def format_example(example): return f""" ### Instruction: Please audit the following smart contract and provide a recommendation. ### Code: {example['code']} ### Description: {example['description']} ### Recommendation: {example['recommendation']} """ # Tokenization function def tokenize(example): text = format_example(example) tokenized = tokenizer( text, truncation=True, padding="max_length", max_length=512, return_tensors="pt" ) return { "input_ids": tokenized["input_ids"][0], "attention_mask": tokenized["attention_mask"][0], "labels": tokenized["input_ids"][0], } # Tokenize dataset tokenized_dataset = dataset.map(tokenize, batched=False, remove_columns=dataset.column_names) # Load model model = AutoModelForCausalLM.from_pretrained(model_name) # Apply LoRA lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, bias="none", task_type=TaskType.CAUSAL_LM, ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Training config training_args = TrainingArguments( output_dir="./audit-model", per_device_train_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=5e-5, logging_dir="./logs", logging_steps=10, save_steps=100, save_total_limit=2, report_to="none", fp16=False, remove_unused_columns=False, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer ) # Train trainer.train() # Save model trainer.save_model("./audit-model") tokenizer.save_pretrained("./audit-model") # ================== INFERENCE + GRADIO UI ================== # Load inference pipeline pipe = pipeline("text-generation", model="./audit-model", tokenizer="./audit-model") # Function to audit uploaded contract def audit_contract(file, description): if file is None: return "Please upload a Solidity contract file." # Read code with open(file.name, "r") as f: contract_code = f.read() # Build prompt prompt = f"""### Instruction: Please audit the following smart contract and provide a recommendation. ### Code: {contract_code} ### Description: {description} ### Recommendation: """ # Generate response output = pipe(prompt, max_new_tokens=100)[0]["generated_text"] return output # Gradio interface gr.Interface( fn=audit_contract, inputs=[ gr.File(label="Upload Smart Contract (.sol)"), gr.Textbox(label="Contract Description", placeholder="E.g., This contract handles fund withdrawals...") ], outputs=gr.Textbox(label="Audit Recommendation"), title="Smart Contract Auditor", description="Upload a Solidity contract and get audit recommendations from the TinyLlama-powered model." ).launch()