File size: 2,716 Bytes
53abbe8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1XRKQ-ICJVg5oXXPNinjrj1VGGr8F3VYE
"""
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
# Step 1: Load the pre-trained model and tokenizer
model_name = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# Step 2: Load the legal dataset
dataset = load_dataset("casehold/casehold", "all")
# Step 3: Preprocess the dataset
def preprocess_data(example):
# Combine context and question into a single input
return {
"input_ids": tokenizer(
example["context"] + " " + example["question"],
truncation=True,
padding="max_length",
max_length=512,
)["input_ids"],
"labels": tokenizer(
example["answer"],
truncation=True,
padding="max_length",
max_length=512,
)["input_ids"],
}
tokenized_dataset = dataset.map(preprocess_data, batched=True)
# Step 4: Fine-tune the model
training_args = TrainingArguments(
output_dir="./legal_gpt",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=4,
num_train_epochs=3,
save_steps=1000,
save_total_limit=2,
fp16=True, # Mixed precision for faster training
logging_dir="./logs",
logging_steps=500,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
)
trainer.train()
model.save_pretrained("./legal_gpt")
tokenizer.save_pretrained("./legal_gpt")
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load the fine-tuned model
model_path = "./legal_gpt"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
def generate_response(prompt):
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(inputs["input_ids"], max_length=200, num_return_sequences=1, do_sample=True, top_k=10)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Gradio Interface
interface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=5, placeholder="Enter your legal query here..."),
outputs="text",
title="Legal Advice GPT",
description="Ask your legal questions and receive advice based on fine-tuned GPT!"
)
interface.launch() |