NLSQL

Runtime error

App Files Files Community

HusnaManakkot commited on Feb 25, 2024

Commit

120ccfd

verified ·

1 Parent(s): 5cacb61

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -45

app.py CHANGED Viewed

@@ -1,51 +1,52 @@
-import torch
-from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
-from torch.utils.data import DataLoader, Dataset
-from tqdm import tqdm
-# Define your dataset class
-class SpiderDataset(Dataset):
-    def __init__(self, encodings, labels):
-        self.encodings = encodings
-        self.labels = labels
-    def __getitem__(self, idx):
-        return {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}
-    def __len__(self):
-        return len(self.encodings)
-# Load your preprocessed Spider dataset
-train_encodings = # Your preprocessed input encodings for training (e.g., a list of input IDs)
-train_labels = # Your preprocessed labels for training (e.g., a list of label IDs)
-# Create a PyTorch dataset and dataloader
-train_dataset = SpiderDataset(train_encodings, train_labels)
-train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
-# Load the pre-trained T5 model
-model = T5ForConditionalGeneration.from_pretrained('t5-base')
-tokenizer = T5Tokenizer.from_pretrained('t5-base')
-# Move the model to the GPU if available
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model.to(device)
-# Set up the optimizer
-optimizer = AdamW(model.parameters(), lr=5e-5)
 # Fine-tune the model
-model.train()
-for epoch in range(3):  # Number of epochs
-    for batch in tqdm(train_loader):
-        optimizer.zero_grad()
-        input_ids = batch['input_ids'].to(device)
-        labels = batch['labels'].to(device)
-        outputs = model(input_ids=input_ids, labels=labels)
-        loss = outputs.loss
-        loss.backward()
-        optimizer.step()
-# Save the fine-tuned model
-model.save_pretrained('your_model_directory')
-tokenizer.save_pretrained('your_model_directory')

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
+from datasets import load_dataset
+# Load the model and tokenizer
+model_name = "hrshtsharma2012/NL2SQL-Picard-final"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Load the Spider dataset
+dataset = load_dataset("spider")
+# Preprocess the dataset
+def tokenize_function(examples):
+    inputs = tokenizer(examples["question"], padding="max_length", truncation=True, max_length=512)
+    outputs = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=512)
+    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": outputs.input_ids}
+tokenized_dataset = dataset.map(tokenize_function, batched=True)
 # Fine-tune the model
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir="./logs",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset["train"],
+    eval_dataset=tokenized_dataset["validation"],
+)
+trainer.train()
+# Evaluate the model
+results = trainer.evaluate()
+print(results)
+# Use the model for inference
+def generate_sql(question):
+    inputs = tokenizer(question, return_tensors="pt", padding=True)
+    outputs = model.generate(**inputs, max_length=512)
+    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return sql_query
+sample_question = "What are the names of the students?"
+print(generate_sql(sample_question))