HusnaManakkot commited on
Commit
120ccfd
Β·
verified Β·
1 Parent(s): 5cacb61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -45
app.py CHANGED
@@ -1,51 +1,52 @@
1
- import torch
2
- from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
3
- from torch.utils.data import DataLoader, Dataset
4
- from tqdm import tqdm
5
 
6
- # Define your dataset class
7
- class SpiderDataset(Dataset):
8
- def __init__(self, encodings, labels):
9
- self.encodings = encodings
10
- self.labels = labels
11
 
12
- def __getitem__(self, idx):
13
- return {'input_ids': self.encodings[idx], 'labels': self.labels[idx]}
14
 
15
- def __len__(self):
16
- return len(self.encodings)
 
 
 
17
 
18
- # Load your preprocessed Spider dataset
19
- train_encodings = # Your preprocessed input encodings for training (e.g., a list of input IDs)
20
- train_labels = # Your preprocessed labels for training (e.g., a list of label IDs)
21
-
22
- # Create a PyTorch dataset and dataloader
23
- train_dataset = SpiderDataset(train_encodings, train_labels)
24
- train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
25
-
26
- # Load the pre-trained T5 model
27
- model = T5ForConditionalGeneration.from_pretrained('t5-base')
28
- tokenizer = T5Tokenizer.from_pretrained('t5-base')
29
-
30
- # Move the model to the GPU if available
31
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
- model.to(device)
33
-
34
- # Set up the optimizer
35
- optimizer = AdamW(model.parameters(), lr=5e-5)
36
 
37
  # Fine-tune the model
38
- model.train()
39
- for epoch in range(3): # Number of epochs
40
- for batch in tqdm(train_loader):
41
- optimizer.zero_grad()
42
- input_ids = batch['input_ids'].to(device)
43
- labels = batch['labels'].to(device)
44
- outputs = model(input_ids=input_ids, labels=labels)
45
- loss = outputs.loss
46
- loss.backward()
47
- optimizer.step()
48
-
49
- # Save the fine-tuned model
50
- model.save_pretrained('your_model_directory')
51
- tokenizer.save_pretrained('your_model_directory')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset
 
 
3
 
4
+ # Load the model and tokenizer
5
+ model_name = "hrshtsharma2012/NL2SQL-Picard-final"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
8
 
9
+ # Load the Spider dataset
10
+ dataset = load_dataset("spider")
11
 
12
+ # Preprocess the dataset
13
+ def tokenize_function(examples):
14
+ inputs = tokenizer(examples["question"], padding="max_length", truncation=True, max_length=512)
15
+ outputs = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=512)
16
+ return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": outputs.input_ids}
17
 
18
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Fine-tune the model
21
+ training_args = TrainingArguments(
22
+ output_dir="./results",
23
+ num_train_epochs=3,
24
+ per_device_train_batch_size=4,
25
+ per_device_eval_batch_size=4,
26
+ warmup_steps=500,
27
+ weight_decay=0.01,
28
+ logging_dir="./logs",
29
+ )
30
+
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ train_dataset=tokenized_dataset["train"],
35
+ eval_dataset=tokenized_dataset["validation"],
36
+ )
37
+
38
+ trainer.train()
39
+
40
+ # Evaluate the model
41
+ results = trainer.evaluate()
42
+ print(results)
43
+
44
+ # Use the model for inference
45
+ def generate_sql(question):
46
+ inputs = tokenizer(question, return_tensors="pt", padding=True)
47
+ outputs = model.generate(**inputs, max_length=512)
48
+ sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
+ return sql_query
50
+
51
+ sample_question = "What are the names of the students?"
52
+ print(generate_sql(sample_question))