File size: 1,574 Bytes
e112bff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from datasets import Dataset
from sql_metadata import Parser
def format_deepseek_chat(example, tokenizer, input_prompt):
# Manually build the prompt as one flat string
prompt = f"{input_prompt}{example['natural_query']}\n"
completion = f"Tables:\n{example['tables']}"
full_text = prompt + completion
tokenized = tokenizer(
full_text,
truncation=True,
padding="max_length",
max_length=3156, # or whatever your model can handle
)
# Mask out prompt tokens in the labels
prompt_len = len(tokenizer(prompt, truncation=True)["input_ids"])
labels = tokenized["input_ids"][:]
labels[:prompt_len] = [-100] * prompt_len
tokenized["labels"] = labels
return tokenized
def get_tokenized_dataset(nba_df, tokenizer, input_prompt):
natural_query_list = nba_df["natural_query"].tolist()
sql_query_list = nba_df["sql_query"].tolist()
tables = [Parser(sql_query).tables for sql_query in sql_query_list]
dataset_dict = {
"natural_query": natural_query_list,
"tables": tables,
}
# Create HuggingFace Dataset
dataset = Dataset.from_dict(dataset_dict)
tokenized_dataset = dataset.map(
lambda x: format_deepseek_chat(x, tokenizer, input_prompt),
remove_columns=["natural_query", "tables"]
)
split = int(0.9 * len(tokenized_dataset)) # 90% train, 10% validation
train_dataset = tokenized_dataset.select(range(split))
val_dataset = tokenized_dataset.select(range(split, len(tokenized_dataset)))
return train_dataset, val_dataset
|