Spaces:
Sleeping
Sleeping
File size: 4,804 Bytes
587016d 6bb0951 1f93ce8 fa28652 a5adb63 2fb593a 6bb0951 4dd54a9 6bb0951 4dd54a9 4451426 f3a3c59 df6fdec f3a3c59 64bc0cd f3a3c59 d9b3a0f 558dbba cdba87b 1f93ce8 fcb2108 5483dd7 2fb593a 8c5bbad 69b0eca ca38289 8c5bbad 69b0eca 8c5bbad 69b0eca 8c5bbad 69b0eca b561141 aa2dad0 b561141 69b0eca e9ab64b 1f93ce8 f7c8bfc 587016d f7c8bfc f2d93bb f7c8bfc 59b164d f7c8bfc f2d93bb f7c8bfc 9922904 f7c8bfc 587016d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
app = FastAPI()
@app.get("/")
def summarize():
# Example: Loading a dataset as part of the API
billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)
# import pandas as pd
# df = pd.read_csv("squad_sample_train.tsv", sep="\t")
# print(df.head()) # Debugging step
# return {"Hello": "World!", "dataset_length": len(billsum)}
# return df.head()
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
prefix = "summarize: "
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["text"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_billsum = billsum.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
result["gen_len"] = np.mean(prediction_lens)
return {k: round(v, 4) for k, v in result.items()}
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
"""training_args = Seq2SeqTrainingArguments(
output_dir="./results",
logging_dir="./logs", # Save logs here
eval_strategy="steps",
learning_rate=2e-5,
per_device_train_batch_size=16, # Increase batch size
per_device_eval_batch_size=16,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=1, # Reduce epochs
predict_with_generate=True,
fp16=True, # Keep mixed precision
push_to_hub=False,
# optim="adamw_bnb_8bit", # Use 8-bit optimizer
logging_steps=10, # Log every 10 steps
logging_strategy="steps",
dataloader_num_workers=4, # Speed up data loading
save_strategy="epoch", # Reduce checkpointing overhead
save_steps=500,
gradient_accumulation_steps=4 # Effective larger batch size
)"""
training_args = Seq2SeqTrainingArguments(
output_dir="./tmp_test", # Temporary output directory
max_steps=2, # Run only 2 steps
per_device_train_batch_size=1, # Smallest batch size
per_device_eval_batch_size=1, # Smallest batch size
evaluation_strategy="no", # No evaluation for speed
save_strategy="no", # No checkpoint saving
logging_strategy="no", # No logging
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_billsum["train"],
eval_dataset=tokenized_billsum["test"],
processing_class=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
return trainer.train()
# return data_collator
# return type(tokenized_billsum)
"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
app = FastAPI()
#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test") # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: " # Example prefix for text generation
@app.get("/")
def preprocess_function(examples):
inputs = [prefix + doc for doc in examples["text"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
#@app.get("/")
def get_tokenized_data():
tokenized_billsum = billsum.map(preprocess_function, batched=True)
# Convert to list of dictionaries
json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")
return {"tokenized_data": json_serializable_output} # Ensure JSON format"""
|