File size: 4,804 Bytes
587016d
6bb0951
 
1f93ce8
fa28652
a5adb63
2fb593a
6bb0951
 
 
 
4dd54a9
6bb0951
4dd54a9
4451426
f3a3c59
df6fdec
f3a3c59
64bc0cd
 
f3a3c59
d9b3a0f
 
558dbba
 
 
 
 
 
 
 
 
 
 
 
cdba87b
 
1f93ce8
 
fcb2108
 
5483dd7
 
 
 
 
 
 
 
 
 
 
 
 
2fb593a
 
8c5bbad
 
 
 
 
 
 
 
 
69b0eca
ca38289
8c5bbad
 
 
69b0eca
 
 
 
 
 
 
 
 
 
8c5bbad
 
69b0eca
 
8c5bbad
69b0eca
b561141
 
aa2dad0
b561141
 
 
 
 
 
 
69b0eca
e9ab64b
 
 
 
 
 
 
 
 
 
 
 
1f93ce8
f7c8bfc
587016d
f7c8bfc
 
 
 
 
f2d93bb
f7c8bfc
59b164d
f7c8bfc
 
 
f2d93bb
f7c8bfc
 
 
 
 
 
 
 
 
9922904
f7c8bfc
 
 
 
 
 
587016d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

app = FastAPI()

@app.get("/")
def summarize():
    # Example: Loading a dataset as part of the API
    billsum = load_dataset("billsum", split="ca_test")
    billsum = billsum.train_test_split(test_size=0.2)
#    import pandas as pd

#    df = pd.read_csv("squad_sample_train.tsv", sep="\t")
#    print(df.head())  # Debugging step
#    return {"Hello": "World!", "dataset_length": len(billsum)}
#    return df.head()
    checkpoint = "google-t5/t5-small"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    prefix = "summarize: "


    def preprocess_function(examples):
        inputs = [prefix + doc for doc in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

        labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
   
    rouge = evaluate.load("rouge")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
        result["gen_len"] = np.mean(prediction_lens)

        return {k: round(v, 4) for k, v in result.items()}

    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)



    
    
    
    

    
    
    """training_args = Seq2SeqTrainingArguments(
        output_dir="./results",
        logging_dir="./logs",  # Save logs here
        eval_strategy="steps",
        learning_rate=2e-5,
        per_device_train_batch_size=16,  # Increase batch size
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=1,  # Reduce epochs
        predict_with_generate=True,
        fp16=True,  # Keep mixed precision
        push_to_hub=False,
    #    optim="adamw_bnb_8bit",  # Use 8-bit optimizer
        logging_steps=10,  # Log every 10 steps
        logging_strategy="steps",
        dataloader_num_workers=4,  # Speed up data loading
        save_strategy="epoch",  # Reduce checkpointing overhead
        save_steps=500,
        gradient_accumulation_steps=4  # Effective larger batch size
    )"""

    training_args = Seq2SeqTrainingArguments(
        output_dir="./tmp_test",  # Temporary output directory
        max_steps=2,  # Run only 2 steps
        per_device_train_batch_size=1,  # Smallest batch size
        per_device_eval_batch_size=1,  # Smallest batch size
        evaluation_strategy="no",  # No evaluation for speed
        save_strategy="no",  # No checkpoint saving
        logging_strategy="no",  # No logging
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_billsum["train"],
        eval_dataset=tokenized_billsum["test"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    return trainer.train()
    
#    return data_collator
#    return type(tokenized_billsum)

"""from fastapi import FastAPI
from datasets import load_dataset
from transformers import AutoTokenizer

app = FastAPI()

#@app.get("/")
# Load dataset and tokenizer
billsum = load_dataset("billsum", split="ca_test")  # Load a small sample
tokenizer = AutoTokenizer.from_pretrained("t5-small")
prefix = "summarize: "  # Example prefix for text generation

@app.get("/")
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

#@app.get("/")
def get_tokenized_data():
    tokenized_billsum = billsum.map(preprocess_function, batched=True)

    # Convert to list of dictionaries
    json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")

    return {"tokenized_data": json_serializable_output}  # Ensure JSON format"""