rcook commited on
Commit
f7c8bfc
·
verified ·
1 Parent(s): cdba87b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI
2
  from datasets import load_dataset
3
  from transformers import AutoTokenizer
4
 
@@ -32,4 +32,33 @@ def summarize():
32
 
33
  tokenized_billsum = billsum.map(preprocess_function, batched=True)
34
 
35
- return tokenized_billsum
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """from fastapi import FastAPI
2
  from datasets import load_dataset
3
  from transformers import AutoTokenizer
4
 
 
32
 
33
  tokenized_billsum = billsum.map(preprocess_function, batched=True)
34
 
35
+ return tokenized_billsum """
36
+
37
+ from fastapi import FastAPI
38
+ from datasets import load_dataset
39
+ from transformers import AutoTokenizer
40
+
41
+ app = FastAPI()
42
+
43
+ # Load dataset and tokenizer
44
+ billsum = load_dataset("billsum", split="train[:1%]") # Load a small sample
45
+ tokenizer = AutoTokenizer.from_pretrained("t5-small")
46
+ prefix = "summarize: " # Example prefix for text generation
47
+
48
+ def preprocess_function(examples):
49
+ inputs = [prefix + doc for doc in examples["text"]]
50
+ model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
51
+
52
+ labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
53
+ model_inputs["labels"] = labels["input_ids"]
54
+
55
+ return model_inputs
56
+
57
+ @app.get("/tokenized")
58
+ def get_tokenized_data():
59
+ tokenized_billsum = billsum.map(preprocess_function, batched=True)
60
+
61
+ # Convert to list of dictionaries
62
+ json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records")
63
+
64
+ return {"tokenized_data": json_serializable_output} # Ensure JSON format