Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
import
|
|
|
2 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
import fitz # PyMuPDF
|
4 |
import docx
|
@@ -6,34 +7,35 @@ import pptx
|
|
6 |
import openpyxl
|
7 |
import re
|
8 |
import nltk
|
9 |
-
from nltk.tokenize import sent_tokenize
|
10 |
import torch
|
11 |
-
from
|
12 |
-
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
|
13 |
from gtts import gTTS
|
|
|
14 |
import tempfile
|
15 |
import os
|
16 |
import easyocr
|
17 |
-
from fpdf import FPDF
|
18 |
import datetime
|
19 |
-
from concurrent.futures import ThreadPoolExecutor
|
20 |
import hashlib
|
21 |
|
|
|
22 |
nltk.download('punkt', quiet=True)
|
23 |
-
|
24 |
app = FastAPI()
|
25 |
|
|
|
26 |
MODEL_NAME = "facebook/bart-large-cnn"
|
27 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
28 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
29 |
model.eval()
|
30 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
31 |
|
|
|
32 |
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
33 |
-
executor = ThreadPoolExecutor()
|
34 |
|
|
|
35 |
summary_cache = {}
|
36 |
|
|
|
|
|
37 |
def clean_text(text: str) -> str:
|
38 |
text = re.sub(r'\s+', ' ', text)
|
39 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
@@ -110,20 +112,18 @@ def generate_summary(text: str, length: str = "medium") -> str:
|
|
110 |
"long": {"max_length": 300, "min_length": 210}
|
111 |
}
|
112 |
chunks = chunk_text(text)
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
except Exception as e:
|
126 |
-
summary_texts = [f"[Batch error: {str(e)}]"]
|
127 |
|
128 |
final_summary = " ".join(summary_texts)
|
129 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
@@ -138,15 +138,13 @@ def text_to_speech(text: str):
|
|
138 |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
139 |
tts.save(temp_audio.name)
|
140 |
return temp_audio.name
|
141 |
-
except Exception
|
142 |
-
print(f"Error in text-to-speech: {e}")
|
143 |
return ""
|
144 |
|
145 |
def create_pdf(summary: str, original_filename: str):
|
146 |
try:
|
147 |
pdf = FPDF()
|
148 |
pdf.add_page()
|
149 |
-
pdf.set_font("Arial", size=12)
|
150 |
pdf.set_font("Arial", 'B', 16)
|
151 |
pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
|
152 |
pdf.set_font("Arial", size=12)
|
@@ -157,75 +155,46 @@ def create_pdf(summary: str, original_filename: str):
|
|
157 |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
158 |
pdf.output(temp_pdf.name)
|
159 |
return temp_pdf.name
|
160 |
-
except Exception
|
161 |
-
print(f"Error creating PDF: {e}")
|
162 |
return ""
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
171 |
if error:
|
172 |
-
return
|
|
|
173 |
if not text or len(text.split()) < 30:
|
174 |
-
return "Document
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
with gr.Row():
|
188 |
-
with gr.Column():
|
189 |
-
file_input = gr.File(
|
190 |
-
label="Upload Document",
|
191 |
-
file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
|
192 |
-
type="filepath"
|
193 |
-
)
|
194 |
-
length_radio = gr.Radio(
|
195 |
-
["short", "medium", "long"],
|
196 |
-
value="medium",
|
197 |
-
label="Summary Length"
|
198 |
-
)
|
199 |
-
submit_btn = gr.Button("Generate Summary", variant="primary")
|
200 |
-
|
201 |
-
with gr.Column():
|
202 |
-
output = gr.Textbox(label="Summary", lines=10)
|
203 |
-
audio_output = gr.Audio(label="Audio Summary")
|
204 |
-
pdf_download = gr.File(label="Download Summary as PDF", visible=False)
|
205 |
-
|
206 |
-
def summarize_and_return_ui(file, summary_length):
|
207 |
-
summary, _, audio_path, pdf_path = summarize_document(file, summary_length)
|
208 |
-
return (
|
209 |
-
summary,
|
210 |
-
audio_path,
|
211 |
-
gr.File(visible=pdf_path is not None, value=pdf_path)
|
212 |
-
)
|
213 |
-
|
214 |
-
submit_btn.click(
|
215 |
-
fn=summarize_and_return_ui,
|
216 |
-
inputs=[file_input, length_radio],
|
217 |
-
outputs=[output, audio_output, pdf_download]
|
218 |
-
)
|
219 |
|
220 |
@app.get("/files/{file_name}")
|
221 |
-
async def
|
222 |
-
|
223 |
-
if os.path.exists(
|
224 |
-
return FileResponse(
|
225 |
return JSONResponse({"error": "File not found"}, status_code=404)
|
226 |
|
227 |
-
app = gr.mount_gradio_app(app, demo, path="/")
|
228 |
-
|
229 |
@app.get("/")
|
230 |
-
def
|
231 |
-
return RedirectResponse(url="/")
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
2 |
+
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
|
3 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
4 |
import fitz # PyMuPDF
|
5 |
import docx
|
|
|
7 |
import openpyxl
|
8 |
import re
|
9 |
import nltk
|
|
|
10 |
import torch
|
11 |
+
from nltk.tokenize import sent_tokenize
|
|
|
12 |
from gtts import gTTS
|
13 |
+
from fpdf import FPDF
|
14 |
import tempfile
|
15 |
import os
|
16 |
import easyocr
|
|
|
17 |
import datetime
|
|
|
18 |
import hashlib
|
19 |
|
20 |
+
# Initialize
|
21 |
nltk.download('punkt', quiet=True)
|
|
|
22 |
app = FastAPI()
|
23 |
|
24 |
+
# Load Summarizer Model
|
25 |
MODEL_NAME = "facebook/bart-large-cnn"
|
26 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
27 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
28 |
model.eval()
|
29 |
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
|
30 |
|
31 |
+
# Load OCR Reader
|
32 |
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
|
|
|
33 |
|
34 |
+
# Cache
|
35 |
summary_cache = {}
|
36 |
|
37 |
+
# --- Helper Functions ---
|
38 |
+
|
39 |
def clean_text(text: str) -> str:
|
40 |
text = re.sub(r'\s+', ' ', text)
|
41 |
text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
|
|
|
112 |
"long": {"max_length": 300, "min_length": 210}
|
113 |
}
|
114 |
chunks = chunk_text(text)
|
115 |
+
|
116 |
+
summaries = summarizer(
|
117 |
+
chunks,
|
118 |
+
max_length=length_params[length]["max_length"],
|
119 |
+
min_length=length_params[length]["min_length"],
|
120 |
+
do_sample=False,
|
121 |
+
truncation=True,
|
122 |
+
no_repeat_ngram_size=2,
|
123 |
+
num_beams=2,
|
124 |
+
early_stopping=True
|
125 |
+
)
|
126 |
+
summary_texts = [s['summary_text'] for s in summaries]
|
|
|
|
|
127 |
|
128 |
final_summary = " ".join(summary_texts)
|
129 |
final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
|
|
|
138 |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
139 |
tts.save(temp_audio.name)
|
140 |
return temp_audio.name
|
141 |
+
except Exception:
|
|
|
142 |
return ""
|
143 |
|
144 |
def create_pdf(summary: str, original_filename: str):
|
145 |
try:
|
146 |
pdf = FPDF()
|
147 |
pdf.add_page()
|
|
|
148 |
pdf.set_font("Arial", 'B', 16)
|
149 |
pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
|
150 |
pdf.set_font("Arial", size=12)
|
|
|
155 |
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
156 |
pdf.output(temp_pdf.name)
|
157 |
return temp_pdf.name
|
158 |
+
except Exception:
|
|
|
159 |
return ""
|
160 |
|
161 |
+
# --- API Endpoints ---
|
162 |
+
|
163 |
+
@app.post("/summarize/")
|
164 |
+
async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
|
165 |
+
contents = await file.read()
|
166 |
+
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
167 |
+
tmp_file.write(contents)
|
168 |
+
tmp_path = tmp_file.name
|
169 |
+
|
170 |
+
file_ext = tmp_path.split('.')[-1].lower()
|
171 |
+
text, error = extract_text(tmp_path, file_ext)
|
172 |
+
|
173 |
if error:
|
174 |
+
return JSONResponse({"detail": error}, status_code=400)
|
175 |
+
|
176 |
if not text or len(text.split()) < 30:
|
177 |
+
return JSONResponse({"detail": "Document too short to summarize"}, status_code=400)
|
178 |
+
|
179 |
+
summary = generate_summary(text, length)
|
180 |
+
audio_path = text_to_speech(summary)
|
181 |
+
pdf_path = create_pdf(summary, file.filename)
|
182 |
+
|
183 |
+
response = {"summary": summary}
|
184 |
+
if audio_path:
|
185 |
+
response["audioUrl"] = f"/files/{os.path.basename(audio_path)}"
|
186 |
+
if pdf_path:
|
187 |
+
response["pdfUrl"] = f"/files/{os.path.basename(pdf_path)}"
|
188 |
+
|
189 |
+
return JSONResponse(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
@app.get("/files/{file_name}")
|
192 |
+
async def serve_file(file_name: str):
|
193 |
+
path = os.path.join(tempfile.gettempdir(), file_name)
|
194 |
+
if os.path.exists(path):
|
195 |
+
return FileResponse(path)
|
196 |
return JSONResponse({"error": "File not found"}, status_code=404)
|
197 |
|
|
|
|
|
198 |
@app.get("/")
|
199 |
+
def home():
|
200 |
+
return RedirectResponse(url="/")
|