ikraamkb commited on
Commit
6f78a44
·
verified ·
1 Parent(s): 89a3550

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -88
app.py CHANGED
@@ -1,4 +1,5 @@
1
- import gradio as gr
 
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
  import fitz # PyMuPDF
4
  import docx
@@ -6,34 +7,35 @@ import pptx
6
  import openpyxl
7
  import re
8
  import nltk
9
- from nltk.tokenize import sent_tokenize
10
  import torch
11
- from fastapi import FastAPI
12
- from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
13
  from gtts import gTTS
 
14
  import tempfile
15
  import os
16
  import easyocr
17
- from fpdf import FPDF
18
  import datetime
19
- from concurrent.futures import ThreadPoolExecutor
20
  import hashlib
21
 
 
22
  nltk.download('punkt', quiet=True)
23
-
24
  app = FastAPI()
25
 
 
26
  MODEL_NAME = "facebook/bart-large-cnn"
27
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
29
  model.eval()
30
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
31
 
 
32
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
33
- executor = ThreadPoolExecutor()
34
 
 
35
  summary_cache = {}
36
 
 
 
37
  def clean_text(text: str) -> str:
38
  text = re.sub(r'\s+', ' ', text)
39
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
@@ -110,20 +112,18 @@ def generate_summary(text: str, length: str = "medium") -> str:
110
  "long": {"max_length": 300, "min_length": 210}
111
  }
112
  chunks = chunk_text(text)
113
- try:
114
- summaries = summarizer(
115
- chunks,
116
- max_length=length_params[length]["max_length"],
117
- min_length=length_params[length]["min_length"],
118
- do_sample=False,
119
- truncation=True,
120
- no_repeat_ngram_size=2,
121
- num_beams=2,
122
- early_stopping=True
123
- )
124
- summary_texts = [s['summary_text'] for s in summaries]
125
- except Exception as e:
126
- summary_texts = [f"[Batch error: {str(e)}]"]
127
 
128
  final_summary = " ".join(summary_texts)
129
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
@@ -138,15 +138,13 @@ def text_to_speech(text: str):
138
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
139
  tts.save(temp_audio.name)
140
  return temp_audio.name
141
- except Exception as e:
142
- print(f"Error in text-to-speech: {e}")
143
  return ""
144
 
145
  def create_pdf(summary: str, original_filename: str):
146
  try:
147
  pdf = FPDF()
148
  pdf.add_page()
149
- pdf.set_font("Arial", size=12)
150
  pdf.set_font("Arial", 'B', 16)
151
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
152
  pdf.set_font("Arial", size=12)
@@ -157,75 +155,46 @@ def create_pdf(summary: str, original_filename: str):
157
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
158
  pdf.output(temp_pdf.name)
159
  return temp_pdf.name
160
- except Exception as e:
161
- print(f"Error creating PDF: {e}")
162
  return ""
163
 
164
- def summarize_document(file, summary_length: str, enable_tts: bool = True):
165
- if file is None:
166
- return "Please upload a document first", "", None, None
167
- file_path = file.name
168
- file_extension = file_path.split(".")[-1].lower()
169
- original_filename = os.path.basename(file_path)
170
- text, error = extract_text(file_path, file_extension)
 
 
 
 
 
171
  if error:
172
- return error, "", None, None
 
173
  if not text or len(text.split()) < 30:
174
- return "Document is too short or contains too little text to summarize", "", None, None
175
- try:
176
- summary = generate_summary(text, summary_length)
177
- audio_path = text_to_speech(summary) if enable_tts else None
178
- pdf_path = create_pdf(summary, original_filename) if summary else None
179
- return summary, "", audio_path, pdf_path
180
- except Exception as e:
181
- return f"Summarization error: {str(e)}", "", None, None
182
-
183
- with gr.Blocks(title="Document Summarizer", theme=gr.themes.Soft()) as demo:
184
- gr.Markdown("# 📄 Advanced Document Summarizer")
185
- gr.Markdown("Upload a document to generate a summary with audio and optional PDF download")
186
-
187
- with gr.Row():
188
- with gr.Column():
189
- file_input = gr.File(
190
- label="Upload Document",
191
- file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".jpg", ".jpeg", ".png"],
192
- type="filepath"
193
- )
194
- length_radio = gr.Radio(
195
- ["short", "medium", "long"],
196
- value="medium",
197
- label="Summary Length"
198
- )
199
- submit_btn = gr.Button("Generate Summary", variant="primary")
200
-
201
- with gr.Column():
202
- output = gr.Textbox(label="Summary", lines=10)
203
- audio_output = gr.Audio(label="Audio Summary")
204
- pdf_download = gr.File(label="Download Summary as PDF", visible=False)
205
-
206
- def summarize_and_return_ui(file, summary_length):
207
- summary, _, audio_path, pdf_path = summarize_document(file, summary_length)
208
- return (
209
- summary,
210
- audio_path,
211
- gr.File(visible=pdf_path is not None, value=pdf_path)
212
- )
213
-
214
- submit_btn.click(
215
- fn=summarize_and_return_ui,
216
- inputs=[file_input, length_radio],
217
- outputs=[output, audio_output, pdf_download]
218
- )
219
 
220
  @app.get("/files/{file_name}")
221
- async def get_file(file_name: str):
222
- file_path = os.path.join(tempfile.gettempdir(), file_name)
223
- if os.path.exists(file_path):
224
- return FileResponse(file_path)
225
  return JSONResponse({"error": "File not found"}, status_code=404)
226
 
227
- app = gr.mount_gradio_app(app, demo, path="/")
228
-
229
  @app.get("/")
230
- def redirect_to_interface():
231
- return RedirectResponse(url="/")
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
3
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
  import fitz # PyMuPDF
5
  import docx
 
7
  import openpyxl
8
  import re
9
  import nltk
 
10
  import torch
11
+ from nltk.tokenize import sent_tokenize
 
12
  from gtts import gTTS
13
+ from fpdf import FPDF
14
  import tempfile
15
  import os
16
  import easyocr
 
17
  import datetime
 
18
  import hashlib
19
 
20
+ # Initialize
21
  nltk.download('punkt', quiet=True)
 
22
  app = FastAPI()
23
 
24
+ # Load Summarizer Model
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
28
  model.eval()
29
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
30
 
31
+ # Load OCR Reader
32
  reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
 
33
 
34
+ # Cache
35
  summary_cache = {}
36
 
37
+ # --- Helper Functions ---
38
+
39
  def clean_text(text: str) -> str:
40
  text = re.sub(r'\s+', ' ', text)
41
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
 
112
  "long": {"max_length": 300, "min_length": 210}
113
  }
114
  chunks = chunk_text(text)
115
+
116
+ summaries = summarizer(
117
+ chunks,
118
+ max_length=length_params[length]["max_length"],
119
+ min_length=length_params[length]["min_length"],
120
+ do_sample=False,
121
+ truncation=True,
122
+ no_repeat_ngram_size=2,
123
+ num_beams=2,
124
+ early_stopping=True
125
+ )
126
+ summary_texts = [s['summary_text'] for s in summaries]
 
 
127
 
128
  final_summary = " ".join(summary_texts)
129
  final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
 
138
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
139
  tts.save(temp_audio.name)
140
  return temp_audio.name
141
+ except Exception:
 
142
  return ""
143
 
144
  def create_pdf(summary: str, original_filename: str):
145
  try:
146
  pdf = FPDF()
147
  pdf.add_page()
 
148
  pdf.set_font("Arial", 'B', 16)
149
  pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
150
  pdf.set_font("Arial", size=12)
 
155
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
156
  pdf.output(temp_pdf.name)
157
  return temp_pdf.name
158
+ except Exception:
 
159
  return ""
160
 
161
+ # --- API Endpoints ---
162
+
163
+ @app.post("/summarize/")
164
+ async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
165
+ contents = await file.read()
166
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
167
+ tmp_file.write(contents)
168
+ tmp_path = tmp_file.name
169
+
170
+ file_ext = tmp_path.split('.')[-1].lower()
171
+ text, error = extract_text(tmp_path, file_ext)
172
+
173
  if error:
174
+ return JSONResponse({"detail": error}, status_code=400)
175
+
176
  if not text or len(text.split()) < 30:
177
+ return JSONResponse({"detail": "Document too short to summarize"}, status_code=400)
178
+
179
+ summary = generate_summary(text, length)
180
+ audio_path = text_to_speech(summary)
181
+ pdf_path = create_pdf(summary, file.filename)
182
+
183
+ response = {"summary": summary}
184
+ if audio_path:
185
+ response["audioUrl"] = f"/files/{os.path.basename(audio_path)}"
186
+ if pdf_path:
187
+ response["pdfUrl"] = f"/files/{os.path.basename(pdf_path)}"
188
+
189
+ return JSONResponse(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  @app.get("/files/{file_name}")
192
+ async def serve_file(file_name: str):
193
+ path = os.path.join(tempfile.gettempdir(), file_name)
194
+ if os.path.exists(path):
195
+ return FileResponse(path)
196
  return JSONResponse({"error": "File not found"}, status_code=404)
197
 
 
 
198
  @app.get("/")
199
+ def home():
200
+ return RedirectResponse(url="/")