ikraamkb commited on
Commit
08f3d12
·
verified ·
1 Parent(s): 85abd3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -133
app.py CHANGED
@@ -1,51 +1,31 @@
1
- from fastapi import FastAPI, UploadFile, File, Form
2
- from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
3
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
- import fitz # PyMuPDF
5
- import docx
6
- import pptx
7
- import openpyxl
8
- import re
9
- import nltk
10
- import torch
11
  from nltk.tokenize import sent_tokenize
12
- from gtts import gTTS
13
  from fpdf import FPDF
14
- import tempfile
15
- import os
16
- import easyocr
17
- import datetime
18
- import hashlib
19
 
20
- # Initialize
21
  nltk.download('punkt', quiet=True)
22
- app = FastAPI()
23
 
24
- # Load Summarizer Model
25
  MODEL_NAME = "facebook/bart-large-cnn"
26
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
28
- model.eval()
29
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
 
30
 
31
- # Load OCR Reader
32
- reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
33
-
34
- # Cache
35
  summary_cache = {}
36
 
37
- # --- Helper Functions ---
38
-
39
- def clean_text(text: str) -> str:
40
  text = re.sub(r'\s+', ' ', text)
41
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
42
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
43
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
44
  return text.strip()
45
 
46
- def extract_text(file_path: str, file_extension: str):
47
  try:
48
- if file_extension == "pdf":
49
  with fitz.open(file_path) as doc:
50
  text = "\n".join(page.get_text("text") for page in doc)
51
  if len(text.strip()) < 50:
@@ -55,146 +35,68 @@ def extract_text(file_path: str, file_extension: str):
55
  ocr_result = reader.readtext(temp_img.name, detail=0)
56
  os.unlink(temp_img.name)
57
  text = "\n".join(ocr_result) if ocr_result else text
58
- return clean_text(text), ""
59
-
60
- elif file_extension == "docx":
61
  doc = docx.Document(file_path)
62
- return clean_text("\n".join(p.text for p in doc.paragraphs)), ""
63
-
64
- elif file_extension == "pptx":
65
  prs = pptx.Presentation(file_path)
66
- text = [shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")]
67
- return clean_text("\n".join(text)), ""
68
-
69
- elif file_extension == "xlsx":
70
  wb = openpyxl.load_workbook(file_path, read_only=True)
71
- text = [" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)]
72
- return clean_text("\n".join(text)), ""
73
-
74
- elif file_extension in ["jpg", "jpeg", "png"]:
75
- ocr_result = reader.readtext(file_path, detail=0)
76
- return clean_text("\n".join(ocr_result)), ""
77
-
78
- return "", "Unsupported file format"
79
  except Exception as e:
80
- return "", f"Error reading {file_extension.upper()} file: {str(e)}"
81
-
82
- def chunk_text(text: str, max_tokens: int = 950):
83
- try:
84
- sentences = sent_tokenize(text)
85
- except:
86
- words = text.split()
87
- sentences = [' '.join(words[i:i+20]) for i in range(0, len(words), 20)]
88
 
89
- chunks = []
90
- current_chunk = ""
 
91
  for sentence in sentences:
92
- token_length = len(tokenizer.encode(current_chunk + " " + sentence))
93
- if token_length <= max_tokens:
94
  current_chunk += " " + sentence
95
  else:
96
  chunks.append(current_chunk.strip())
97
  current_chunk = sentence
98
-
99
  if current_chunk:
100
  chunks.append(current_chunk.strip())
101
-
102
  return chunks
103
 
104
- def generate_summary(text: str, length: str = "medium") -> str:
105
  cache_key = hashlib.md5((text + length).encode()).hexdigest()
106
  if cache_key in summary_cache:
107
  return summary_cache[cache_key]
108
 
109
- length_params = {
110
- "short": {"max_length": 80, "min_length": 30},
111
- "medium": {"max_length": 200, "min_length": 80},
112
- "long": {"max_length": 300, "min_length": 210}
113
- }
114
- chunks = chunk_text(text)
115
-
116
- summaries = summarizer(
117
- chunks,
118
- max_length=length_params[length]["max_length"],
119
- min_length=length_params[length]["min_length"],
120
- do_sample=False,
121
- truncation=True,
122
- no_repeat_ngram_size=2,
123
- num_beams=2,
124
- early_stopping=True
125
- )
126
- summary_texts = [s['summary_text'] for s in summaries]
127
-
128
- final_summary = " ".join(summary_texts)
129
- final_summary = ". ".join(s.strip().capitalize() for s in final_summary.split(". ") if s.strip())
130
- final_summary = final_summary if len(final_summary) > 25 else "Summary too short - document may be too brief"
131
 
 
 
 
132
  summary_cache[cache_key] = final_summary
133
  return final_summary
134
 
135
- def text_to_speech(text: str):
136
  try:
137
  tts = gTTS(text)
138
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
139
  tts.save(temp_audio.name)
140
  return temp_audio.name
141
- except Exception:
142
  return ""
143
 
144
- def create_pdf(summary: str, original_filename: str):
145
  try:
146
  pdf = FPDF()
147
  pdf.add_page()
148
  pdf.set_font("Arial", 'B', 16)
149
- pdf.cell(200, 10, txt="Document Summary", ln=1, align='C')
150
  pdf.set_font("Arial", size=12)
151
- pdf.cell(200, 10, txt=f"Original file: {original_filename}", ln=1)
152
- pdf.cell(200, 10, txt=f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1)
153
- pdf.ln(10)
154
- pdf.multi_cell(0, 10, txt=summary)
155
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
156
  pdf.output(temp_pdf.name)
157
  return temp_pdf.name
158
- except Exception:
159
  return ""
160
-
161
- # --- API Endpoints ---
162
-
163
- @app.post("/summarize/")
164
- async def summarize_api(file: UploadFile = File(...), length: str = Form("medium")):
165
- contents = await file.read()
166
- with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
167
- tmp_file.write(contents)
168
- tmp_path = tmp_file.name
169
-
170
- file_ext = tmp_path.split('.')[-1].lower()
171
- text, error = extract_text(tmp_path, file_ext)
172
-
173
- if error:
174
- return JSONResponse({"detail": error}, status_code=400)
175
-
176
- if not text or len(text.split()) < 30:
177
- return JSONResponse({"detail": "Document too short to summarize"}, status_code=400)
178
-
179
- summary = generate_summary(text, length)
180
- audio_path = text_to_speech(summary)
181
- pdf_path = create_pdf(summary, file.filename)
182
-
183
- response = {"summary": summary}
184
- if audio_path:
185
- response["audioUrl"] = f"/files/{os.path.basename(audio_path)}"
186
- if pdf_path:
187
- response["pdfUrl"] = f"/files/{os.path.basename(pdf_path)}"
188
-
189
- return JSONResponse(response)
190
-
191
- @app.get("/files/{file_name}")
192
- async def serve_file(file_name: str):
193
- path = os.path.join(tempfile.gettempdir(), file_name)
194
- if os.path.exists(path):
195
- return FileResponse(path)
196
- return JSONResponse({"error": "File not found"}, status_code=404)
197
-
198
- @app.get("/")
199
- def home():
200
- return RedirectResponse(url="/")
 
1
+ # app_logic.py
 
2
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
+ import fitz, docx, pptx, openpyxl, re, nltk, tempfile, os, easyocr, hashlib, datetime
 
 
 
 
 
 
4
  from nltk.tokenize import sent_tokenize
 
5
  from fpdf import FPDF
6
+ from gtts import gTTS
 
 
 
 
7
 
 
8
  nltk.download('punkt', quiet=True)
 
9
 
10
+ # Load once
11
  MODEL_NAME = "facebook/bart-large-cnn"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
 
14
  summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, batch_size=4)
15
+ reader = easyocr.Reader(['en'], gpu=False)
16
 
 
 
 
 
17
  summary_cache = {}
18
 
19
+ def clean_text(text):
 
 
20
  text = re.sub(r'\s+', ' ', text)
21
  text = re.sub(r'\u2022\s*|\d\.\s+', '', text)
22
  text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
23
  text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
24
  return text.strip()
25
 
26
+ def extract_text(file_path, file_extension):
27
  try:
28
+ if file_extension in ["pdf"]:
29
  with fitz.open(file_path) as doc:
30
  text = "\n".join(page.get_text("text") for page in doc)
31
  if len(text.strip()) < 50:
 
35
  ocr_result = reader.readtext(temp_img.name, detail=0)
36
  os.unlink(temp_img.name)
37
  text = "\n".join(ocr_result) if ocr_result else text
38
+ elif file_extension in ["docx"]:
 
 
39
  doc = docx.Document(file_path)
40
+ text = "\n".join(p.text for p in doc.paragraphs)
41
+ elif file_extension in ["pptx"]:
 
42
  prs = pptx.Presentation(file_path)
43
+ text = "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
44
+ elif file_extension in ["xlsx"]:
 
 
45
  wb = openpyxl.load_workbook(file_path, read_only=True)
46
+ text = "\n".join([" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True)])
47
+ else:
48
+ return "", "Unsupported file type"
49
+
50
+ return clean_text(text), ""
 
 
 
51
  except Exception as e:
52
+ return "", f"Extraction error: {e}"
 
 
 
 
 
 
 
53
 
54
+ def chunk_text(text, max_tokens=950):
55
+ sentences = sent_tokenize(text)
56
+ chunks, current_chunk = [], ""
57
  for sentence in sentences:
58
+ if len(tokenizer.encode(current_chunk + " " + sentence)) <= max_tokens:
 
59
  current_chunk += " " + sentence
60
  else:
61
  chunks.append(current_chunk.strip())
62
  current_chunk = sentence
 
63
  if current_chunk:
64
  chunks.append(current_chunk.strip())
 
65
  return chunks
66
 
67
+ def generate_summary(text, length="medium"):
68
  cache_key = hashlib.md5((text + length).encode()).hexdigest()
69
  if cache_key in summary_cache:
70
  return summary_cache[cache_key]
71
 
72
+ params = {"short": (30, 80), "medium": (80, 200), "long": (210, 300)}[length]
73
+ min_len, max_len = params
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ chunks = chunk_text(text)
76
+ summaries = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
77
+ final_summary = " ".join(s['summary_text'] for s in summaries)
78
  summary_cache[cache_key] = final_summary
79
  return final_summary
80
 
81
+ def text_to_speech(text):
82
  try:
83
  tts = gTTS(text)
84
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
85
  tts.save(temp_audio.name)
86
  return temp_audio.name
87
+ except:
88
  return ""
89
 
90
+ def create_pdf(summary, original_filename):
91
  try:
92
  pdf = FPDF()
93
  pdf.add_page()
94
  pdf.set_font("Arial", 'B', 16)
95
+ pdf.cell(200, 10, "Summary", ln=True, align='C')
96
  pdf.set_font("Arial", size=12)
97
+ pdf.multi_cell(0, 10, summary)
 
 
 
98
  temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
99
  pdf.output(temp_pdf.name)
100
  return temp_pdf.name
101
+ except:
102
  return ""