Spaces:

maahi2412
/

text-summarization-app

Runtime error

App Files Files Community

Praful Nayak commited on Feb 28

Commit

b0bb1a1

1 Parent(s): 8cf911d

Deploy Flask Summarization App

Browse files

Files changed (2) hide show

Dockerfile +2 -9
app.py +20 -10

Dockerfile CHANGED Viewed

@@ -1,7 +1,5 @@
-# Use a lightweight Python image
 FROM python:3.9-slim
-# Install system dependencies as root
 RUN apt-get update && apt-get install -y \
     libpng-dev \
     libjpeg-dev \
@@ -10,24 +8,19 @@ RUN apt-get update && apt-get install -y \
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
-# Create a non-root user and set environment
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
-# Set working directory
 WORKDIR /app
-# Copy requirements file and install Python dependencies as user
 COPY --chown=user:user requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
-# Copy application files
 COPY --chown=user:user . /app
-# Expose the necessary port
 EXPOSE 7860
-# Run the Flask app using Gunicorn with a higher timeout
-CMD ["gunicorn", "--workers", "2", "--timeout", "300", "--bind", "0.0.0.0:7860", "app:app"]

 FROM python:3.9-slim
 RUN apt-get update && apt-get install -y \
     libpng-dev \
     libjpeg-dev \
     libtesseract-dev \
     && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user:user requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 COPY --chown=user:user . /app
 EXPOSE 7860
+# Increase timeout and reduce workers
+CMD ["gunicorn", "--workers", "1", "--timeout", "600", "--bind", "0.0.0.0:7860", "app:app"]

app.py CHANGED Viewed

@@ -13,14 +13,14 @@ app = Flask(__name__)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Load Pegasus Model
 logger.info("Loading Pegasus model and tokenizer...")
 tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
-model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
 logger.info("Model loaded successfully.")
-# Extract text from PDF with page limit and timeout handling
-def extract_text_from_pdf(file_path, max_pages=10):
     text = ""
     try:
         with pdfplumber.open(file_path) as pdf:
@@ -32,8 +32,12 @@ def extract_text_from_pdf(file_path, max_pages=10):
                     extracted = page.extract_text()
                     if extracted:
                         text += extracted + "\n"
                 except Exception as e:
-                    logger.warning(f"Error extracting text from page {i+1}: {e}")
                     continue
     except Exception as e:
         logger.error(f"Failed to process PDF {file_path}: {e}")
@@ -51,17 +55,23 @@ def extract_text_from_image(file_path):
         logger.error(f"Failed to process image {file_path}: {e}")
         return ""
-# Summarize text using Pegasus with truncation
 def summarize_text(text, max_input_length=512, max_output_length=150):
     try:
         logger.info("Summarizing text...")
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length)
         summary_ids = model.generate(
             inputs["input_ids"],
             max_length=max_output_length,
             min_length=30,
-            num_beams=4,
-            early_stopping=True
         )
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
         logger.info("Summarization completed.")
@@ -88,7 +98,7 @@ def summarize_document():
         logger.info(f"File saved to {file_path}")
         if filename.lower().endswith('.pdf'):
-            text = extract_text_from_pdf(file_path, max_pages=5)
         elif filename.lower().endswith(('.png', '.jpeg', '.jpg')):
             text = extract_text_from_image(file_path)
         else:

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Load Pegasus Model (load once globally)
 logger.info("Loading Pegasus model and tokenizer...")
 tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
+model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to("cpu")  # Force CPU to manage memory
 logger.info("Model loaded successfully.")
+# Extract text from PDF with page limit
+def extract_text_from_pdf(file_path, max_pages=5):
     text = ""
     try:
         with pdfplumber.open(file_path) as pdf:
                     extracted = page.extract_text()
                     if extracted:
                         text += extracted + "\n"
+                    else:
+                        logger.info(f"No text on page {i+1}, attempting OCR...")
+                        image = page.to_image().original
+                        text += pytesseract.image_to_string(image) + "\n"
                 except Exception as e:
+                    logger.warning(f"Error processing page {i+1}: {e}")
                     continue
     except Exception as e:
         logger.error(f"Failed to process PDF {file_path}: {e}")
         logger.error(f"Failed to process image {file_path}: {e}")
         return ""
+# Summarize text with chunking for large inputs
 def summarize_text(text, max_input_length=512, max_output_length=150):
     try:
         logger.info("Summarizing text...")
+        # Tokenize and truncate to max_input_length
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length, padding=True)
+        input_length = inputs["input_ids"].shape[1]
+        logger.info(f"Input length: {input_length} tokens")
+        # Adjust generation params for efficiency
         summary_ids = model.generate(
             inputs["input_ids"],
             max_length=max_output_length,
             min_length=30,
+            num_beams=2,  # Reduce beams for speedup
+            early_stopping=True,
+            length_penalty=1.0,  # Encourage shorter outputs
         )
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
         logger.info("Summarization completed.")
         logger.info(f"File saved to {file_path}")
         if filename.lower().endswith('.pdf'):
+            text = extract_text_from_pdf(file_path, max_pages=2)  # Reduce to 2 pages
         elif filename.lower().endswith(('.png', '.jpeg', '.jpg')):
             text = extract_text_from_image(file_path)
         else: