Praful Nayak commited on
Commit
b0bb1a1
·
1 Parent(s): 8cf911d

Deploy Flask Summarization App

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -9
  2. app.py +20 -10
Dockerfile CHANGED
@@ -1,7 +1,5 @@
1
- # Use a lightweight Python image
2
  FROM python:3.9-slim
3
 
4
- # Install system dependencies as root
5
  RUN apt-get update && apt-get install -y \
6
  libpng-dev \
7
  libjpeg-dev \
@@ -10,24 +8,19 @@ RUN apt-get update && apt-get install -y \
10
  libtesseract-dev \
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
- # Create a non-root user and set environment
14
  RUN useradd -m -u 1000 user
15
  USER user
16
  ENV PATH="/home/user/.local/bin:$PATH"
17
 
18
- # Set working directory
19
  WORKDIR /app
20
 
21
- # Copy requirements file and install Python dependencies as user
22
  COPY --chown=user:user requirements.txt requirements.txt
23
  RUN pip install --no-cache-dir --upgrade pip && \
24
  pip install --no-cache-dir -r requirements.txt
25
 
26
- # Copy application files
27
  COPY --chown=user:user . /app
28
 
29
- # Expose the necessary port
30
  EXPOSE 7860
31
 
32
- # Run the Flask app using Gunicorn with a higher timeout
33
- CMD ["gunicorn", "--workers", "2", "--timeout", "300", "--bind", "0.0.0.0:7860", "app:app"]
 
 
1
  FROM python:3.9-slim
2
 
 
3
  RUN apt-get update && apt-get install -y \
4
  libpng-dev \
5
  libjpeg-dev \
 
8
  libtesseract-dev \
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
11
  RUN useradd -m -u 1000 user
12
  USER user
13
  ENV PATH="/home/user/.local/bin:$PATH"
14
 
 
15
  WORKDIR /app
16
 
 
17
  COPY --chown=user:user requirements.txt requirements.txt
18
  RUN pip install --no-cache-dir --upgrade pip && \
19
  pip install --no-cache-dir -r requirements.txt
20
 
 
21
  COPY --chown=user:user . /app
22
 
 
23
  EXPOSE 7860
24
 
25
+ # Increase timeout and reduce workers
26
+ CMD ["gunicorn", "--workers", "1", "--timeout", "600", "--bind", "0.0.0.0:7860", "app:app"]
app.py CHANGED
@@ -13,14 +13,14 @@ app = Flask(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
- # Load Pegasus Model
17
  logger.info("Loading Pegasus model and tokenizer...")
18
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
19
- model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
20
  logger.info("Model loaded successfully.")
21
 
22
- # Extract text from PDF with page limit and timeout handling
23
- def extract_text_from_pdf(file_path, max_pages=10):
24
  text = ""
25
  try:
26
  with pdfplumber.open(file_path) as pdf:
@@ -32,8 +32,12 @@ def extract_text_from_pdf(file_path, max_pages=10):
32
  extracted = page.extract_text()
33
  if extracted:
34
  text += extracted + "\n"
 
 
 
 
35
  except Exception as e:
36
- logger.warning(f"Error extracting text from page {i+1}: {e}")
37
  continue
38
  except Exception as e:
39
  logger.error(f"Failed to process PDF {file_path}: {e}")
@@ -51,17 +55,23 @@ def extract_text_from_image(file_path):
51
  logger.error(f"Failed to process image {file_path}: {e}")
52
  return ""
53
 
54
- # Summarize text using Pegasus with truncation
55
  def summarize_text(text, max_input_length=512, max_output_length=150):
56
  try:
57
  logger.info("Summarizing text...")
58
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length)
 
 
 
 
 
59
  summary_ids = model.generate(
60
  inputs["input_ids"],
61
  max_length=max_output_length,
62
  min_length=30,
63
- num_beams=4,
64
- early_stopping=True
 
65
  )
66
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
67
  logger.info("Summarization completed.")
@@ -88,7 +98,7 @@ def summarize_document():
88
  logger.info(f"File saved to {file_path}")
89
 
90
  if filename.lower().endswith('.pdf'):
91
- text = extract_text_from_pdf(file_path, max_pages=5)
92
  elif filename.lower().endswith(('.png', '.jpeg', '.jpg')):
93
  text = extract_text_from_image(file_path)
94
  else:
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
+ # Load Pegasus Model (load once globally)
17
  logger.info("Loading Pegasus model and tokenizer...")
18
  tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
19
+ model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to("cpu") # Force CPU to manage memory
20
  logger.info("Model loaded successfully.")
21
 
22
+ # Extract text from PDF with page limit
23
+ def extract_text_from_pdf(file_path, max_pages=5):
24
  text = ""
25
  try:
26
  with pdfplumber.open(file_path) as pdf:
 
32
  extracted = page.extract_text()
33
  if extracted:
34
  text += extracted + "\n"
35
+ else:
36
+ logger.info(f"No text on page {i+1}, attempting OCR...")
37
+ image = page.to_image().original
38
+ text += pytesseract.image_to_string(image) + "\n"
39
  except Exception as e:
40
+ logger.warning(f"Error processing page {i+1}: {e}")
41
  continue
42
  except Exception as e:
43
  logger.error(f"Failed to process PDF {file_path}: {e}")
 
55
  logger.error(f"Failed to process image {file_path}: {e}")
56
  return ""
57
 
58
+ # Summarize text with chunking for large inputs
59
  def summarize_text(text, max_input_length=512, max_output_length=150):
60
  try:
61
  logger.info("Summarizing text...")
62
+ # Tokenize and truncate to max_input_length
63
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length, padding=True)
64
+ input_length = inputs["input_ids"].shape[1]
65
+ logger.info(f"Input length: {input_length} tokens")
66
+
67
+ # Adjust generation params for efficiency
68
  summary_ids = model.generate(
69
  inputs["input_ids"],
70
  max_length=max_output_length,
71
  min_length=30,
72
+ num_beams=2, # Reduce beams for speedup
73
+ early_stopping=True,
74
+ length_penalty=1.0, # Encourage shorter outputs
75
  )
76
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
77
  logger.info("Summarization completed.")
 
98
  logger.info(f"File saved to {file_path}")
99
 
100
  if filename.lower().endswith('.pdf'):
101
+ text = extract_text_from_pdf(file_path, max_pages=2) # Reduce to 2 pages
102
  elif filename.lower().endswith(('.png', '.jpeg', '.jpg')):
103
  text = extract_text_from_image(file_path)
104
  else: