Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on Mar 31

Commit

53425a8

verified ·

1 Parent(s): b7db40a

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -14

app.py CHANGED Viewed

@@ -8,23 +8,32 @@ from PyPDF2 import PdfReader
 from docx import Document
 from pptx import Presentation
 import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import sent_tokenize, word_tokenize
-from nltk.probability import FreqDist
-from heapq import nlargest
-from collections import defaultdict
 import string
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
-# Download necessary NLTK data
 try:
-    nltk.data.find('tokenizers/punkt')
-    nltk.data.find('corpora/stopwords')
-except LookupError:
-    nltk.download('punkt')
-    nltk.download('stopwords')
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -67,7 +76,17 @@ def summarize():
             text = extract_text_from_txt(file_content)
         # Generate a summary of the text
-        summary = generate_summary(text)
         # Include metadata
         word_count = len(text.split())
@@ -76,12 +95,12 @@ def summarize():
             "filename": filename,
             "summary": summary,
             "original_word_count": word_count,
-            "summary_word_count": len(summary.split())
         })
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
-# Improved text extraction functions
 def extract_text_from_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
     text = ""
@@ -162,5 +181,33 @@ def generate_summary(text, sentence_count=5):
     return summary
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)

 from docx import Document
 from pptx import Presentation
 import nltk
 import string
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
+# Set NLTK data path to a directory where we have write access
+nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
+os.makedirs(nltk_data_dir, exist_ok=True)
+nltk.data.path.append(nltk_data_dir)
+# Download necessary NLTK data to our custom directory
 try:
+    nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
+    nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
+    from nltk.corpus import stopwords
+    from nltk.tokenize import sent_tokenize, word_tokenize
+    from nltk.probability import FreqDist
+    from heapq import nlargest
+    from collections import defaultdict
+except Exception as e:
+    print(f"NLTK download error: {e}")
+    # Fallback function if NLTK data cannot be downloaded
+    def simple_summarize(text, max_chars=1000):
+        # Simple summary: first few paragraphs
+        paragraphs = text.split('\n\n')
+        return ' '.join(paragraphs[:3])[:max_chars]
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
             text = extract_text_from_txt(file_content)
         # Generate a summary of the text
+        try:
+            # Try using the advanced summarizer
+            if 'stopwords' in globals() and 'sent_tokenize' in globals():
+                summary = generate_summary(text)
+            else:
+                # Fallback to simple summarizer if NLTK isn't available
+                summary = simple_summarize(text)
+        except Exception as e:
+            print(f"Summarization error: {e}")
+            # Another fallback if something goes wrong with summarization
+            summary = text[:1000] + "..." if len(text) > 1000 else text
         # Include metadata
         word_count = len(text.split())
             "filename": filename,
             "summary": summary,
             "original_word_count": word_count,
+            "summary_word_count": len(summary.split()) if summary else 0
         })
     except Exception as e:
         return jsonify({"error": f"Error processing file: {str(e)}"}), 500
+# Text extraction functions
 def extract_text_from_pdf(file_content):
     reader = PdfReader(io.BytesIO(file_content))
     text = ""
     return summary
+def simple_summarize(text, max_chars=1000):
+    # Simple summary: first few paragraphs plus additional key sentences
+    paragraphs = text.split('\n\n')
+    # Take first 2-3 paragraphs as a base summary
+    base_summary = ' '.join(paragraphs[:3])
+    # If we have a very short text, just return it
+    if len(text) <= max_chars:
+        return text
+    # If base summary is too short, add more content up to max_chars
+    if len(base_summary) < max_chars:
+        remaining_text = ' '.join(paragraphs[3:])
+        sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
+        for sentence in sentences:
+            if len(base_summary) + len(sentence) + 1 <= max_chars:
+                base_summary += ' ' + sentence
+            else:
+                break
+    # Truncate if still too long
+    if len(base_summary) > max_chars:
+        base_summary = base_summary[:max_chars] + "..."
+    return base_summary
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)