Spaces:

mike23415
/

playwebit-t5-api

Sleeping

App Files Files Community

mike23415 commited on Mar 31

Commit

44d7238

verified ·

1 Parent(s): 53425a8

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -49

app.py CHANGED Viewed

@@ -9,31 +9,26 @@ from docx import Document
 from pptx import Presentation
 import nltk
 import string
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
-# Set NLTK data path to a directory where we have write access
 nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
 os.makedirs(nltk_data_dir, exist_ok=True)
 nltk.data.path.append(nltk_data_dir)
-# Download necessary NLTK data to our custom directory
 try:
-    nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
-    nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
-    from nltk.corpus import stopwords
-    from nltk.tokenize import sent_tokenize, word_tokenize
-    from nltk.probability import FreqDist
-    from heapq import nlargest
-    from collections import defaultdict
-except Exception as e:
-    print(f"NLTK download error: {e}")
-    # Fallback function if NLTK data cannot be downloaded
-    def simple_summarize(text, max_chars=1000):
-        # Simple summary: first few paragraphs
-        paragraphs = text.split('\n\n')
-        return ' '.join(paragraphs[:3])[:max_chars]
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
@@ -77,15 +72,12 @@ def summarize():
         # Generate a summary of the text
         try:
-            # Try using the advanced summarizer
-            if 'stopwords' in globals() and 'sent_tokenize' in globals():
-                summary = generate_summary(text)
-            else:
-                # Fallback to simple summarizer if NLTK isn't available
-                summary = simple_summarize(text)
         except Exception as e:
             print(f"Summarization error: {e}")
-            # Another fallback if something goes wrong with summarization
             summary = text[:1000] + "..." if len(text) > 1000 else text
         # Include metadata
@@ -129,85 +121,63 @@ def extract_text_from_txt(file_content):
     return clean_text(text)
 def clean_text(text):
-    # Remove excess whitespace
     text = re.sub(r'\s+', ' ', text)
-    # Remove special characters but keep sentence punctuation
     text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
     return text.strip()
 def generate_summary(text, sentence_count=5):
-    # If text is very short, return it as is
     if len(text.split()) < 100:
         return text
-    # Tokenize the text into sentences
     sentences = sent_tokenize(text)
-    # If too few sentences, return all
     if len(sentences) <= sentence_count:
         return text
-    # Remove punctuation and convert to lowercase for processing
     clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
-    # Get stop words
     stop_words = set(stopwords.words('english'))
-    # Calculate word frequencies excluding stop words
     word_frequencies = defaultdict(int)
     for sentence in clean_sentences:
         for word in word_tokenize(sentence):
             if word not in stop_words:
                 word_frequencies[word] += 1
-    # Normalize frequencies
     max_frequency = max(word_frequencies.values()) if word_frequencies else 1
     for word in word_frequencies:
         word_frequencies[word] = word_frequencies[word] / max_frequency
-    # Calculate sentence scores based on word frequencies
     sentence_scores = defaultdict(int)
     for i, sentence in enumerate(clean_sentences):
         for word in word_tokenize(sentence):
             if word in word_frequencies:
                 sentence_scores[i] += word_frequencies[word]
-    # Get top sentences
     top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
-    top_indices.sort()  # Sort to maintain original order
-    # Combine top sentences to form summary
-    summary = ' '.join([sentences[i] for i in top_indices])
-    return summary
 def simple_summarize(text, max_chars=1000):
-    # Simple summary: first few paragraphs plus additional key sentences
     paragraphs = text.split('\n\n')
-    # Take first 2-3 paragraphs as a base summary
     base_summary = ' '.join(paragraphs[:3])
-    # If we have a very short text, just return it
     if len(text) <= max_chars:
         return text
-    # If base summary is too short, add more content up to max_chars
     if len(base_summary) < max_chars:
         remaining_text = ' '.join(paragraphs[3:])
         sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
         for sentence in sentences:
             if len(base_summary) + len(sentence) + 1 <= max_chars:
                 base_summary += ' ' + sentence
             else:
                 break
-    # Truncate if still too long
     if len(base_summary) > max_chars:
         base_summary = base_summary[:max_chars] + "..."
     return base_summary
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860, debug=True)

 from pptx import Presentation
 import nltk
 import string
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.probability import FreqDist
+from heapq import nlargest
+from collections import defaultdict
 app = Flask(__name__)
 CORS(app)  # Enable CORS for all routes
+# Set NLTK data path to a directory included in the project
 nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
 os.makedirs(nltk_data_dir, exist_ok=True)
 nltk.data.path.append(nltk_data_dir)
+# Ensure NLTK data is available (pre-downloaded)
 try:
+    stopwords.words('english')  # Test if stopwords are accessible
+except LookupError:
+    print("NLTK data not found. Please ensure 'punkt' and 'stopwords' are pre-downloaded in 'nltk_data'.")
+    # Fallback will be used if this fails
 # Allowed file extensions
 ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "txt"}
         # Generate a summary of the text
         try:
+            summary = generate_summary(text)
+        except LookupError as e:
+            print(f"NLTK summarization failed: {e}. Using fallback.")
+            summary = simple_summarize(text)
         except Exception as e:
             print(f"Summarization error: {e}")
             summary = text[:1000] + "..." if len(text) > 1000 else text
         # Include metadata
     return clean_text(text)
 def clean_text(text):
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'[^\w\s\.\,\!\?\:\;]', '', text)
     return text.strip()
 def generate_summary(text, sentence_count=5):
     if len(text.split()) < 100:
         return text
     sentences = sent_tokenize(text)
     if len(sentences) <= sentence_count:
         return text
     clean_sentences = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in sentences]
     stop_words = set(stopwords.words('english'))
     word_frequencies = defaultdict(int)
     for sentence in clean_sentences:
         for word in word_tokenize(sentence):
             if word not in stop_words:
                 word_frequencies[word] += 1
     max_frequency = max(word_frequencies.values()) if word_frequencies else 1
     for word in word_frequencies:
         word_frequencies[word] = word_frequencies[word] / max_frequency
     sentence_scores = defaultdict(int)
     for i, sentence in enumerate(clean_sentences):
         for word in word_tokenize(sentence):
             if word in word_frequencies:
                 sentence_scores[i] += word_frequencies[word]
     top_indices = nlargest(sentence_count, sentence_scores, key=sentence_scores.get)
+    top_indices.sort()
+    return ' '.join([sentences[i] for i in top_indices])
 def simple_summarize(text, max_chars=1000):
     paragraphs = text.split('\n\n')
     base_summary = ' '.join(paragraphs[:3])
     if len(text) <= max_chars:
         return text
     if len(base_summary) < max_chars:
         remaining_text = ' '.join(paragraphs[3:])
         sentences = re.split(r'(?<=[.!?])\s+', remaining_text)
         for sentence in sentences:
             if len(base_summary) + len(sentence) + 1 <= max_chars:
                 base_summary += ' ' + sentence
             else:
                 break
     if len(base_summary) > max_chars:
         base_summary = base_summary[:max_chars] + "..."
     return base_summary
 if __name__ == "__main__":
+    # For local testing only
+    app.run(host="0.0.0.0", port=7860)