Spaces:

broadfield-dev
/

RSS_News

Sleeping

App Files Files Community

broadfield-dev commited on 29 days ago

Commit

1176acb

verified ·

1 Parent(s): 1e7d9be

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -94

app.py CHANGED Viewed

@@ -2,14 +2,14 @@ import os
 import threading
 from flask import Flask, render_template, request, jsonify
 from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
-from langchain.vectorstores import Chroma
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.docstore.document import Document
 import logging
 import time
 from datetime import datetime
 import hashlib
 import glob
 app = Flask(__name__)
 # Setup logging
@@ -17,11 +17,9 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global flag to track background loading
-loading_complete = False
 last_update_time = time.time()
 def load_feeds_in_background():
     global loading_complete, last_update_time
     try:
@@ -32,43 +30,25 @@ def load_feeds_in_background():
         last_update_time = time.time()
         logger.info("Background feed processing complete")
         upload_to_hf_hub()
-        loading_complete = True
     except Exception as e:
         logger.error(f"Error in background feed loading: {e}")
         loading_complete = True
-@app.route('/')
-def index():
-    global loading_complete, last_update_time
-    # Check if any DB exists and initialize if none found
-    db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
-    if not db_exists:
-        loading_complete = False
-        logger.info("Downloading Chroma DB from Hugging Face Hub...")
-        download_from_hf_hub()
-        threading.Thread(target=load_feeds_in_background, daemon=True).start()
-    elif not loading_complete:
-        pass
-    else:
-        loading_complete = True
-    try:
-        # Aggregate documents and metadata from all Chroma DBs
-        all_docs = {'documents': [], 'metadatas': []}
-        seen_ids = set()  # For deduplication across all databases
-        # Iterate over all folders matching "chroma_db*"
-        for db_path in glob.glob("chroma_db*"):
-            if not os.path.isdir(db_path):
-                continue
-            # Initialize a Chroma instance for each database folder
             temp_vector_db = Chroma(
                 persist_directory=db_path,
                 embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
                 collection_name="news_articles"
             )
-            # Retrieve documents and metadata
             db_data = temp_vector_db.get(include=['documents', 'metadatas'])
             if db_data.get('documents') and db_data.get('metadatas'):
                 for doc, meta in zip(db_data['documents'], db_data['metadatas']):
@@ -77,14 +57,35 @@ def index():
                         seen_ids.add(doc_id)
                         all_docs['documents'].append(doc)
                         all_docs['metadatas'].append(meta)
         total_docs = len(all_docs['documents'])
-        logger.info(f"Total articles across all DBs: {total_docs}")
         if not all_docs.get('metadatas'):
             logger.info("No articles in any DB yet")
-            return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
-        # Process and categorize articles with strict deduplication
         enriched_articles = []
         seen_keys = set()
         for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
@@ -95,12 +96,10 @@ def index():
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
-            # Clean and normalize all fields
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
-            # Use a robust key with cleaned fields and description hash for deduplication
             description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
             key = f"{title}|{link}|{published}|{description_hash}"
             if key not in seen_keys:
@@ -117,13 +116,9 @@ def index():
                     "published": published,
                     "image": meta.get("image", "svg"),
                 })
-            else:
-                logger.debug(f"Duplicate found in retrieval: {key}")
-        # Sort by published date (stable sort)
         enriched_articles.sort(key=lambda x: x["published"], reverse=True)
-        # Group by category and limit to 10 most recent per category
         categorized_articles = {}
         for article in enriched_articles:
             cat = article["category"]
@@ -131,25 +126,21 @@ def index():
                 categorized_articles[cat] = []
             categorized_articles[cat].append(article)
-        # Sort categories alphabetically
         categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
-        # Limit to 10 most recent per category and log top 2 for debugging
         for cat in categorized_articles:
             categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
             if len(categorized_articles[cat]) >= 2:
                 logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
-        logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
         return render_template("index.html",
                               categorized_articles=categorized_articles,
                               has_articles=True,
-                              loading=not loading_complete)
     except Exception as e:
-        logger.error(f"Error retrieving articles: {e}")
-        return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
 @app.route('/search', methods=['POST'])
 def search():
@@ -160,35 +151,39 @@ def search():
     try:
         logger.info(f"Searching for: {query}")
-        results = vector_db.similarity_search(query, k=10)
-        logger.info(f"Search returned {len(results)} results")
         enriched_articles = []
         seen_keys = set()
-        for doc in results:
-            meta = doc.metadata
             title = meta.get("title", "No Title")
             link = meta.get("link", "")
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
-            # Clean and normalize all fields
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
-            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
-            key = f"{title}|{link}|{published}|{description_hash}"
-            if key not in seen_keys:
-                seen_keys.add(key)
-                enriched_articles.append({
-                    "title": title,
-                    "link": link,
-                    "description": description,
-                    "category": meta.get("category", "Uncategorized"),
-                    "published": published,
-                    "image": meta.get("image", "svg"),
-                })
         categorized_articles = {}
         for article in enriched_articles:
@@ -216,27 +211,7 @@ def check_loading():
 def get_updates():
     global last_update_time
     try:
-        # Aggregate documents and metadata from all Chroma DBs
-        all_docs = {'documents': [], 'metadatas': []}
-        seen_ids = set()
-        for db_path in glob.glob("chroma_db*"):
-            if not os.path.isdir(db_path):
-                continue
-            temp_vector_db = Chroma(
-                persist_directory=db_path,
-                embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
-                collection_name="news_articles"
-            )
-            db_data = temp_vector_db.get(include=['documents', 'metadatas'])
-            if db_data.get('documents') and db_data.get('metadatas'):
-                for doc, meta in zip(db_data['documents'], db_data['metadatas']):
-                    doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
-                    if doc_id not in seen_ids:
-                        seen_ids.add(doc_id)
-                        all_docs['documents'].append(doc)
-                        all_docs['metadatas'].append(meta)
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "last_update": last_update_time})
@@ -250,7 +225,6 @@ def get_updates():
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
-            # Clean and normalize all fields
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
@@ -282,7 +256,6 @@ def get_updates():
             if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
                 categorized_articles[cat].append(article)
-        # Limit to 10 most recent per category with final deduplication
         for cat in categorized_articles:
             unique_articles = []
             seen_cat_keys = set()
@@ -301,7 +274,7 @@ def get_updates():
 @app.route('/get_all_articles/<category>')
 def get_all_articles(category):
     try:
-        all_docs = vector_db.get(include=['documents', 'metadatas'])
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "category": category})
@@ -315,7 +288,6 @@ def get_all_articles(category):
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
-            # Clean and normalize all fields
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
@@ -342,10 +314,10 @@ def get_all_articles(category):
     except Exception as e:
         logger.error(f"Error fetching all articles for category {category}: {e}")
         return jsonify({"articles": [], "category": category}), 500
 @app.route('/card')
 def card_load():
     return render_template("card.html")
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 import threading
 from flask import Flask, render_template, request, jsonify
 from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
 import logging
 import time
 from datetime import datetime
 import hashlib
 import glob
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
 app = Flask(__name__)
 # Setup logging
 logger = logging.getLogger(__name__)
 # Global flag to track background loading
+loading_complete = True  # Start as True to allow initial rendering
 last_update_time = time.time()
 def load_feeds_in_background():
     global loading_complete, last_update_time
     try:
         last_update_time = time.time()
         logger.info("Background feed processing complete")
         upload_to_hf_hub()
     except Exception as e:
         logger.error(f"Error in background feed loading: {e}")
+    finally:
         loading_complete = True
+def get_all_docs_from_dbs():
+    """Aggregate documents and metadata from all Chroma DB folders."""
+    all_docs = {'documents': [], 'metadatas': []}
+    seen_ids = set()
+    for db_path in glob.glob("chroma_db*"):
+        if not os.path.isdir(db_path):
+            continue
+        try:
             temp_vector_db = Chroma(
                 persist_directory=db_path,
                 embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
                 collection_name="news_articles"
             )
             db_data = temp_vector_db.get(include=['documents', 'metadatas'])
             if db_data.get('documents') and db_data.get('metadatas'):
                 for doc, meta in zip(db_data['documents'], db_data['metadatas']):
                         seen_ids.add(doc_id)
                         all_docs['documents'].append(doc)
                         all_docs['metadatas'].append(meta)
+        except Exception as e:
+            logger.error(f"Error loading DB {db_path}: {e}")
+    return all_docs
+@app.route('/')
+def index():
+    global loading_complete, last_update_time
+    # Check if any DB exists; if not, download from Hugging Face
+    db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
+    if not db_exists:
+        logger.info("No Chroma DB found, downloading from Hugging Face Hub...")
+        download_from_hf_hub()
+    # Start background RSS feed update
+    loading_complete = False
+    threading.Thread(target=load_feeds_in_background, daemon=True).start()
+    # Load existing data immediately
+    try:
+        all_docs = get_all_docs_from_dbs()
         total_docs = len(all_docs['documents'])
+        logger.info(f"Total articles across all DBs at startup: {total_docs}")
         if not all_docs.get('metadatas'):
             logger.info("No articles in any DB yet")
+            return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
+        # Process and categorize articles with deduplication
         enriched_articles = []
         seen_keys = set()
         for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
             description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
             key = f"{title}|{link}|{published}|{description_hash}"
             if key not in seen_keys:
                     "published": published,
                     "image": meta.get("image", "svg"),
                 })
         enriched_articles.sort(key=lambda x: x["published"], reverse=True)
         categorized_articles = {}
         for article in enriched_articles:
             cat = article["category"]
                 categorized_articles[cat] = []
             categorized_articles[cat].append(article)
         categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
         for cat in categorized_articles:
             categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
             if len(categorized_articles[cat]) >= 2:
                 logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
+        logger.info(f"Displaying articles at startup: {sum(len(articles) for articles in categorized_articles.values())} total")
         return render_template("index.html",
                               categorized_articles=categorized_articles,
                               has_articles=True,
+                              loading=True)  # Show spinner while background task runs
     except Exception as e:
+        logger.error(f"Error retrieving articles at startup: {e}")
+        return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
 @app.route('/search', methods=['POST'])
 def search():
     try:
         logger.info(f"Searching for: {query}")
+        all_docs = get_all_docs_from_dbs()
+        if not all_docs.get('metadatas'):
+            return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
+        # Simple keyword search for now (can be improved with similarity_search later)
         enriched_articles = []
         seen_keys = set()
+        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
+            if not meta:
+                continue
             title = meta.get("title", "No Title")
             link = meta.get("link", "")
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
+            # Basic keyword match
+            if query.lower() in title or query.lower() in description:
+                description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
+                key = f"{title}|{link}|{published}|{description_hash}"
+                if key not in seen_keys:
+                    seen_keys.add(key)
+                    enriched_articles.append({
+                        "title": title,
+                        "link": link,
+                        "description": description,
+                        "category": meta.get("category", "Uncategorized"),
+                        "published": published,
+                        "image": meta.get("image", "svg"),
+                    })
         categorized_articles = {}
         for article in enriched_articles:
 def get_updates():
     global last_update_time
     try:
+        all_docs = get_all_docs_from_dbs()
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "last_update": last_update_time})
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
             if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
                 categorized_articles[cat].append(article)
         for cat in categorized_articles:
             unique_articles = []
             seen_cat_keys = set()
 @app.route('/get_all_articles/<category>')
 def get_all_articles(category):
     try:
+        all_docs = get_all_docs_from_dbs()
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "category": category})
             description = meta.get("original_description", "No Description")
             published = meta.get("published", "Unknown Date").strip()
             title = clean_text(title)
             link = clean_text(link)
             description = clean_text(description)
     except Exception as e:
         logger.error(f"Error fetching all articles for category {category}: {e}")
         return jsonify({"articles": [], "category": category}), 500
 @app.route('/card')
 def card_load():
     return render_template("card.html")
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)