Spaces:

broadfield-dev
/

RSS_News

Running

App Files Files Community

broadfield-dev commited on 4 days ago

Commit

d434ac0

verified ·

1 Parent(s): 2fc1b66

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -7

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import time
 from datetime import datetime
 import hashlib
 app = Flask(__name__)
 # Setup logging
@@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
 loading_complete = False
 last_update_time = time.time()
 def load_feeds_in_background():
     global loading_complete, last_update_time
     try:
@@ -36,7 +38,8 @@ def load_feeds_in_background():
 def index():
     global loading_complete, last_update_time
-    db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
     if not db_exists:
         loading_complete = False
         logger.info("Downloading Chroma DB from Hugging Face Hub...")
@@ -48,11 +51,34 @@ def index():
         loading_complete = True
     try:
-        all_docs = vector_db.get(include=['documents', 'metadatas'])
-        total_docs = len(all_docs['documents']) if all_docs.get('documents') else 0
-        logger.info(f"Total articles in DB: {total_docs}")
         if not all_docs.get('metadatas'):
-            logger.info("No articles in DB yet")
             return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
         # Process and categorize articles with strict deduplication
@@ -120,6 +146,8 @@ def index():
         logger.error(f"Error retrieving articles: {e}")
         return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
 @app.route('/search', methods=['POST'])
 def search():
     query = request.form.get('search')
@@ -185,7 +213,27 @@ def check_loading():
 def get_updates():
     global last_update_time
     try:
-        all_docs = vector_db.get(include=['documents', 'metadatas'])
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "last_update": last_update_time})

 import time
 from datetime import datetime
 import hashlib
+import glob
 app = Flask(__name__)
 # Setup logging
 loading_complete = False
 last_update_time = time.time()
 def load_feeds_in_background():
     global loading_complete, last_update_time
     try:
 def index():
     global loading_complete, last_update_time
+    # Check if any DB exists and initialize if none found
+    db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
     if not db_exists:
         loading_complete = False
         logger.info("Downloading Chroma DB from Hugging Face Hub...")
         loading_complete = True
     try:
+        # Aggregate documents and metadata from all Chroma DBs
+        all_docs = {'documents': [], 'metadatas': []}
+        seen_ids = set()  # For deduplication across all databases
+        # Iterate over all folders matching "chroma_db*"
+        for db_path in glob.glob("chroma_db*"):
+            if not os.path.isdir(db_path):
+                continue
+            # Initialize a Chroma instance for each database folder
+            temp_vector_db = Chroma(
+                persist_directory=db_path,
+                embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
+                collection_name="news_articles"
+            )
+            # Retrieve documents and metadata
+            db_data = temp_vector_db.get(include=['documents', 'metadatas'])
+            if db_data.get('documents') and db_data.get('metadatas'):
+                for doc, meta in zip(db_data['documents'], db_data['metadatas']):
+                    doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
+                    if doc_id not in seen_ids:
+                        seen_ids.add(doc_id)
+                        all_docs['documents'].append(doc)
+                        all_docs['metadatas'].append(meta)
+        total_docs = len(all_docs['documents'])
+        logger.info(f"Total articles across all DBs: {total_docs}")
         if not all_docs.get('metadatas'):
+            logger.info("No articles in any DB yet")
             return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
         # Process and categorize articles with strict deduplication
         logger.error(f"Error retrieving articles: {e}")
         return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
 @app.route('/search', methods=['POST'])
 def search():
     query = request.form.get('search')
 def get_updates():
     global last_update_time
     try:
+        # Aggregate documents and metadata from all Chroma DBs
+        all_docs = {'documents': [], 'metadatas': []}
+        seen_ids = set()
+        for db_path in glob.glob("chroma_db*"):
+            if not os.path.isdir(db_path):
+                continue
+            temp_vector_db = Chroma(
+                persist_directory=db_path,
+                embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
+                collection_name="news_articles"
+            )
+            db_data = temp_vector_db.get(include=['documents', 'metadatas'])
+            if db_data.get('documents') and db_data.get('metadatas'):
+                for doc, meta in zip(db_data['documents'], db_data['metadatas']):
+                    doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
+                    if doc_id not in seen_ids:
+                        seen_ids.add(doc_id)
+                        all_docs['documents'].append(doc)
+                        all_docs['metadatas'].append(meta)
         if not all_docs.get('metadatas'):
             return jsonify({"articles": [], "last_update": last_update_time})