Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ import logging
|
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
import hashlib
|
9 |
-
|
10 |
app = Flask(__name__)
|
11 |
|
12 |
# Setup logging
|
@@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
|
|
17 |
loading_complete = False
|
18 |
last_update_time = time.time()
|
19 |
|
|
|
|
|
20 |
def load_feeds_in_background():
|
21 |
global loading_complete, last_update_time
|
22 |
try:
|
@@ -36,7 +38,8 @@ def load_feeds_in_background():
|
|
36 |
def index():
|
37 |
global loading_complete, last_update_time
|
38 |
|
39 |
-
|
|
|
40 |
if not db_exists:
|
41 |
loading_complete = False
|
42 |
logger.info("Downloading Chroma DB from Hugging Face Hub...")
|
@@ -48,11 +51,34 @@ def index():
|
|
48 |
loading_complete = True
|
49 |
|
50 |
try:
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if not all_docs.get('metadatas'):
|
55 |
-
logger.info("No articles in DB yet")
|
56 |
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
57 |
|
58 |
# Process and categorize articles with strict deduplication
|
@@ -120,6 +146,8 @@ def index():
|
|
120 |
logger.error(f"Error retrieving articles: {e}")
|
121 |
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
122 |
|
|
|
|
|
123 |
@app.route('/search', methods=['POST'])
|
124 |
def search():
|
125 |
query = request.form.get('search')
|
@@ -185,7 +213,27 @@ def check_loading():
|
|
185 |
def get_updates():
|
186 |
global last_update_time
|
187 |
try:
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
if not all_docs.get('metadatas'):
|
190 |
return jsonify({"articles": [], "last_update": last_update_time})
|
191 |
|
|
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
import hashlib
|
9 |
+
import glob
|
10 |
app = Flask(__name__)
|
11 |
|
12 |
# Setup logging
|
|
|
17 |
loading_complete = False
|
18 |
last_update_time = time.time()
|
19 |
|
20 |
+
|
21 |
+
|
22 |
def load_feeds_in_background():
|
23 |
global loading_complete, last_update_time
|
24 |
try:
|
|
|
38 |
def index():
|
39 |
global loading_complete, last_update_time
|
40 |
|
41 |
+
# Check if any DB exists and initialize if none found
|
42 |
+
db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
|
43 |
if not db_exists:
|
44 |
loading_complete = False
|
45 |
logger.info("Downloading Chroma DB from Hugging Face Hub...")
|
|
|
51 |
loading_complete = True
|
52 |
|
53 |
try:
|
54 |
+
# Aggregate documents and metadata from all Chroma DBs
|
55 |
+
all_docs = {'documents': [], 'metadatas': []}
|
56 |
+
seen_ids = set() # For deduplication across all databases
|
57 |
+
|
58 |
+
# Iterate over all folders matching "chroma_db*"
|
59 |
+
for db_path in glob.glob("chroma_db*"):
|
60 |
+
if not os.path.isdir(db_path):
|
61 |
+
continue
|
62 |
+
# Initialize a Chroma instance for each database folder
|
63 |
+
temp_vector_db = Chroma(
|
64 |
+
persist_directory=db_path,
|
65 |
+
embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
|
66 |
+
collection_name="news_articles"
|
67 |
+
)
|
68 |
+
# Retrieve documents and metadata
|
69 |
+
db_data = temp_vector_db.get(include=['documents', 'metadatas'])
|
70 |
+
if db_data.get('documents') and db_data.get('metadatas'):
|
71 |
+
for doc, meta in zip(db_data['documents'], db_data['metadatas']):
|
72 |
+
doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
|
73 |
+
if doc_id not in seen_ids:
|
74 |
+
seen_ids.add(doc_id)
|
75 |
+
all_docs['documents'].append(doc)
|
76 |
+
all_docs['metadatas'].append(meta)
|
77 |
+
|
78 |
+
total_docs = len(all_docs['documents'])
|
79 |
+
logger.info(f"Total articles across all DBs: {total_docs}")
|
80 |
if not all_docs.get('metadatas'):
|
81 |
+
logger.info("No articles in any DB yet")
|
82 |
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
83 |
|
84 |
# Process and categorize articles with strict deduplication
|
|
|
146 |
logger.error(f"Error retrieving articles: {e}")
|
147 |
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
148 |
|
149 |
+
|
150 |
+
|
151 |
@app.route('/search', methods=['POST'])
|
152 |
def search():
|
153 |
query = request.form.get('search')
|
|
|
213 |
def get_updates():
|
214 |
global last_update_time
|
215 |
try:
|
216 |
+
# Aggregate documents and metadata from all Chroma DBs
|
217 |
+
all_docs = {'documents': [], 'metadatas': []}
|
218 |
+
seen_ids = set()
|
219 |
+
|
220 |
+
for db_path in glob.glob("chroma_db*"):
|
221 |
+
if not os.path.isdir(db_path):
|
222 |
+
continue
|
223 |
+
temp_vector_db = Chroma(
|
224 |
+
persist_directory=db_path,
|
225 |
+
embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
|
226 |
+
collection_name="news_articles"
|
227 |
+
)
|
228 |
+
db_data = temp_vector_db.get(include=['documents', 'metadatas'])
|
229 |
+
if db_data.get('documents') and db_data.get('metadatas'):
|
230 |
+
for doc, meta in zip(db_data['documents'], db_data['metadatas']):
|
231 |
+
doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
|
232 |
+
if doc_id not in seen_ids:
|
233 |
+
seen_ids.add(doc_id)
|
234 |
+
all_docs['documents'].append(doc)
|
235 |
+
all_docs['metadatas'].append(meta)
|
236 |
+
|
237 |
if not all_docs.get('metadatas'):
|
238 |
return jsonify({"articles": [], "last_update": last_update_time})
|
239 |
|