broadfield-dev commited on
Commit
d434ac0
·
verified ·
1 Parent(s): 2fc1b66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -7
app.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
  import time
7
  from datetime import datetime
8
  import hashlib
9
-
10
  app = Flask(__name__)
11
 
12
  # Setup logging
@@ -17,6 +17,8 @@ logger = logging.getLogger(__name__)
17
  loading_complete = False
18
  last_update_time = time.time()
19
 
 
 
20
  def load_feeds_in_background():
21
  global loading_complete, last_update_time
22
  try:
@@ -36,7 +38,8 @@ def load_feeds_in_background():
36
  def index():
37
  global loading_complete, last_update_time
38
 
39
- db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
 
40
  if not db_exists:
41
  loading_complete = False
42
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
@@ -48,11 +51,34 @@ def index():
48
  loading_complete = True
49
 
50
  try:
51
- all_docs = vector_db.get(include=['documents', 'metadatas'])
52
- total_docs = len(all_docs['documents']) if all_docs.get('documents') else 0
53
- logger.info(f"Total articles in DB: {total_docs}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  if not all_docs.get('metadatas'):
55
- logger.info("No articles in DB yet")
56
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
57
 
58
  # Process and categorize articles with strict deduplication
@@ -120,6 +146,8 @@ def index():
120
  logger.error(f"Error retrieving articles: {e}")
121
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
122
 
 
 
123
  @app.route('/search', methods=['POST'])
124
  def search():
125
  query = request.form.get('search')
@@ -185,7 +213,27 @@ def check_loading():
185
  def get_updates():
186
  global last_update_time
187
  try:
188
- all_docs = vector_db.get(include=['documents', 'metadatas'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  if not all_docs.get('metadatas'):
190
  return jsonify({"articles": [], "last_update": last_update_time})
191
 
 
6
  import time
7
  from datetime import datetime
8
  import hashlib
9
+ import glob
10
  app = Flask(__name__)
11
 
12
  # Setup logging
 
17
  loading_complete = False
18
  last_update_time = time.time()
19
 
20
+
21
+
22
  def load_feeds_in_background():
23
  global loading_complete, last_update_time
24
  try:
 
38
  def index():
39
  global loading_complete, last_update_time
40
 
41
+ # Check if any DB exists and initialize if none found
42
+ db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
43
  if not db_exists:
44
  loading_complete = False
45
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
 
51
  loading_complete = True
52
 
53
  try:
54
+ # Aggregate documents and metadata from all Chroma DBs
55
+ all_docs = {'documents': [], 'metadatas': []}
56
+ seen_ids = set() # For deduplication across all databases
57
+
58
+ # Iterate over all folders matching "chroma_db*"
59
+ for db_path in glob.glob("chroma_db*"):
60
+ if not os.path.isdir(db_path):
61
+ continue
62
+ # Initialize a Chroma instance for each database folder
63
+ temp_vector_db = Chroma(
64
+ persist_directory=db_path,
65
+ embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
66
+ collection_name="news_articles"
67
+ )
68
+ # Retrieve documents and metadata
69
+ db_data = temp_vector_db.get(include=['documents', 'metadatas'])
70
+ if db_data.get('documents') and db_data.get('metadatas'):
71
+ for doc, meta in zip(db_data['documents'], db_data['metadatas']):
72
+ doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
73
+ if doc_id not in seen_ids:
74
+ seen_ids.add(doc_id)
75
+ all_docs['documents'].append(doc)
76
+ all_docs['metadatas'].append(meta)
77
+
78
+ total_docs = len(all_docs['documents'])
79
+ logger.info(f"Total articles across all DBs: {total_docs}")
80
  if not all_docs.get('metadatas'):
81
+ logger.info("No articles in any DB yet")
82
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
83
 
84
  # Process and categorize articles with strict deduplication
 
146
  logger.error(f"Error retrieving articles: {e}")
147
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
148
 
149
+
150
+
151
  @app.route('/search', methods=['POST'])
152
  def search():
153
  query = request.form.get('search')
 
213
  def get_updates():
214
  global last_update_time
215
  try:
216
+ # Aggregate documents and metadata from all Chroma DBs
217
+ all_docs = {'documents': [], 'metadatas': []}
218
+ seen_ids = set()
219
+
220
+ for db_path in glob.glob("chroma_db*"):
221
+ if not os.path.isdir(db_path):
222
+ continue
223
+ temp_vector_db = Chroma(
224
+ persist_directory=db_path,
225
+ embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
226
+ collection_name="news_articles"
227
+ )
228
+ db_data = temp_vector_db.get(include=['documents', 'metadatas'])
229
+ if db_data.get('documents') and db_data.get('metadatas'):
230
+ for doc, meta in zip(db_data['documents'], db_data['metadatas']):
231
+ doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
232
+ if doc_id not in seen_ids:
233
+ seen_ids.add(doc_id)
234
+ all_docs['documents'].append(doc)
235
+ all_docs['metadatas'].append(meta)
236
+
237
  if not all_docs.get('metadatas'):
238
  return jsonify({"articles": [], "last_update": last_update_time})
239