broadfield-dev commited on
Commit
1176acb
·
verified ·
1 Parent(s): 1e7d9be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -94
app.py CHANGED
@@ -2,14 +2,14 @@ import os
2
  import threading
3
  from flask import Flask, render_template, request, jsonify
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
5
- from langchain.vectorstores import Chroma
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.docstore.document import Document
8
  import logging
9
  import time
10
  from datetime import datetime
11
  import hashlib
12
  import glob
 
 
 
13
  app = Flask(__name__)
14
 
15
  # Setup logging
@@ -17,11 +17,9 @@ logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
  # Global flag to track background loading
20
- loading_complete = False
21
  last_update_time = time.time()
22
 
23
-
24
-
25
  def load_feeds_in_background():
26
  global loading_complete, last_update_time
27
  try:
@@ -32,43 +30,25 @@ def load_feeds_in_background():
32
  last_update_time = time.time()
33
  logger.info("Background feed processing complete")
34
  upload_to_hf_hub()
35
- loading_complete = True
36
  except Exception as e:
37
  logger.error(f"Error in background feed loading: {e}")
 
38
  loading_complete = True
39
 
40
- @app.route('/')
41
- def index():
42
- global loading_complete, last_update_time
43
-
44
- # Check if any DB exists and initialize if none found
45
- db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
46
- if not db_exists:
47
- loading_complete = False
48
- logger.info("Downloading Chroma DB from Hugging Face Hub...")
49
- download_from_hf_hub()
50
- threading.Thread(target=load_feeds_in_background, daemon=True).start()
51
- elif not loading_complete:
52
- pass
53
- else:
54
- loading_complete = True
55
 
56
- try:
57
- # Aggregate documents and metadata from all Chroma DBs
58
- all_docs = {'documents': [], 'metadatas': []}
59
- seen_ids = set() # For deduplication across all databases
60
-
61
- # Iterate over all folders matching "chroma_db*"
62
- for db_path in glob.glob("chroma_db*"):
63
- if not os.path.isdir(db_path):
64
- continue
65
- # Initialize a Chroma instance for each database folder
66
  temp_vector_db = Chroma(
67
  persist_directory=db_path,
68
  embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
69
  collection_name="news_articles"
70
  )
71
- # Retrieve documents and metadata
72
  db_data = temp_vector_db.get(include=['documents', 'metadatas'])
73
  if db_data.get('documents') and db_data.get('metadatas'):
74
  for doc, meta in zip(db_data['documents'], db_data['metadatas']):
@@ -77,14 +57,35 @@ def index():
77
  seen_ids.add(doc_id)
78
  all_docs['documents'].append(doc)
79
  all_docs['metadatas'].append(meta)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
 
 
 
81
  total_docs = len(all_docs['documents'])
82
- logger.info(f"Total articles across all DBs: {total_docs}")
83
  if not all_docs.get('metadatas'):
84
  logger.info("No articles in any DB yet")
85
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
86
 
87
- # Process and categorize articles with strict deduplication
88
  enriched_articles = []
89
  seen_keys = set()
90
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
@@ -95,12 +96,10 @@ def index():
95
  description = meta.get("original_description", "No Description")
96
  published = meta.get("published", "Unknown Date").strip()
97
 
98
- # Clean and normalize all fields
99
  title = clean_text(title)
100
  link = clean_text(link)
101
  description = clean_text(description)
102
 
103
- # Use a robust key with cleaned fields and description hash for deduplication
104
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
105
  key = f"{title}|{link}|{published}|{description_hash}"
106
  if key not in seen_keys:
@@ -117,13 +116,9 @@ def index():
117
  "published": published,
118
  "image": meta.get("image", "svg"),
119
  })
120
- else:
121
- logger.debug(f"Duplicate found in retrieval: {key}")
122
 
123
- # Sort by published date (stable sort)
124
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
125
 
126
- # Group by category and limit to 10 most recent per category
127
  categorized_articles = {}
128
  for article in enriched_articles:
129
  cat = article["category"]
@@ -131,25 +126,21 @@ def index():
131
  categorized_articles[cat] = []
132
  categorized_articles[cat].append(article)
133
 
134
- # Sort categories alphabetically
135
  categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
136
 
137
- # Limit to 10 most recent per category and log top 2 for debugging
138
  for cat in categorized_articles:
139
  categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
140
  if len(categorized_articles[cat]) >= 2:
141
  logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
142
 
143
- logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
144
  return render_template("index.html",
145
  categorized_articles=categorized_articles,
146
  has_articles=True,
147
- loading=not loading_complete)
148
  except Exception as e:
149
- logger.error(f"Error retrieving articles: {e}")
150
- return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
151
-
152
-
153
 
154
  @app.route('/search', methods=['POST'])
155
  def search():
@@ -160,35 +151,39 @@ def search():
160
 
161
  try:
162
  logger.info(f"Searching for: {query}")
163
- results = vector_db.similarity_search(query, k=10)
164
- logger.info(f"Search returned {len(results)} results")
165
-
 
 
166
  enriched_articles = []
167
  seen_keys = set()
168
- for doc in results:
169
- meta = doc.metadata
 
170
  title = meta.get("title", "No Title")
171
  link = meta.get("link", "")
172
  description = meta.get("original_description", "No Description")
173
  published = meta.get("published", "Unknown Date").strip()
174
 
175
- # Clean and normalize all fields
176
  title = clean_text(title)
177
  link = clean_text(link)
178
  description = clean_text(description)
179
 
180
- description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
181
- key = f"{title}|{link}|{published}|{description_hash}"
182
- if key not in seen_keys:
183
- seen_keys.add(key)
184
- enriched_articles.append({
185
- "title": title,
186
- "link": link,
187
- "description": description,
188
- "category": meta.get("category", "Uncategorized"),
189
- "published": published,
190
- "image": meta.get("image", "svg"),
191
- })
 
 
192
 
193
  categorized_articles = {}
194
  for article in enriched_articles:
@@ -216,27 +211,7 @@ def check_loading():
216
  def get_updates():
217
  global last_update_time
218
  try:
219
- # Aggregate documents and metadata from all Chroma DBs
220
- all_docs = {'documents': [], 'metadatas': []}
221
- seen_ids = set()
222
-
223
- for db_path in glob.glob("chroma_db*"):
224
- if not os.path.isdir(db_path):
225
- continue
226
- temp_vector_db = Chroma(
227
- persist_directory=db_path,
228
- embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
229
- collection_name="news_articles"
230
- )
231
- db_data = temp_vector_db.get(include=['documents', 'metadatas'])
232
- if db_data.get('documents') and db_data.get('metadatas'):
233
- for doc, meta in zip(db_data['documents'], db_data['metadatas']):
234
- doc_id = f"{meta.get('title', 'No Title')}|{meta.get('link', '')}|{meta.get('published', 'Unknown Date')}"
235
- if doc_id not in seen_ids:
236
- seen_ids.add(doc_id)
237
- all_docs['documents'].append(doc)
238
- all_docs['metadatas'].append(meta)
239
-
240
  if not all_docs.get('metadatas'):
241
  return jsonify({"articles": [], "last_update": last_update_time})
242
 
@@ -250,7 +225,6 @@ def get_updates():
250
  description = meta.get("original_description", "No Description")
251
  published = meta.get("published", "Unknown Date").strip()
252
 
253
- # Clean and normalize all fields
254
  title = clean_text(title)
255
  link = clean_text(link)
256
  description = clean_text(description)
@@ -282,7 +256,6 @@ def get_updates():
282
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
283
  categorized_articles[cat].append(article)
284
 
285
- # Limit to 10 most recent per category with final deduplication
286
  for cat in categorized_articles:
287
  unique_articles = []
288
  seen_cat_keys = set()
@@ -301,7 +274,7 @@ def get_updates():
301
  @app.route('/get_all_articles/<category>')
302
  def get_all_articles(category):
303
  try:
304
- all_docs = vector_db.get(include=['documents', 'metadatas'])
305
  if not all_docs.get('metadatas'):
306
  return jsonify({"articles": [], "category": category})
307
 
@@ -315,7 +288,6 @@ def get_all_articles(category):
315
  description = meta.get("original_description", "No Description")
316
  published = meta.get("published", "Unknown Date").strip()
317
 
318
- # Clean and normalize all fields
319
  title = clean_text(title)
320
  link = clean_text(link)
321
  description = clean_text(description)
@@ -342,10 +314,10 @@ def get_all_articles(category):
342
  except Exception as e:
343
  logger.error(f"Error fetching all articles for category {category}: {e}")
344
  return jsonify({"articles": [], "category": category}), 500
 
345
  @app.route('/card')
346
  def card_load():
347
  return render_template("card.html")
348
 
349
-
350
  if __name__ == "__main__":
351
  app.run(host="0.0.0.0", port=7860)
 
2
  import threading
3
  from flask import Flask, render_template, request, jsonify
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
 
 
 
5
  import logging
6
  import time
7
  from datetime import datetime
8
  import hashlib
9
  import glob
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+
13
  app = Flask(__name__)
14
 
15
  # Setup logging
 
17
  logger = logging.getLogger(__name__)
18
 
19
  # Global flag to track background loading
20
+ loading_complete = True # Start as True to allow initial rendering
21
  last_update_time = time.time()
22
 
 
 
23
  def load_feeds_in_background():
24
  global loading_complete, last_update_time
25
  try:
 
30
  last_update_time = time.time()
31
  logger.info("Background feed processing complete")
32
  upload_to_hf_hub()
 
33
  except Exception as e:
34
  logger.error(f"Error in background feed loading: {e}")
35
+ finally:
36
  loading_complete = True
37
 
38
+ def get_all_docs_from_dbs():
39
+ """Aggregate documents and metadata from all Chroma DB folders."""
40
+ all_docs = {'documents': [], 'metadatas': []}
41
+ seen_ids = set()
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ for db_path in glob.glob("chroma_db*"):
44
+ if not os.path.isdir(db_path):
45
+ continue
46
+ try:
 
 
 
 
 
 
47
  temp_vector_db = Chroma(
48
  persist_directory=db_path,
49
  embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
50
  collection_name="news_articles"
51
  )
 
52
  db_data = temp_vector_db.get(include=['documents', 'metadatas'])
53
  if db_data.get('documents') and db_data.get('metadatas'):
54
  for doc, meta in zip(db_data['documents'], db_data['metadatas']):
 
57
  seen_ids.add(doc_id)
58
  all_docs['documents'].append(doc)
59
  all_docs['metadatas'].append(meta)
60
+ except Exception as e:
61
+ logger.error(f"Error loading DB {db_path}: {e}")
62
+
63
+ return all_docs
64
+
65
+ @app.route('/')
66
+ def index():
67
+ global loading_complete, last_update_time
68
+
69
+ # Check if any DB exists; if not, download from Hugging Face
70
+ db_exists = any(os.path.exists(db_path) for db_path in glob.glob("chroma_db*"))
71
+ if not db_exists:
72
+ logger.info("No Chroma DB found, downloading from Hugging Face Hub...")
73
+ download_from_hf_hub()
74
+
75
+ # Start background RSS feed update
76
+ loading_complete = False
77
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
78
 
79
+ # Load existing data immediately
80
+ try:
81
+ all_docs = get_all_docs_from_dbs()
82
  total_docs = len(all_docs['documents'])
83
+ logger.info(f"Total articles across all DBs at startup: {total_docs}")
84
  if not all_docs.get('metadatas'):
85
  logger.info("No articles in any DB yet")
86
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
87
 
88
+ # Process and categorize articles with deduplication
89
  enriched_articles = []
90
  seen_keys = set()
91
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
 
96
  description = meta.get("original_description", "No Description")
97
  published = meta.get("published", "Unknown Date").strip()
98
 
 
99
  title = clean_text(title)
100
  link = clean_text(link)
101
  description = clean_text(description)
102
 
 
103
  description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
104
  key = f"{title}|{link}|{published}|{description_hash}"
105
  if key not in seen_keys:
 
116
  "published": published,
117
  "image": meta.get("image", "svg"),
118
  })
 
 
119
 
 
120
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
121
 
 
122
  categorized_articles = {}
123
  for article in enriched_articles:
124
  cat = article["category"]
 
126
  categorized_articles[cat] = []
127
  categorized_articles[cat].append(article)
128
 
 
129
  categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))
130
 
 
131
  for cat in categorized_articles:
132
  categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
133
  if len(categorized_articles[cat]) >= 2:
134
  logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
135
 
136
+ logger.info(f"Displaying articles at startup: {sum(len(articles) for articles in categorized_articles.values())} total")
137
  return render_template("index.html",
138
  categorized_articles=categorized_articles,
139
  has_articles=True,
140
+ loading=True) # Show spinner while background task runs
141
  except Exception as e:
142
+ logger.error(f"Error retrieving articles at startup: {e}")
143
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
 
 
144
 
145
  @app.route('/search', methods=['POST'])
146
  def search():
 
151
 
152
  try:
153
  logger.info(f"Searching for: {query}")
154
+ all_docs = get_all_docs_from_dbs()
155
+ if not all_docs.get('metadatas'):
156
+ return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})
157
+
158
+ # Simple keyword search for now (can be improved with similarity_search later)
159
  enriched_articles = []
160
  seen_keys = set()
161
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
162
+ if not meta:
163
+ continue
164
  title = meta.get("title", "No Title")
165
  link = meta.get("link", "")
166
  description = meta.get("original_description", "No Description")
167
  published = meta.get("published", "Unknown Date").strip()
168
 
 
169
  title = clean_text(title)
170
  link = clean_text(link)
171
  description = clean_text(description)
172
 
173
+ # Basic keyword match
174
+ if query.lower() in title or query.lower() in description:
175
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
176
+ key = f"{title}|{link}|{published}|{description_hash}"
177
+ if key not in seen_keys:
178
+ seen_keys.add(key)
179
+ enriched_articles.append({
180
+ "title": title,
181
+ "link": link,
182
+ "description": description,
183
+ "category": meta.get("category", "Uncategorized"),
184
+ "published": published,
185
+ "image": meta.get("image", "svg"),
186
+ })
187
 
188
  categorized_articles = {}
189
  for article in enriched_articles:
 
211
  def get_updates():
212
  global last_update_time
213
  try:
214
+ all_docs = get_all_docs_from_dbs()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  if not all_docs.get('metadatas'):
216
  return jsonify({"articles": [], "last_update": last_update_time})
217
 
 
225
  description = meta.get("original_description", "No Description")
226
  published = meta.get("published", "Unknown Date").strip()
227
 
 
228
  title = clean_text(title)
229
  link = clean_text(link)
230
  description = clean_text(description)
 
256
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
257
  categorized_articles[cat].append(article)
258
 
 
259
  for cat in categorized_articles:
260
  unique_articles = []
261
  seen_cat_keys = set()
 
274
  @app.route('/get_all_articles/<category>')
275
  def get_all_articles(category):
276
  try:
277
+ all_docs = get_all_docs_from_dbs()
278
  if not all_docs.get('metadatas'):
279
  return jsonify({"articles": [], "category": category})
280
 
 
288
  description = meta.get("original_description", "No Description")
289
  published = meta.get("published", "Unknown Date").strip()
290
 
 
291
  title = clean_text(title)
292
  link = clean_text(link)
293
  description = clean_text(description)
 
314
  except Exception as e:
315
  logger.error(f"Error fetching all articles for category {category}: {e}")
316
  return jsonify({"articles": [], "category": category}), 500
317
+
318
  @app.route('/card')
319
  def card_load():
320
  return render_template("card.html")
321
 
 
322
  if __name__ == "__main__":
323
  app.run(host="0.0.0.0", port=7860)