fikird commited on
Commit
ae8bccc
Β·
1 Parent(s): 25a3d88

Improve content processing and result formatting

Browse files
Files changed (2) hide show
  1. app.py +46 -21
  2. search_engine.py +117 -22
app.py CHANGED
@@ -32,50 +32,72 @@ def format_results(results):
32
  if not results or not results.get('results'):
33
  return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
 
35
- formatted = f"# πŸ” Search Results\n\n"
 
36
 
37
- # Add insights section
38
- if 'insights' in results:
39
- formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
40
 
41
- # Add follow-up questions
42
- if 'follow_up_questions' in results:
43
- formatted += "## ❓ Follow-up Questions\n"
44
- for q in results['follow_up_questions']:
45
- if q and q.strip():
46
- formatted += f"- {q.strip()}\n"
47
- formatted += "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Add main results
50
  if 'results' in results:
51
- formatted += "## πŸ“„ Detailed Results\n\n"
 
 
 
52
  for i, result in enumerate(results['results'], 1):
53
  if not isinstance(result, dict):
54
  continue
55
 
56
- formatted += f"### {i}. "
57
  if 'url' in result:
58
  title = result.get('title', 'Untitled')
59
- formatted += f"[{title}]({result['url']})\n"
60
  if 'summary' in result:
61
- formatted += f"\n{result['summary']}\n\n"
62
 
63
  # Add similar chunks if available
64
  if 'similar_chunks' in results:
65
- formatted += "## πŸ” Related Content\n\n"
 
 
 
66
  for i, chunk in enumerate(results['similar_chunks'], 1):
67
  if not isinstance(chunk, dict):
68
  continue
69
 
70
- formatted += f"### Related {i}\n"
71
  if 'metadata' in chunk:
72
  meta = chunk['metadata']
73
  if 'title' in meta and 'url' in meta:
74
- formatted += f"From [{meta['title']}]({meta['url']})\n"
75
  if 'content' in chunk:
76
- formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
- return formatted
79
 
80
  def create_demo():
81
  """Create the Gradio interface"""
@@ -100,7 +122,10 @@ def create_demo():
100
  )
101
  search_button = gr.Button("πŸ” Search")
102
 
103
- output = gr.Markdown()
 
 
 
104
 
105
  search_button.click(
106
  fn=safe_search,
 
32
  if not results or not results.get('results'):
33
  return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
 
35
+ insights = results.get('insights', {})
36
+ output = []
37
 
38
+ # Main Summary
39
+ output.append("πŸ“ Executive Summary")
40
+ output.append("-" * 50)
41
+ output.append(insights.get('main_summary', ''))
42
+ output.append("\n")
43
 
44
+ # Key Findings
45
+ output.append("πŸ”‘ Key Findings")
46
+ output.append("-" * 50)
47
+ for i, point in enumerate(insights.get('key_findings', []), 1):
48
+ output.append(f"{i}. {point}")
49
+ output.append("\n")
50
+
51
+ # Sources
52
+ output.append("πŸ“š Sources")
53
+ output.append("-" * 50)
54
+ for source in insights.get('sources', []):
55
+ output.append(f"β€’ {source.get('title', '')}")
56
+ output.append(f" {source.get('url', '')}")
57
+ output.append("\n")
58
+
59
+ # Follow-up Questions
60
+ output.append("❓ Suggested Questions")
61
+ output.append("-" * 50)
62
+ for question in results.get('follow_up_questions', []):
63
+ output.append(f"β€’ {question}")
64
 
65
  # Add main results
66
  if 'results' in results:
67
+ output.append("\n")
68
+ output.append("πŸ“„ Detailed Results")
69
+ output.append("-" * 50)
70
+ output.append("\n")
71
  for i, result in enumerate(results['results'], 1):
72
  if not isinstance(result, dict):
73
  continue
74
 
75
+ output.append(f"### {i}. ")
76
  if 'url' in result:
77
  title = result.get('title', 'Untitled')
78
+ output.append(f"[{title}]({result['url']})\n")
79
  if 'summary' in result:
80
+ output.append(f"\n{result['summary']}\n\n")
81
 
82
  # Add similar chunks if available
83
  if 'similar_chunks' in results:
84
+ output.append("\n")
85
+ output.append("πŸ” Related Content")
86
+ output.append("-" * 50)
87
+ output.append("\n")
88
  for i, chunk in enumerate(results['similar_chunks'], 1):
89
  if not isinstance(chunk, dict):
90
  continue
91
 
92
+ output.append(f"### Related {i}\n")
93
  if 'metadata' in chunk:
94
  meta = chunk['metadata']
95
  if 'title' in meta and 'url' in meta:
96
+ output.append(f"From [{meta['title']}]({meta['url']})\n")
97
  if 'content' in chunk:
98
+ output.append(f"\n{chunk['content'][:200]}...\n\n")
99
 
100
+ return "\n".join(output)
101
 
102
  def create_demo():
103
  """Create the Gradio interface"""
 
122
  )
123
  search_button = gr.Button("πŸ” Search")
124
 
125
+ output = gr.Textbox(
126
+ label="Search Results",
127
+ lines=20
128
+ )
129
 
130
  search_button.click(
131
  fn=safe_search,
search_engine.py CHANGED
@@ -44,25 +44,115 @@ class ContentProcessor:
44
 
45
  def __init__(self):
46
  self.model_manager = ModelManager()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- def process_content(self, content: str) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  """Process content and generate insights"""
50
  try:
51
- # Generate summary
 
 
 
 
 
 
 
 
 
52
  summary = self.model_manager.models['summarizer'](
53
  content[:1024],
54
- max_length=100,
55
- min_length=30,
56
  do_sample=False
57
  )[0]['summary_text']
58
 
59
  return {
60
  'summary': summary,
 
61
  'content': content
62
  }
 
63
  except Exception as e:
 
64
  return {
65
  'summary': f"Error processing content: {str(e)}",
 
66
  'content': content
67
  }
68
 
@@ -139,25 +229,18 @@ class WebSearchEngine:
139
  response = self.safe_get(url)
140
  soup = BeautifulSoup(response.text, 'lxml')
141
 
142
- # Extract text content
143
- for script in soup(["script", "style"]):
144
- script.decompose()
145
- text = soup.get_text()
146
- lines = (line.strip() for line in text.splitlines())
147
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
148
- content = ' '.join(chunk for chunk in chunks if chunk)
149
 
150
  # Get metadata
151
  metadata = self.get_metadata(soup)
152
 
153
- # Process content
154
- processed = self.processor.process_content(content)
155
-
156
  return {
157
  'url': url,
158
  'title': metadata['title'],
159
  'description': metadata['description'],
160
  'summary': processed['summary'],
 
161
  'content': processed['content']
162
  }
163
 
@@ -229,26 +312,38 @@ class WebSearchEngine:
229
  return {'error': 'No results found'}
230
 
231
  results = []
 
 
232
  for result in search_results:
233
  if 'link' in result:
234
  processed = self.process_url(result['link'])
235
  if 'error' not in processed:
236
  results.append(processed)
 
 
237
  time.sleep(random.uniform(0.5, 1.0))
238
-
239
  if not results:
240
  return {'error': 'Failed to process any search results'}
241
-
242
- # Generate insights from results
243
- all_content = " ".join([r['summary'] for r in results if 'summary' in r])
 
 
 
 
 
 
 
 
244
 
245
  return {
246
  'results': results,
247
- 'insights': all_content[:1000] if all_content else "No insights available.",
248
  'follow_up_questions': [
249
- f"What are the key differences between {query} and related topics?",
250
- f"Can you explain {query} in simple terms?",
251
- f"What are the latest developments in {query}?"
252
  ]
253
  }
254
 
 
44
 
45
  def __init__(self):
46
  self.model_manager = ModelManager()
47
+
48
+ def clean_text(self, text: str) -> str:
49
+ """Clean and normalize text content"""
50
+ # Remove extra whitespace
51
+ text = ' '.join(text.split())
52
+ # Remove common navigation elements
53
+ nav_elements = [
54
+ "skip to content",
55
+ "skip to navigation",
56
+ "search",
57
+ "menu",
58
+ "subscribe",
59
+ "sign in",
60
+ "log in",
61
+ "submit",
62
+ "browse",
63
+ "explore",
64
+ ]
65
+ for element in nav_elements:
66
+ text = text.replace(element.lower(), "")
67
+ return text.strip()
68
+
69
+ def extract_main_content(self, soup: BeautifulSoup) -> str:
70
+ """Extract main content from HTML soup"""
71
+ # Remove navigation, headers, footers, and sidebars
72
+ for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
73
+ elem.decompose()
74
+
75
+ # Remove script and style elements
76
+ for elem in soup.find_all(['script', 'style']):
77
+ elem.decompose()
78
+
79
+ # Try to find main content area
80
+ main_content = None
81
+ content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
82
+
83
+ for tag in content_tags:
84
+ main_content = soup.select_one(tag)
85
+ if main_content:
86
+ break
87
+
88
+ # If no main content found, use body
89
+ if not main_content:
90
+ main_content = soup.find('body')
91
+
92
+ if main_content:
93
+ text = main_content.get_text(separator=' ', strip=True)
94
+ else:
95
+ text = soup.get_text(separator=' ', strip=True)
96
+
97
+ return self.clean_text(text)
98
 
99
+ def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
100
+ """Extract key points from text using AI"""
101
+ try:
102
+ # Split text into chunks for processing
103
+ chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
104
+ all_points = []
105
+
106
+ for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
107
+ summary = self.model_manager.models['summarizer'](
108
+ chunk,
109
+ max_length=100,
110
+ min_length=30,
111
+ do_sample=False
112
+ )[0]['summary_text']
113
+
114
+ # Split summary into sentences
115
+ points = [s.strip() for s in summary.split('.') if s.strip()]
116
+ all_points.extend(points)
117
+
118
+ # Return top points
119
+ return all_points[:max_points]
120
+
121
+ except Exception as e:
122
+ logger.error(f"Error extracting key points: {str(e)}")
123
+ return []
124
+
125
+ def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
126
  """Process content and generate insights"""
127
  try:
128
+ # Extract main content if soup is provided
129
+ if soup:
130
+ content = self.extract_main_content(soup)
131
+ else:
132
+ content = self.clean_text(content)
133
+
134
+ # Extract key points
135
+ key_points = self.extract_key_points(content)
136
+
137
+ # Generate overall summary
138
  summary = self.model_manager.models['summarizer'](
139
  content[:1024],
140
+ max_length=150,
141
+ min_length=50,
142
  do_sample=False
143
  )[0]['summary_text']
144
 
145
  return {
146
  'summary': summary,
147
+ 'key_points': key_points,
148
  'content': content
149
  }
150
+
151
  except Exception as e:
152
+ logger.error(f"Error processing content: {str(e)}")
153
  return {
154
  'summary': f"Error processing content: {str(e)}",
155
+ 'key_points': [],
156
  'content': content
157
  }
158
 
 
229
  response = self.safe_get(url)
230
  soup = BeautifulSoup(response.text, 'lxml')
231
 
232
+ # Process content with BeautifulSoup object
233
+ processed = self.processor.process_content("", soup)
 
 
 
 
 
234
 
235
  # Get metadata
236
  metadata = self.get_metadata(soup)
237
 
 
 
 
238
  return {
239
  'url': url,
240
  'title': metadata['title'],
241
  'description': metadata['description'],
242
  'summary': processed['summary'],
243
+ 'key_points': processed['key_points'],
244
  'content': processed['content']
245
  }
246
 
 
312
  return {'error': 'No results found'}
313
 
314
  results = []
315
+ all_key_points = []
316
+
317
  for result in search_results:
318
  if 'link' in result:
319
  processed = self.process_url(result['link'])
320
  if 'error' not in processed:
321
  results.append(processed)
322
+ if 'key_points' in processed:
323
+ all_key_points.extend(processed['key_points'])
324
  time.sleep(random.uniform(0.5, 1.0))
325
+
326
  if not results:
327
  return {'error': 'Failed to process any search results'}
328
+
329
+ # Combine all summaries and key points
330
+ all_summaries = [r['summary'] for r in results if 'summary' in r]
331
+ combined_summary = " ".join(all_summaries)
332
+
333
+ # Format insights
334
+ insights = {
335
+ 'main_summary': combined_summary[:500],
336
+ 'key_findings': list(set(all_key_points))[:7], # Remove duplicates and limit to top 7
337
+ 'sources': [{'title': r['title'], 'url': r['url']} for r in results]
338
+ }
339
 
340
  return {
341
  'results': results,
342
+ 'insights': insights,
343
  'follow_up_questions': [
344
+ f"What are the practical applications of {query}?",
345
+ f"How has {query} evolved over the past year?",
346
+ f"What challenges remain in {query}?"
347
  ]
348
  }
349