fikird commited on
Commit
68c6844
Β·
1 Parent(s): 3f90511

Improve content processing and result formatting

Browse files
Files changed (2) hide show
  1. app.py +40 -40
  2. search_engine.py +91 -84
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  from search_engine import search
3
 
4
- def format_results(results):
5
- """Format search results in a user-friendly way"""
6
  if 'error' in results:
7
  return f"❌ Error: {results['error']}"
8
 
@@ -10,77 +10,77 @@ def format_results(results):
10
 
11
  # Add insights section
12
  if 'insights' in results and results['insights']:
13
- output.append("# πŸ’‘ Key Insights\n")
14
  output.append(results['insights'])
15
  output.append("\n")
16
 
17
  # Add key points section
18
  if 'key_points' in results and results['key_points']:
19
- output.append("# 🎯 Key Points\n")
20
- for i, point in enumerate(results['key_points'], 1):
21
- output.append(f"{i}. {point}\n")
22
  output.append("\n")
23
 
24
  # Add detailed results section
25
  if 'results' in results and results['results']:
26
- output.append("# πŸ“„ Detailed Results\n")
27
  for i, result in enumerate(results['results'], 1):
28
- output.append(f"## {i}. [{result['title']}]({result['url']})\n")
29
- if 'description' in result and result['description']:
30
- output.append(f"*{result['description']}*\n")
31
- if 'summary' in result and result['summary']:
32
- output.append(f"{result['summary']}\n")
33
  if 'key_points' in result and result['key_points']:
34
- output.append("\nHighlights:\n")
35
- for point in result['key_points']:
36
- output.append(f"- {point}\n")
37
  output.append("\n")
38
 
39
  # Add follow-up questions section
40
  if 'follow_up_questions' in results and results['follow_up_questions']:
41
- output.append("# ❓ Related Questions\n")
42
  for question in results['follow_up_questions']:
43
- output.append(f"- {question}\n")
44
 
45
  return "\n".join(output)
46
 
47
  def search_and_format(query):
48
  """Search and format results"""
 
 
 
49
  try:
50
  results = search(query)
51
- return format_results(results)
52
  except Exception as e:
53
- return f"❌ Error: {str(e)}"
54
 
55
- # Create the Gradio interface
56
- interface = gr.Interface(
57
  fn=search_and_format,
58
  inputs=gr.Textbox(
59
  label="Enter your search query",
60
- placeholder="What would you like to learn about?",
61
- lines=2
62
- ),
63
- outputs=gr.Markdown(
64
- label="Search Results",
65
- show_label=True
66
  ),
67
- title="πŸ” AI-Powered Web Search",
 
68
  description="""
69
- This search engine uses AI to:
70
- - Find relevant web pages
71
- - Extract key information
72
- - Generate insights and summaries
73
- - Suggest follow-up questions
 
 
74
  """,
75
  examples=[
76
- ["What is quantum computing?"],
77
- ["Latest developments in artificial intelligence"],
78
- ["How does blockchain technology work?"],
79
- ["Explain machine learning in simple terms"],
80
  ],
81
  theme=gr.themes.Soft()
82
  )
83
 
84
- # Launch the app
85
- if __name__ == "__main__":
86
- interface.launch()
 
1
  import gradio as gr
2
  from search_engine import search
3
 
4
+ def format_search_results(results):
5
+ """Format search results into a clean markdown output"""
6
  if 'error' in results:
7
  return f"❌ Error: {results['error']}"
8
 
 
10
 
11
  # Add insights section
12
  if 'insights' in results and results['insights']:
13
+ output.append("# πŸ” Latest Developments Summary\n")
14
  output.append(results['insights'])
15
  output.append("\n")
16
 
17
  # Add key points section
18
  if 'key_points' in results and results['key_points']:
19
+ output.append("# πŸ’‘ Key Points\n")
20
+ for point in results['key_points'][:5]: # Limit to top 5 points
21
+ output.append(f"β€’ {point}\n")
22
  output.append("\n")
23
 
24
  # Add detailed results section
25
  if 'results' in results and results['results']:
26
+ output.append("# πŸ“„ Detailed Findings\n")
27
  for i, result in enumerate(results['results'], 1):
28
+ output.append(f"## {i}. {result.get('title', 'No Title')}\n")
29
+ if 'url' in result:
30
+ output.append(f"πŸ”— [Source]({result['url']})\n")
31
+ if 'summary' in result:
32
+ output.append(f"\n{result['summary']}\n")
33
  if 'key_points' in result and result['key_points']:
34
+ output.append("\nKey Takeaways:")
35
+ for point in result['key_points'][:3]: # Limit to top 3 points per result
36
+ output.append(f"β€’ {point}")
37
  output.append("\n")
38
 
39
  # Add follow-up questions section
40
  if 'follow_up_questions' in results and results['follow_up_questions']:
41
+ output.append("# ❓ Suggested Follow-up Questions\n")
42
  for question in results['follow_up_questions']:
43
+ output.append(f"β€’ {question}\n")
44
 
45
  return "\n".join(output)
46
 
47
  def search_and_format(query):
48
  """Search and format results"""
49
+ if not query.strip():
50
+ return "Please enter a search query"
51
+
52
  try:
53
  results = search(query)
54
+ return format_search_results(results)
55
  except Exception as e:
56
+ return f"❌ Error performing search: {str(e)}"
57
 
58
+ # Create Gradio interface
59
+ iface = gr.Interface(
60
  fn=search_and_format,
61
  inputs=gr.Textbox(
62
  label="Enter your search query",
63
+ placeholder="Example: Latest developments in quantum computing"
 
 
 
 
 
64
  ),
65
+ outputs=gr.Markdown(label="Search Results"),
66
+ title="AI-Powered Research Assistant",
67
  description="""
68
+ This tool helps you research topics by:
69
+ 1. Finding relevant information from multiple sources
70
+ 2. Summarizing key findings
71
+ 3. Extracting important points
72
+ 4. Suggesting follow-up questions
73
+
74
+ Try searching for topics in technology, science, or any other field!
75
  """,
76
  examples=[
77
+ ["Latest developments in quantum computing"],
78
+ ["Artificial intelligence breakthroughs"],
79
+ ["Climate change solutions"],
80
+ ["Space exploration advancements"],
81
  ],
82
  theme=gr.themes.Soft()
83
  )
84
 
85
+ # Launch for Spaces
86
+ iface.launch()
 
search_engine.py CHANGED
@@ -50,104 +50,95 @@ class ContentProcessor:
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
- nav_patterns = [
54
  "skip to content",
55
- "skip to navigation",
56
  "search",
57
  "menu",
 
58
  "subscribe",
59
  "sign in",
60
  "log in",
61
- "browse",
62
  "submit",
 
 
63
  ]
64
- for pattern in nav_patterns:
65
- text = text.replace(pattern.lower(), "")
66
  return text.strip()
67
 
68
- def extract_main_content(self, soup: BeautifulSoup) -> str:
69
- """Extract main content from HTML"""
70
- # Remove navigation, headers, footers
71
- for elem in soup.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']):
72
- elem.decompose()
73
-
74
- # Try to find main content container
75
- main_content = None
76
- for tag in ['main', 'article', 'div[role="main"]', '.main-content', '#main-content']:
77
- main_content = soup.select_one(tag)
78
- if main_content:
79
- break
80
-
81
- if not main_content:
82
- # Fallback to body content
83
- main_content = soup.find('body')
84
-
85
- if main_content:
86
- text = main_content.get_text(separator=' ', strip=True)
87
- else:
88
- # Last resort: get all text
89
- text = soup.get_text(separator=' ', strip=True)
90
-
91
- return self.clean_text(text)
92
-
93
- def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
94
- """Extract key points from text using AI"""
95
- try:
96
- # Split text into smaller chunks
97
- chunk_size = 1024
98
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
99
-
100
- all_points = []
101
- for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
102
- summary = self.model_manager.models['summarizer'](
103
- chunk,
104
- max_length=100,
105
- min_length=30,
106
- do_sample=False
107
- )[0]['summary_text']
108
-
109
- # Split summary into sentences
110
- points = [s.strip() for s in summary.split('.') if s.strip()]
111
- all_points.extend(points)
112
-
113
- # Return top points
114
- return all_points[:max_points]
115
 
116
- except Exception as e:
117
- logger.error(f"Error extracting key points: {str(e)}")
118
- return []
119
 
120
- def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
121
- """Process content and generate insights"""
122
  try:
123
- # Extract main content if HTML is available
124
- if soup:
125
- content = self.extract_main_content(soup)
126
- else:
127
- content = self.clean_text(content)
128
 
129
- # Extract key points
130
- key_points = self.extract_key_points(content)
 
 
 
 
131
 
132
- # Generate overall summary
133
  summary = self.model_manager.models['summarizer'](
134
- content[:1024],
135
  max_length=150,
136
  min_length=50,
137
  do_sample=False
138
  )[0]['summary_text']
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return {
141
  'summary': summary,
142
  'key_points': key_points,
143
- 'content': content
144
  }
 
145
  except Exception as e:
146
  return {
147
  'summary': f"Error processing content: {str(e)}",
148
  'key_points': [],
149
  'content': content
150
  }
 
 
 
 
151
 
152
  class WebSearchEngine:
153
  """Main search engine class"""
@@ -222,12 +213,20 @@ class WebSearchEngine:
222
  response = self.safe_get(url)
223
  soup = BeautifulSoup(response.text, 'lxml')
224
 
225
- # Process content with HTML context
226
- processed = self.processor.process_content("", soup)
 
 
 
 
 
227
 
228
  # Get metadata
229
  metadata = self.get_metadata(soup)
230
 
 
 
 
231
  return {
232
  'url': url,
233
  'title': metadata['title'],
@@ -305,41 +304,49 @@ class WebSearchEngine:
305
  return {'error': 'No results found'}
306
 
307
  results = []
308
- all_key_points = []
309
 
310
  for result in search_results:
311
  if 'link' in result:
312
  processed = self.process_url(result['link'])
313
  if 'error' not in processed:
 
 
314
  results.append(processed)
 
 
 
 
315
  if 'key_points' in processed:
316
- all_key_points.extend(processed['key_points'])
 
317
  time.sleep(random.uniform(0.5, 1.0))
318
-
319
  if not results:
320
  return {'error': 'Failed to process any search results'}
321
 
322
- # Combine all summaries and key points
323
- all_summaries = [r['summary'] for r in results if 'summary' in r]
324
- combined_summary = " ".join(all_summaries)
325
-
326
- # Generate final insights
327
  final_summary = self.processor.model_manager.models['summarizer'](
328
- combined_summary[:1024],
329
  max_length=200,
330
  min_length=100,
331
  do_sample=False
332
  )[0]['summary_text']
333
 
 
 
 
 
 
 
 
 
334
  return {
335
  'results': results,
336
  'insights': final_summary,
337
- 'key_points': list(set(all_key_points)), # Remove duplicates
338
- 'follow_up_questions': [
339
- f"What are the key differences between {query} and related topics?",
340
- f"Can you explain {query} in simple terms?",
341
- f"What are the latest developments in {query}?"
342
- ]
343
  }
344
 
345
  except Exception as e:
 
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
+ nav_elements = [
54
  "skip to content",
 
55
  "search",
56
  "menu",
57
+ "navigation",
58
  "subscribe",
59
  "sign in",
60
  "log in",
 
61
  "submit",
62
+ "browse",
63
+ "explore",
64
  ]
65
+ for element in nav_elements:
66
+ text = text.replace(element.lower(), "")
67
  return text.strip()
68
 
69
+ def extract_main_content(self, content: str) -> str:
70
+ """Extract main content from webpage text"""
71
+ # Split into paragraphs
72
+ paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
73
+
74
+ # Filter out short lines and navigation elements
75
+ meaningful_paragraphs = []
76
+ for p in paragraphs:
77
+ # Skip if too short
78
+ if len(p.split()) < 5:
79
+ continue
80
+ # Skip if looks like navigation
81
+ if any(nav in p.lower() for nav in ["β†’", "↓", "menu", "search", "click"]):
82
+ continue
83
+ meaningful_paragraphs.append(p)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ # Join remaining paragraphs
86
+ return ' '.join(meaningful_paragraphs)
 
87
 
88
+ def generate_insights(self, content: str) -> Dict[str, str]:
89
+ """Generate insights from content using AI"""
90
  try:
91
+ # Clean the content first
92
+ cleaned_content = self.clean_text(content)
93
+ main_content = self.extract_main_content(cleaned_content)
 
 
94
 
95
+ if not main_content:
96
+ return {
97
+ 'summary': "Could not extract meaningful content",
98
+ 'key_points': [],
99
+ 'content': content
100
+ }
101
 
102
+ # Generate summary
103
  summary = self.model_manager.models['summarizer'](
104
+ main_content[:1024],
105
  max_length=150,
106
  min_length=50,
107
  do_sample=False
108
  )[0]['summary_text']
109
 
110
+ # Extract key points using the same model
111
+ key_points_text = self.model_manager.models['summarizer'](
112
+ main_content[:1024],
113
+ max_length=200,
114
+ min_length=100,
115
+ num_beams=4,
116
+ do_sample=True
117
+ )[0]['summary_text']
118
+
119
+ # Split into bullet points
120
+ key_points = [
121
+ point.strip()
122
+ for point in key_points_text.split('.')
123
+ if point.strip() and len(point.split()) > 3
124
+ ]
125
+
126
  return {
127
  'summary': summary,
128
  'key_points': key_points,
129
+ 'content': main_content
130
  }
131
+
132
  except Exception as e:
133
  return {
134
  'summary': f"Error processing content: {str(e)}",
135
  'key_points': [],
136
  'content': content
137
  }
138
+
139
+ def process_content(self, content: str) -> Dict:
140
+ """Process content and generate insights"""
141
+ return self.generate_insights(content)
142
 
143
  class WebSearchEngine:
144
  """Main search engine class"""
 
213
  response = self.safe_get(url)
214
  soup = BeautifulSoup(response.text, 'lxml')
215
 
216
+ # Extract text content
217
+ for script in soup(["script", "style"]):
218
+ script.decompose()
219
+ text = soup.get_text()
220
+ lines = (line.strip() for line in text.splitlines())
221
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
222
+ content = ' '.join(chunk for chunk in chunks if chunk)
223
 
224
  # Get metadata
225
  metadata = self.get_metadata(soup)
226
 
227
+ # Process content
228
+ processed = self.processor.process_content(content)
229
+
230
  return {
231
  'url': url,
232
  'title': metadata['title'],
 
304
  return {'error': 'No results found'}
305
 
306
  results = []
307
+ all_insights = []
308
 
309
  for result in search_results:
310
  if 'link' in result:
311
  processed = self.process_url(result['link'])
312
  if 'error' not in processed:
313
+ # Add the snippet to help with context
314
+ processed['snippet'] = result.get('snippet', '')
315
  results.append(processed)
316
+
317
+ # Collect insights
318
+ if 'summary' in processed:
319
+ all_insights.append(processed['summary'])
320
  if 'key_points' in processed:
321
+ all_insights.extend(processed.get('key_points', []))
322
+
323
  time.sleep(random.uniform(0.5, 1.0))
324
+
325
  if not results:
326
  return {'error': 'Failed to process any search results'}
327
 
328
+ # Combine and summarize all insights
329
+ combined_insights = ' '.join(all_insights)
 
 
 
330
  final_summary = self.processor.model_manager.models['summarizer'](
331
+ combined_insights[:1024],
332
  max_length=200,
333
  min_length=100,
334
  do_sample=False
335
  )[0]['summary_text']
336
 
337
+ # Generate specific follow-up questions
338
+ follow_ups = [
339
+ f"What are the recent breakthroughs in {query}?",
340
+ f"How does {query} impact industry and research?",
341
+ f"What are the challenges and limitations in {query}?",
342
+ f"What are the future prospects for {query}?"
343
+ ]
344
+
345
  return {
346
  'results': results,
347
  'insights': final_summary,
348
+ 'key_points': list(set(all_insights)), # Remove duplicates
349
+ 'follow_up_questions': follow_ups
 
 
 
 
350
  }
351
 
352
  except Exception as e: