fikird commited on
Commit
3f90511
Β·
1 Parent(s): 2f58cc7

Enhance content processing and improve result formatting

Browse files
Files changed (2) hide show
  1. app.py +74 -114
  2. search_engine.py +89 -106
app.py CHANGED
@@ -1,126 +1,86 @@
1
  import gradio as gr
2
- from rag_engine import RAGEngine
3
- import torch
4
- import os
5
- import logging
6
- import traceback
7
-
8
- # Configure logging
9
- logging.basicConfig(
10
- level=logging.INFO,
11
- format='%(asctime)s - %(levelname)s - %(message)s'
12
- )
13
- logger = logging.getLogger(__name__)
14
-
15
- def safe_search(query, max_results):
16
- """Wrapper function to handle errors gracefully"""
17
- try:
18
- rag = RAGEngine()
19
- results = rag.search_and_process(query, max_results)
20
-
21
- if 'error' in results:
22
- return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{results['error']}\n```"
23
-
24
- return format_results(results)
25
- except Exception as e:
26
- error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
27
- logger.error(error_msg)
28
- return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
 
30
  def format_results(results):
31
- """Format search results for display"""
32
- if not results or not results.get('results'):
33
- return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
-
35
- formatted = f"# πŸ” Search Results\n\n"
36
 
37
  # Add insights section
38
- if 'insights' in results:
39
- formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
40
 
41
- # Add follow-up questions
42
- if 'follow_up_questions' in results:
43
- formatted += "## ❓ Follow-up Questions\n"
44
- for q in results['follow_up_questions']:
45
- if q and q.strip():
46
- formatted += f"- {q.strip()}\n"
47
- formatted += "\n"
48
 
49
- # Add main results
50
- if 'results' in results:
51
- formatted += "## πŸ“„ Detailed Results\n\n"
52
  for i, result in enumerate(results['results'], 1):
53
- if not isinstance(result, dict):
54
- continue
55
-
56
- formatted += f"### {i}. "
57
- if 'url' in result:
58
- title = result.get('title', 'Untitled')
59
- formatted += f"[{title}]({result['url']})\n"
60
- if 'summary' in result:
61
- formatted += f"\n{result['summary']}\n\n"
62
-
63
- # Add similar chunks if available
64
- if 'similar_chunks' in results:
65
- formatted += "## πŸ” Related Content\n\n"
66
- for i, chunk in enumerate(results['similar_chunks'], 1):
67
- if not isinstance(chunk, dict):
68
- continue
69
-
70
- formatted += f"### Related {i}\n"
71
- if 'metadata' in chunk:
72
- meta = chunk['metadata']
73
- if 'title' in meta and 'url' in meta:
74
- formatted += f"From [{meta['title']}]({meta['url']})\n"
75
- if 'content' in chunk:
76
- formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
- return formatted
79
-
80
- def create_demo():
81
- """Create the Gradio interface"""
 
82
 
83
- with gr.Blocks(title="Web Search + RAG") as demo:
84
- gr.Markdown("# πŸ” Intelligent Web Search")
85
- gr.Markdown("Search the web with AI-powered insights and analysis.")
86
-
87
- with gr.Row():
88
- with gr.Column():
89
- query = gr.Textbox(
90
- label="Search Query",
91
- placeholder="Enter your search query...",
92
- lines=2
93
- )
94
- max_results = gr.Slider(
95
- minimum=1,
96
- maximum=10,
97
- value=5,
98
- step=1,
99
- label="Number of Results"
100
- )
101
- search_button = gr.Button("πŸ” Search")
102
-
103
- output = gr.Markdown()
104
-
105
- search_button.click(
106
- fn=safe_search,
107
- inputs=[query, max_results],
108
- outputs=output
109
- )
110
-
111
- gr.Examples(
112
- examples=[
113
- ["What is RAG in AI?", 5],
114
- ["Latest developments in quantum computing", 3],
115
- ["How does BERT work?", 5]
116
- ],
117
- inputs=[query, max_results]
118
- )
119
-
120
- return demo
121
 
122
- # Create the demo
123
- demo = create_demo()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- # Launch for Spaces
126
- demo.launch()
 
 
1
  import gradio as gr
2
+ from search_engine import search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def format_results(results):
5
+ """Format search results in a user-friendly way"""
6
+ if 'error' in results:
7
+ return f"❌ Error: {results['error']}"
8
+
9
+ output = []
10
 
11
  # Add insights section
12
+ if 'insights' in results and results['insights']:
13
+ output.append("# πŸ’‘ Key Insights\n")
14
+ output.append(results['insights'])
15
+ output.append("\n")
16
 
17
+ # Add key points section
18
+ if 'key_points' in results and results['key_points']:
19
+ output.append("# 🎯 Key Points\n")
20
+ for i, point in enumerate(results['key_points'], 1):
21
+ output.append(f"{i}. {point}\n")
22
+ output.append("\n")
 
23
 
24
+ # Add detailed results section
25
+ if 'results' in results and results['results']:
26
+ output.append("# πŸ“„ Detailed Results\n")
27
  for i, result in enumerate(results['results'], 1):
28
+ output.append(f"## {i}. [{result['title']}]({result['url']})\n")
29
+ if 'description' in result and result['description']:
30
+ output.append(f"*{result['description']}*\n")
31
+ if 'summary' in result and result['summary']:
32
+ output.append(f"{result['summary']}\n")
33
+ if 'key_points' in result and result['key_points']:
34
+ output.append("\nHighlights:\n")
35
+ for point in result['key_points']:
36
+ output.append(f"- {point}\n")
37
+ output.append("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Add follow-up questions section
40
+ if 'follow_up_questions' in results and results['follow_up_questions']:
41
+ output.append("# ❓ Related Questions\n")
42
+ for question in results['follow_up_questions']:
43
+ output.append(f"- {question}\n")
44
 
45
+ return "\n".join(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ def search_and_format(query):
48
+ """Search and format results"""
49
+ try:
50
+ results = search(query)
51
+ return format_results(results)
52
+ except Exception as e:
53
+ return f"❌ Error: {str(e)}"
54
+
55
+ # Create the Gradio interface
56
+ interface = gr.Interface(
57
+ fn=search_and_format,
58
+ inputs=gr.Textbox(
59
+ label="Enter your search query",
60
+ placeholder="What would you like to learn about?",
61
+ lines=2
62
+ ),
63
+ outputs=gr.Markdown(
64
+ label="Search Results",
65
+ show_label=True
66
+ ),
67
+ title="πŸ” AI-Powered Web Search",
68
+ description="""
69
+ This search engine uses AI to:
70
+ - Find relevant web pages
71
+ - Extract key information
72
+ - Generate insights and summaries
73
+ - Suggest follow-up questions
74
+ """,
75
+ examples=[
76
+ ["What is quantum computing?"],
77
+ ["Latest developments in artificial intelligence"],
78
+ ["How does blockchain technology work?"],
79
+ ["Explain machine learning in simple terms"],
80
+ ],
81
+ theme=gr.themes.Soft()
82
+ )
83
 
84
+ # Launch the app
85
+ if __name__ == "__main__":
86
+ interface.launch()
search_engine.py CHANGED
@@ -50,101 +50,102 @@ class ContentProcessor:
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
- nav_elements = [
54
- "Skip to content",
55
- "Search",
56
- "Menu",
57
- "Navigation",
58
- "Subscribe",
59
- "Browse",
60
- "Submit",
61
- "More",
62
- "About",
63
- "Contact",
64
- "Privacy Policy",
65
- "Terms of Use"
66
  ]
67
- for element in nav_elements:
68
- text = text.replace(element, "")
69
  return text.strip()
70
 
71
  def extract_main_content(self, soup: BeautifulSoup) -> str:
72
  """Extract main content from HTML"""
73
  # Remove navigation, headers, footers
74
- for elem in soup.find_all(['nav', 'header', 'footer', 'script', 'style', 'meta', 'link']):
75
  elem.decompose()
76
 
77
  # Try to find main content container
78
  main_content = None
79
- content_tags = ['article', 'main', '[role="main"]', '.content', '#content', '.post', '.entry']
80
-
81
- for tag in content_tags:
82
  main_content = soup.select_one(tag)
83
  if main_content:
84
  break
85
 
86
  if not main_content:
87
- main_content = soup
 
88
 
89
- # Extract text from paragraphs
90
- paragraphs = main_content.find_all('p')
91
- if paragraphs:
92
- return ' '.join(p.get_text(strip=True) for p in paragraphs)
 
93
 
94
- # Fallback to all text if no paragraphs found
95
- return main_content.get_text(strip=True)
96
 
97
- def process_content(self, content: str, html_content: str = None) -> Dict:
98
- """Process content and generate insights"""
99
  try:
100
- # Clean content
101
- cleaned_content = self.clean_text(content)
 
102
 
103
- # If HTML content is provided, try to extract main content
104
- if html_content:
105
- soup = BeautifulSoup(html_content, 'lxml')
106
- main_content = self.extract_main_content(soup)
107
- if main_content:
108
- cleaned_content = self.clean_text(main_content)
 
 
 
 
 
 
109
 
110
- # Generate summary in chunks if content is too long
111
- chunks = [cleaned_content[i:i+1024] for i in range(0, len(cleaned_content), 1024)]
112
- summaries = []
113
 
114
- for chunk in chunks[:3]: # Process up to 3 chunks to avoid too long processing
115
- try:
116
- summary = self.model_manager.models['summarizer'](
117
- chunk,
118
- max_length=150,
119
- min_length=50,
120
- do_sample=False
121
- )[0]['summary_text']
122
- summaries.append(summary)
123
- except Exception as e:
124
- logger.warning(f"Error summarizing chunk: {str(e)}")
125
- continue
126
 
127
- # Combine summaries
128
- final_summary = ' '.join(summaries)
129
 
130
- # Extract key points using bullet points
131
- key_points = self.model_manager.models['summarizer'](
132
- cleaned_content[:1024],
133
- max_length=100,
134
- min_length=30,
135
- num_beams=4,
136
- do_sample=True
137
  )[0]['summary_text']
138
 
139
  return {
140
- 'summary': final_summary,
141
  'key_points': key_points,
142
- 'content': cleaned_content
143
  }
144
  except Exception as e:
145
  return {
146
  'summary': f"Error processing content: {str(e)}",
147
- 'key_points': "",
148
  'content': content
149
  }
150
 
@@ -221,15 +222,12 @@ class WebSearchEngine:
221
  response = self.safe_get(url)
222
  soup = BeautifulSoup(response.text, 'lxml')
223
 
 
 
 
224
  # Get metadata
225
  metadata = self.get_metadata(soup)
226
 
227
- # Process content with both text and HTML
228
- processed = self.processor.process_content(
229
- soup.get_text(),
230
- html_content=response.text
231
- )
232
-
233
  return {
234
  'url': url,
235
  'title': metadata['title'],
@@ -242,35 +240,6 @@ class WebSearchEngine:
242
  except Exception as e:
243
  return {'error': f"Error processing {url}: {str(e)}"}
244
 
245
- def format_results(self, results: List[Dict]) -> Dict:
246
- """Format search results in a user-friendly way"""
247
- formatted_insights = []
248
- formatted_results = []
249
-
250
- for result in results:
251
- if 'error' not in result:
252
- # Format key points
253
- if result.get('key_points'):
254
- points = result['key_points'].split('. ')
255
- formatted_points = [f"β€’ {point.strip()}" for point in points if point.strip()]
256
- formatted_insights.extend(formatted_points)
257
-
258
- # Format detailed result
259
- formatted_result = {
260
- 'title': result['title'],
261
- 'url': result['url'],
262
- 'summary': result['summary'],
263
- }
264
- formatted_results.append(formatted_result)
265
-
266
- # Remove duplicates while preserving order
267
- formatted_insights = list(dict.fromkeys(formatted_insights))
268
-
269
- return {
270
- 'insights': '\n'.join(formatted_insights[:10]), # Top 10 insights
271
- 'results': formatted_results
272
- }
273
-
274
  def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
275
  """Search DuckDuckGo and parse HTML results"""
276
  search_results = []
@@ -336,26 +305,40 @@ class WebSearchEngine:
336
  return {'error': 'No results found'}
337
 
338
  results = []
 
 
339
  for result in search_results:
340
  if 'link' in result:
341
  processed = self.process_url(result['link'])
342
  if 'error' not in processed:
343
  results.append(processed)
 
 
344
  time.sleep(random.uniform(0.5, 1.0))
345
-
346
  if not results:
347
  return {'error': 'Failed to process any search results'}
348
 
349
- # Format results in a user-friendly way
350
- formatted = self.format_results(results)
 
 
 
 
 
 
 
 
 
351
 
352
  return {
353
- 'results': formatted['results'],
354
- 'insights': formatted['insights'],
 
355
  'follow_up_questions': [
356
- f"What are the recent breakthroughs in {query}?",
357
- f"How does {query} impact various industries?",
358
- f"What are the future prospects of {query}?"
359
  ]
360
  }
361
 
 
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
+ nav_patterns = [
54
+ "skip to content",
55
+ "skip to navigation",
56
+ "search",
57
+ "menu",
58
+ "subscribe",
59
+ "sign in",
60
+ "log in",
61
+ "browse",
62
+ "submit",
 
 
 
63
  ]
64
+ for pattern in nav_patterns:
65
+ text = text.replace(pattern.lower(), "")
66
  return text.strip()
67
 
68
  def extract_main_content(self, soup: BeautifulSoup) -> str:
69
  """Extract main content from HTML"""
70
  # Remove navigation, headers, footers
71
+ for elem in soup.find_all(['nav', 'header', 'footer', 'aside', 'script', 'style']):
72
  elem.decompose()
73
 
74
  # Try to find main content container
75
  main_content = None
76
+ for tag in ['main', 'article', 'div[role="main"]', '.main-content', '#main-content']:
 
 
77
  main_content = soup.select_one(tag)
78
  if main_content:
79
  break
80
 
81
  if not main_content:
82
+ # Fallback to body content
83
+ main_content = soup.find('body')
84
 
85
+ if main_content:
86
+ text = main_content.get_text(separator=' ', strip=True)
87
+ else:
88
+ # Last resort: get all text
89
+ text = soup.get_text(separator=' ', strip=True)
90
 
91
+ return self.clean_text(text)
 
92
 
93
+ def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
94
+ """Extract key points from text using AI"""
95
  try:
96
+ # Split text into smaller chunks
97
+ chunk_size = 1024
98
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
99
 
100
+ all_points = []
101
+ for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
102
+ summary = self.model_manager.models['summarizer'](
103
+ chunk,
104
+ max_length=100,
105
+ min_length=30,
106
+ do_sample=False
107
+ )[0]['summary_text']
108
+
109
+ # Split summary into sentences
110
+ points = [s.strip() for s in summary.split('.') if s.strip()]
111
+ all_points.extend(points)
112
 
113
+ # Return top points
114
+ return all_points[:max_points]
 
115
 
116
+ except Exception as e:
117
+ logger.error(f"Error extracting key points: {str(e)}")
118
+ return []
119
+
120
+ def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
121
+ """Process content and generate insights"""
122
+ try:
123
+ # Extract main content if HTML is available
124
+ if soup:
125
+ content = self.extract_main_content(soup)
126
+ else:
127
+ content = self.clean_text(content)
128
 
129
+ # Extract key points
130
+ key_points = self.extract_key_points(content)
131
 
132
+ # Generate overall summary
133
+ summary = self.model_manager.models['summarizer'](
134
+ content[:1024],
135
+ max_length=150,
136
+ min_length=50,
137
+ do_sample=False
 
138
  )[0]['summary_text']
139
 
140
  return {
141
+ 'summary': summary,
142
  'key_points': key_points,
143
+ 'content': content
144
  }
145
  except Exception as e:
146
  return {
147
  'summary': f"Error processing content: {str(e)}",
148
+ 'key_points': [],
149
  'content': content
150
  }
151
 
 
222
  response = self.safe_get(url)
223
  soup = BeautifulSoup(response.text, 'lxml')
224
 
225
+ # Process content with HTML context
226
+ processed = self.processor.process_content("", soup)
227
+
228
  # Get metadata
229
  metadata = self.get_metadata(soup)
230
 
 
 
 
 
 
 
231
  return {
232
  'url': url,
233
  'title': metadata['title'],
 
240
  except Exception as e:
241
  return {'error': f"Error processing {url}: {str(e)}"}
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
244
  """Search DuckDuckGo and parse HTML results"""
245
  search_results = []
 
305
  return {'error': 'No results found'}
306
 
307
  results = []
308
+ all_key_points = []
309
+
310
  for result in search_results:
311
  if 'link' in result:
312
  processed = self.process_url(result['link'])
313
  if 'error' not in processed:
314
  results.append(processed)
315
+ if 'key_points' in processed:
316
+ all_key_points.extend(processed['key_points'])
317
  time.sleep(random.uniform(0.5, 1.0))
318
+
319
  if not results:
320
  return {'error': 'Failed to process any search results'}
321
 
322
+ # Combine all summaries and key points
323
+ all_summaries = [r['summary'] for r in results if 'summary' in r]
324
+ combined_summary = " ".join(all_summaries)
325
+
326
+ # Generate final insights
327
+ final_summary = self.processor.model_manager.models['summarizer'](
328
+ combined_summary[:1024],
329
+ max_length=200,
330
+ min_length=100,
331
+ do_sample=False
332
+ )[0]['summary_text']
333
 
334
  return {
335
+ 'results': results,
336
+ 'insights': final_summary,
337
+ 'key_points': list(set(all_key_points)), # Remove duplicates
338
  'follow_up_questions': [
339
+ f"What are the key differences between {query} and related topics?",
340
+ f"Can you explain {query} in simple terms?",
341
+ f"What are the latest developments in {query}?"
342
  ]
343
  }
344