fikird commited on
Commit
636f8ae
Β·
1 Parent(s): 68c6844

Enhance content processing with better extraction and summarization

Browse files
Files changed (2) hide show
  1. app.py +110 -70
  2. search_engine.py +68 -93
app.py CHANGED
@@ -1,86 +1,126 @@
1
  import gradio as gr
2
- from search_engine import search
 
 
 
 
3
 
4
- def format_search_results(results):
5
- """Format search results into a clean markdown output"""
6
- if 'error' in results:
7
- return f"❌ Error: {results['error']}"
 
 
 
 
 
 
 
 
8
 
9
- output = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Add insights section
12
- if 'insights' in results and results['insights']:
13
- output.append("# πŸ” Latest Developments Summary\n")
14
- output.append(results['insights'])
15
- output.append("\n")
16
 
17
- # Add key points section
18
- if 'key_points' in results and results['key_points']:
19
- output.append("# πŸ’‘ Key Points\n")
20
- for point in results['key_points'][:5]: # Limit to top 5 points
21
- output.append(f"β€’ {point}\n")
22
- output.append("\n")
 
23
 
24
- # Add detailed results section
25
- if 'results' in results and results['results']:
26
- output.append("# πŸ“„ Detailed Findings\n")
27
  for i, result in enumerate(results['results'], 1):
28
- output.append(f"## {i}. {result.get('title', 'No Title')}\n")
 
 
 
29
  if 'url' in result:
30
- output.append(f"πŸ”— [Source]({result['url']})\n")
 
31
  if 'summary' in result:
32
- output.append(f"\n{result['summary']}\n")
33
- if 'key_points' in result and result['key_points']:
34
- output.append("\nKey Takeaways:")
35
- for point in result['key_points'][:3]: # Limit to top 3 points per result
36
- output.append(f"β€’ {point}")
37
- output.append("\n")
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Add follow-up questions section
40
- if 'follow_up_questions' in results and results['follow_up_questions']:
41
- output.append("# ❓ Suggested Follow-up Questions\n")
42
- for question in results['follow_up_questions']:
43
- output.append(f"β€’ {question}\n")
44
-
45
- return "\n".join(output)
46
 
47
- def search_and_format(query):
48
- """Search and format results"""
49
- if not query.strip():
50
- return "Please enter a search query"
 
 
51
 
52
- try:
53
- results = search(query)
54
- return format_search_results(results)
55
- except Exception as e:
56
- return f"οΏ½οΏ½οΏ½ Error performing search: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # Create Gradio interface
59
- iface = gr.Interface(
60
- fn=search_and_format,
61
- inputs=gr.Textbox(
62
- label="Enter your search query",
63
- placeholder="Example: Latest developments in quantum computing"
64
- ),
65
- outputs=gr.Markdown(label="Search Results"),
66
- title="AI-Powered Research Assistant",
67
- description="""
68
- This tool helps you research topics by:
69
- 1. Finding relevant information from multiple sources
70
- 2. Summarizing key findings
71
- 3. Extracting important points
72
- 4. Suggesting follow-up questions
73
-
74
- Try searching for topics in technology, science, or any other field!
75
- """,
76
- examples=[
77
- ["Latest developments in quantum computing"],
78
- ["Artificial intelligence breakthroughs"],
79
- ["Climate change solutions"],
80
- ["Space exploration advancements"],
81
- ],
82
- theme=gr.themes.Soft()
83
- )
84
 
85
  # Launch for Spaces
86
- iface.launch()
 
1
  import gradio as gr
2
+ from rag_engine import RAGEngine
3
+ import torch
4
+ import os
5
+ import logging
6
+ import traceback
7
 
8
+ # Configure logging
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s'
12
+ )
13
+ logger = logging.getLogger(__name__)
14
+
15
+ def safe_search(query, max_results):
16
+ """Wrapper function to handle errors gracefully"""
17
+ try:
18
+ rag = RAGEngine()
19
+ results = rag.search_and_process(query, max_results)
20
 
21
+ if 'error' in results:
22
+ return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{results['error']}\n```"
23
+
24
+ return format_results(results)
25
+ except Exception as e:
26
+ error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
27
+ logger.error(error_msg)
28
+ return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
+
30
+ def format_results(results):
31
+ """Format search results for display"""
32
+ if not results or not results.get('results'):
33
+ return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
+
35
+ formatted = f"# πŸ” Search Results\n\n"
36
 
37
  # Add insights section
38
+ if 'insights' in results:
39
+ formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
40
 
41
+ # Add follow-up questions
42
+ if 'follow_up_questions' in results:
43
+ formatted += "## ❓ Follow-up Questions\n"
44
+ for q in results['follow_up_questions']:
45
+ if q and q.strip():
46
+ formatted += f"- {q.strip()}\n"
47
+ formatted += "\n"
48
 
49
+ # Add main results
50
+ if 'results' in results:
51
+ formatted += "## πŸ“„ Detailed Results\n\n"
52
  for i, result in enumerate(results['results'], 1):
53
+ if not isinstance(result, dict):
54
+ continue
55
+
56
+ formatted += f"### {i}. "
57
  if 'url' in result:
58
+ title = result.get('title', 'Untitled')
59
+ formatted += f"[{title}]({result['url']})\n"
60
  if 'summary' in result:
61
+ formatted += f"\n{result['summary']}\n\n"
62
+
63
+ # Add similar chunks if available
64
+ if 'similar_chunks' in results:
65
+ formatted += "## πŸ” Related Content\n\n"
66
+ for i, chunk in enumerate(results['similar_chunks'], 1):
67
+ if not isinstance(chunk, dict):
68
+ continue
69
+
70
+ formatted += f"### Related {i}\n"
71
+ if 'metadata' in chunk:
72
+ meta = chunk['metadata']
73
+ if 'title' in meta and 'url' in meta:
74
+ formatted += f"From [{meta['title']}]({meta['url']})\n"
75
+ if 'content' in chunk:
76
+ formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
+ return formatted
 
 
 
 
 
 
79
 
80
+ def create_demo():
81
+ """Create the Gradio interface"""
82
+
83
+ with gr.Blocks(title="Web Search + RAG") as demo:
84
+ gr.Markdown("# πŸ” Intelligent Web Search")
85
+ gr.Markdown("Search the web with AI-powered insights and analysis.")
86
 
87
+ with gr.Row():
88
+ with gr.Column():
89
+ query = gr.Textbox(
90
+ label="Search Query",
91
+ placeholder="Enter your search query...",
92
+ lines=2
93
+ )
94
+ max_results = gr.Slider(
95
+ minimum=1,
96
+ maximum=10,
97
+ value=5,
98
+ step=1,
99
+ label="Number of Results"
100
+ )
101
+ search_button = gr.Button("πŸ” Search")
102
+
103
+ output = gr.Markdown()
104
+
105
+ search_button.click(
106
+ fn=safe_search,
107
+ inputs=[query, max_results],
108
+ outputs=output
109
+ )
110
+
111
+ gr.Examples(
112
+ examples=[
113
+ ["What is RAG in AI?", 5],
114
+ ["Latest developments in quantum computing", 3],
115
+ ["How does BERT work?", 5]
116
+ ],
117
+ inputs=[query, max_results]
118
+ )
119
+
120
+ return demo
121
 
122
+ # Create the demo
123
+ demo = create_demo()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Launch for Spaces
126
+ demo.launch()
search_engine.py CHANGED
@@ -49,96 +49,78 @@ class ContentProcessor:
49
  """Clean and normalize text content"""
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
- # Remove common navigation elements
53
- nav_elements = [
54
- "skip to content",
55
- "search",
56
- "menu",
57
- "navigation",
58
- "subscribe",
59
- "sign in",
60
- "log in",
61
- "submit",
62
- "browse",
63
- "explore",
64
- ]
65
- for element in nav_elements:
66
- text = text.replace(element.lower(), "")
67
- return text.strip()
68
-
69
- def extract_main_content(self, content: str) -> str:
70
- """Extract main content from webpage text"""
71
- # Split into paragraphs
72
- paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
73
-
74
- # Filter out short lines and navigation elements
75
- meaningful_paragraphs = []
76
- for p in paragraphs:
77
- # Skip if too short
78
- if len(p.split()) < 5:
79
- continue
80
- # Skip if looks like navigation
81
- if any(nav in p.lower() for nav in ["β†’", "↓", "menu", "search", "click"]):
82
- continue
83
- meaningful_paragraphs.append(p)
84
 
85
- # Join remaining paragraphs
86
- return ' '.join(meaningful_paragraphs)
 
 
87
 
88
- def generate_insights(self, content: str) -> Dict[str, str]:
89
- """Generate insights from content using AI"""
90
  try:
91
- # Clean the content first
92
  cleaned_content = self.clean_text(content)
93
- main_content = self.extract_main_content(cleaned_content)
94
 
95
- if not main_content:
96
- return {
97
- 'summary': "Could not extract meaningful content",
98
- 'key_points': [],
99
- 'content': content
100
- }
101
 
102
- # Generate summary
103
- summary = self.model_manager.models['summarizer'](
104
- main_content[:1024],
105
- max_length=150,
106
- min_length=50,
107
- do_sample=False
108
- )[0]['summary_text']
109
 
110
- # Extract key points using the same model
111
- key_points_text = self.model_manager.models['summarizer'](
112
- main_content[:1024],
113
  max_length=200,
114
  min_length=100,
 
115
  num_beams=4,
116
- do_sample=True
 
117
  )[0]['summary_text']
118
 
119
- # Split into bullet points
120
- key_points = [
121
- point.strip()
122
- for point in key_points_text.split('.')
123
- if point.strip() and len(point.split()) > 3
124
- ]
125
-
126
  return {
127
  'summary': summary,
128
  'key_points': key_points,
129
- 'content': main_content
130
  }
131
-
132
  except Exception as e:
133
  return {
134
  'summary': f"Error processing content: {str(e)}",
135
  'key_points': [],
136
  'content': content
137
  }
138
-
139
- def process_content(self, content: str) -> Dict:
140
- """Process content and generate insights"""
141
- return self.generate_insights(content)
142
 
143
  class WebSearchEngine:
144
  """Main search engine class"""
@@ -225,7 +207,7 @@ class WebSearchEngine:
225
  metadata = self.get_metadata(soup)
226
 
227
  # Process content
228
- processed = self.processor.process_content(content)
229
 
230
  return {
231
  'url': url,
@@ -304,49 +286,42 @@ class WebSearchEngine:
304
  return {'error': 'No results found'}
305
 
306
  results = []
307
- all_insights = []
308
 
309
  for result in search_results:
310
  if 'link' in result:
311
  processed = self.process_url(result['link'])
312
  if 'error' not in processed:
313
- # Add the snippet to help with context
314
  processed['snippet'] = result.get('snippet', '')
315
  results.append(processed)
316
-
317
- # Collect insights
318
- if 'summary' in processed:
319
- all_insights.append(processed['summary'])
320
  if 'key_points' in processed:
321
- all_insights.extend(processed.get('key_points', []))
322
-
323
  time.sleep(random.uniform(0.5, 1.0))
324
 
325
  if not results:
326
  return {'error': 'Failed to process any search results'}
327
 
328
- # Combine and summarize all insights
329
- combined_insights = ' '.join(all_insights)
330
- final_summary = self.processor.model_manager.models['summarizer'](
331
- combined_insights[:1024],
332
- max_length=200,
333
- min_length=100,
334
- do_sample=False
335
- )[0]['summary_text']
336
 
337
- # Generate specific follow-up questions
338
- follow_ups = [
339
- f"What are the recent breakthroughs in {query}?",
340
  f"How does {query} impact industry and research?",
341
- f"What are the challenges and limitations in {query}?",
342
- f"What are the future prospects for {query}?"
343
  ]
344
 
345
  return {
346
  'results': results,
347
- 'insights': final_summary,
348
- 'key_points': list(set(all_insights)), # Remove duplicates
349
- 'follow_up_questions': follow_ups
350
  }
351
 
352
  except Exception as e:
 
49
  """Clean and normalize text content"""
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
+ # Remove redundant headers and navigation text
53
+ common_headers = ['skip to content', 'search', 'menu', 'navigation', 'subscribe']
54
+ lines = []
55
+ for line in text.split('\n'):
56
+ line = line.strip().lower()
57
+ if not any(header in line for header in common_headers) and len(line) > 20:
58
+ lines.append(line)
59
+ return ' '.join(lines)
60
+
61
+ def extract_key_points(self, content: str) -> List[str]:
62
+ """Extract key points from content using AI"""
63
+ try:
64
+ # Split content into chunks for processing
65
+ chunks = [content[i:i+1024] for i in range(0, len(content), 1024)]
66
+ key_points = []
67
+
68
+ for chunk in chunks:
69
+ # Generate focused summary for each chunk
70
+ summary = self.model_manager.models['summarizer'](
71
+ chunk,
72
+ max_length=150,
73
+ min_length=50,
74
+ do_sample=False,
75
+ num_beams=4,
76
+ length_penalty=2.0,
77
+ early_stopping=True
78
+ )[0]['summary_text']
79
+
80
+ key_points.append(summary)
 
 
 
81
 
82
+ return key_points
83
+ except Exception as e:
84
+ logger.error(f"Error extracting key points: {str(e)}")
85
+ return []
86
 
87
+ def process_content(self, content: str, title: str = "", description: str = "") -> Dict:
88
+ """Process content and generate insights"""
89
  try:
90
+ # Clean the content
91
  cleaned_content = self.clean_text(content)
 
92
 
93
+ # Combine title and description with content for context
94
+ if title:
95
+ cleaned_content = f"{title}. {cleaned_content}"
96
+ if description:
97
+ cleaned_content = f"{description}. {cleaned_content}"
 
98
 
99
+ # Extract key points
100
+ key_points = self.extract_key_points(cleaned_content)
 
 
 
 
 
101
 
102
+ # Generate overall summary
103
+ summary = self.model_manager.models['summarizer'](
104
+ ' '.join(key_points)[:1024],
105
  max_length=200,
106
  min_length=100,
107
+ do_sample=False,
108
  num_beams=4,
109
+ length_penalty=2.0,
110
+ early_stopping=True
111
  )[0]['summary_text']
112
 
 
 
 
 
 
 
 
113
  return {
114
  'summary': summary,
115
  'key_points': key_points,
116
+ 'content': cleaned_content
117
  }
 
118
  except Exception as e:
119
  return {
120
  'summary': f"Error processing content: {str(e)}",
121
  'key_points': [],
122
  'content': content
123
  }
 
 
 
 
124
 
125
  class WebSearchEngine:
126
  """Main search engine class"""
 
207
  metadata = self.get_metadata(soup)
208
 
209
  # Process content
210
+ processed = self.processor.process_content(content, metadata['title'], metadata['description'])
211
 
212
  return {
213
  'url': url,
 
286
  return {'error': 'No results found'}
287
 
288
  results = []
289
+ all_key_points = []
290
 
291
  for result in search_results:
292
  if 'link' in result:
293
  processed = self.process_url(result['link'])
294
  if 'error' not in processed:
295
+ # Add original search snippet
296
  processed['snippet'] = result.get('snippet', '')
297
  results.append(processed)
298
+ # Collect key points
 
 
 
299
  if 'key_points' in processed:
300
+ all_key_points.extend(processed['key_points'])
 
301
  time.sleep(random.uniform(0.5, 1.0))
302
 
303
  if not results:
304
  return {'error': 'Failed to process any search results'}
305
 
306
+ # Generate comprehensive insights
307
+ insights = []
308
+ if all_key_points:
309
+ # Group similar points and remove duplicates
310
+ unique_points = list(set(all_key_points))
311
+ insights = self.processor.extract_key_points(' '.join(unique_points))
 
 
312
 
313
+ # Generate relevant follow-up questions
314
+ follow_up_questions = [
315
+ f"What are the practical applications of {query}?",
316
  f"How does {query} impact industry and research?",
317
+ f"What challenges and limitations exist in {query}?",
318
+ f"What future developments are expected in {query}?"
319
  ]
320
 
321
  return {
322
  'results': results,
323
+ 'insights': insights if insights else ["No comprehensive insights available."],
324
+ 'follow_up_questions': follow_up_questions
 
325
  }
326
 
327
  except Exception as e: