fikird commited on
Commit
f2c01c1
Β·
1 Parent(s): ae8bccc

Enhance content processing with better extraction and summarization

Browse files
Files changed (2) hide show
  1. app.py +21 -46
  2. search_engine.py +75 -72
app.py CHANGED
@@ -32,72 +32,50 @@ def format_results(results):
32
  if not results or not results.get('results'):
33
  return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
 
35
- insights = results.get('insights', {})
36
- output = []
37
 
38
- # Main Summary
39
- output.append("πŸ“ Executive Summary")
40
- output.append("-" * 50)
41
- output.append(insights.get('main_summary', ''))
42
- output.append("\n")
43
 
44
- # Key Findings
45
- output.append("πŸ”‘ Key Findings")
46
- output.append("-" * 50)
47
- for i, point in enumerate(insights.get('key_findings', []), 1):
48
- output.append(f"{i}. {point}")
49
- output.append("\n")
50
-
51
- # Sources
52
- output.append("πŸ“š Sources")
53
- output.append("-" * 50)
54
- for source in insights.get('sources', []):
55
- output.append(f"β€’ {source.get('title', '')}")
56
- output.append(f" {source.get('url', '')}")
57
- output.append("\n")
58
-
59
- # Follow-up Questions
60
- output.append("❓ Suggested Questions")
61
- output.append("-" * 50)
62
- for question in results.get('follow_up_questions', []):
63
- output.append(f"β€’ {question}")
64
 
65
  # Add main results
66
  if 'results' in results:
67
- output.append("\n")
68
- output.append("πŸ“„ Detailed Results")
69
- output.append("-" * 50)
70
- output.append("\n")
71
  for i, result in enumerate(results['results'], 1):
72
  if not isinstance(result, dict):
73
  continue
74
 
75
- output.append(f"### {i}. ")
76
  if 'url' in result:
77
  title = result.get('title', 'Untitled')
78
- output.append(f"[{title}]({result['url']})\n")
79
  if 'summary' in result:
80
- output.append(f"\n{result['summary']}\n\n")
81
 
82
  # Add similar chunks if available
83
  if 'similar_chunks' in results:
84
- output.append("\n")
85
- output.append("πŸ” Related Content")
86
- output.append("-" * 50)
87
- output.append("\n")
88
  for i, chunk in enumerate(results['similar_chunks'], 1):
89
  if not isinstance(chunk, dict):
90
  continue
91
 
92
- output.append(f"### Related {i}\n")
93
  if 'metadata' in chunk:
94
  meta = chunk['metadata']
95
  if 'title' in meta and 'url' in meta:
96
- output.append(f"From [{meta['title']}]({meta['url']})\n")
97
  if 'content' in chunk:
98
- output.append(f"\n{chunk['content'][:200]}...\n\n")
99
 
100
- return "\n".join(output)
101
 
102
  def create_demo():
103
  """Create the Gradio interface"""
@@ -122,10 +100,7 @@ def create_demo():
122
  )
123
  search_button = gr.Button("πŸ” Search")
124
 
125
- output = gr.Textbox(
126
- label="Search Results",
127
- lines=20
128
- )
129
 
130
  search_button.click(
131
  fn=safe_search,
 
32
  if not results or not results.get('results'):
33
  return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
 
35
+ formatted = f"# πŸ” Search Results\n\n"
 
36
 
37
+ # Add insights section
38
+ if 'insights' in results:
39
+ formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
40
 
41
+ # Add follow-up questions
42
+ if 'follow_up_questions' in results:
43
+ formatted += "## ❓ Follow-up Questions\n"
44
+ for q in results['follow_up_questions']:
45
+ if q and q.strip():
46
+ formatted += f"- {q.strip()}\n"
47
+ formatted += "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Add main results
50
  if 'results' in results:
51
+ formatted += "## πŸ“„ Detailed Results\n\n"
 
 
 
52
  for i, result in enumerate(results['results'], 1):
53
  if not isinstance(result, dict):
54
  continue
55
 
56
+ formatted += f"### {i}. "
57
  if 'url' in result:
58
  title = result.get('title', 'Untitled')
59
+ formatted += f"[{title}]({result['url']})\n"
60
  if 'summary' in result:
61
+ formatted += f"\n{result['summary']}\n\n"
62
 
63
  # Add similar chunks if available
64
  if 'similar_chunks' in results:
65
+ formatted += "## πŸ” Related Content\n\n"
 
 
 
66
  for i, chunk in enumerate(results['similar_chunks'], 1):
67
  if not isinstance(chunk, dict):
68
  continue
69
 
70
+ formatted += f"### Related {i}\n"
71
  if 'metadata' in chunk:
72
  meta = chunk['metadata']
73
  if 'title' in meta and 'url' in meta:
74
+ formatted += f"From [{meta['title']}]({meta['url']})\n"
75
  if 'content' in chunk:
76
+ formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
+ return formatted
79
 
80
  def create_demo():
81
  """Create the Gradio interface"""
 
100
  )
101
  search_button = gr.Button("πŸ” Search")
102
 
103
+ output = gr.Markdown()
 
 
 
104
 
105
  search_button.click(
106
  fn=safe_search,
search_engine.py CHANGED
@@ -52,72 +52,76 @@ class ContentProcessor:
52
  # Remove common navigation elements
53
  nav_elements = [
54
  "skip to content",
55
- "skip to navigation",
56
  "search",
57
  "menu",
 
58
  "subscribe",
59
  "sign in",
60
  "log in",
61
  "submit",
62
  "browse",
63
- "explore",
64
  ]
65
  for element in nav_elements:
66
  text = text.replace(element.lower(), "")
67
  return text.strip()
68
 
69
  def extract_main_content(self, soup: BeautifulSoup) -> str:
70
- """Extract main content from HTML soup"""
71
- # Remove navigation, headers, footers, and sidebars
72
- for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
73
- elem.decompose()
74
-
75
- # Remove script and style elements
76
- for elem in soup.find_all(['script', 'style']):
77
- elem.decompose()
78
-
79
- # Try to find main content area
80
- main_content = None
81
- content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
82
 
83
- for tag in content_tags:
84
- main_content = soup.select_one(tag)
85
- if main_content:
86
- break
87
-
88
- # If no main content found, use body
89
- if not main_content:
90
- main_content = soup.find('body')
91
-
92
- if main_content:
93
- text = main_content.get_text(separator=' ', strip=True)
94
- else:
95
- text = soup.get_text(separator=' ', strip=True)
96
-
97
- return self.clean_text(text)
 
 
 
 
 
 
98
 
99
  def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
100
- """Extract key points from text using AI"""
101
  try:
102
- # Split text into chunks for processing
103
- chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
104
- all_points = []
105
-
106
- for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
107
- summary = self.model_manager.models['summarizer'](
108
- chunk,
109
- max_length=100,
110
- min_length=30,
111
- do_sample=False
112
- )[0]['summary_text']
113
 
114
- # Split summary into sentences
115
- points = [s.strip() for s in summary.split('.') if s.strip()]
116
- all_points.extend(points)
117
 
118
- # Return top points
119
- return all_points[:max_points]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
 
121
  except Exception as e:
122
  logger.error(f"Error extracting key points: {str(e)}")
123
  return []
@@ -125,16 +129,13 @@ class ContentProcessor:
125
  def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
126
  """Process content and generate insights"""
127
  try:
128
- # Extract main content if soup is provided
129
  if soup:
130
  content = self.extract_main_content(soup)
131
  else:
132
  content = self.clean_text(content)
133
-
134
- # Extract key points
135
- key_points = self.extract_key_points(content)
136
 
137
- # Generate overall summary
138
  summary = self.model_manager.models['summarizer'](
139
  content[:1024],
140
  max_length=150,
@@ -142,18 +143,19 @@ class ContentProcessor:
142
  do_sample=False
143
  )[0]['summary_text']
144
 
 
 
 
145
  return {
146
  'summary': summary,
147
- 'key_points': key_points,
148
- 'content': content
149
  }
150
-
151
  except Exception as e:
152
- logger.error(f"Error processing content: {str(e)}")
153
  return {
154
  'summary': f"Error processing content: {str(e)}",
155
- 'key_points': [],
156
- 'content': content
157
  }
158
 
159
  class WebSearchEngine:
@@ -229,12 +231,12 @@ class WebSearchEngine:
229
  response = self.safe_get(url)
230
  soup = BeautifulSoup(response.text, 'lxml')
231
 
232
- # Process content with BeautifulSoup object
233
- processed = self.processor.process_content("", soup)
234
-
235
  # Get metadata
236
  metadata = self.get_metadata(soup)
237
 
 
 
 
238
  return {
239
  'url': url,
240
  'title': metadata['title'],
@@ -326,24 +328,25 @@ class WebSearchEngine:
326
  if not results:
327
  return {'error': 'Failed to process any search results'}
328
 
329
- # Combine all summaries and key points
330
- all_summaries = [r['summary'] for r in results if 'summary' in r]
331
- combined_summary = " ".join(all_summaries)
332
 
333
- # Format insights
334
- insights = {
335
- 'main_summary': combined_summary[:500],
336
- 'key_findings': list(set(all_key_points))[:7], # Remove duplicates and limit to top 7
337
- 'sources': [{'title': r['title'], 'url': r['url']} for r in results]
338
- }
 
339
 
340
  return {
341
  'results': results,
342
  'insights': insights,
 
343
  'follow_up_questions': [
344
- f"What are the practical applications of {query}?",
345
- f"How has {query} evolved over the past year?",
346
- f"What challenges remain in {query}?"
347
  ]
348
  }
349
 
 
52
  # Remove common navigation elements
53
  nav_elements = [
54
  "skip to content",
 
55
  "search",
56
  "menu",
57
+ "navigation",
58
  "subscribe",
59
  "sign in",
60
  "log in",
61
  "submit",
62
  "browse",
 
63
  ]
64
  for element in nav_elements:
65
  text = text.replace(element.lower(), "")
66
  return text.strip()
67
 
68
  def extract_main_content(self, soup: BeautifulSoup) -> str:
69
+ """Extract main content from HTML, prioritizing article content"""
70
+ content = ""
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Try to find main content containers
73
+ priority_tags = [
74
+ ('article', {}),
75
+ ('div', {'class': ['article', 'post', 'content', 'main']}),
76
+ ('div', {'id': ['article', 'post', 'content', 'main']}),
77
+ ('main', {}),
78
+ ]
79
+
80
+ for tag, attrs in priority_tags:
81
+ elements = soup.find_all(tag, attrs)
82
+ if elements:
83
+ content = " ".join(elem.get_text(strip=True) for elem in elements)
84
+ if content:
85
+ break
86
+
87
+ # If no main content found, try extracting paragraphs
88
+ if not content:
89
+ paragraphs = soup.find_all('p')
90
+ content = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 100)
91
+
92
+ return self.clean_text(content)
93
 
94
  def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
95
+ """Extract key points from text using sentence transformers"""
96
  try:
97
+ # Split into sentences
98
+ sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
99
+ if not sentences:
100
+ return []
 
 
 
 
 
 
 
101
 
102
+ # Get embeddings for sentences
103
+ embeddings = self.model_manager.models['embeddings'].embed_documents(sentences)
 
104
 
105
+ # Use simple clustering to find diverse sentences
106
+ selected_indices = [0] # Start with first sentence
107
+ for _ in range(min(max_points - 1, len(sentences) - 1)):
108
+ # Find sentence most different from selected ones
109
+ max_diff = -1
110
+ max_idx = -1
111
+ for i in range(len(sentences)):
112
+ if i not in selected_indices:
113
+ # Calculate average difference from selected sentences
114
+ diffs = [sum((embeddings[i][j] - embeddings[k][j])**2
115
+ for j in range(len(embeddings[i])))
116
+ for k in selected_indices]
117
+ avg_diff = sum(diffs) / len(diffs)
118
+ if avg_diff > max_diff:
119
+ max_diff = avg_diff
120
+ max_idx = i
121
+ if max_idx != -1:
122
+ selected_indices.append(max_idx)
123
 
124
+ return [sentences[i] for i in selected_indices]
125
  except Exception as e:
126
  logger.error(f"Error extracting key points: {str(e)}")
127
  return []
 
129
  def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
130
  """Process content and generate insights"""
131
  try:
132
+ # Extract main content if HTML is available
133
  if soup:
134
  content = self.extract_main_content(soup)
135
  else:
136
  content = self.clean_text(content)
 
 
 
137
 
138
+ # Generate summary
139
  summary = self.model_manager.models['summarizer'](
140
  content[:1024],
141
  max_length=150,
 
143
  do_sample=False
144
  )[0]['summary_text']
145
 
146
+ # Extract key points
147
+ key_points = self.extract_key_points(content)
148
+
149
  return {
150
  'summary': summary,
151
+ 'content': content,
152
+ 'key_points': key_points
153
  }
 
154
  except Exception as e:
 
155
  return {
156
  'summary': f"Error processing content: {str(e)}",
157
+ 'content': content,
158
+ 'key_points': []
159
  }
160
 
161
  class WebSearchEngine:
 
231
  response = self.safe_get(url)
232
  soup = BeautifulSoup(response.text, 'lxml')
233
 
 
 
 
234
  # Get metadata
235
  metadata = self.get_metadata(soup)
236
 
237
+ # Process content
238
+ processed = self.processor.process_content("", soup=soup)
239
+
240
  return {
241
  'url': url,
242
  'title': metadata['title'],
 
328
  if not results:
329
  return {'error': 'Failed to process any search results'}
330
 
331
+ # Combine insights from all results
332
+ combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
 
333
 
334
+ # Generate overall insights
335
+ insights = self.processor.model_manager.models['summarizer'](
336
+ combined_summary,
337
+ max_length=200,
338
+ min_length=100,
339
+ do_sample=False
340
+ )[0]['summary_text']
341
 
342
  return {
343
  'results': results,
344
  'insights': insights,
345
+ 'key_points': all_key_points[:10], # Top 10 key points
346
  'follow_up_questions': [
347
+ f"What are the recent breakthroughs in {query}?",
348
+ f"How does {query} impact various industries?",
349
+ f"What are the future prospects of {query}?"
350
  ]
351
  }
352