Spaces:
Build error
Build error
fikird
commited on
Commit
Β·
f2c01c1
1
Parent(s):
ae8bccc
Enhance content processing with better extraction and summarization
Browse files- app.py +21 -46
- search_engine.py +75 -72
app.py
CHANGED
@@ -32,72 +32,50 @@ def format_results(results):
|
|
32 |
if not results or not results.get('results'):
|
33 |
return "# β οΈ No Results\nNo search results were found. Please try a different query."
|
34 |
|
35 |
-
|
36 |
-
output = []
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
output.append(insights.get('main_summary', ''))
|
42 |
-
output.append("\n")
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
# Sources
|
52 |
-
output.append("π Sources")
|
53 |
-
output.append("-" * 50)
|
54 |
-
for source in insights.get('sources', []):
|
55 |
-
output.append(f"β’ {source.get('title', '')}")
|
56 |
-
output.append(f" {source.get('url', '')}")
|
57 |
-
output.append("\n")
|
58 |
-
|
59 |
-
# Follow-up Questions
|
60 |
-
output.append("β Suggested Questions")
|
61 |
-
output.append("-" * 50)
|
62 |
-
for question in results.get('follow_up_questions', []):
|
63 |
-
output.append(f"β’ {question}")
|
64 |
|
65 |
# Add main results
|
66 |
if 'results' in results:
|
67 |
-
|
68 |
-
output.append("π Detailed Results")
|
69 |
-
output.append("-" * 50)
|
70 |
-
output.append("\n")
|
71 |
for i, result in enumerate(results['results'], 1):
|
72 |
if not isinstance(result, dict):
|
73 |
continue
|
74 |
|
75 |
-
|
76 |
if 'url' in result:
|
77 |
title = result.get('title', 'Untitled')
|
78 |
-
|
79 |
if 'summary' in result:
|
80 |
-
|
81 |
|
82 |
# Add similar chunks if available
|
83 |
if 'similar_chunks' in results:
|
84 |
-
|
85 |
-
output.append("π Related Content")
|
86 |
-
output.append("-" * 50)
|
87 |
-
output.append("\n")
|
88 |
for i, chunk in enumerate(results['similar_chunks'], 1):
|
89 |
if not isinstance(chunk, dict):
|
90 |
continue
|
91 |
|
92 |
-
|
93 |
if 'metadata' in chunk:
|
94 |
meta = chunk['metadata']
|
95 |
if 'title' in meta and 'url' in meta:
|
96 |
-
|
97 |
if 'content' in chunk:
|
98 |
-
|
99 |
|
100 |
-
return
|
101 |
|
102 |
def create_demo():
|
103 |
"""Create the Gradio interface"""
|
@@ -122,10 +100,7 @@ def create_demo():
|
|
122 |
)
|
123 |
search_button = gr.Button("π Search")
|
124 |
|
125 |
-
output = gr.
|
126 |
-
label="Search Results",
|
127 |
-
lines=20
|
128 |
-
)
|
129 |
|
130 |
search_button.click(
|
131 |
fn=safe_search,
|
|
|
32 |
if not results or not results.get('results'):
|
33 |
return "# β οΈ No Results\nNo search results were found. Please try a different query."
|
34 |
|
35 |
+
formatted = f"# π Search Results\n\n"
|
|
|
36 |
|
37 |
+
# Add insights section
|
38 |
+
if 'insights' in results:
|
39 |
+
formatted += f"## π‘ Key Insights\n{results['insights']}\n\n"
|
|
|
|
|
40 |
|
41 |
+
# Add follow-up questions
|
42 |
+
if 'follow_up_questions' in results:
|
43 |
+
formatted += "## β Follow-up Questions\n"
|
44 |
+
for q in results['follow_up_questions']:
|
45 |
+
if q and q.strip():
|
46 |
+
formatted += f"- {q.strip()}\n"
|
47 |
+
formatted += "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Add main results
|
50 |
if 'results' in results:
|
51 |
+
formatted += "## π Detailed Results\n\n"
|
|
|
|
|
|
|
52 |
for i, result in enumerate(results['results'], 1):
|
53 |
if not isinstance(result, dict):
|
54 |
continue
|
55 |
|
56 |
+
formatted += f"### {i}. "
|
57 |
if 'url' in result:
|
58 |
title = result.get('title', 'Untitled')
|
59 |
+
formatted += f"[{title}]({result['url']})\n"
|
60 |
if 'summary' in result:
|
61 |
+
formatted += f"\n{result['summary']}\n\n"
|
62 |
|
63 |
# Add similar chunks if available
|
64 |
if 'similar_chunks' in results:
|
65 |
+
formatted += "## π Related Content\n\n"
|
|
|
|
|
|
|
66 |
for i, chunk in enumerate(results['similar_chunks'], 1):
|
67 |
if not isinstance(chunk, dict):
|
68 |
continue
|
69 |
|
70 |
+
formatted += f"### Related {i}\n"
|
71 |
if 'metadata' in chunk:
|
72 |
meta = chunk['metadata']
|
73 |
if 'title' in meta and 'url' in meta:
|
74 |
+
formatted += f"From [{meta['title']}]({meta['url']})\n"
|
75 |
if 'content' in chunk:
|
76 |
+
formatted += f"\n{chunk['content'][:200]}...\n\n"
|
77 |
|
78 |
+
return formatted
|
79 |
|
80 |
def create_demo():
|
81 |
"""Create the Gradio interface"""
|
|
|
100 |
)
|
101 |
search_button = gr.Button("π Search")
|
102 |
|
103 |
+
output = gr.Markdown()
|
|
|
|
|
|
|
104 |
|
105 |
search_button.click(
|
106 |
fn=safe_search,
|
search_engine.py
CHANGED
@@ -52,72 +52,76 @@ class ContentProcessor:
|
|
52 |
# Remove common navigation elements
|
53 |
nav_elements = [
|
54 |
"skip to content",
|
55 |
-
"skip to navigation",
|
56 |
"search",
|
57 |
"menu",
|
|
|
58 |
"subscribe",
|
59 |
"sign in",
|
60 |
"log in",
|
61 |
"submit",
|
62 |
"browse",
|
63 |
-
"explore",
|
64 |
]
|
65 |
for element in nav_elements:
|
66 |
text = text.replace(element.lower(), "")
|
67 |
return text.strip()
|
68 |
|
69 |
def extract_main_content(self, soup: BeautifulSoup) -> str:
|
70 |
-
"""Extract main content from HTML
|
71 |
-
|
72 |
-
for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
|
73 |
-
elem.decompose()
|
74 |
-
|
75 |
-
# Remove script and style elements
|
76 |
-
for elem in soup.find_all(['script', 'style']):
|
77 |
-
elem.decompose()
|
78 |
-
|
79 |
-
# Try to find main content area
|
80 |
-
main_content = None
|
81 |
-
content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
|
100 |
-
"""Extract key points from text using
|
101 |
try:
|
102 |
-
# Split
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
|
107 |
-
summary = self.model_manager.models['summarizer'](
|
108 |
-
chunk,
|
109 |
-
max_length=100,
|
110 |
-
min_length=30,
|
111 |
-
do_sample=False
|
112 |
-
)[0]['summary_text']
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
all_points.extend(points)
|
117 |
|
118 |
-
#
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
|
|
121 |
except Exception as e:
|
122 |
logger.error(f"Error extracting key points: {str(e)}")
|
123 |
return []
|
@@ -125,16 +129,13 @@ class ContentProcessor:
|
|
125 |
def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
|
126 |
"""Process content and generate insights"""
|
127 |
try:
|
128 |
-
# Extract main content if
|
129 |
if soup:
|
130 |
content = self.extract_main_content(soup)
|
131 |
else:
|
132 |
content = self.clean_text(content)
|
133 |
-
|
134 |
-
# Extract key points
|
135 |
-
key_points = self.extract_key_points(content)
|
136 |
|
137 |
-
# Generate
|
138 |
summary = self.model_manager.models['summarizer'](
|
139 |
content[:1024],
|
140 |
max_length=150,
|
@@ -142,18 +143,19 @@ class ContentProcessor:
|
|
142 |
do_sample=False
|
143 |
)[0]['summary_text']
|
144 |
|
|
|
|
|
|
|
145 |
return {
|
146 |
'summary': summary,
|
147 |
-
'
|
148 |
-
'
|
149 |
}
|
150 |
-
|
151 |
except Exception as e:
|
152 |
-
logger.error(f"Error processing content: {str(e)}")
|
153 |
return {
|
154 |
'summary': f"Error processing content: {str(e)}",
|
155 |
-
'
|
156 |
-
'
|
157 |
}
|
158 |
|
159 |
class WebSearchEngine:
|
@@ -229,12 +231,12 @@ class WebSearchEngine:
|
|
229 |
response = self.safe_get(url)
|
230 |
soup = BeautifulSoup(response.text, 'lxml')
|
231 |
|
232 |
-
# Process content with BeautifulSoup object
|
233 |
-
processed = self.processor.process_content("", soup)
|
234 |
-
|
235 |
# Get metadata
|
236 |
metadata = self.get_metadata(soup)
|
237 |
|
|
|
|
|
|
|
238 |
return {
|
239 |
'url': url,
|
240 |
'title': metadata['title'],
|
@@ -326,24 +328,25 @@ class WebSearchEngine:
|
|
326 |
if not results:
|
327 |
return {'error': 'Failed to process any search results'}
|
328 |
|
329 |
-
# Combine
|
330 |
-
|
331 |
-
combined_summary = " ".join(all_summaries)
|
332 |
|
333 |
-
#
|
334 |
-
insights =
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
|
|
339 |
|
340 |
return {
|
341 |
'results': results,
|
342 |
'insights': insights,
|
|
|
343 |
'follow_up_questions': [
|
344 |
-
f"What are the
|
345 |
-
f"How
|
346 |
-
f"What
|
347 |
]
|
348 |
}
|
349 |
|
|
|
52 |
# Remove common navigation elements
|
53 |
nav_elements = [
|
54 |
"skip to content",
|
|
|
55 |
"search",
|
56 |
"menu",
|
57 |
+
"navigation",
|
58 |
"subscribe",
|
59 |
"sign in",
|
60 |
"log in",
|
61 |
"submit",
|
62 |
"browse",
|
|
|
63 |
]
|
64 |
for element in nav_elements:
|
65 |
text = text.replace(element.lower(), "")
|
66 |
return text.strip()
|
67 |
|
68 |
def extract_main_content(self, soup: BeautifulSoup) -> str:
|
69 |
+
"""Extract main content from HTML, prioritizing article content"""
|
70 |
+
content = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# Try to find main content containers
|
73 |
+
priority_tags = [
|
74 |
+
('article', {}),
|
75 |
+
('div', {'class': ['article', 'post', 'content', 'main']}),
|
76 |
+
('div', {'id': ['article', 'post', 'content', 'main']}),
|
77 |
+
('main', {}),
|
78 |
+
]
|
79 |
+
|
80 |
+
for tag, attrs in priority_tags:
|
81 |
+
elements = soup.find_all(tag, attrs)
|
82 |
+
if elements:
|
83 |
+
content = " ".join(elem.get_text(strip=True) for elem in elements)
|
84 |
+
if content:
|
85 |
+
break
|
86 |
+
|
87 |
+
# If no main content found, try extracting paragraphs
|
88 |
+
if not content:
|
89 |
+
paragraphs = soup.find_all('p')
|
90 |
+
content = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 100)
|
91 |
+
|
92 |
+
return self.clean_text(content)
|
93 |
|
94 |
def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
|
95 |
+
"""Extract key points from text using sentence transformers"""
|
96 |
try:
|
97 |
+
# Split into sentences
|
98 |
+
sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
|
99 |
+
if not sentences:
|
100 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
# Get embeddings for sentences
|
103 |
+
embeddings = self.model_manager.models['embeddings'].embed_documents(sentences)
|
|
|
104 |
|
105 |
+
# Use simple clustering to find diverse sentences
|
106 |
+
selected_indices = [0] # Start with first sentence
|
107 |
+
for _ in range(min(max_points - 1, len(sentences) - 1)):
|
108 |
+
# Find sentence most different from selected ones
|
109 |
+
max_diff = -1
|
110 |
+
max_idx = -1
|
111 |
+
for i in range(len(sentences)):
|
112 |
+
if i not in selected_indices:
|
113 |
+
# Calculate average difference from selected sentences
|
114 |
+
diffs = [sum((embeddings[i][j] - embeddings[k][j])**2
|
115 |
+
for j in range(len(embeddings[i])))
|
116 |
+
for k in selected_indices]
|
117 |
+
avg_diff = sum(diffs) / len(diffs)
|
118 |
+
if avg_diff > max_diff:
|
119 |
+
max_diff = avg_diff
|
120 |
+
max_idx = i
|
121 |
+
if max_idx != -1:
|
122 |
+
selected_indices.append(max_idx)
|
123 |
|
124 |
+
return [sentences[i] for i in selected_indices]
|
125 |
except Exception as e:
|
126 |
logger.error(f"Error extracting key points: {str(e)}")
|
127 |
return []
|
|
|
129 |
def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
|
130 |
"""Process content and generate insights"""
|
131 |
try:
|
132 |
+
# Extract main content if HTML is available
|
133 |
if soup:
|
134 |
content = self.extract_main_content(soup)
|
135 |
else:
|
136 |
content = self.clean_text(content)
|
|
|
|
|
|
|
137 |
|
138 |
+
# Generate summary
|
139 |
summary = self.model_manager.models['summarizer'](
|
140 |
content[:1024],
|
141 |
max_length=150,
|
|
|
143 |
do_sample=False
|
144 |
)[0]['summary_text']
|
145 |
|
146 |
+
# Extract key points
|
147 |
+
key_points = self.extract_key_points(content)
|
148 |
+
|
149 |
return {
|
150 |
'summary': summary,
|
151 |
+
'content': content,
|
152 |
+
'key_points': key_points
|
153 |
}
|
|
|
154 |
except Exception as e:
|
|
|
155 |
return {
|
156 |
'summary': f"Error processing content: {str(e)}",
|
157 |
+
'content': content,
|
158 |
+
'key_points': []
|
159 |
}
|
160 |
|
161 |
class WebSearchEngine:
|
|
|
231 |
response = self.safe_get(url)
|
232 |
soup = BeautifulSoup(response.text, 'lxml')
|
233 |
|
|
|
|
|
|
|
234 |
# Get metadata
|
235 |
metadata = self.get_metadata(soup)
|
236 |
|
237 |
+
# Process content
|
238 |
+
processed = self.processor.process_content("", soup=soup)
|
239 |
+
|
240 |
return {
|
241 |
'url': url,
|
242 |
'title': metadata['title'],
|
|
|
328 |
if not results:
|
329 |
return {'error': 'Failed to process any search results'}
|
330 |
|
331 |
+
# Combine insights from all results
|
332 |
+
combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
|
|
|
333 |
|
334 |
+
# Generate overall insights
|
335 |
+
insights = self.processor.model_manager.models['summarizer'](
|
336 |
+
combined_summary,
|
337 |
+
max_length=200,
|
338 |
+
min_length=100,
|
339 |
+
do_sample=False
|
340 |
+
)[0]['summary_text']
|
341 |
|
342 |
return {
|
343 |
'results': results,
|
344 |
'insights': insights,
|
345 |
+
'key_points': all_key_points[:10], # Top 10 key points
|
346 |
'follow_up_questions': [
|
347 |
+
f"What are the recent breakthroughs in {query}?",
|
348 |
+
f"How does {query} impact various industries?",
|
349 |
+
f"What are the future prospects of {query}?"
|
350 |
]
|
351 |
}
|
352 |
|