Spaces:
Build error
Build error
fikird
commited on
Commit
Β·
ae8bccc
1
Parent(s):
25a3d88
Improve content processing and result formatting
Browse files- app.py +46 -21
- search_engine.py +117 -22
app.py
CHANGED
@@ -32,50 +32,72 @@ def format_results(results):
|
|
32 |
if not results or not results.get('results'):
|
33 |
return "# β οΈ No Results\nNo search results were found. Please try a different query."
|
34 |
|
35 |
-
|
|
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
|
|
|
|
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Add main results
|
50 |
if 'results' in results:
|
51 |
-
|
|
|
|
|
|
|
52 |
for i, result in enumerate(results['results'], 1):
|
53 |
if not isinstance(result, dict):
|
54 |
continue
|
55 |
|
56 |
-
|
57 |
if 'url' in result:
|
58 |
title = result.get('title', 'Untitled')
|
59 |
-
|
60 |
if 'summary' in result:
|
61 |
-
|
62 |
|
63 |
# Add similar chunks if available
|
64 |
if 'similar_chunks' in results:
|
65 |
-
|
|
|
|
|
|
|
66 |
for i, chunk in enumerate(results['similar_chunks'], 1):
|
67 |
if not isinstance(chunk, dict):
|
68 |
continue
|
69 |
|
70 |
-
|
71 |
if 'metadata' in chunk:
|
72 |
meta = chunk['metadata']
|
73 |
if 'title' in meta and 'url' in meta:
|
74 |
-
|
75 |
if 'content' in chunk:
|
76 |
-
|
77 |
|
78 |
-
return
|
79 |
|
80 |
def create_demo():
|
81 |
"""Create the Gradio interface"""
|
@@ -100,7 +122,10 @@ def create_demo():
|
|
100 |
)
|
101 |
search_button = gr.Button("π Search")
|
102 |
|
103 |
-
output = gr.
|
|
|
|
|
|
|
104 |
|
105 |
search_button.click(
|
106 |
fn=safe_search,
|
|
|
32 |
if not results or not results.get('results'):
|
33 |
return "# β οΈ No Results\nNo search results were found. Please try a different query."
|
34 |
|
35 |
+
insights = results.get('insights', {})
|
36 |
+
output = []
|
37 |
|
38 |
+
# Main Summary
|
39 |
+
output.append("π Executive Summary")
|
40 |
+
output.append("-" * 50)
|
41 |
+
output.append(insights.get('main_summary', ''))
|
42 |
+
output.append("\n")
|
43 |
|
44 |
+
# Key Findings
|
45 |
+
output.append("π Key Findings")
|
46 |
+
output.append("-" * 50)
|
47 |
+
for i, point in enumerate(insights.get('key_findings', []), 1):
|
48 |
+
output.append(f"{i}. {point}")
|
49 |
+
output.append("\n")
|
50 |
+
|
51 |
+
# Sources
|
52 |
+
output.append("π Sources")
|
53 |
+
output.append("-" * 50)
|
54 |
+
for source in insights.get('sources', []):
|
55 |
+
output.append(f"β’ {source.get('title', '')}")
|
56 |
+
output.append(f" {source.get('url', '')}")
|
57 |
+
output.append("\n")
|
58 |
+
|
59 |
+
# Follow-up Questions
|
60 |
+
output.append("β Suggested Questions")
|
61 |
+
output.append("-" * 50)
|
62 |
+
for question in results.get('follow_up_questions', []):
|
63 |
+
output.append(f"β’ {question}")
|
64 |
|
65 |
# Add main results
|
66 |
if 'results' in results:
|
67 |
+
output.append("\n")
|
68 |
+
output.append("π Detailed Results")
|
69 |
+
output.append("-" * 50)
|
70 |
+
output.append("\n")
|
71 |
for i, result in enumerate(results['results'], 1):
|
72 |
if not isinstance(result, dict):
|
73 |
continue
|
74 |
|
75 |
+
output.append(f"### {i}. ")
|
76 |
if 'url' in result:
|
77 |
title = result.get('title', 'Untitled')
|
78 |
+
output.append(f"[{title}]({result['url']})\n")
|
79 |
if 'summary' in result:
|
80 |
+
output.append(f"\n{result['summary']}\n\n")
|
81 |
|
82 |
# Add similar chunks if available
|
83 |
if 'similar_chunks' in results:
|
84 |
+
output.append("\n")
|
85 |
+
output.append("π Related Content")
|
86 |
+
output.append("-" * 50)
|
87 |
+
output.append("\n")
|
88 |
for i, chunk in enumerate(results['similar_chunks'], 1):
|
89 |
if not isinstance(chunk, dict):
|
90 |
continue
|
91 |
|
92 |
+
output.append(f"### Related {i}\n")
|
93 |
if 'metadata' in chunk:
|
94 |
meta = chunk['metadata']
|
95 |
if 'title' in meta and 'url' in meta:
|
96 |
+
output.append(f"From [{meta['title']}]({meta['url']})\n")
|
97 |
if 'content' in chunk:
|
98 |
+
output.append(f"\n{chunk['content'][:200]}...\n\n")
|
99 |
|
100 |
+
return "\n".join(output)
|
101 |
|
102 |
def create_demo():
|
103 |
"""Create the Gradio interface"""
|
|
|
122 |
)
|
123 |
search_button = gr.Button("π Search")
|
124 |
|
125 |
+
output = gr.Textbox(
|
126 |
+
label="Search Results",
|
127 |
+
lines=20
|
128 |
+
)
|
129 |
|
130 |
search_button.click(
|
131 |
fn=safe_search,
|
search_engine.py
CHANGED
@@ -44,25 +44,115 @@ class ContentProcessor:
|
|
44 |
|
45 |
def __init__(self):
|
46 |
self.model_manager = ModelManager()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"""Process content and generate insights"""
|
50 |
try:
|
51 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
summary = self.model_manager.models['summarizer'](
|
53 |
content[:1024],
|
54 |
-
max_length=
|
55 |
-
min_length=
|
56 |
do_sample=False
|
57 |
)[0]['summary_text']
|
58 |
|
59 |
return {
|
60 |
'summary': summary,
|
|
|
61 |
'content': content
|
62 |
}
|
|
|
63 |
except Exception as e:
|
|
|
64 |
return {
|
65 |
'summary': f"Error processing content: {str(e)}",
|
|
|
66 |
'content': content
|
67 |
}
|
68 |
|
@@ -139,25 +229,18 @@ class WebSearchEngine:
|
|
139 |
response = self.safe_get(url)
|
140 |
soup = BeautifulSoup(response.text, 'lxml')
|
141 |
|
142 |
-
#
|
143 |
-
|
144 |
-
script.decompose()
|
145 |
-
text = soup.get_text()
|
146 |
-
lines = (line.strip() for line in text.splitlines())
|
147 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
148 |
-
content = ' '.join(chunk for chunk in chunks if chunk)
|
149 |
|
150 |
# Get metadata
|
151 |
metadata = self.get_metadata(soup)
|
152 |
|
153 |
-
# Process content
|
154 |
-
processed = self.processor.process_content(content)
|
155 |
-
|
156 |
return {
|
157 |
'url': url,
|
158 |
'title': metadata['title'],
|
159 |
'description': metadata['description'],
|
160 |
'summary': processed['summary'],
|
|
|
161 |
'content': processed['content']
|
162 |
}
|
163 |
|
@@ -229,26 +312,38 @@ class WebSearchEngine:
|
|
229 |
return {'error': 'No results found'}
|
230 |
|
231 |
results = []
|
|
|
|
|
232 |
for result in search_results:
|
233 |
if 'link' in result:
|
234 |
processed = self.process_url(result['link'])
|
235 |
if 'error' not in processed:
|
236 |
results.append(processed)
|
|
|
|
|
237 |
time.sleep(random.uniform(0.5, 1.0))
|
238 |
-
|
239 |
if not results:
|
240 |
return {'error': 'Failed to process any search results'}
|
241 |
-
|
242 |
-
#
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
return {
|
246 |
'results': results,
|
247 |
-
'insights':
|
248 |
'follow_up_questions': [
|
249 |
-
f"What are the
|
250 |
-
f"
|
251 |
-
f"What
|
252 |
]
|
253 |
}
|
254 |
|
|
|
44 |
|
45 |
def __init__(self):
|
46 |
self.model_manager = ModelManager()
|
47 |
+
|
48 |
+
def clean_text(self, text: str) -> str:
|
49 |
+
"""Clean and normalize text content"""
|
50 |
+
# Remove extra whitespace
|
51 |
+
text = ' '.join(text.split())
|
52 |
+
# Remove common navigation elements
|
53 |
+
nav_elements = [
|
54 |
+
"skip to content",
|
55 |
+
"skip to navigation",
|
56 |
+
"search",
|
57 |
+
"menu",
|
58 |
+
"subscribe",
|
59 |
+
"sign in",
|
60 |
+
"log in",
|
61 |
+
"submit",
|
62 |
+
"browse",
|
63 |
+
"explore",
|
64 |
+
]
|
65 |
+
for element in nav_elements:
|
66 |
+
text = text.replace(element.lower(), "")
|
67 |
+
return text.strip()
|
68 |
+
|
69 |
+
def extract_main_content(self, soup: BeautifulSoup) -> str:
|
70 |
+
"""Extract main content from HTML soup"""
|
71 |
+
# Remove navigation, headers, footers, and sidebars
|
72 |
+
for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
|
73 |
+
elem.decompose()
|
74 |
+
|
75 |
+
# Remove script and style elements
|
76 |
+
for elem in soup.find_all(['script', 'style']):
|
77 |
+
elem.decompose()
|
78 |
+
|
79 |
+
# Try to find main content area
|
80 |
+
main_content = None
|
81 |
+
content_tags = ['article', 'main', '[role="main"]', '#content', '.content', '.post-content']
|
82 |
+
|
83 |
+
for tag in content_tags:
|
84 |
+
main_content = soup.select_one(tag)
|
85 |
+
if main_content:
|
86 |
+
break
|
87 |
+
|
88 |
+
# If no main content found, use body
|
89 |
+
if not main_content:
|
90 |
+
main_content = soup.find('body')
|
91 |
+
|
92 |
+
if main_content:
|
93 |
+
text = main_content.get_text(separator=' ', strip=True)
|
94 |
+
else:
|
95 |
+
text = soup.get_text(separator=' ', strip=True)
|
96 |
+
|
97 |
+
return self.clean_text(text)
|
98 |
|
99 |
+
def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
|
100 |
+
"""Extract key points from text using AI"""
|
101 |
+
try:
|
102 |
+
# Split text into chunks for processing
|
103 |
+
chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
|
104 |
+
all_points = []
|
105 |
+
|
106 |
+
for chunk in chunks[:3]: # Process first 3 chunks to keep it manageable
|
107 |
+
summary = self.model_manager.models['summarizer'](
|
108 |
+
chunk,
|
109 |
+
max_length=100,
|
110 |
+
min_length=30,
|
111 |
+
do_sample=False
|
112 |
+
)[0]['summary_text']
|
113 |
+
|
114 |
+
# Split summary into sentences
|
115 |
+
points = [s.strip() for s in summary.split('.') if s.strip()]
|
116 |
+
all_points.extend(points)
|
117 |
+
|
118 |
+
# Return top points
|
119 |
+
return all_points[:max_points]
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
logger.error(f"Error extracting key points: {str(e)}")
|
123 |
+
return []
|
124 |
+
|
125 |
+
def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
|
126 |
"""Process content and generate insights"""
|
127 |
try:
|
128 |
+
# Extract main content if soup is provided
|
129 |
+
if soup:
|
130 |
+
content = self.extract_main_content(soup)
|
131 |
+
else:
|
132 |
+
content = self.clean_text(content)
|
133 |
+
|
134 |
+
# Extract key points
|
135 |
+
key_points = self.extract_key_points(content)
|
136 |
+
|
137 |
+
# Generate overall summary
|
138 |
summary = self.model_manager.models['summarizer'](
|
139 |
content[:1024],
|
140 |
+
max_length=150,
|
141 |
+
min_length=50,
|
142 |
do_sample=False
|
143 |
)[0]['summary_text']
|
144 |
|
145 |
return {
|
146 |
'summary': summary,
|
147 |
+
'key_points': key_points,
|
148 |
'content': content
|
149 |
}
|
150 |
+
|
151 |
except Exception as e:
|
152 |
+
logger.error(f"Error processing content: {str(e)}")
|
153 |
return {
|
154 |
'summary': f"Error processing content: {str(e)}",
|
155 |
+
'key_points': [],
|
156 |
'content': content
|
157 |
}
|
158 |
|
|
|
229 |
response = self.safe_get(url)
|
230 |
soup = BeautifulSoup(response.text, 'lxml')
|
231 |
|
232 |
+
# Process content with BeautifulSoup object
|
233 |
+
processed = self.processor.process_content("", soup)
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
# Get metadata
|
236 |
metadata = self.get_metadata(soup)
|
237 |
|
|
|
|
|
|
|
238 |
return {
|
239 |
'url': url,
|
240 |
'title': metadata['title'],
|
241 |
'description': metadata['description'],
|
242 |
'summary': processed['summary'],
|
243 |
+
'key_points': processed['key_points'],
|
244 |
'content': processed['content']
|
245 |
}
|
246 |
|
|
|
312 |
return {'error': 'No results found'}
|
313 |
|
314 |
results = []
|
315 |
+
all_key_points = []
|
316 |
+
|
317 |
for result in search_results:
|
318 |
if 'link' in result:
|
319 |
processed = self.process_url(result['link'])
|
320 |
if 'error' not in processed:
|
321 |
results.append(processed)
|
322 |
+
if 'key_points' in processed:
|
323 |
+
all_key_points.extend(processed['key_points'])
|
324 |
time.sleep(random.uniform(0.5, 1.0))
|
325 |
+
|
326 |
if not results:
|
327 |
return {'error': 'Failed to process any search results'}
|
328 |
+
|
329 |
+
# Combine all summaries and key points
|
330 |
+
all_summaries = [r['summary'] for r in results if 'summary' in r]
|
331 |
+
combined_summary = " ".join(all_summaries)
|
332 |
+
|
333 |
+
# Format insights
|
334 |
+
insights = {
|
335 |
+
'main_summary': combined_summary[:500],
|
336 |
+
'key_findings': list(set(all_key_points))[:7], # Remove duplicates and limit to top 7
|
337 |
+
'sources': [{'title': r['title'], 'url': r['url']} for r in results]
|
338 |
+
}
|
339 |
|
340 |
return {
|
341 |
'results': results,
|
342 |
+
'insights': insights,
|
343 |
'follow_up_questions': [
|
344 |
+
f"What are the practical applications of {query}?",
|
345 |
+
f"How has {query} evolved over the past year?",
|
346 |
+
f"What challenges remain in {query}?"
|
347 |
]
|
348 |
}
|
349 |
|