Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -23,6 +23,8 @@ from datetime import datetime
|
|
23 |
from dateutil import parser as date_parser
|
24 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
25 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
26 |
|
27 |
|
28 |
# Set up basic configuration for logging
|
@@ -276,15 +278,49 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
|
|
276 |
print(f"Final clean response: {final_response[:100]}...")
|
277 |
return final_response
|
278 |
|
279 |
-
|
280 |
-
|
281 |
-
results =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
-
|
284 |
-
|
285 |
-
result['date'] = date_parser.parse(result.get('published', datetime.now().isoformat()))
|
286 |
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
class CitingSources(BaseModel):
|
290 |
sources: List[str] = Field(
|
|
|
23 |
from dateutil import parser as date_parser
|
24 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
25 |
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
+
from trafilatura import fetch_url, extract
|
27 |
+
import json
|
28 |
|
29 |
|
30 |
# Set up basic configuration for logging
|
|
|
278 |
print(f"Final clean response: {final_response[:100]}...")
|
279 |
return final_response
|
280 |
|
281 |
+
class SimpleDDGSearch:
|
282 |
+
def search(self, query: str, num_results: int = 5):
|
283 |
+
results = DDGS().text(query, region='wt-wt', safesearch='off', max_results=num_results)
|
284 |
+
return [res["href"] for res in results]
|
285 |
+
|
286 |
+
class TrafilaturaWebCrawler:
|
287 |
+
def get_website_content_from_url(self, url: str) -> str:
|
288 |
+
try:
|
289 |
+
downloaded = fetch_url(url)
|
290 |
+
if downloaded is None:
|
291 |
+
return f"Failed to fetch content from URL: {url}"
|
292 |
+
|
293 |
+
result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
|
294 |
+
if result:
|
295 |
+
result_dict = json.loads(result)
|
296 |
+
title = result_dict.get('title', 'No title found')
|
297 |
+
content = result_dict.get('text', 'No content extracted')
|
298 |
+
|
299 |
+
if content == 'No content extracted':
|
300 |
+
content = extract(downloaded, include_comments=False)
|
301 |
+
|
302 |
+
return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
|
303 |
+
else:
|
304 |
+
return f"No content extracted from URL: {url}"
|
305 |
+
except Exception as e:
|
306 |
+
return f"An error occurred while processing {url}: {str(e)}"
|
307 |
+
|
308 |
+
def search_and_crawl(query: str, num_results: int = 10):
|
309 |
+
searcher = SimpleDDGSearch()
|
310 |
+
search_results = searcher.search(query, num_results=num_results)
|
311 |
|
312 |
+
crawler = TrafilaturaWebCrawler()
|
313 |
+
output = ""
|
|
|
314 |
|
315 |
+
for i, url in enumerate(search_results):
|
316 |
+
output += f"Results for URL {i+1}: {url}\n\n"
|
317 |
+
output += crawler.get_website_content_from_url(url) + "\n"
|
318 |
+
output += "------------------------------------------------------------\n\n"
|
319 |
+
|
320 |
+
return output
|
321 |
+
|
322 |
+
def duckduckgo_search(query):
|
323 |
+
return search_and_crawl(query, num_results=10)
|
324 |
|
325 |
class CitingSources(BaseModel):
|
326 |
sources: List[str] = Field(
|