Shreyas094 commited on
Commit
a198e07
·
verified ·
1 Parent(s): 63bcdb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -7
app.py CHANGED
@@ -23,6 +23,8 @@ from datetime import datetime
23
  from dateutil import parser as date_parser
24
  from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
 
 
26
 
27
 
28
  # Set up basic configuration for logging
@@ -276,15 +278,49 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
276
  print(f"Final clean response: {final_response[:100]}...")
277
  return final_response
278
 
279
- def duckduckgo_search(query):
280
- with DDGS() as ddgs:
281
- results = list(ddgs.text(query, max_results=10))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
- # Add date to results, defaulting to current date if not available
284
- for result in results:
285
- result['date'] = date_parser.parse(result.get('published', datetime.now().isoformat()))
286
 
287
- return results
 
 
 
 
 
 
 
 
288
 
289
  class CitingSources(BaseModel):
290
  sources: List[str] = Field(
 
23
  from dateutil import parser as date_parser
24
  from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
+ from trafilatura import fetch_url, extract
27
+ import json
28
 
29
 
30
  # Set up basic configuration for logging
 
278
  print(f"Final clean response: {final_response[:100]}...")
279
  return final_response
280
 
281
+ class SimpleDDGSearch:
282
+ def search(self, query: str, num_results: int = 5):
283
+ results = DDGS().text(query, region='wt-wt', safesearch='off', max_results=num_results)
284
+ return [res["href"] for res in results]
285
+
286
+ class TrafilaturaWebCrawler:
287
+ def get_website_content_from_url(self, url: str) -> str:
288
+ try:
289
+ downloaded = fetch_url(url)
290
+ if downloaded is None:
291
+ return f"Failed to fetch content from URL: {url}"
292
+
293
+ result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
294
+ if result:
295
+ result_dict = json.loads(result)
296
+ title = result_dict.get('title', 'No title found')
297
+ content = result_dict.get('text', 'No content extracted')
298
+
299
+ if content == 'No content extracted':
300
+ content = extract(downloaded, include_comments=False)
301
+
302
+ return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
303
+ else:
304
+ return f"No content extracted from URL: {url}"
305
+ except Exception as e:
306
+ return f"An error occurred while processing {url}: {str(e)}"
307
+
308
+ def search_and_crawl(query: str, num_results: int = 10):
309
+ searcher = SimpleDDGSearch()
310
+ search_results = searcher.search(query, num_results=num_results)
311
 
312
+ crawler = TrafilaturaWebCrawler()
313
+ output = ""
 
314
 
315
+ for i, url in enumerate(search_results):
316
+ output += f"Results for URL {i+1}: {url}\n\n"
317
+ output += crawler.get_website_content_from_url(url) + "\n"
318
+ output += "------------------------------------------------------------\n\n"
319
+
320
+ return output
321
+
322
+ def duckduckgo_search(query):
323
+ return search_and_crawl(query, num_results=10)
324
 
325
  class CitingSources(BaseModel):
326
  sources: List[str] = Field(