Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,10 +6,11 @@ import requests
|
|
6 |
from typing import List, Dict, Union
|
7 |
import pandas as pd
|
8 |
import wikipediaapi
|
|
|
9 |
from bs4 import BeautifulSoup
|
10 |
import urllib.parse
|
|
|
11 |
from typing import List, Dict
|
12 |
-
import fake_useragent # For realistic user-agent rotation
|
13 |
|
14 |
load_dotenv()
|
15 |
|
@@ -22,17 +23,27 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
22 |
|
23 |
class BasicAgent:
|
24 |
def __init__(self):
|
25 |
-
self.user_agent = fake_useragent.UserAgent().random
|
26 |
self.headers = {
|
27 |
-
'User-Agent': self.
|
28 |
'Accept-Language': 'en-US,en;q=0.5',
|
29 |
}
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def search(self, query: str, num_results: int = 3) -> List[Dict]:
|
33 |
"""Perform Google search and return structured results"""
|
34 |
encoded_query = urllib.parse.quote_plus(query)
|
35 |
-
url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}"
|
36 |
|
37 |
try:
|
38 |
response = requests.get(url, headers=self.headers, timeout=10)
|
@@ -47,17 +58,17 @@ class BasicAgent:
|
|
47 |
soup = BeautifulSoup(html, 'html.parser')
|
48 |
results = []
|
49 |
|
50 |
-
#
|
51 |
-
for i, result in enumerate(soup.select('.tF2Cxc')[:max_results]):
|
52 |
-
title = result.select_one('h3')
|
53 |
link = result.find('a')['href']
|
54 |
-
snippet = result.select_one('.IsZvec')
|
55 |
|
56 |
if title and link:
|
57 |
results.append({
|
58 |
'position': i + 1,
|
59 |
'title': title.get_text(),
|
60 |
-
'link': link,
|
61 |
'snippet': snippet.get_text() if snippet else None
|
62 |
})
|
63 |
|
@@ -69,26 +80,25 @@ class BasicAgent:
|
|
69 |
for res in results:
|
70 |
output.append(
|
71 |
f"{res['position']}. {res['title']}\n"
|
72 |
-
f" {res['link']}\n"
|
73 |
-
f" {res['snippet'] or 'No description available'}\n"
|
74 |
)
|
75 |
return "\n".join(output)
|
76 |
|
77 |
-
# Usage Example
|
78 |
if __name__ == "__main__":
|
79 |
scraper = BasicAgent()
|
80 |
|
81 |
-
#
|
82 |
query = "Python programming language"
|
83 |
-
print(f"Searching Google for: '{query}'")
|
84 |
|
85 |
-
results = scraper.search(query)
|
86 |
|
87 |
if results:
|
88 |
print("\nTop Results:")
|
89 |
print(scraper.pretty_print(results))
|
90 |
else:
|
91 |
-
print("No results found or search failed")
|
92 |
|
93 |
|
94 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
|
6 |
from typing import List, Dict, Union
|
7 |
import pandas as pd
|
8 |
import wikipediaapi
|
9 |
+
import requests
|
10 |
from bs4 import BeautifulSoup
|
11 |
import urllib.parse
|
12 |
+
import random
|
13 |
from typing import List, Dict
|
|
|
14 |
|
15 |
load_dotenv()
|
16 |
|
|
|
23 |
|
24 |
class BasicAgent:
|
25 |
def __init__(self):
|
|
|
26 |
self.headers = {
|
27 |
+
'User-Agent': self._get_random_user_agent(),
|
28 |
'Accept-Language': 'en-US,en;q=0.5',
|
29 |
}
|
30 |
+
|
31 |
+
def _get_random_user_agent(self) -> str:
|
32 |
+
"""Fallback user-agent generator if fake-useragent isn't installed"""
|
33 |
+
browsers = [
|
34 |
+
# Chrome
|
35 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
36 |
+
# Firefox
|
37 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
38 |
+
# Safari
|
39 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
|
40 |
+
]
|
41 |
+
return random.choice(browsers)
|
42 |
|
43 |
def search(self, query: str, num_results: int = 3) -> List[Dict]:
|
44 |
"""Perform Google search and return structured results"""
|
45 |
encoded_query = urllib.parse.quote_plus(query)
|
46 |
+
url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}"
|
47 |
|
48 |
try:
|
49 |
response = requests.get(url, headers=self.headers, timeout=10)
|
|
|
58 |
soup = BeautifulSoup(html, 'html.parser')
|
59 |
results = []
|
60 |
|
61 |
+
# Current Google result selectors (July 2024)
|
62 |
+
for i, result in enumerate(soup.select('.tF2Cxc, .g')[:max_results]):
|
63 |
+
title = result.select_one('h3, .LC20lb')
|
64 |
link = result.find('a')['href']
|
65 |
+
snippet = result.select_one('.IsZvec, .VwiC3b')
|
66 |
|
67 |
if title and link:
|
68 |
results.append({
|
69 |
'position': i + 1,
|
70 |
'title': title.get_text(),
|
71 |
+
'link': link if link.startswith('http') else f"https://www.google.com{link}",
|
72 |
'snippet': snippet.get_text() if snippet else None
|
73 |
})
|
74 |
|
|
|
80 |
for res in results:
|
81 |
output.append(
|
82 |
f"{res['position']}. {res['title']}\n"
|
83 |
+
f" π {res['link']}\n"
|
84 |
+
f" π {res['snippet'] or 'No description available'}\n"
|
85 |
)
|
86 |
return "\n".join(output)
|
87 |
|
|
|
88 |
if __name__ == "__main__":
|
89 |
scraper = BasicAgent()
|
90 |
|
91 |
+
# Example search
|
92 |
query = "Python programming language"
|
93 |
+
print(f"π Searching Google for: '{query}'")
|
94 |
|
95 |
+
results = scraper.search(query, num_results=3)
|
96 |
|
97 |
if results:
|
98 |
print("\nTop Results:")
|
99 |
print(scraper.pretty_print(results))
|
100 |
else:
|
101 |
+
print("β No results found or search failed")
|
102 |
|
103 |
|
104 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|