wt002 commited on
Commit
7bc778b
Β·
verified Β·
1 Parent(s): 25e901d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -17
app.py CHANGED
@@ -6,10 +6,11 @@ import requests
6
  from typing import List, Dict, Union
7
  import pandas as pd
8
  import wikipediaapi
 
9
  from bs4 import BeautifulSoup
10
  import urllib.parse
 
11
  from typing import List, Dict
12
- import fake_useragent # For realistic user-agent rotation
13
 
14
  load_dotenv()
15
 
@@ -22,17 +23,27 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
  class BasicAgent:
24
  def __init__(self):
25
- self.user_agent = fake_useragent.UserAgent().random
26
  self.headers = {
27
- 'User-Agent': self.user_agent,
28
  'Accept-Language': 'en-US,en;q=0.5',
29
  }
30
- print("GoogleScraper initialized with User-Agent:", self.user_agent[:50] + "...")
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def search(self, query: str, num_results: int = 3) -> List[Dict]:
33
  """Perform Google search and return structured results"""
34
  encoded_query = urllib.parse.quote_plus(query)
35
- url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}" # +2 for buffer
36
 
37
  try:
38
  response = requests.get(url, headers=self.headers, timeout=10)
@@ -47,17 +58,17 @@ class BasicAgent:
47
  soup = BeautifulSoup(html, 'html.parser')
48
  results = []
49
 
50
- # Main result blocks (class names may change - this works as of July 2024)
51
- for i, result in enumerate(soup.select('.tF2Cxc')[:max_results]):
52
- title = result.select_one('h3')
53
  link = result.find('a')['href']
54
- snippet = result.select_one('.IsZvec')
55
 
56
  if title and link:
57
  results.append({
58
  'position': i + 1,
59
  'title': title.get_text(),
60
- 'link': link,
61
  'snippet': snippet.get_text() if snippet else None
62
  })
63
 
@@ -69,26 +80,25 @@ class BasicAgent:
69
  for res in results:
70
  output.append(
71
  f"{res['position']}. {res['title']}\n"
72
- f" {res['link']}\n"
73
- f" {res['snippet'] or 'No description available'}\n"
74
  )
75
  return "\n".join(output)
76
 
77
- # Usage Example
78
  if __name__ == "__main__":
79
  scraper = BasicAgent()
80
 
81
- # Search for Python programming
82
  query = "Python programming language"
83
- print(f"Searching Google for: '{query}'")
84
 
85
- results = scraper.search(query)
86
 
87
  if results:
88
  print("\nTop Results:")
89
  print(scraper.pretty_print(results))
90
  else:
91
- print("No results found or search failed")
92
 
93
 
94
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
6
  from typing import List, Dict, Union
7
  import pandas as pd
8
  import wikipediaapi
9
+ import requests
10
  from bs4 import BeautifulSoup
11
  import urllib.parse
12
+ import random
13
  from typing import List, Dict
 
14
 
15
  load_dotenv()
16
 
 
23
 
24
  class BasicAgent:
25
  def __init__(self):
 
26
  self.headers = {
27
+ 'User-Agent': self._get_random_user_agent(),
28
  'Accept-Language': 'en-US,en;q=0.5',
29
  }
30
+
31
+ def _get_random_user_agent(self) -> str:
32
+ """Fallback user-agent generator if fake-useragent isn't installed"""
33
+ browsers = [
34
+ # Chrome
35
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
36
+ # Firefox
37
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
38
+ # Safari
39
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
40
+ ]
41
+ return random.choice(browsers)
42
 
43
  def search(self, query: str, num_results: int = 3) -> List[Dict]:
44
  """Perform Google search and return structured results"""
45
  encoded_query = urllib.parse.quote_plus(query)
46
+ url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}"
47
 
48
  try:
49
  response = requests.get(url, headers=self.headers, timeout=10)
 
58
  soup = BeautifulSoup(html, 'html.parser')
59
  results = []
60
 
61
+ # Current Google result selectors (July 2024)
62
+ for i, result in enumerate(soup.select('.tF2Cxc, .g')[:max_results]):
63
+ title = result.select_one('h3, .LC20lb')
64
  link = result.find('a')['href']
65
+ snippet = result.select_one('.IsZvec, .VwiC3b')
66
 
67
  if title and link:
68
  results.append({
69
  'position': i + 1,
70
  'title': title.get_text(),
71
+ 'link': link if link.startswith('http') else f"https://www.google.com{link}",
72
  'snippet': snippet.get_text() if snippet else None
73
  })
74
 
 
80
  for res in results:
81
  output.append(
82
  f"{res['position']}. {res['title']}\n"
83
+ f" πŸ”— {res['link']}\n"
84
+ f" πŸ“ {res['snippet'] or 'No description available'}\n"
85
  )
86
  return "\n".join(output)
87
 
 
88
  if __name__ == "__main__":
89
  scraper = BasicAgent()
90
 
91
+ # Example search
92
  query = "Python programming language"
93
+ print(f"πŸ” Searching Google for: '{query}'")
94
 
95
+ results = scraper.search(query, num_results=3)
96
 
97
  if results:
98
  print("\nTop Results:")
99
  print(scraper.pretty_print(results))
100
  else:
101
+ print("❌ No results found or search failed")
102
 
103
 
104
  def run_and_submit_all( profile: gr.OAuthProfile | None):