"""googlesearch is a Python library for searching Google, easily.""" from time import sleep from bs4 import BeautifulSoup from requests import get from urllib.parse import unquote # to decode the url from chipsearch.useragentka import get_useragent from curl_cffi import requests as curlreq from chipsearch.gettyimages import get_images def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): resp = get( url="https://www.google.com/search", headers={ "User-Agent": get_useragent(), "Accept": "*/*" }, params={ "q": term, "num": results + 2, # Prevents multiple requests "hl": lang, "start": start, "safe": safe, "gl": region, }, proxies=proxies, timeout=timeout, verify=ssl_verify, cookies = { 'CONSENT': 'PENDING+987', # Bypasses the consent page 'SOCS': 'CAESHAgBEhIaAB', } ) resp.raise_for_status() return resp class SearchResult: def __init__(self, url, title, description): self.url = url self.title = title self.description = description def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None start = start_num fetched_results = 0 fetched_links = set() results_list = [] image_results = [] # New list for image results while fetched_results < num_results: # Send request resp = _req(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # Parse soup = BeautifulSoup(resp.text, "html.parser") result_block = soup.find_all("div", class_="ezO2md") new_results = 0 # Find all images on the page try: all_images = soup.find_all("img") # Google's image class for img in all_images: img_src = img.get("src") or img.get("data-src") if img_src: # Handle base64 images if img_src.startswith("data:image"): image_results.append({ "src": img_src, # Already base64 encoded "alt": img.get("alt", ""), "class": img.get("class", []), }) # Handle regular image URLs elif img_src.startswith("http"): image_results.append({ "src": img_src, "alt": img.get("alt", ""), "class": img.get("class", []), }) except Exception as e: print(f"Error parsing images: {str(e)}") for result in result_block: link_tag = result.find("a", href=True) title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None description_tag = result.find("span", class_="FrIlee") if link_tag and title_tag and description_tag: link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link in fetched_links and unique: continue fetched_links.add(link) title = title_tag.text if title_tag else "" description = description_tag.text if description_tag else "" # Only get page_text if advanced mode and we haven't gotten any yet if advanced and not any('page_text' in result for result in results_list): try: page_scrape = curlreq.get(link, impersonate='chrome110') page_scrape.encoding = 'utf-8' page_soup = BeautifulSoup(page_scrape.text, "html.parser") # Try multiple strategies to find main content main_content = ( page_soup.find(['article', 'main']) or page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or page_soup.find('div', {'role': 'main'}) or page_soup.body ) if main_content: # Remove unwanted elements for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']): element.decompose() # Extract text with better cleaning text = main_content.get_text(separator=' ', strip=True) text = ' '.join(line.strip() for line in text.splitlines() if line.strip()) page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000] else: page_text = "" except Exception as e: print(f"Error scraping {link}: {str(e)}") page_text = "" else: page_text = "" fetched_results += 1 new_results += 1 if advanced: results_list.append({ "link": link, "title": title, "description": description, "page_text": page_text, }) else: results_list.append(link) if fetched_results >= num_results: break if new_results == 0: break start += 10 sleep(sleep_interval) if image_results == [] : images = get_images(term) return {"results": results_list, "images": images} else: return {"results": results_list, "images": image_results}