Spaces:
Running
Running
Revert
Browse files- Search/__pycache__/gettyimages.cpython-312.pyc +0 -0
- Search/__pycache__/main.cpython-312.pyc +0 -0
- Search/__pycache__/useragentka.cpython-312.pyc +0 -0
- Search/gettyimages.py +0 -21
- Search/main.py +0 -163
- Search/useragentka.py +0 -20
- __pycache__/app.cpython-312.pyc +0 -0
- app.py +0 -48
- requirements.txt +1 -3
- test.py +0 -18
Search/__pycache__/gettyimages.cpython-312.pyc
DELETED
Binary file (1.03 kB)
|
|
Search/__pycache__/main.cpython-312.pyc
DELETED
Binary file (6.96 kB)
|
|
Search/__pycache__/useragentka.cpython-312.pyc
DELETED
Binary file (1.68 kB)
|
|
Search/gettyimages.py
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
from curl_cffi import requests
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
|
4 |
-
def get_images(query):
|
5 |
-
res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
|
6 |
-
|
7 |
-
soup = BeautifulSoup(res.text, 'html.parser')
|
8 |
-
|
9 |
-
images = soup.find_all('img')
|
10 |
-
|
11 |
-
results = []
|
12 |
-
|
13 |
-
for image in images:
|
14 |
-
print(image['src'])
|
15 |
-
if image['src'].startswith('https://media.gettyimages.com'):
|
16 |
-
results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
|
17 |
-
else:
|
18 |
-
continue
|
19 |
-
|
20 |
-
return results
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Search/main.py
DELETED
@@ -1,163 +0,0 @@
|
|
1 |
-
"""googlesearch is a Python library for searching Google, easily."""
|
2 |
-
from time import sleep
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
from requests import get
|
5 |
-
from urllib.parse import unquote # to decode the url
|
6 |
-
from Search.useragentka import get_useragent
|
7 |
-
from curl_cffi import requests as curlreq
|
8 |
-
from Search.gettyimages import get_images
|
9 |
-
|
10 |
-
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
|
11 |
-
resp = get(
|
12 |
-
url="https://www.google.com/search",
|
13 |
-
headers={
|
14 |
-
"User-Agent": get_useragent(),
|
15 |
-
"Accept": "*/*"
|
16 |
-
},
|
17 |
-
params={
|
18 |
-
"q": term,
|
19 |
-
"num": results + 2, # Prevents multiple requests
|
20 |
-
"hl": lang,
|
21 |
-
"start": start,
|
22 |
-
"safe": safe,
|
23 |
-
"gl": region,
|
24 |
-
},
|
25 |
-
proxies=proxies,
|
26 |
-
timeout=timeout,
|
27 |
-
verify=ssl_verify,
|
28 |
-
cookies = {
|
29 |
-
'CONSENT': 'PENDING+987', # Bypasses the consent page
|
30 |
-
'SOCS': 'CAESHAgBEhIaAB',
|
31 |
-
}
|
32 |
-
)
|
33 |
-
resp.raise_for_status()
|
34 |
-
return resp
|
35 |
-
|
36 |
-
|
37 |
-
class SearchResult:
|
38 |
-
def __init__(self, url, title, description):
|
39 |
-
self.url = url
|
40 |
-
self.title = title
|
41 |
-
self.description = description
|
42 |
-
|
43 |
-
def __repr__(self):
|
44 |
-
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
|
45 |
-
|
46 |
-
|
47 |
-
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
|
48 |
-
"""Search the Google search engine"""
|
49 |
-
|
50 |
-
# Proxy setup
|
51 |
-
proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
|
52 |
-
|
53 |
-
start = start_num
|
54 |
-
fetched_results = 0
|
55 |
-
fetched_links = set()
|
56 |
-
results_list = []
|
57 |
-
image_results = [] # New list for image results
|
58 |
-
|
59 |
-
while fetched_results < num_results:
|
60 |
-
# Send request
|
61 |
-
resp = _req(term, num_results - start,
|
62 |
-
lang, start, proxies, timeout, safe, ssl_verify, region)
|
63 |
-
|
64 |
-
# Parse
|
65 |
-
soup = BeautifulSoup(resp.text, "html.parser")
|
66 |
-
result_block = soup.find_all("div", class_="ezO2md")
|
67 |
-
new_results = 0
|
68 |
-
|
69 |
-
# Find all images on the page
|
70 |
-
try:
|
71 |
-
all_images = soup.find_all("img") # Google's image class
|
72 |
-
for img in all_images:
|
73 |
-
img_src = img.get("src") or img.get("data-src")
|
74 |
-
if img_src:
|
75 |
-
# Handle base64 images
|
76 |
-
if img_src.startswith("data:image"):
|
77 |
-
image_results.append({
|
78 |
-
"src": img_src, # Already base64 encoded
|
79 |
-
"alt": img.get("alt", ""),
|
80 |
-
"class": img.get("class", []),
|
81 |
-
})
|
82 |
-
# Handle regular image URLs
|
83 |
-
elif img_src.startswith("http"):
|
84 |
-
image_results.append({
|
85 |
-
"src": img_src,
|
86 |
-
"alt": img.get("alt", ""),
|
87 |
-
"class": img.get("class", []),
|
88 |
-
})
|
89 |
-
except Exception as e:
|
90 |
-
print(f"Error parsing images: {str(e)}")
|
91 |
-
|
92 |
-
for result in result_block:
|
93 |
-
link_tag = result.find("a", href=True)
|
94 |
-
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
|
95 |
-
description_tag = result.find("span", class_="FrIlee")
|
96 |
-
|
97 |
-
if link_tag and title_tag and description_tag:
|
98 |
-
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
|
99 |
-
if link in fetched_links and unique:
|
100 |
-
continue
|
101 |
-
fetched_links.add(link)
|
102 |
-
title = title_tag.text if title_tag else ""
|
103 |
-
description = description_tag.text if description_tag else ""
|
104 |
-
|
105 |
-
# Only get page_text if advanced mode and we haven't gotten any yet
|
106 |
-
if advanced and not any('page_text' in result for result in results_list):
|
107 |
-
try:
|
108 |
-
page_scrape = curlreq.get(link, impersonate='chrome110')
|
109 |
-
page_scrape.encoding = 'utf-8'
|
110 |
-
page_soup = BeautifulSoup(page_scrape.text, "html.parser")
|
111 |
-
|
112 |
-
# Try multiple strategies to find main content
|
113 |
-
main_content = (
|
114 |
-
page_soup.find(['article', 'main']) or
|
115 |
-
page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
|
116 |
-
page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
|
117 |
-
page_soup.find('div', {'role': 'main'}) or
|
118 |
-
page_soup.body
|
119 |
-
)
|
120 |
-
if main_content:
|
121 |
-
# Remove unwanted elements
|
122 |
-
for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
|
123 |
-
element.decompose()
|
124 |
-
# Extract text with better cleaning
|
125 |
-
text = main_content.get_text(separator=' ', strip=True)
|
126 |
-
text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
|
127 |
-
page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
|
128 |
-
else:
|
129 |
-
page_text = ""
|
130 |
-
except Exception as e:
|
131 |
-
print(f"Error scraping {link}: {str(e)}")
|
132 |
-
page_text = ""
|
133 |
-
else:
|
134 |
-
page_text = ""
|
135 |
-
|
136 |
-
|
137 |
-
fetched_results += 1
|
138 |
-
new_results += 1
|
139 |
-
|
140 |
-
if advanced:
|
141 |
-
results_list.append({
|
142 |
-
"link": link,
|
143 |
-
"title": title,
|
144 |
-
"description": description,
|
145 |
-
"page_text": page_text,
|
146 |
-
})
|
147 |
-
else:
|
148 |
-
results_list.append(link)
|
149 |
-
|
150 |
-
if fetched_results >= num_results:
|
151 |
-
break
|
152 |
-
|
153 |
-
if new_results == 0:
|
154 |
-
break
|
155 |
-
|
156 |
-
start += 10
|
157 |
-
sleep(sleep_interval)
|
158 |
-
|
159 |
-
if image_results == [] :
|
160 |
-
images = get_images(term)
|
161 |
-
return {"results": results_list, "images": images}
|
162 |
-
else:
|
163 |
-
return {"results": results_list, "images": image_results}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Search/useragentka.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
import random
|
2 |
-
|
3 |
-
def get_useragent():
|
4 |
-
"""
|
5 |
-
Generates a random user agent string mimicking the format of various software versions.
|
6 |
-
|
7 |
-
The user agent string is composed of:
|
8 |
-
- Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
|
9 |
-
- libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
|
10 |
-
- SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
|
11 |
-
- OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
|
12 |
-
|
13 |
-
Returns:
|
14 |
-
str: A randomly generated user agent string.
|
15 |
-
"""
|
16 |
-
lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
|
17 |
-
libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
|
18 |
-
ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
|
19 |
-
openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
|
20 |
-
return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/app.cpython-312.pyc
CHANGED
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -15,7 +15,6 @@ from fastapi.templating import Jinja2Templates
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
18 |
-
from Search.main import search
|
19 |
|
20 |
app = FastAPI()
|
21 |
|
@@ -223,50 +222,3 @@ async def chat(request: ChatRequest):
|
|
223 |
log_request("/chat", selected_generator.__name__)
|
224 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
225 |
|
226 |
-
@app.post("/fetch-images")
|
227 |
-
async def fetch_images(request: Request):
|
228 |
-
data = await request.json()
|
229 |
-
query = data.get("query", "")
|
230 |
-
num_results = data.get("num_results", 5)
|
231 |
-
lang = data.get("lang", "en")
|
232 |
-
advanced = data.get("advanced", False)
|
233 |
-
|
234 |
-
# Call the search function
|
235 |
-
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
236 |
-
|
237 |
-
# Log the request
|
238 |
-
log_request("/fetch-images", query)
|
239 |
-
|
240 |
-
return results['images']
|
241 |
-
|
242 |
-
@app.post("/fetch-links")
|
243 |
-
async def fetch_links(request: Request):
|
244 |
-
data = await request.json()
|
245 |
-
query = data.get("query", "")
|
246 |
-
num_results = data.get("num_results", 5)
|
247 |
-
lang = data.get("lang", "en")
|
248 |
-
advanced = data.get("advanced", False)
|
249 |
-
|
250 |
-
# Call the search function
|
251 |
-
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
252 |
-
|
253 |
-
# Log the request
|
254 |
-
log_request("/fetch-links", query)
|
255 |
-
|
256 |
-
return results['results']
|
257 |
-
|
258 |
-
@app.post("/fetch-google")
|
259 |
-
async def fetch_google(request: Request):
|
260 |
-
data = await request.json()
|
261 |
-
query = data.get("query", "")
|
262 |
-
num_results = data.get("num_results", 5)
|
263 |
-
lang = data.get("lang", "en")
|
264 |
-
advanced = data.get("advanced", True)
|
265 |
-
|
266 |
-
# Call the search function
|
267 |
-
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
268 |
-
|
269 |
-
# Log the request
|
270 |
-
log_request("/fetch-google", query)
|
271 |
-
|
272 |
-
return results
|
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
|
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
|
|
222 |
log_request("/chat", selected_generator.__name__)
|
223 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -7,6 +7,4 @@ asyncio
|
|
7 |
groq
|
8 |
jinja2
|
9 |
aiofiles
|
10 |
-
matplotlib
|
11 |
-
bs4
|
12 |
-
curl_cffi
|
|
|
7 |
groq
|
8 |
jinja2
|
9 |
aiofiles
|
10 |
+
matplotlib
|
|
|
|
test.py
CHANGED
@@ -1,18 +0,0 @@
|
|
1 |
-
import requests
|
2 |
-
|
3 |
-
url = "http://localhost:8000/fetch-images" # or your deployed URL
|
4 |
-
|
5 |
-
payload = {
|
6 |
-
"query": "sunset beach",
|
7 |
-
"num_results": 5,
|
8 |
-
"lang": "en",
|
9 |
-
"advanced": False
|
10 |
-
}
|
11 |
-
|
12 |
-
response = requests.post(url, json=payload)
|
13 |
-
|
14 |
-
if response.ok:
|
15 |
-
results = response.json()
|
16 |
-
print("Fetched Images:", results)
|
17 |
-
else:
|
18 |
-
print("Error:", response.status_code, response.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|