Maouu commited on
Commit
ef4e7d0
·
1 Parent(s): 050c6b4
Search/__pycache__/gettyimages.cpython-312.pyc DELETED
Binary file (1.03 kB)
 
Search/__pycache__/main.cpython-312.pyc DELETED
Binary file (6.96 kB)
 
Search/__pycache__/useragentka.cpython-312.pyc DELETED
Binary file (1.68 kB)
 
Search/gettyimages.py DELETED
@@ -1,21 +0,0 @@
1
- from curl_cffi import requests
2
- from bs4 import BeautifulSoup
3
-
4
- def get_images(query):
5
- res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
6
-
7
- soup = BeautifulSoup(res.text, 'html.parser')
8
-
9
- images = soup.find_all('img')
10
-
11
- results = []
12
-
13
- for image in images:
14
- print(image['src'])
15
- if image['src'].startswith('https://media.gettyimages.com'):
16
- results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
17
- else:
18
- continue
19
-
20
- return results
21
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Search/main.py DELETED
@@ -1,163 +0,0 @@
1
- """googlesearch is a Python library for searching Google, easily."""
2
- from time import sleep
3
- from bs4 import BeautifulSoup
4
- from requests import get
5
- from urllib.parse import unquote # to decode the url
6
- from Search.useragentka import get_useragent
7
- from curl_cffi import requests as curlreq
8
- from Search.gettyimages import get_images
9
-
10
- def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
11
- resp = get(
12
- url="https://www.google.com/search",
13
- headers={
14
- "User-Agent": get_useragent(),
15
- "Accept": "*/*"
16
- },
17
- params={
18
- "q": term,
19
- "num": results + 2, # Prevents multiple requests
20
- "hl": lang,
21
- "start": start,
22
- "safe": safe,
23
- "gl": region,
24
- },
25
- proxies=proxies,
26
- timeout=timeout,
27
- verify=ssl_verify,
28
- cookies = {
29
- 'CONSENT': 'PENDING+987', # Bypasses the consent page
30
- 'SOCS': 'CAESHAgBEhIaAB',
31
- }
32
- )
33
- resp.raise_for_status()
34
- return resp
35
-
36
-
37
- class SearchResult:
38
- def __init__(self, url, title, description):
39
- self.url = url
40
- self.title = title
41
- self.description = description
42
-
43
- def __repr__(self):
44
- return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
45
-
46
-
47
- def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
48
- """Search the Google search engine"""
49
-
50
- # Proxy setup
51
- proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
52
-
53
- start = start_num
54
- fetched_results = 0
55
- fetched_links = set()
56
- results_list = []
57
- image_results = [] # New list for image results
58
-
59
- while fetched_results < num_results:
60
- # Send request
61
- resp = _req(term, num_results - start,
62
- lang, start, proxies, timeout, safe, ssl_verify, region)
63
-
64
- # Parse
65
- soup = BeautifulSoup(resp.text, "html.parser")
66
- result_block = soup.find_all("div", class_="ezO2md")
67
- new_results = 0
68
-
69
- # Find all images on the page
70
- try:
71
- all_images = soup.find_all("img") # Google's image class
72
- for img in all_images:
73
- img_src = img.get("src") or img.get("data-src")
74
- if img_src:
75
- # Handle base64 images
76
- if img_src.startswith("data:image"):
77
- image_results.append({
78
- "src": img_src, # Already base64 encoded
79
- "alt": img.get("alt", ""),
80
- "class": img.get("class", []),
81
- })
82
- # Handle regular image URLs
83
- elif img_src.startswith("http"):
84
- image_results.append({
85
- "src": img_src,
86
- "alt": img.get("alt", ""),
87
- "class": img.get("class", []),
88
- })
89
- except Exception as e:
90
- print(f"Error parsing images: {str(e)}")
91
-
92
- for result in result_block:
93
- link_tag = result.find("a", href=True)
94
- title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
95
- description_tag = result.find("span", class_="FrIlee")
96
-
97
- if link_tag and title_tag and description_tag:
98
- link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
99
- if link in fetched_links and unique:
100
- continue
101
- fetched_links.add(link)
102
- title = title_tag.text if title_tag else ""
103
- description = description_tag.text if description_tag else ""
104
-
105
- # Only get page_text if advanced mode and we haven't gotten any yet
106
- if advanced and not any('page_text' in result for result in results_list):
107
- try:
108
- page_scrape = curlreq.get(link, impersonate='chrome110')
109
- page_scrape.encoding = 'utf-8'
110
- page_soup = BeautifulSoup(page_scrape.text, "html.parser")
111
-
112
- # Try multiple strategies to find main content
113
- main_content = (
114
- page_soup.find(['article', 'main']) or
115
- page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
116
- page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
117
- page_soup.find('div', {'role': 'main'}) or
118
- page_soup.body
119
- )
120
- if main_content:
121
- # Remove unwanted elements
122
- for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
123
- element.decompose()
124
- # Extract text with better cleaning
125
- text = main_content.get_text(separator=' ', strip=True)
126
- text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
127
- page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
128
- else:
129
- page_text = ""
130
- except Exception as e:
131
- print(f"Error scraping {link}: {str(e)}")
132
- page_text = ""
133
- else:
134
- page_text = ""
135
-
136
-
137
- fetched_results += 1
138
- new_results += 1
139
-
140
- if advanced:
141
- results_list.append({
142
- "link": link,
143
- "title": title,
144
- "description": description,
145
- "page_text": page_text,
146
- })
147
- else:
148
- results_list.append(link)
149
-
150
- if fetched_results >= num_results:
151
- break
152
-
153
- if new_results == 0:
154
- break
155
-
156
- start += 10
157
- sleep(sleep_interval)
158
-
159
- if image_results == [] :
160
- images = get_images(term)
161
- return {"results": results_list, "images": images}
162
- else:
163
- return {"results": results_list, "images": image_results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Search/useragentka.py DELETED
@@ -1,20 +0,0 @@
1
- import random
2
-
3
- def get_useragent():
4
- """
5
- Generates a random user agent string mimicking the format of various software versions.
6
-
7
- The user agent string is composed of:
8
- - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
9
- - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
10
- - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
11
- - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
12
-
13
- Returns:
14
- str: A randomly generated user agent string.
15
- """
16
- lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
17
- libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
18
- ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
19
- openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
20
- return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/app.cpython-312.pyc CHANGED
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
 
app.py CHANGED
@@ -15,7 +15,6 @@ from fastapi.templating import Jinja2Templates
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
18
- from Search.main import search
19
 
20
  app = FastAPI()
21
 
@@ -223,50 +222,3 @@ async def chat(request: ChatRequest):
223
  log_request("/chat", selected_generator.__name__)
224
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
225
 
226
- @app.post("/fetch-images")
227
- async def fetch_images(request: Request):
228
- data = await request.json()
229
- query = data.get("query", "")
230
- num_results = data.get("num_results", 5)
231
- lang = data.get("lang", "en")
232
- advanced = data.get("advanced", False)
233
-
234
- # Call the search function
235
- results = search(query, num_results=num_results, lang=lang, advanced=advanced)
236
-
237
- # Log the request
238
- log_request("/fetch-images", query)
239
-
240
- return results['images']
241
-
242
- @app.post("/fetch-links")
243
- async def fetch_links(request: Request):
244
- data = await request.json()
245
- query = data.get("query", "")
246
- num_results = data.get("num_results", 5)
247
- lang = data.get("lang", "en")
248
- advanced = data.get("advanced", False)
249
-
250
- # Call the search function
251
- results = search(query, num_results=num_results, lang=lang, advanced=advanced)
252
-
253
- # Log the request
254
- log_request("/fetch-links", query)
255
-
256
- return results['results']
257
-
258
- @app.post("/fetch-google")
259
- async def fetch_google(request: Request):
260
- data = await request.json()
261
- query = data.get("query", "")
262
- num_results = data.get("num_results", 5)
263
- lang = data.get("lang", "en")
264
- advanced = data.get("advanced", True)
265
-
266
- # Call the search function
267
- results = search(query, num_results=num_results, lang=lang, advanced=advanced)
268
-
269
- # Log the request
270
- log_request("/fetch-google", query)
271
-
272
- return results
 
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
 
18
 
19
  app = FastAPI()
20
 
 
222
  log_request("/chat", selected_generator.__name__)
223
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -7,6 +7,4 @@ asyncio
7
  groq
8
  jinja2
9
  aiofiles
10
- matplotlib
11
- bs4
12
- curl_cffi
 
7
  groq
8
  jinja2
9
  aiofiles
10
+ matplotlib
 
 
test.py CHANGED
@@ -1,18 +0,0 @@
1
- import requests
2
-
3
- url = "http://localhost:8000/fetch-images" # or your deployed URL
4
-
5
- payload = {
6
- "query": "sunset beach",
7
- "num_results": 5,
8
- "lang": "en",
9
- "advanced": False
10
- }
11
-
12
- response = requests.post(url, json=payload)
13
-
14
- if response.ok:
15
- results = response.json()
16
- print("Fetched Images:", results)
17
- else:
18
- print("Error:", response.status_code, response.text)