Frason commited on
Commit
d785968
·
verified ·
1 Parent(s): 3b5ed8a
Files changed (2) hide show
  1. app.py +1496 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,1496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ import logging
4
+ import argparse
5
+ import os
6
+ import json
7
+ import random
8
+ import re
9
+ import uuid
10
+ from collections import defaultdict
11
+ from datetime import datetime
12
+ from typing import List, Dict, Any, Optional, Union, Tuple
13
+
14
+ from selenium import webdriver
15
+ from selenium.webdriver.chrome.options import Options
16
+ from selenium.webdriver.chrome.service import Service
17
+ from selenium.webdriver.common.by import By
18
+ from selenium.webdriver.common.keys import Keys
19
+ from selenium.webdriver.common.action_chains import ActionChains
20
+ from selenium.webdriver.support.ui import WebDriverWait
21
+ from selenium.webdriver.support import expected_conditions as EC
22
+ from selenium.common.exceptions import (
23
+ TimeoutException, NoSuchElementException, WebDriverException
24
+ )
25
+ import gradio as gr
26
+ import pandas as pd
27
+
28
+ # Setup logging
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format='%(asctime)s - %(levelname)s - %(message)s',
32
+ datefmt='%Y-%m-%d %H:%M:%S'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ # Predefined advertisers list
38
+ ADVERTISERS = [
39
+ {"id": "AR10051102910143528961", "name": "Theory Sabers"},
40
+ {"id": "AR12645693856247971841", "name": "Artsabers"},
41
+ {"id": "AR07257050693515608065", "name": "bmlightsabers"},
42
+ {"id": "AR01506694249926623233", "name": "Padawan Outpost Ltd"},
43
+ {"id": "AR10584025853845307393", "name": "GalaxySabers"},
44
+ {"id": "AR16067963414479110145", "name": "nsabers"},
45
+ {"id": "AR12875519274243850241", "name": "es-sabers"},
46
+ {"id": "AR05144647067079016449", "name": "Ultra Sabers"},
47
+ {"id": "AR15581800501283389441", "name": "SuperNeox"},
48
+ {"id": "AR06148907109187584001", "name": "Sabertrio"}
49
+ ]
50
+
51
+
52
+ #####################################
53
+ ### FACEBOOK SCRAPER SECTION #######
54
+ #####################################
55
+
56
+ # Constants for Facebook Scraper
57
+ FB_DEFAULT_TIMEOUT = 60 # seconds
58
+ FB_MIN_WAIT_TIME = 1 # minimum seconds for random waits
59
+ FB_MAX_WAIT_TIME = 3 # maximum seconds for random waits
60
+ FB_MAX_SCROLL_ATTEMPTS = 5 # maximum number of scroll attempts
61
+ FB_SELECTOR_HISTORY_FILE = "fb_selector_stats.json" # File to store selector success stats
62
+
63
+ # User agents for rotation
64
+ USER_AGENTS = [
65
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
66
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
67
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
68
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
69
+ ]
70
+
71
+ # Viewport sizes for randomization
72
+ VIEWPORT_SIZES = [
73
+ (1366, 768),
74
+ (1920, 1080),
75
+ (1536, 864),
76
+ (1440, 900)
77
+ ]
78
+
79
+
80
+ class SelectorStats:
81
+ """Class to track and optimize selector performance"""
82
+
83
+ def __init__(self, file_path=FB_SELECTOR_HISTORY_FILE):
84
+ self.file_path = file_path
85
+ self.stats = self._load_stats()
86
+
87
+ def _load_stats(self) -> Dict:
88
+ """Load stats from file or initialize if not exists"""
89
+ if os.path.exists(self.file_path):
90
+ try:
91
+ with open(self.file_path, 'r') as f:
92
+ return json.load(f)
93
+ except (json.JSONDecodeError, IOError) as e:
94
+ logger.warning(f"Error loading selector stats: {e}, initializing new stats")
95
+
96
+ # Initialize structure for platform stats
97
+ return {
98
+ "facebook": {"selectors": {}, "last_updated": datetime.now().isoformat()}
99
+ }
100
+
101
+ def update_selector_success(self, selector: str, count: int = 1) -> None:
102
+ """Record successful use of a selector"""
103
+ platform = "facebook" # Only using Facebook for this version
104
+ if platform not in self.stats:
105
+ self.stats[platform] = {"selectors": {}, "last_updated": datetime.now().isoformat()}
106
+
107
+ if selector not in self.stats[platform]["selectors"]:
108
+ self.stats[platform]["selectors"][selector] = {"successes": 0, "attempts": 0}
109
+
110
+ self.stats[platform]["selectors"][selector]["successes"] += count
111
+ self.stats[platform]["selectors"][selector]["attempts"] += 1
112
+ self.stats[platform]["last_updated"] = datetime.now().isoformat()
113
+
114
+ # Save after each update
115
+ self._save_stats()
116
+
117
+ def update_selector_attempt(self, selector: str) -> None:
118
+ """Record attempt to use a selector regardless of success"""
119
+ platform = "facebook" # Only using Facebook for this version
120
+ if platform not in self.stats:
121
+ self.stats[platform] = {"selectors": {}, "last_updated": datetime.now().isoformat()}
122
+
123
+ if selector not in self.stats[platform]["selectors"]:
124
+ self.stats[platform]["selectors"][selector] = {"successes": 0, "attempts": 0}
125
+
126
+ self.stats[platform]["selectors"][selector]["attempts"] += 1
127
+ self.stats[platform]["last_updated"] = datetime.now().isoformat()
128
+
129
+ # Don't save on every attempt to reduce disk I/O
130
+
131
+ def get_best_selectors(self, min_attempts: int = 3, max_count: int = 10) -> List[str]:
132
+ """Get the best performing selectors for Facebook"""
133
+ platform = "facebook" # Only using Facebook for this version
134
+ if platform not in self.stats:
135
+ return []
136
+
137
+ selectors = []
138
+ for selector, data in self.stats[platform]["selectors"].items():
139
+ if data["attempts"] >= min_attempts:
140
+ success_rate = data["successes"] / data["attempts"] if data["attempts"] > 0 else 0
141
+ selectors.append((selector, success_rate))
142
+
143
+ # Sort by success rate (descending)
144
+ selectors.sort(key=lambda x: x[1], reverse=True)
145
+
146
+ # Return top N selectors
147
+ return [s[0] for s in selectors[:max_count]]
148
+
149
+ def _save_stats(self) -> None:
150
+ """Save stats to file"""
151
+ try:
152
+ with open(self.file_path, 'w') as f:
153
+ json.dump(self.stats, f, indent=2)
154
+ except IOError as e:
155
+ logger.error(f"Error saving selector stats: {e}")
156
+
157
+
158
+ class FacebookAdsScraper:
159
+ def __init__(self, headless=True, debug_mode=False):
160
+ """Initialize the ads scraper with browser configuration"""
161
+ self.debug_mode = debug_mode
162
+ self.headless = headless
163
+ self.driver = self._setup_driver(headless)
164
+ # Initialize selector stats tracker
165
+ self.selector_stats = SelectorStats()
166
+ # Track navigation history for smart retry
167
+ self.navigation_history = []
168
+ # Track success/failure for self-healing
169
+ self.success_rate = defaultdict(lambda: {"success": 0, "failure": 0})
170
+ # Generate a session ID for this scraping session
171
+ self.session_id = str(uuid.uuid4())[:8]
172
+
173
+ def _setup_driver(self, headless):
174
+ """Set up and configure the Chrome WebDriver with anti-detection measures"""
175
+ chrome_options = Options()
176
+ if headless:
177
+ chrome_options.add_argument("--headless")
178
+
179
+ # Select a random user agent
180
+ user_agent = random.choice(USER_AGENTS)
181
+ chrome_options.add_argument(f"--user-agent={user_agent}")
182
+ logger.info(f"Using user agent: {user_agent}")
183
+
184
+ # Select a random viewport size
185
+ viewport_width, viewport_height = random.choice(VIEWPORT_SIZES)
186
+ chrome_options.add_argument(f"--window-size={viewport_width},{viewport_height}")
187
+ logger.info(f"Using viewport size: {viewport_width}x{viewport_height}")
188
+
189
+ # Add common options to avoid detection
190
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
191
+ chrome_options.add_argument("--no-sandbox")
192
+ chrome_options.add_argument("--disable-dev-shm-usage")
193
+ chrome_options.add_argument("--disable-gpu")
194
+ chrome_options.add_argument("--start-maximized")
195
+ chrome_options.add_argument("--enable-unsafe-swiftshader")
196
+
197
+ # Performance improvements
198
+ chrome_options.add_argument("--disable-extensions")
199
+ chrome_options.add_argument("--disable-notifications")
200
+ chrome_options.add_argument("--blink-settings=imagesEnabled=true")
201
+
202
+ # Add experimental options to avoid detection
203
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
204
+ chrome_options.add_experimental_option("useAutomationExtension", False)
205
+
206
+ # Additional preferences to improve performance
207
+ chrome_options.add_experimental_option("prefs", {
208
+ "profile.default_content_setting_values.notifications": 2,
209
+ "profile.managed_default_content_settings.images": 1,
210
+ "profile.managed_default_content_settings.cookies": 1,
211
+ # Add some randomness to the profile
212
+ "profile.default_content_setting_values.plugins": random.randint(1, 3),
213
+ "profile.default_content_setting_values.popups": random.randint(1, 2)
214
+ })
215
+
216
+ try:
217
+ # Try to create driver with service for newer Selenium versions
218
+ service = Service()
219
+ driver = webdriver.Chrome(service=service, options=chrome_options)
220
+
221
+ # Execute CDP commands to avoid detection (works in newer Chrome versions)
222
+ driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
223
+ "source": """
224
+ Object.defineProperty(navigator, 'webdriver', {
225
+ get: () => undefined
226
+ });
227
+
228
+ // Overwrite the languages with random order
229
+ Object.defineProperty(navigator, 'languages', {
230
+ get: () => ['en-US', 'en', 'de'].sort(() => 0.5 - Math.random())
231
+ });
232
+
233
+ // Modify plugins length
234
+ Object.defineProperty(navigator, 'plugins', {
235
+ get: () => {
236
+ // Randomize plugins length between 3 and 7
237
+ const len = Math.floor(Math.random() * 5) + 3;
238
+ const plugins = { length: len };
239
+ for (let i = 0; i < len; i++) {
240
+ plugins[i] = {
241
+ name: ['Flash', 'Chrome PDF Plugin', 'Native Client', 'Chrome PDF Viewer'][Math.floor(Math.random() * 4)],
242
+ filename: ['internal-pdf-viewer', 'mhjfbmdgcfjbbpaeojofohoefgiehjai', 'internal-nacl-plugin'][Math.floor(Math.random() * 3)]
243
+ };
244
+ }
245
+ return plugins;
246
+ }
247
+ });
248
+ """
249
+ })
250
+ except TypeError:
251
+ # Fallback for older Selenium versions
252
+ driver = webdriver.Chrome(options=chrome_options)
253
+ except Exception as e:
254
+ # If there's an issue with CDP, continue anyway
255
+ logger.warning(f"CDP command failed, continuing: {e}")
256
+ driver = webdriver.Chrome(options=chrome_options)
257
+
258
+ # Set default timeout
259
+ driver.set_page_load_timeout(FB_DEFAULT_TIMEOUT)
260
+ return driver
261
+
262
+ def random_wait(self, min_time=None, max_time=None):
263
+ """Wait for a random amount of time to simulate human behavior"""
264
+ min_time = min_time or FB_MIN_WAIT_TIME
265
+ max_time = max_time or FB_MAX_WAIT_TIME
266
+ wait_time = random.uniform(min_time, max_time)
267
+ time.sleep(wait_time)
268
+ return wait_time
269
+
270
+ def human_like_scroll(self, scroll_attempts=None):
271
+ """Scroll down the page in a human-like way"""
272
+ attempts = scroll_attempts or random.randint(3, FB_MAX_SCROLL_ATTEMPTS)
273
+
274
+ # Get page height before scrolling
275
+ initial_height = self.driver.execute_script("return document.body.scrollHeight")
276
+
277
+ for i in range(attempts):
278
+ # Calculate a random scroll amount (25-90% of viewport)
279
+ scroll_percent = random.uniform(0.25, 0.9)
280
+ viewport_height = self.driver.execute_script("return window.innerHeight")
281
+ scroll_amount = int(viewport_height * scroll_percent)
282
+
283
+ # Scroll with a random speed
284
+ scroll_steps = random.randint(5, 15)
285
+ current_position = self.driver.execute_script("return window.pageYOffset")
286
+ target_position = current_position + scroll_amount
287
+
288
+ for step in range(scroll_steps):
289
+ # Calculate next position with easing
290
+ t = (step + 1) / scroll_steps
291
+ # Ease in-out function
292
+ factor = t * t * (3.0 - 2.0 * t)
293
+ next_position = current_position + (target_position - current_position) * factor
294
+ self.driver.execute_script(f"window.scrollTo(0, {next_position})")
295
+ time.sleep(random.uniform(0.01, 0.05))
296
+
297
+ # Occasionally pause longer as if reading content
298
+ if random.random() < 0.3: # 30% chance to pause
299
+ self.random_wait(1.5, 3.5)
300
+ else:
301
+ self.random_wait(0.5, 1.5)
302
+
303
+ # Log progress
304
+ logger.info(f"Human-like scroll {i + 1}/{attempts} completed")
305
+
306
+ # Check if we've reached the bottom of the page
307
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
308
+ if new_height == initial_height and i > 1:
309
+ # We haven't loaded new content after a couple of scrolls
310
+ # Do one big scroll to the bottom to trigger any lazy loading
311
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
312
+ self.random_wait()
313
+ initial_height = new_height
314
+
315
+ def simulate_human_behavior(self):
316
+ """Simulate random human-like interactions with the page"""
317
+ # Random chance to move the mouse around
318
+ if random.random() < 0.7: # 70% chance
319
+ try:
320
+ # Find a random element to hover over
321
+ elements = self.driver.find_elements(By.CSS_SELECTOR, "a, button, input, div")
322
+ if elements:
323
+ element = random.choice(elements)
324
+ ActionChains(self.driver).move_to_element(element).perform()
325
+ self.random_wait(0.2, 1.0)
326
+ except:
327
+ # Ignore any errors, this is just for randomness
328
+ pass
329
+
330
+ # Random chance to click somewhere non-interactive
331
+ if random.random() < 0.2: # 20% chance
332
+ try:
333
+ # Find a safe area to click (like a paragraph or heading)
334
+ safe_elements = self.driver.find_elements(By.CSS_SELECTOR, "p, h1, h2, h3, h4, span")
335
+ if safe_elements:
336
+ safe_element = random.choice(safe_elements)
337
+ ActionChains(self.driver).move_to_element(safe_element).click().perform()
338
+ self.random_wait(0.2, 1.0)
339
+ except:
340
+ # Ignore any errors, this is just for randomness
341
+ pass
342
+
343
+ def check_headless_visibility(self):
344
+ """
345
+ Check if elements are visible in headless mode
346
+ Returns True if everything is working properly
347
+ """
348
+ if not self.headless:
349
+ # If not in headless mode, no need to check
350
+ return True
351
+
352
+ logger.info("Performing headless visibility check...")
353
+
354
+ # Use a simpler page for testing interactivity
355
+ test_url = "https://www.example.com"
356
+ try:
357
+ self.driver.get(test_url)
358
+
359
+ # Just check if the page loads at all - don't try to interact with elements
360
+ WebDriverWait(self.driver, 10).until(
361
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
362
+ )
363
+
364
+ logger.info("Headless check passed: Page loaded successfully")
365
+ return True
366
+
367
+ except Exception as e:
368
+ logger.error(f"Headless check failed: {e}")
369
+
370
+ # Try switching to non-headless mode
371
+ logger.info("Switching to non-headless mode...")
372
+ self.driver.quit()
373
+ self.headless = False
374
+ self.driver = self._setup_driver(headless=False)
375
+
376
+ return True # Continue without rechecking
377
+
378
+ def fetch_facebook_ads(self, query):
379
+ """Fetch ads from Facebook's Ad Library with anti-detection measures"""
380
+ ads_data = []
381
+ base_url = "https://www.facebook.com/ads/library/"
382
+
383
+ logger.info(f"Fetching Facebook ads for {query}")
384
+
385
+ try:
386
+ # Add some randomness to URL parameters
387
+ params = {
388
+ "active_status": "all",
389
+ "ad_type": "all",
390
+ "country": "ALL",
391
+ "q": query,
392
+ # Random parameters to avoid fingerprinting
393
+ "_": int(time.time() * 1000),
394
+ "session_id": self.session_id
395
+ }
396
+
397
+ # Construct URL with parameters
398
+ url = base_url + "?" + "&".join(f"{k}={v}" for k, v in params.items())
399
+ logger.info(f"Navigating to Facebook URL: {url}")
400
+
401
+ # Navigate to the URL
402
+ self.driver.get(url)
403
+
404
+ # Wait for page to initially load
405
+ try:
406
+ WebDriverWait(self.driver, FB_DEFAULT_TIMEOUT).until(
407
+ EC.any_of(
408
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='main']")),
409
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
410
+ )
411
+ )
412
+ except TimeoutException:
413
+ logger.warning("Timeout waiting for Facebook page to load initially, continuing anyway")
414
+
415
+ # Human-like scrolling to trigger lazy loading
416
+ self.human_like_scroll()
417
+
418
+ # Simulate human behavior
419
+ self.simulate_human_behavior()
420
+
421
+ # Save debug data at this point
422
+ if self.debug_mode:
423
+ self._save_debug_data("facebook_after_scroll", query)
424
+
425
+ # Find ad elements using self-healing selectors
426
+ ad_elements = self._find_facebook_ad_elements()
427
+
428
+ if not ad_elements:
429
+ logger.info("No Facebook ads found")
430
+
431
+ if self.debug_mode:
432
+ self._save_debug_data("facebook_no_ads", query)
433
+
434
+ # Return placeholder data as fallback
435
+ return self._generate_placeholder_facebook_data(query)
436
+
437
+ # Process the found ad elements
438
+ for i, ad in enumerate(ad_elements[:10]): # Limit to 10 ads for performance
439
+ try:
440
+ ad_data = {
441
+ "platform": "Facebook",
442
+ "query": query,
443
+ "timestamp": datetime.now().isoformat(),
444
+ "index": i + 1,
445
+ "session_id": self.session_id
446
+ }
447
+
448
+ # Extract data using smarter methods
449
+ full_text = ad.text.strip()
450
+
451
+ # Log first ad text for debugging
452
+ if i == 0:
453
+ logger.info(f"First Facebook ad full text (first 150 chars): {full_text[:150]}...")
454
+
455
+ # Smart data extraction
456
+ extracted_data = self._extract_facebook_ad_data(ad, full_text)
457
+
458
+ # Merge extracted data
459
+ ad_data.update(extracted_data)
460
+
461
+ # Add fallback values if needed
462
+ if "advertiser" not in ad_data or not ad_data["advertiser"]:
463
+ ad_data["advertiser"] = "Unknown Advertiser"
464
+ if "text" not in ad_data or not ad_data["text"]:
465
+ ad_data["text"] = "Ad content not available"
466
+
467
+ ads_data.append(ad_data)
468
+
469
+ except Exception as e:
470
+ logger.warning(f"Error processing Facebook ad {i + 1}: {e}")
471
+
472
+ return ads_data if ads_data else self._generate_placeholder_facebook_data(query)
473
+
474
+ except Exception as e:
475
+ logger.error(f"Error fetching Facebook ads: {e}")
476
+
477
+ if self.debug_mode:
478
+ self._save_debug_data("facebook_error", query)
479
+
480
+ return self._generate_placeholder_facebook_data(query)
481
+
482
+ def _find_facebook_ad_elements(self):
483
+ """Find Facebook ad elements using a self-healing selector strategy"""
484
+ # Historical best performers
485
+ historical_best = self.selector_stats.get_best_selectors()
486
+
487
+ # Base selectors
488
+ base_selectors = [
489
+ "div[class*='_7jvw']",
490
+ "div[data-testid='ad_library_card']",
491
+ "div[class*='AdLibraryCard']",
492
+ "div.AdLibraryCard",
493
+ "div[class*='adCard']",
494
+ "div[class*='ad_card']"
495
+ ]
496
+
497
+ # Combine selectors, prioritizing historical best
498
+ combined_selectors = historical_best + [s for s in base_selectors if s not in historical_best]
499
+
500
+ # Try each selector
501
+ for selector in combined_selectors:
502
+ try:
503
+ # Record attempt
504
+ self.selector_stats.update_selector_attempt(selector)
505
+
506
+ elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
507
+ if elements:
508
+ logger.info(f"Found {len(elements)} Facebook ads using selector: {selector}")
509
+
510
+ # Record success
511
+ self.selector_stats.update_selector_success(selector, len(elements))
512
+
513
+ return elements
514
+ except Exception as e:
515
+ logger.debug(f"Facebook selector {selector} failed: {e}")
516
+
517
+ # No elements found with standard selectors, try a more aggressive approach
518
+ try:
519
+ # Look for text patterns that typically appear in ads
520
+ patterns = [
521
+ "//div[contains(., 'Library ID:')]",
522
+ "//div[contains(., 'Sponsored')]",
523
+ "//div[contains(., 'Active')][contains(., 'Library ID')]",
524
+ "//div[contains(., 'Inactive')][contains(., 'Library ID')]"
525
+ ]
526
+
527
+ for pattern in patterns:
528
+ elements = self.driver.find_elements(By.XPATH, pattern)
529
+ if elements:
530
+ ad_containers = []
531
+ for element in elements:
532
+ try:
533
+ # Try to find containing card by navigating up
534
+ container = element
535
+ for _ in range(5): # Try up to 5 levels up
536
+ if container.get_attribute("class") and "card" in container.get_attribute(
537
+ "class").lower():
538
+ ad_containers.append(container)
539
+ break
540
+ container = container.find_element(By.XPATH, "..")
541
+ except:
542
+ continue
543
+
544
+ if ad_containers:
545
+ logger.info(f"Found {len(ad_containers)} Facebook ads using text pattern approach")
546
+
547
+ # Record this special method
548
+ self.selector_stats.update_selector_success("text_pattern_method", len(ad_containers))
549
+
550
+ return ad_containers
551
+ except Exception as e:
552
+ logger.debug(f"Facebook text pattern approach failed: {e}")
553
+
554
+ return []
555
+
556
+ def _extract_facebook_ad_data(self, ad_element, full_text):
557
+ """Extract data from Facebook ad using multiple intelligent methods"""
558
+ extracted_data = {}
559
+
560
+ # Process text content if available
561
+ if full_text:
562
+ # Split into lines
563
+ lines = full_text.split('\n')
564
+
565
+ # Check for status (Active/Inactive)
566
+ if lines and lines[0] in ["Active", "Inactive"]:
567
+ extracted_data["status"] = lines[0]
568
+
569
+ # Look for advertiser - typically after "See ad details"
570
+ for i, line in enumerate(lines):
571
+ if "See ad details" in line or "See summary details" in line:
572
+ if i + 1 < len(lines):
573
+ extracted_data["advertiser"] = lines[i + 1].strip()
574
+ break
575
+ else:
576
+ # First line is likely the advertiser
577
+ if lines:
578
+ extracted_data["advertiser"] = lines[0].strip()
579
+
580
+ # Extract ad content
581
+ # Look for patterns to determine content boundaries
582
+ content_start_idx = -1
583
+ content_end_idx = len(lines)
584
+
585
+ # Find where "Sponsored" appears
586
+ for i, line in enumerate(lines):
587
+ if "Sponsored" in line:
588
+ content_start_idx = i + 1
589
+ break
590
+
591
+ # If no "Sponsored" found, look for advertiser + status
592
+ if content_start_idx == -1:
593
+ # Skip metadata lines
594
+ metadata_patterns = [
595
+ "Library ID:",
596
+ "Started running on",
597
+ "Platforms",
598
+ "Open Drop-down",
599
+ "See ad details",
600
+ "See summary details",
601
+ "This ad has multiple versions"
602
+ ]
603
+
604
+ for i, line in enumerate(lines):
605
+ if any(pattern in line for pattern in metadata_patterns):
606
+ continue
607
+
608
+ if i > 0: # Skip first line (advertiser)
609
+ content_start_idx = i
610
+ break
611
+
612
+ # Find where UI elements start
613
+ ui_elements = [
614
+ "Like", "Comment", "Share", "Learn More", "Shop Now",
615
+ "Sign Up", "Visit Instagram profile", "See More"
616
+ ]
617
+
618
+ for i, line in enumerate(lines):
619
+ # Skip lines before content start
620
+ if i <= content_start_idx:
621
+ continue
622
+
623
+ if any(ui in line for ui in ui_elements):
624
+ content_end_idx = i
625
+ break
626
+
627
+ # Extract content between boundaries
628
+ if content_start_idx != -1 and content_start_idx < content_end_idx:
629
+ content_lines = lines[content_start_idx:content_end_idx]
630
+ extracted_data["text"] = "\n".join(content_lines).strip()
631
+
632
+ # If text extraction failed, try element-based approaches
633
+ if "text" not in extracted_data or not extracted_data["text"]:
634
+ facebook_text_selectors = [
635
+ "div[data-ad-preview='message']", # Direct message container
636
+ "div[class*='_7jy6']", # Known ad text container
637
+ "div[data-testid='ad-creative-text']", # Test ID for ad text
638
+ "div[class*='_38ki']", # Another text container
639
+ "span[class*='_7oe']", # Text span
640
+ "div.text_exposed_root" # Exposed text root
641
+ ]
642
+
643
+ for selector in facebook_text_selectors:
644
+ try:
645
+ elements = ad_element.find_elements(By.CSS_SELECTOR, selector)
646
+ text_content = " ".join([e.text.strip() for e in elements if e.text.strip()])
647
+ if text_content:
648
+ extracted_data["text"] = text_content
649
+ break
650
+ except:
651
+ pass
652
+
653
+ # If advertiser extraction failed, try element-based approaches
654
+ if "advertiser" not in extracted_data or not extracted_data["advertiser"]:
655
+ facebook_advertiser_selectors = [
656
+ "span[class*='fsl']", # Facebook specific large text class
657
+ "a[aria-label*='profile']", # Profile links often contain advertiser name
658
+ "h4", # Often contains advertiser name
659
+ "div[class*='_8jh5']", # Known advertiser class
660
+ "a[role='link']", # Links are often advertiser names
661
+ "div[class*='_3qn7']", # Another known advertiser container
662
+ "div[class*='_7jvw'] a", # Links within the ad card
663
+ ]
664
+
665
+ for selector in facebook_advertiser_selectors:
666
+ try:
667
+ elements = ad_element.find_elements(By.CSS_SELECTOR, selector)
668
+ for element in elements:
669
+ text = element.text.strip()
670
+ if text and len(text) < 50: # Advertiser names are usually short
671
+ extracted_data["advertiser"] = text
672
+ break
673
+ if "advertiser" in extracted_data and extracted_data["advertiser"]:
674
+ break
675
+ except:
676
+ pass
677
+
678
+ return extracted_data
679
+
680
+ def _generate_placeholder_facebook_data(self, query):
681
+ """Generate placeholder Facebook ad data when real ads cannot be scraped"""
682
+ logger.info(f"Returning placeholder Facebook ad data for query: {query}")
683
+ return [
684
+ {
685
+ "platform": "Facebook",
686
+ "query": query,
687
+ "advertiser": "Placeholder Advertiser 1",
688
+ "text": f"This is a placeholder ad for {query} since no actual ads could be scraped.",
689
+ "timestamp": datetime.now().isoformat(),
690
+ "index": 1,
691
+ "is_placeholder": True,
692
+ "session_id": self.session_id
693
+ },
694
+ {
695
+ "platform": "Facebook",
696
+ "query": query,
697
+ "advertiser": "Placeholder Advertiser 2",
698
+ "text": f"Another placeholder ad for {query}. Please check your scraping settings.",
699
+ "timestamp": datetime.now().isoformat(),
700
+ "index": 2,
701
+ "is_placeholder": True,
702
+ "session_id": self.session_id
703
+ }
704
+ ]
705
+
706
+ def _save_debug_data(self, prefix, query):
707
+ """Save debugging data for investigation"""
708
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
709
+ debug_dir = "debug_data"
710
+
711
+ if not os.path.exists(debug_dir):
712
+ os.makedirs(debug_dir)
713
+
714
+ # Save screenshot
715
+ screenshot_path = f"{debug_dir}/{prefix}_{query}_{timestamp}.png"
716
+ self.driver.save_screenshot(screenshot_path)
717
+ logger.info(f"Saved debug screenshot to {screenshot_path}")
718
+
719
+ # Save HTML
720
+ html_path = f"{debug_dir}/{prefix}_{query}_{timestamp}.html"
721
+ with open(html_path, "w", encoding="utf-8") as f:
722
+ f.write(self.driver.page_source)
723
+ logger.info(f"Saved debug HTML to {html_path}")
724
+
725
+ # Save sample of first ad structure if available
726
+ try:
727
+ ad_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[class*='_7jvw']")
728
+ if ad_elements:
729
+ first_ad = ad_elements[0]
730
+ # Get sample HTML structure
731
+ first_ad_html = first_ad.get_attribute('outerHTML')
732
+ # Save first ad HTML
733
+ sample_path = f"{debug_dir}/{prefix}_sample_ad_{timestamp}.html"
734
+ with open(sample_path, "w", encoding="utf-8") as f:
735
+ f.write(first_ad_html)
736
+ logger.info(f"Saved sample ad HTML to {sample_path}")
737
+
738
+ # Log the text structure
739
+ logger.info(f"Sample ad text structure: {first_ad.text[:300]}...")
740
+ except Exception as e:
741
+ logger.error(f"Error saving ad sample: {e}")
742
+
743
+ def close(self):
744
+ """Close the WebDriver and save stats"""
745
+ if self.driver:
746
+ self.driver.quit()
747
+
748
+ # Save selector stats one last time
749
+ self.selector_stats._save_stats()
750
+
751
+
752
+ # Facebook Gradio Interface Function
753
+ def fetch_facebook_ads(query):
754
+ """Fetch Facebook ads only for Gradio interface"""
755
+ logger.info(f"Processing Facebook ad search for: {query}")
756
+
757
+ scraper = FacebookAdsScraper(headless=True, debug_mode=True)
758
+
759
+ # Perform headless check first
760
+ visibility_ok = scraper.check_headless_visibility()
761
+ if not visibility_ok:
762
+ logger.warning("Headless visibility check failed, results may be affected")
763
+
764
+ # Fetch ads from Facebook
765
+ facebook_ads = scraper.fetch_facebook_ads(query)
766
+
767
+ # Format for display
768
+ formatted_results = []
769
+ for ad in facebook_ads:
770
+ formatted_ad = f"Platform: {ad['platform']}\n"
771
+
772
+ # Include status if available
773
+ if 'status' in ad:
774
+ formatted_ad += f"Status: {ad['status']}\n"
775
+
776
+ formatted_ad += f"Advertiser: {ad['advertiser']}\n"
777
+
778
+ # Format ad text with word wrapping
779
+ text_lines = []
780
+ if ad['text'] and ad['text'] != "Ad content not available":
781
+ # Split long text into readable chunks
782
+ words = ad['text'].split()
783
+ current_line = ""
784
+ for word in words:
785
+ if len(current_line) + len(word) + 1 <= 80: # 80 chars per line
786
+ current_line += (" " + word if current_line else word)
787
+ else:
788
+ text_lines.append(current_line)
789
+ current_line = word
790
+ if current_line:
791
+ text_lines.append(current_line)
792
+
793
+ formatted_text = "\n".join(text_lines)
794
+ else:
795
+ formatted_text = ad['text']
796
+
797
+ formatted_ad += f"Ad Text: {formatted_text}\n"
798
+ formatted_ad += f"Timestamp: {ad['timestamp']}\n"
799
+ if ad.get('is_placeholder', False):
800
+ formatted_ad += "[THIS IS PLACEHOLDER DATA]\n"
801
+ formatted_ad += "-" * 50
802
+ formatted_results.append(formatted_ad)
803
+
804
+ scraper.close()
805
+
806
+ return "\n\n".join(formatted_results) if formatted_results else "No Facebook ads found for your query."
807
+
808
+
809
+ # Create a function to save ads to JSON
810
+ def save_ads_to_json(ads, query):
811
+ """Save ads to a JSON file"""
812
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
813
+ filename = f"facebook_ads_{query.replace(' ', '_')}_{timestamp}.json"
814
+
815
+ try:
816
+ with open(filename, 'w', encoding='utf-8') as f:
817
+ json.dump(ads, f, indent=2, ensure_ascii=False)
818
+ logger.info(f"Saved ads to {filename}")
819
+ return filename
820
+ except Exception as e:
821
+ logger.error(f"Error saving ads to JSON: {e}")
822
+ return None
823
+
824
+
825
+ #####################################
826
+ ### GOOGLE ADS SCRAPER SECTION #####
827
+ #####################################
828
+
829
+ # Constants for Google Ads Scraper
830
+ MAX_ADS_DEFAULT = 5
831
+
832
+ # Import the actual GoogleAds class and regions
833
+ try:
834
+ from GoogleAds.main import GoogleAds, show_regions_list
835
+ from GoogleAds.regions import Regions
836
+
837
+ USING_ACTUAL_GOOGLE_ADS = True
838
+ logger.info("Successfully imported GoogleAds module")
839
+ except ImportError as e:
840
+ # Fallback to mock implementation if module is missing
841
+ logger.warning(f"GoogleAds module not found: {e}. Using mock implementation.")
842
+ USING_ACTUAL_GOOGLE_ADS = False
843
+
844
+ # Mock Regions dictionary - only used if real module fails to import
845
+ Regions = {
846
+ "GB": {"Region": "United Kingdom"}
847
+ }
848
+
849
+
850
+ def show_regions_list():
851
+ """Mock function - only used if real module fails to import"""
852
+ return [("GB", "United Kingdom"), ("US", "United States")]
853
+
854
+
855
+ # Mock GoogleAds class - only used if real module fails to import
856
+ class GoogleAds:
857
+ def __init__(self, region="GB"):
858
+ self.region = region
859
+ logger.warning(f"Using MOCK GoogleAds implementation with region: {region}")
860
+ logger.warning("Please install the GoogleAds module for actual data")
861
+
862
+ def creative_search_by_advertiser_id(self, advertiser_id, count=5):
863
+ # Mock implementation - only used if real module fails to import
864
+ logger.warning(f"MOCK: Searching for creatives from advertiser {advertiser_id}")
865
+ return [f"creative_{i}_{advertiser_id}" for i in range(min(count, 3))]
866
+
867
+ def get_detailed_ad(self, advertiser_id, creative_id):
868
+ # Mock implementation - only used if real module fails to import
869
+ logger.warning(f"MOCK: Getting details for creative {creative_id}")
870
+
871
+ # Find advertiser name
872
+ advertiser_name = "Unknown"
873
+ for adv in ADVERTISERS:
874
+ if adv["id"] == advertiser_id:
875
+ advertiser_name = adv["name"]
876
+ break
877
+
878
+ # Return mock ad details
879
+ return {
880
+ "Ad Format": "Text",
881
+ "Advertiser": advertiser_name,
882
+ "Advertiser Name": advertiser_name,
883
+ "Ad Title": f"MOCK DATA - INSTALL GOOGLE ADS MODULE",
884
+ "Ad Body": f"This is MOCK data because the GoogleAds module is not installed. Please install the proper module.",
885
+ "Last Shown": datetime.now().strftime("%Y-%m-%d"),
886
+ "Creative Id": creative_id,
887
+ "Ad Link": "#"
888
+ }
889
+
890
+
891
+ def clean_ad_text(text):
892
+ """Clean ad text by removing special characters and formatting issues."""
893
+ if text is None or not isinstance(text, str):
894
+ return ""
895
+
896
+ # Remove Unicode special characters often found in Google ads data
897
+ cleaned = text.replace('â¦', '') # Opening symbol
898
+ cleaned = cleaned.replace('â©', '') # Closing symbol
899
+ cleaned = cleaned.replace('<dynamically generated based on landing page content>', '[Dynamic Content]')
900
+
901
+ # Remove any other strange characters that might appear
902
+ cleaned = re.sub(r'[^\x00-\x7F]+', '', cleaned)
903
+
904
+ return cleaned.strip()
905
+
906
+
907
+ def get_regions_list():
908
+ """Get a limited list of regions - only GB and anywhere."""
909
+ regions = [
910
+ ("anywhere", "Global (anywhere)"),
911
+ ("GB", f"{Regions['GB']['Region']} (GB)")
912
+ ]
913
+ return regions
914
+
915
+
916
+ def search_by_advertiser_id(advertiser_id: str, max_ads=MAX_ADS_DEFAULT, region="GB", progress=gr.Progress(),
917
+ provided_name=None) -> Tuple[
918
+ str, Optional[pd.DataFrame], Optional[Dict]]:
919
+
920
+ try:
921
+ progress(0, desc="Initializing scraper...")
922
+
923
+ # Fix for region handling
924
+ region_val = region
925
+ if isinstance(region, tuple) and len(region) > 0:
926
+ region_val = region[0]
927
+
928
+ # Ensure 'anywhere' is handled correctly
929
+ if region_val == "Global (anywhere)" or "anywhere" in str(region_val).lower():
930
+ region_val = "anywhere"
931
+
932
+ # Initialize the Google Ads scraper
933
+ scraper = GoogleAds(region=region_val)
934
+
935
+ progress(0.2, desc=f"Fetching ads for advertiser ID: {advertiser_id}")
936
+
937
+ # Get creative IDs for this advertiser
938
+ creative_ids = scraper.creative_search_by_advertiser_id(advertiser_id, count=max_ads)
939
+
940
+ if not creative_ids:
941
+ return f"No ads found for advertiser ID: {advertiser_id}", None, None
942
+
943
+ progress(0.3, desc=f"Found {len(creative_ids)} ads. Fetching details...")
944
+
945
+ # Fetch detailed information for each ad
946
+ ads_data = []
947
+ ad_formats = {}
948
+
949
+ for i, creative_id in enumerate(creative_ids):
950
+ progress_val = 0.3 + (0.7 * (i / len(creative_ids)))
951
+ progress(progress_val, desc=f"Processing ad {i + 1}/{len(creative_ids)}")
952
+
953
+ try:
954
+ ad_details = scraper.get_detailed_ad(advertiser_id, creative_id)
955
+
956
+ # Fix encoding issues for Ad Title and Ad Body fields
957
+ if 'Ad Title' in ad_details:
958
+ ad_details['Ad Title'] = clean_ad_text(ad_details['Ad Title'])
959
+
960
+ if 'Ad Body' in ad_details:
961
+ ad_details['Ad Body'] = clean_ad_text(ad_details['Ad Body'])
962
+
963
+ ads_data.append(ad_details)
964
+
965
+ # Count ad formats
966
+ ad_format = ad_details.get("Ad Format", "Unknown")
967
+ ad_formats[ad_format] = ad_formats.get(ad_format, 0) + 1
968
+
969
+ # Brief pause to avoid overwhelming the server
970
+ time.sleep(0.2)
971
+ except Exception as e:
972
+ print(f"Error fetching details for ad {creative_id}: {e}")
973
+
974
+ if not ads_data:
975
+ return f"Retrieved creative IDs but couldn't fetch ad details for advertiser ID: {advertiser_id}", None, None
976
+
977
+ # Create a DataFrame for display
978
+ df = pd.DataFrame(ads_data)
979
+
980
+ # Generate summary info
981
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
982
+
983
+ # Use provided name if available, otherwise try to determine from predefined list or ad data
984
+ advertiser_name = "Unknown"
985
+
986
+ # First, use the provided name if it exists
987
+ if provided_name:
988
+ advertiser_name = provided_name
989
+ else:
990
+ # Check our predefined list
991
+ for adv in ADVERTISERS:
992
+ if adv["id"] == advertiser_id:
993
+ advertiser_name = adv["name"]
994
+ break
995
+
996
+ # If still unknown, try to get from the ad data
997
+ if advertiser_name == "Unknown" and ads_data and len(ads_data) > 0:
998
+ # The field might be "Advertiser" or "Advertiser Name" depending on the version
999
+ for field in ["Advertiser", "Advertiser Name", "advertiser_name"]:
1000
+ if field in ads_data[0]:
1001
+ advertiser_name = ads_data[0][field]
1002
+ break
1003
+
1004
+ summary = {
1005
+ 'advertiser_id': advertiser_id,
1006
+ 'advertiser_name': advertiser_name,
1007
+ 'ads_count': len(ads_data),
1008
+ 'timestamp': timestamp,
1009
+ 'region': region_val,
1010
+ 'ad_formats': ad_formats
1011
+ }
1012
+
1013
+ # Find the earliest and latest ad
1014
+ dates = []
1015
+ for ad in ads_data:
1016
+ # The field might be "Last Shown" or "last_shown_date" depending on the version
1017
+ for field in ["Last Shown", "last_shown_date"]:
1018
+ if field in ad and ad[field]:
1019
+ dates.append(ad[field])
1020
+ break
1021
+
1022
+ if dates:
1023
+ summary['earliest_ad'] = min(dates)
1024
+ summary['latest_ad'] = max(dates)
1025
+
1026
+ # Don't save the data, just prepare the summary info
1027
+ summary = {
1028
+ 'advertiser_id': advertiser_id,
1029
+ 'advertiser_name': advertiser_name,
1030
+ 'ads_count': len(ads_data),
1031
+ 'timestamp': timestamp,
1032
+ 'region': region_val,
1033
+ 'ad_formats': ad_formats
1034
+ }
1035
+
1036
+ # Find the earliest and latest ad
1037
+ dates = []
1038
+ for ad in ads_data:
1039
+ # The field might be "Last Shown" or "last_shown_date" depending on the version
1040
+ for field in ["Last Shown", "last_shown_date"]:
1041
+ if field in ad and ad[field]:
1042
+ dates.append(ad[field])
1043
+ break
1044
+
1045
+ if dates:
1046
+ summary['earliest_ad'] = min(dates)
1047
+ summary['latest_ad'] = max(dates)
1048
+
1049
+ success_message = (
1050
+ f"Found {len(ads_data)} ads for advertiser '{advertiser_name}' (ID: {advertiser_id})."
1051
+ )
1052
+
1053
+ progress(1.0, desc="Complete!")
1054
+ return success_message, df, summary
1055
+
1056
+ except Exception as e:
1057
+ error_message = f"Error searching for advertiser ID: {str(e)}"
1058
+ return error_message, None, None
1059
+
1060
+
1061
+ def process_advertiser_search(advertiser_selection, region, max_ads, progress=gr.Progress()):
1062
+ """Handle the advertiser selection form submission and update the UI."""
1063
+
1064
+ # Extract advertiser ID and name from the selection format "ID: Name"
1065
+ if not advertiser_selection:
1066
+ return "Please select an advertiser to search", None, None, None
1067
+
1068
+ # Split the selection string to get the ID and name
1069
+ parts = advertiser_selection.split(":", 1)
1070
+ advertiser_id = parts[0].strip()
1071
+ advertiser_name = parts[1].strip() if len(parts) > 1 else "Unknown"
1072
+
1073
+ # Perform the search
1074
+ result_message, ads_df, summary_info = search_by_advertiser_id(
1075
+ advertiser_id, max_ads, region, progress, advertiser_name
1076
+ )
1077
+
1078
+ # Generate analysis if data is available
1079
+ analysis_html = analyze_ads(ads_df, summary_info) if ads_df is not None and not ads_df.empty else None
1080
+
1081
+ return result_message, ads_df, analysis_html, summary_info
1082
+
1083
+
1084
+ def analyze_ads(df: pd.DataFrame, summary: Dict) -> str:
1085
+ """
1086
+ Analyze ads data and generate insights.
1087
+
1088
+ Args:
1089
+ df: DataFrame containing ad data
1090
+ summary: Dictionary with summary information
1091
+
1092
+ Returns:
1093
+ HTML string with analysis results
1094
+ """
1095
+ if df is None or df.empty or summary is None:
1096
+ return "<h3>No data available for analysis</h3>"
1097
+
1098
+ try:
1099
+ # Create a simple HTML report with the analysis
1100
+ html = f"""
1101
+ <div style="font-family: Arial, sans-serif;">
1102
+ <h2>{summary.get('advertiser_name', 'Unknown Advertiser')} - Ad Analysis</h2>
1103
+
1104
+ <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin-bottom: 20px;">
1105
+ <h3>Overview</h3>
1106
+ <p><b>Advertiser ID:</b> {summary.get('advertiser_id', 'Unknown')}</p>
1107
+ <p><b>Total Ads Found:</b> {summary['ads_count']}</p>
1108
+ <p><b>Region:</b> {summary['region']}</p>
1109
+ <p><b>Data Collected:</b> {summary['timestamp'].replace('_', ' ').replace('-', '/')}</p>
1110
+
1111
+ {f"<p><b>Ad Date Range:</b> {summary.get('earliest_ad')} to {summary.get('latest_ad')}</p>" if 'earliest_ad' in summary else ""}
1112
+ </div>
1113
+
1114
+ <div style="display: flex; margin-bottom: 20px;">
1115
+ <div style="flex: 1; background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin-right: 10px;">
1116
+ <h3>Ad Format Distribution</h3>
1117
+ <table style="width: 100%; border-collapse: collapse;">
1118
+ <tr style="background-color: #eaeaea;">
1119
+ <th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Format</th>
1120
+ <th style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">Count</th>
1121
+ <th style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">Percentage</th>
1122
+ </tr>
1123
+ """
1124
+
1125
+ total = sum(summary['ad_formats'].values())
1126
+ for format_name, count in summary['ad_formats'].items():
1127
+ percentage = (count / total) * 100
1128
+ html += f"""
1129
+ <tr>
1130
+ <td style="padding: 8px; border-bottom: 1px solid #ddd;">{format_name}</td>
1131
+ <td style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">{count}</td>
1132
+ <td style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">{percentage:.1f}%</td>
1133
+ </tr>
1134
+ """
1135
+
1136
+ html += """
1137
+ </table>
1138
+ </div>
1139
+ """
1140
+
1141
+ # Common words in ad titles
1142
+ if 'Ad Title' in df.columns and not df['Ad Title'].isna().all():
1143
+ from collections import Counter
1144
+ import re
1145
+
1146
+ # Extract words from titles
1147
+ all_titles = ' '.join(df['Ad Title'].dropna().astype(str).tolist())
1148
+ words = re.findall(r'\b\w+\b', all_titles.lower())
1149
+
1150
+ # Remove common stop words
1151
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'is',
1152
+ 'are'}
1153
+ filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
1154
+
1155
+ # Count word frequencies
1156
+ word_counts = Counter(filtered_words).most_common(10)
1157
+
1158
+ if word_counts:
1159
+ html += """
1160
+ <div style="flex: 1; background-color: #f5f5f5; padding: 15px; border-radius: 5px;">
1161
+ <h3>Most Common Words in Ad Titles</h3>
1162
+ <table style="width: 100%; border-collapse: collapse;">
1163
+ <tr style="background-color: #eaeaea;">
1164
+ <th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Word</th>
1165
+ <th style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">Frequency</th>
1166
+ </tr>
1167
+ """
1168
+
1169
+ for word, count in word_counts:
1170
+ html += f"""
1171
+ <tr>
1172
+ <td style="padding: 8px; border-bottom: 1px solid #ddd;">{word}</td>
1173
+ <td style="text-align: center; padding: 8px; border-bottom: 1px solid #ddd;">{count}</td>
1174
+ </tr>
1175
+ """
1176
+
1177
+ html += """
1178
+ </table>
1179
+ </div>
1180
+ """
1181
+
1182
+ html += """
1183
+ </div>
1184
+
1185
+ <h3>SEO & Marketing Insights</h3>
1186
+ <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin-bottom: 20px;">
1187
+ """
1188
+
1189
+ # Add general insights
1190
+ html += f"""
1191
+ <h4>Competitive Intelligence</h4>
1192
+ <ul>
1193
+ <li>The advertiser has been active in advertising until {summary.get('latest_ad', 'recently')}</li>
1194
+ <li>Their ad strategy focuses primarily on {max(summary['ad_formats'].items(), key=lambda x: x[1])[0]} ads</li>
1195
+ <li>Consider monitoring changes in their ad frequency and creative strategy over time</li>
1196
+ </ul>
1197
+
1198
+ <h4>UK Market Insights</h4>
1199
+ <ul>
1200
+ <li>The ads were collected for the {summary['region']} market</li>
1201
+ <li>Regular monitoring can reveal seasonal UK advertising patterns</li>
1202
+ <li>Compare with other regions to identify UK-specific marketing approaches</li>
1203
+ </ul>
1204
+ """
1205
+
1206
+ html += """
1207
+ </div>
1208
+
1209
+ <h3>All Ad Examples</h3>
1210
+ """
1211
+
1212
+ # Add example ads (all of them, not just the most recent)
1213
+ if not df.empty:
1214
+ # Sort by Last Shown date if available
1215
+ if 'Last Shown' in df.columns:
1216
+ df = df.sort_values(by='Last Shown', ascending=False)
1217
+
1218
+ # Get all ads, not just the top 3
1219
+ for i, (_, ad) in enumerate(df.iterrows()):
1220
+ html += f"""
1221
+ <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin-bottom: 15px;">
1222
+ <h4>Ad {i + 1}: {ad.get('Creative Id', '')}</h4>
1223
+ <p><b>Format:</b> {ad.get('Ad Format', 'Unknown')}</p>
1224
+ <p><b>Last Shown:</b> {ad.get('Last Shown', 'Unknown')}</p>
1225
+ """
1226
+
1227
+ # Display title and body if available
1228
+ if 'Ad Title' in ad and pd.notna(ad['Ad Title']) and ad['Ad Title']:
1229
+ html += f"<p><b>Title:</b> {ad['Ad Title']}</p>"
1230
+
1231
+ if 'Ad Body' in ad and pd.notna(ad['Ad Body']) and ad['Ad Body']:
1232
+ body = ad['Ad Body']
1233
+ if len(body) > 150:
1234
+ body = body[:150] + "..."
1235
+ html += f"<p><b>Body:</b> {body}</p>"
1236
+
1237
+ # Display image or video links if available
1238
+ if 'Image URL' in ad and pd.notna(ad['Image URL']) and ad['Image URL']:
1239
+ html += f"""<p><img src="{ad['Image URL']}" style="max-width: 300px; max-height: 200px;" /></p>"""
1240
+
1241
+ if 'Ad Link' in ad and pd.notna(ad['Ad Link']) and ad['Ad Link'] and ad.get('Ad Format') != 'Text':
1242
+ html += f"""<p><b>Ad Link:</b> <a href="{ad['Ad Link']}" target="_blank">View Ad</a></p>"""
1243
+
1244
+ html += "</div>"
1245
+
1246
+ html += """
1247
+ </div>
1248
+ """
1249
+
1250
+ return html
1251
+
1252
+ except Exception as e:
1253
+ return f"<h3>Error analyzing data: {str(e)}</h3>"
1254
+
1255
+
1256
+ #####################################
1257
+ ### COMBINED INTERFACE SECTION #####
1258
+ #####################################
1259
+
1260
+ def create_combined_app():
1261
+ """Create the combined Gradio interface with Facebook and Google Ads scrapers"""
1262
+
1263
+ # Create dropdown choices for advertiser selection
1264
+ advertiser_choices = [f"{adv['id']}: {adv['name']}" for adv in ADVERTISERS]
1265
+
1266
+ with gr.Blocks(title="Combined Ads Transparency Scraper") as app:
1267
+ gr.Markdown("# Combined Ads Transparency Scraper")
1268
+ gr.Markdown("## Search for ads from Facebook and Google Ads transparency tools")
1269
+
1270
+ # Create tabs for the two different scrapers
1271
+ with gr.Tabs() as tabs:
1272
+ # Tab 1: Facebook Ad Library Scraper
1273
+ with gr.TabItem("Facebook Ad Library"):
1274
+ gr.Markdown("### Facebook Ad Library Search")
1275
+ gr.Markdown("Search for ads by brand, domain, or keyword")
1276
+
1277
+ with gr.Row():
1278
+ fb_query_input = gr.Textbox(
1279
+ label="Search Query",
1280
+ placeholder="Enter brand, domain or product name",
1281
+ value=""
1282
+ )
1283
+ fb_search_button = gr.Button("Find Facebook Ads", variant="primary")
1284
+
1285
+ fb_results_output = gr.Textbox(label="Search Results", lines=20)
1286
+ fb_save_button = gr.Button("Save Results to JSON")
1287
+ fb_save_status = gr.Textbox(label="Save Status", lines=1)
1288
+
1289
+ # Define the save function for Facebook
1290
+ def save_fb_results(query, results_text):
1291
+ if not results_text or "No Facebook ads found" in results_text:
1292
+ return "No ads to save"
1293
+
1294
+ # Get the scraper to fetch fresh ads for JSON format
1295
+ scraper = FacebookAdsScraper(headless=True, debug_mode=False)
1296
+ ads = scraper.fetch_facebook_ads(query)
1297
+ scraper.close()
1298
+
1299
+ # Save to JSON
1300
+ filename = save_ads_to_json(ads, query)
1301
+ if filename:
1302
+ return f"Saved {len(ads)} ads to {filename}"
1303
+ else:
1304
+ return "Error saving ads to JSON"
1305
+
1306
+ # Connect Facebook interface components
1307
+ fb_search_button.click(
1308
+ fn=fetch_facebook_ads,
1309
+ inputs=[fb_query_input],
1310
+ outputs=[fb_results_output]
1311
+ )
1312
+
1313
+ fb_save_button.click(
1314
+ fn=save_fb_results,
1315
+ inputs=[fb_query_input, fb_results_output],
1316
+ outputs=[fb_save_status]
1317
+ )
1318
+
1319
+ # Tab 2: Lightsaber Companies Google Ads Scraper
1320
+ with gr.TabItem("Google Ads (Lightsaber Companies)"):
1321
+ gr.Markdown("### Lightsaber Companies Ads Transparency Scraper")
1322
+ gr.Markdown("View Google Ads data for popular lightsaber companies")
1323
+
1324
+ with gr.Row():
1325
+ with gr.Column(scale=3):
1326
+ advertiser_dropdown = gr.Dropdown(
1327
+ choices=advertiser_choices,
1328
+ label="Select Lightsaber Company",
1329
+ info="Choose a company to view their Google Ads data"
1330
+ )
1331
+
1332
+ with gr.Row():
1333
+ region_dropdown = gr.Dropdown(
1334
+ choices=get_regions_list(),
1335
+ value="GB", # UK is the default
1336
+ label="Region",
1337
+ info="Choose between Global or UK"
1338
+ )
1339
+
1340
+ max_ads_slider = gr.Slider(
1341
+ minimum=1,
1342
+ maximum=10,
1343
+ value=5,
1344
+ step=1,
1345
+ label="Max Ads to Retrieve"
1346
+ )
1347
+
1348
+ search_button = gr.Button("Search Ads", variant="primary")
1349
+
1350
+ with gr.Column(scale=2):
1351
+ result_message = gr.Markdown(label="Search Result")
1352
+
1353
+ # Tabs for displaying Google Ads search results
1354
+ with gr.Tabs() as google_result_tabs:
1355
+ with gr.Tab("Analysis"):
1356
+ analysis_html = gr.HTML()
1357
+
1358
+ with gr.Tab("Raw Data"):
1359
+ ads_table = gr.DataFrame()
1360
+
1361
+ # State for storing summary info
1362
+ summary_info = gr.State()
1363
+
1364
+ # Connect the Google Ads inputs to the output function
1365
+ search_button.click(
1366
+ fn=process_advertiser_search,
1367
+ inputs=[advertiser_dropdown, region_dropdown, max_ads_slider],
1368
+ outputs=[result_message, ads_table, analysis_html, summary_info]
1369
+ )
1370
+
1371
+ # About section for the combined app
1372
+ with gr.Accordion("About This Tool", open=False):
1373
+ gr.Markdown("""
1374
+ ## About Combined Ads Transparency Scraper
1375
+
1376
+ This tool combines two different ad transparency scrapers:
1377
+
1378
+ 1. **Facebook Ad Library Scraper**: Search for any advertiser's ads on Facebook.
1379
+ 2. **Google Ads Transparency Scraper**: View ads for popular lightsaber companies.
1380
+
1381
+ ### Technical Details
1382
+
1383
+ - The Facebook scraper uses Selenium WebDriver with anti-detection techniques.
1384
+ - The Google Ads scraper leverages the Google Ad Transparency API.
1385
+ - Both scrapers include adaptive error handling and fallback mechanisms.
1386
+
1387
+ ### Usage Notes
1388
+
1389
+ - Facebook scraping may take 30-60 seconds to complete
1390
+ - Search results are not stored permanently
1391
+ - Use the "Save Results" button to save data for later analysis
1392
+
1393
+ **Note**: This tool is intended for research and educational purposes only.
1394
+ """)
1395
+
1396
+ return app
1397
+
1398
+ # Main execution
1399
+
1400
+
1401
+ if __name__ == "__main__":
1402
+ parser = argparse.ArgumentParser(description="Combined Ads Transparency Scraper")
1403
+ parser.add_argument("--headless", action="store_true", default=True, help="Run in headless mode")
1404
+ parser.add_argument("--debug", action="store_true", help="Enable debug mode with extra logging")
1405
+ parser.add_argument("--fb-query", type=str, help="Facebook search query to run directly without Gradio")
1406
+ parser.add_argument("--google-advertiser", type=str, help="Google Ads advertiser ID to run directly without Gradio")
1407
+ parser.add_argument("--save", action="store_true", help="Save results to JSON file when using direct query")
1408
+
1409
+ args = parser.parse_args()
1410
+
1411
+ if args.fb_query:
1412
+ # Run direct query mode for Facebook
1413
+ scraper = FacebookAdsScraper(headless=args.headless, debug_mode=args.debug)
1414
+ scraper.check_headless_visibility()
1415
+
1416
+ facebook_ads = scraper.fetch_facebook_ads(args.fb_query)
1417
+
1418
+ # Display results
1419
+ print(f"\nFound {len(facebook_ads)} Facebook ads for '{args.fb_query}'")
1420
+
1421
+ if facebook_ads:
1422
+ for i, ad in enumerate(facebook_ads):
1423
+ print(f"\n--- Ad {i + 1} ---")
1424
+ print(f"Platform: {ad['platform']}")
1425
+ if 'status' in ad:
1426
+ print(f"Status: {ad['status']}")
1427
+ print(f"Advertiser: {ad['advertiser']}")
1428
+ print(f"Text: {ad['text']}")
1429
+ if ad.get('is_placeholder', False):
1430
+ print("[THIS IS PLACEHOLDER DATA]")
1431
+
1432
+ # Save to JSON if requested
1433
+ if args.save:
1434
+ filename = save_ads_to_json(facebook_ads, args.fb_query)
1435
+ if filename:
1436
+ print(f"\nSaved {len(facebook_ads)} ads to {filename}")
1437
+ else:
1438
+ print("No Facebook ads found.")
1439
+
1440
+ scraper.close()
1441
+
1442
+ elif args.google_advertiser:
1443
+ # Run direct query mode for Google Ads
1444
+ advertiser_id = args.google_advertiser
1445
+
1446
+ # Find advertiser name if it's in our list
1447
+ advertiser_name = "Unknown"
1448
+ for adv in ADVERTISERS:
1449
+ if adv["id"] == advertiser_id:
1450
+ advertiser_name = adv["name"]
1451
+ break
1452
+
1453
+ print(f"\nSearching for Google Ads from advertiser '{advertiser_name}' (ID: {advertiser_id})")
1454
+
1455
+
1456
+ # Use a dummy progress object for CLI
1457
+ class DummyProgress:
1458
+ def __call__(self, value, desc=None):
1459
+ if desc:
1460
+ print(f"{desc} ({value * 100:.0f}%)")
1461
+
1462
+
1463
+ result_message, ads_df, summary_info = search_by_advertiser_id(
1464
+ advertiser_id,
1465
+ max_ads=5,
1466
+ region="GB",
1467
+ progress=DummyProgress(),
1468
+ provided_name=advertiser_name
1469
+ )
1470
+
1471
+ print(f"\n{result_message}")
1472
+
1473
+ if ads_df is not None and not ads_df.empty:
1474
+ print("\nFound ads:")
1475
+ for i, (_, ad) in enumerate(ads_df.iterrows()):
1476
+ print(f"\n--- Ad {i + 1} ---")
1477
+ print(f"Format: {ad.get('Ad Format', 'Unknown')}")
1478
+ print(f"Title: {ad.get('Ad Title', 'Unknown')}")
1479
+ body_text = ad.get('Ad Body', 'Unknown')
1480
+ if len(body_text) > 100:
1481
+ body_text = body_text[:100] + "..."
1482
+ print(f"Body: {body_text}")
1483
+ print(f"Last Shown: {ad.get('Last Shown', 'Unknown')}")
1484
+ print(f"Creative ID: {ad.get('Creative Id', 'Unknown')}")
1485
+ else:
1486
+ print("No Google ads found or error occurred.")
1487
+
1488
+ else:
1489
+ # Run Gradio interface
1490
+ app = create_combined_app()
1491
+ print("Starting Combined Ads Transparency Scraper")
1492
+ print("Facebook: Search for any brand or company")
1493
+ print("Google Ads: Available lightsaber companies:")
1494
+ for adv in ADVERTISERS:
1495
+ print(f" - {adv['name']}")
1496
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4>=4.12.3
2
+ langchain>=0.1.16
3
+ langchain_community>=0.0.34
4
+ langchain_core>=0.1.45
5
+ langchain_openai>=0.1.3
6
+ sentence-transformers>=2.7.0
7
+ python-dotenv>=1.0.1
8
+ Requests>=2.31.0
9
+ chromadb>=0.4.24
10
+ gradio>=4.27.0
11
+ selenium
12
+ pandas
13
+ python-dateutil
14
+ webdriver-manager