Spaces:
Sleeping
Sleeping
File size: 5,547 Bytes
5fdb69e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# openai_scraper_playwright.py
import asyncio
from playwright.async_api import async_playwright
from openai import OpenAI
import logging
import random
import time
import os
from prometheus_client import start_http_server, Counter, Histogram
from diskcache import Cache
from dotenv import load_dotenv
load_dotenv()
SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts')
SCRAPE_DURATION = Histogram('scrape_duration', 'Scraping duration distribution')
cache = Cache('./scraper_cache')
class ScrapingError(Exception): pass
class ContentAnalysisError(Exception): pass
class EnhancedOpenAIScraper:
API_KEY = os.getenv("OPENAI_API_KEY")
BROWSER_EXECUTABLE = os.getenv("BROWSER_PATH", "/usr/bin/chromium-browser")
MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000))
def __init__(self, headless=True):
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..."
]
self.timeout = 45000
self.retry_count = int(os.getenv("RETRY_COUNT", 2))
self.headless = headless
self.proxy_servers = [x.strip() for x in os.getenv("PROXY_SERVERS", "").split(',') if x.strip()]
async def human_interaction(self, page):
for _ in range(random.randint(2, 5)):
x, y = random.randint(0, 1366), random.randint(0, 768)
await page.mouse.move(x, y, steps=random.randint(5, 20))
await page.wait_for_timeout(random.randint(50, 200))
if random.random() < 0.3:
await page.keyboard.press('Tab')
await page.keyboard.type(' ', delay=random.randint(50, 200))
await page.mouse.wheel(0, random.choice([300, 600, 900]))
await page.wait_for_timeout(random.randint(500, 2000))
async def load_page(self, page, url):
try:
await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout)
selectors = ['main article', '#main-content', 'section:first-of-type', 'div[class*="content"]', 'body']
for selector in selectors:
if await page.query_selector(selector):
return True
await page.wait_for_timeout(5000)
return True
except Exception as e:
logging.error(f"Error loading page {url}: {e}")
return False
@SCRAPE_DURATION.time()
async def scrape_with_retry(self, url):
SCRAPE_ATTEMPTS.inc()
last_error = None
try:
async with async_playwright() as p:
args = {
"headless": self.headless,
"args": ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
"executable_path": self.BROWSER_EXECUTABLE
}
browser = await p.chromium.launch(**args)
context = await browser.new_context(user_agent=random.choice(self.user_agents))
page = await context.new_page()
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => false });
""")
for attempt in range(self.retry_count):
try:
if not await self.load_page(page, url):
raise ScrapingError("Failed to load page")
await self.human_interaction(page)
content = await page.evaluate("""() => document.body.innerText""")
if not content.strip():
raise ContentAnalysisError("No content extracted")
await browser.close()
return content[:self.MAX_CONTENT_LENGTH]
except Exception as e:
last_error = e
if attempt < self.retry_count - 1:
await asyncio.sleep(5)
else:
await browser.close()
raise
except Exception as e:
raise last_error or e
async def get_cached_content(self, url):
key = 'cache_' + url.replace('https://', '').replace('/', '_')
content = cache.get(key)
if content is None:
content = await self.scrape_with_retry(url)
cache.set(key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600)))
return content
async def analyze_content(url="https://openai.com", headless=True):
scraper = EnhancedOpenAIScraper(headless=headless)
content = await scraper.get_cached_content(url)
client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
if not client.api_key:
raise ContentAnalysisError("OpenAI API key not configured")
prompt = f"""
Analyze this page:
{content}
"""
model = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
max_tokens = int(os.getenv("MAX_TOKENS", 1500))
top_p = float(os.getenv("MODEL_TOP_P", 0.9))
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a content analyst."},
{"role": "user", "content": prompt}
],
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p
)
if not response.choices:
raise ContentAnalysisError("Empty response from GPT")
return response.choices[0].message.content
|