File size: 5,547 Bytes
5fdb69e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# openai_scraper_playwright.py

import asyncio
from playwright.async_api import async_playwright
from openai import OpenAI
import logging
import random
import time
import os
from prometheus_client import start_http_server, Counter, Histogram
from diskcache import Cache
from dotenv import load_dotenv

load_dotenv()

SCRAPE_ATTEMPTS = Counter('scrape_attempts', 'Total scraping attempts')
SCRAPE_DURATION = Histogram('scrape_duration', 'Scraping duration distribution')
cache = Cache('./scraper_cache')

class ScrapingError(Exception): pass
class ContentAnalysisError(Exception): pass

class EnhancedOpenAIScraper:
    API_KEY = os.getenv("OPENAI_API_KEY")
    BROWSER_EXECUTABLE = os.getenv("BROWSER_PATH", "/usr/bin/chromium-browser")
    MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", 30000))

    def __init__(self, headless=True):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..."
        ]
        self.timeout = 45000
        self.retry_count = int(os.getenv("RETRY_COUNT", 2))
        self.headless = headless
        self.proxy_servers = [x.strip() for x in os.getenv("PROXY_SERVERS", "").split(',') if x.strip()]

    async def human_interaction(self, page):
        for _ in range(random.randint(2, 5)):
            x, y = random.randint(0, 1366), random.randint(0, 768)
            await page.mouse.move(x, y, steps=random.randint(5, 20))
            await page.wait_for_timeout(random.randint(50, 200))

        if random.random() < 0.3:
            await page.keyboard.press('Tab')
            await page.keyboard.type(' ', delay=random.randint(50, 200))

        await page.mouse.wheel(0, random.choice([300, 600, 900]))
        await page.wait_for_timeout(random.randint(500, 2000))

    async def load_page(self, page, url):
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=self.timeout)
            selectors = ['main article', '#main-content', 'section:first-of-type', 'div[class*="content"]', 'body']
            for selector in selectors:
                if await page.query_selector(selector):
                    return True
            await page.wait_for_timeout(5000)
            return True
        except Exception as e:
            logging.error(f"Error loading page {url}: {e}")
            return False

    @SCRAPE_DURATION.time()
    async def scrape_with_retry(self, url):
        SCRAPE_ATTEMPTS.inc()
        last_error = None
        try:
            async with async_playwright() as p:
                args = {
                    "headless": self.headless,
                    "args": ["--disable-blink-features=AutomationControlled", "--no-sandbox"],
                    "executable_path": self.BROWSER_EXECUTABLE
                }
                browser = await p.chromium.launch(**args)
                context = await browser.new_context(user_agent=random.choice(self.user_agents))
                page = await context.new_page()
                await page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', { get: () => false });
                """)

                for attempt in range(self.retry_count):
                    try:
                        if not await self.load_page(page, url):
                            raise ScrapingError("Failed to load page")
                        await self.human_interaction(page)
                        content = await page.evaluate("""() => document.body.innerText""")
                        if not content.strip():
                            raise ContentAnalysisError("No content extracted")
                        await browser.close()
                        return content[:self.MAX_CONTENT_LENGTH]
                    except Exception as e:
                        last_error = e
                        if attempt < self.retry_count - 1:
                            await asyncio.sleep(5)
                        else:
                            await browser.close()
                            raise
        except Exception as e:
            raise last_error or e

    async def get_cached_content(self, url):
        key = 'cache_' + url.replace('https://', '').replace('/', '_')
        content = cache.get(key)
        if content is None:
            content = await self.scrape_with_retry(url)
            cache.set(key, content, expire=int(os.getenv("CACHE_EXPIRY", 3600)))
        return content

async def analyze_content(url="https://openai.com", headless=True):
    scraper = EnhancedOpenAIScraper(headless=headless)
    content = await scraper.get_cached_content(url)
    client = OpenAI(api_key=EnhancedOpenAIScraper.API_KEY)
    if not client.api_key:
        raise ContentAnalysisError("OpenAI API key not configured")

    prompt = f"""
Analyze this page:

{content}
    """
    model = os.getenv("OPENAI_MODEL", "gpt-4-turbo")
    temperature = float(os.getenv("MODEL_TEMPERATURE", 0.3))
    max_tokens = int(os.getenv("MAX_TOKENS", 1500))
    top_p = float(os.getenv("MODEL_TOP_P", 0.9))

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a content analyst."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p
    )

    if not response.choices:
        raise ContentAnalysisError("Empty response from GPT")

    return response.choices[0].message.content