Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

File size: 5,812 Bytes

"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""

import json
import time
import urllib.request
from datetime import datetime, timedelta

from prefect import task, get_run_logger
from lxml import etree

from controllers.utils import crawl_by_url

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Content-Type": "application/json;charset=UTF-8",
    "Origin": "https://www.sohu.com",
    "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
    "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
}

json_data_template = {
    "pvId": "1742131372391_ChMyB2V",
    "pageId": "1742131372927_1741844737337odi_MYH",
    "mainContent": {
        "productType": "13",
        "productId": "324",
        "secureScore": "5",
        "categoryId": "47",
        "adTags": "11111111",
        "authorId": 121135924,
    },
    "resourceList": [
        {
            "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
            "isServerRender": False,
            "configSource": "mp",
            "content": {
                "productId": "325",
                "productType": "13",
                "size": 20,
                "page": 1,  # The number of pages will be updated dynamically in the loop
                "requestId": "1742131372524LxjXrUY_324",
            },
            "adInfo": {},
            "context": {"mkey": "465450"},
        }
    ],
}

def parse_time_string(time_str):
    """
    Parses the time string into a datetime object.
    Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
    """
    now = datetime.now()
    if "分钟前" in time_str:
        minutes = int(time_str.replace("分钟前", "").strip())
        return now - timedelta(minutes=minutes)
    if "小时前" in time_str:
        hours = int(time_str.replace("小时前", "").strip())
        return now - timedelta(hours=hours)
    elif "天前" in time_str:
        days = int(time_str.replace("天前", "").strip())
        return now - timedelta(days=days)
    elif "昨天" in time_str:
        time_part = time_str.replace("昨天", "").strip()
        return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
    elif "前天" in time_str:
        time_part = time_str.replace("前天", "").strip()
        return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
    else:  
        return datetime.strptime(time_str, "%Y.%m.%d")

def extract_recent_urls(response_json, delta):
    """
    Extracts URLs from the response JSON and filters them based on the given delta (in days).

    Args:
        response_json (dict): The JSON response from the API.
        delta (int): The number of days to look back.

    Returns:
        list: A list of filtered URLs.
    """
    base_url = "http://www.sohu.com"
    cutoff_time = datetime.now() - timedelta(days=delta)
    recent_urls = []

    try:
        articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
        for article in articles:
            extra_info = article.get("extraInfoList", [])
            time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
            if time_info:
                article_time = parse_time_string(time_info)
                if article_time >= cutoff_time:
                    url = article.get("url", "")
                    if url:
                        if not url.startswith("http"):
                            url = base_url + url
                        recent_urls.append(url)
    except KeyError as e:
        print(f"KeyError: {e}")

    return recent_urls

@task(name = "Data Collection - sohu_ccef", log_prints = True)
def crawl(delta):
    """
    Crawls the website and collects URLs from the last `delta` days.

    Args:
        delta (int): The number of days to look back.

    Returns:
        list: A list of URLs published in the last `delta` days.
    """
    logger = get_run_logger()
    logger.info("sohu.com")
    all_recent_urls = []
    url = "https://odin.sohu.com/odin/api/blockdata"

    page = 1
    while True:
        json_data = json_data_template.copy()
        json_data["resourceList"][0]["content"]["page"] = page

        data = json.dumps(json_data).encode("utf-8")
        req = urllib.request.Request(url, data=data, headers=headers, method="POST")

        try:
            with urllib.request.urlopen(req) as response:
                response_data = response.read().decode("utf-8")
                response_json = json.loads(response_data)

                recent_urls = extract_recent_urls(response_json, delta)
                if not recent_urls:  # Stop if no recent URLs are found
                    logger.info(f"No recent data found on page {page}. Stopping.")
                    break

                all_recent_urls.extend(recent_urls)
                logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")

        except urllib.error.HTTPError as e:
            logger.info(f"HTTP error: {e.code} on page {page}")
            logger.info(e.read().decode("utf-8"))
            break
        except urllib.error.URLError as e:
            logger.info(f"URL error: {e.reason} on page {page}")
            break

        page += 1

    for url in all_recent_urls:
        print(url)
        article = {}
        if "http://www.sohu.com" in url:
            article['category'] = "Policy Interpretation"
            crawl_by_url(url, article)