File size: 5,812 Bytes
a13f43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f255707
a13f43a
 
 
f255707
a13f43a
 
f255707
 
a13f43a
 
f255707
a13f43a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""

import json
import time
import urllib.request
from datetime import datetime, timedelta

from prefect import task, get_run_logger
from lxml import etree

from controllers.utils import crawl_by_url

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Content-Type": "application/json;charset=UTF-8",
    "Origin": "https://www.sohu.com",
    "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
    "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
}

json_data_template = {
    "pvId": "1742131372391_ChMyB2V",
    "pageId": "1742131372927_1741844737337odi_MYH",
    "mainContent": {
        "productType": "13",
        "productId": "324",
        "secureScore": "5",
        "categoryId": "47",
        "adTags": "11111111",
        "authorId": 121135924,
    },
    "resourceList": [
        {
            "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
            "isServerRender": False,
            "configSource": "mp",
            "content": {
                "productId": "325",
                "productType": "13",
                "size": 20,
                "page": 1,  # The number of pages will be updated dynamically in the loop
                "requestId": "1742131372524LxjXrUY_324",
            },
            "adInfo": {},
            "context": {"mkey": "465450"},
        }
    ],
}

def parse_time_string(time_str):
    """
    Parses the time string into a datetime object.
    Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
    """
    now = datetime.now()
    if "分钟前" in time_str:
        minutes = int(time_str.replace("分钟前", "").strip())
        return now - timedelta(minutes=minutes)
    if "小时前" in time_str:
        hours = int(time_str.replace("小时前", "").strip())
        return now - timedelta(hours=hours)
    elif "天前" in time_str:
        days = int(time_str.replace("天前", "").strip())
        return now - timedelta(days=days)
    elif "昨天" in time_str:
        time_part = time_str.replace("昨天", "").strip()
        return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
    elif "前天" in time_str:
        time_part = time_str.replace("前天", "").strip()
        return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
    else:  
        return datetime.strptime(time_str, "%Y.%m.%d")

def extract_recent_urls(response_json, delta):
    """
    Extracts URLs from the response JSON and filters them based on the given delta (in days).

    Args:
        response_json (dict): The JSON response from the API.
        delta (int): The number of days to look back.

    Returns:
        list: A list of filtered URLs.
    """
    base_url = "http://www.sohu.com"
    cutoff_time = datetime.now() - timedelta(days=delta)
    recent_urls = []

    try:
        articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
        for article in articles:
            extra_info = article.get("extraInfoList", [])
            time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
            if time_info:
                article_time = parse_time_string(time_info)
                if article_time >= cutoff_time:
                    url = article.get("url", "")
                    if url:
                        if not url.startswith("http"):
                            url = base_url + url
                        recent_urls.append(url)
    except KeyError as e:
        print(f"KeyError: {e}")

    return recent_urls

@task(name = "Data Collection - sohu_ccef", log_prints = True)
def crawl(delta):
    """
    Crawls the website and collects URLs from the last `delta` days.

    Args:
        delta (int): The number of days to look back.

    Returns:
        list: A list of URLs published in the last `delta` days.
    """
    logger = get_run_logger()
    logger.info("sohu.com")
    all_recent_urls = []
    url = "https://odin.sohu.com/odin/api/blockdata"

    page = 1
    while True:
        json_data = json_data_template.copy()
        json_data["resourceList"][0]["content"]["page"] = page

        data = json.dumps(json_data).encode("utf-8")
        req = urllib.request.Request(url, data=data, headers=headers, method="POST")

        try:
            with urllib.request.urlopen(req) as response:
                response_data = response.read().decode("utf-8")
                response_json = json.loads(response_data)

                recent_urls = extract_recent_urls(response_json, delta)
                if not recent_urls:  # Stop if no recent URLs are found
                    logger.info(f"No recent data found on page {page}. Stopping.")
                    break

                all_recent_urls.extend(recent_urls)
                logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")

        except urllib.error.HTTPError as e:
            logger.info(f"HTTP error: {e.code} on page {page}")
            logger.info(e.read().decode("utf-8"))
            break
        except urllib.error.URLError as e:
            logger.info(f"URL error: {e.reason} on page {page}")
            break

        page += 1

    for url in all_recent_urls:
        print(url)
        article = {}
        if "http://www.sohu.com" in url:
            article['category'] = "Policy Interpretation"
            crawl_by_url(url, article)