"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles.""" import json import time import urllib.request from datetime import datetime, timedelta from prefect import task, get_run_logger from lxml import etree from controllers.utils import crawl_by_url headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json;charset=UTF-8", "Origin": "https://www.sohu.com", "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;", } json_data_template = { "pvId": "1742131372391_ChMyB2V", "pageId": "1742131372927_1741844737337odi_MYH", "mainContent": { "productType": "13", "productId": "324", "secureScore": "5", "categoryId": "47", "adTags": "11111111", "authorId": 121135924, }, "resourceList": [ { "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2", "isServerRender": False, "configSource": "mp", "content": { "productId": "325", "productType": "13", "size": 20, "page": 1, # The number of pages will be updated dynamically in the loop "requestId": "1742131372524LxjXrUY_324", }, "adInfo": {}, "context": {"mkey": "465450"}, } ], } def parse_time_string(time_str): """ Parses the time string into a datetime object. Supports formats like "昨天18:41", "2天前", "2025.03.10", etc. """ now = datetime.now() if "分钟前" in time_str: minutes = int(time_str.replace("分钟前", "").strip()) return now - timedelta(minutes=minutes) if "小时前" in time_str: hours = int(time_str.replace("小时前", "").strip()) return now - timedelta(hours=hours) elif "天前" in time_str: days = int(time_str.replace("天前", "").strip()) return now - timedelta(days=days) elif "昨天" in time_str: time_part = time_str.replace("昨天", "").strip() return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M") elif "前天" in time_str: time_part = time_str.replace("前天", "").strip() return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M") else: return datetime.strptime(time_str, "%Y.%m.%d") def extract_recent_urls(response_json, delta): """ Extracts URLs from the response JSON and filters them based on the given delta (in days). Args: response_json (dict): The JSON response from the API. delta (int): The number of days to look back. Returns: list: A list of filtered URLs. """ base_url = "http://www.sohu.com" cutoff_time = datetime.now() - timedelta(days=delta) recent_urls = [] try: articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", []) for article in articles: extra_info = article.get("extraInfoList", []) time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None) if time_info: article_time = parse_time_string(time_info) if article_time >= cutoff_time: url = article.get("url", "") if url: if not url.startswith("http"): url = base_url + url recent_urls.append(url) except KeyError as e: print(f"KeyError: {e}") return recent_urls @task(name = "Data Collection - sohu_ccef", log_prints = True) def crawl(delta): """ Crawls the website and collects URLs from the last `delta` days. Args: delta (int): The number of days to look back. Returns: list: A list of URLs published in the last `delta` days. """ logger = get_run_logger() logger.info("sohu.com") all_recent_urls = [] url = "https://odin.sohu.com/odin/api/blockdata" page = 1 while True: json_data = json_data_template.copy() json_data["resourceList"][0]["content"]["page"] = page data = json.dumps(json_data).encode("utf-8") req = urllib.request.Request(url, data=data, headers=headers, method="POST") try: with urllib.request.urlopen(req) as response: response_data = response.read().decode("utf-8") response_json = json.loads(response_data) recent_urls = extract_recent_urls(response_json, delta) if not recent_urls: # Stop if no recent URLs are found logger.info(f"No recent data found on page {page}. Stopping.") break all_recent_urls.extend(recent_urls) logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.") except urllib.error.HTTPError as e: logger.info(f"HTTP error: {e.code} on page {page}") logger.info(e.read().decode("utf-8")) break except urllib.error.URLError as e: logger.info(f"URL error: {e.reason} on page {page}") break page += 1 for url in all_recent_urls: print(url) article = {} if "http://www.sohu.com" in url: article['category'] = "Policy Interpretation" crawl_by_url(url, article)