RaymondWongWL's picture
Add Data Source [CCEF]
f255707
"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
import json
import time
import urllib.request
from datetime import datetime, timedelta
from prefect import task, get_run_logger
from lxml import etree
from controllers.utils import crawl_by_url
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Content-Type": "application/json;charset=UTF-8",
"Origin": "https://www.sohu.com",
"Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
}
json_data_template = {
"pvId": "1742131372391_ChMyB2V",
"pageId": "1742131372927_1741844737337odi_MYH",
"mainContent": {
"productType": "13",
"productId": "324",
"secureScore": "5",
"categoryId": "47",
"adTags": "11111111",
"authorId": 121135924,
},
"resourceList": [
{
"tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
"isServerRender": False,
"configSource": "mp",
"content": {
"productId": "325",
"productType": "13",
"size": 20,
"page": 1, # The number of pages will be updated dynamically in the loop
"requestId": "1742131372524LxjXrUY_324",
},
"adInfo": {},
"context": {"mkey": "465450"},
}
],
}
def parse_time_string(time_str):
"""
Parses the time string into a datetime object.
Supports formats like "昨倩18:41", "2倩前", "2025.03.10", etc.
"""
now = datetime.now()
if "εˆ†ι’Ÿε‰" in time_str:
minutes = int(time_str.replace("εˆ†ι’Ÿε‰", "").strip())
return now - timedelta(minutes=minutes)
if "小既前" in time_str:
hours = int(time_str.replace("小既前", "").strip())
return now - timedelta(hours=hours)
elif "倩前" in time_str:
days = int(time_str.replace("倩前", "").strip())
return now - timedelta(days=days)
elif "昨倩" in time_str:
time_part = time_str.replace("昨倩", "").strip()
return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
elif "前倩" in time_str:
time_part = time_str.replace("前倩", "").strip()
return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
else:
return datetime.strptime(time_str, "%Y.%m.%d")
def extract_recent_urls(response_json, delta):
"""
Extracts URLs from the response JSON and filters them based on the given delta (in days).
Args:
response_json (dict): The JSON response from the API.
delta (int): The number of days to look back.
Returns:
list: A list of filtered URLs.
"""
base_url = "http://www.sohu.com"
cutoff_time = datetime.now() - timedelta(days=delta)
recent_urls = []
try:
articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
for article in articles:
extra_info = article.get("extraInfoList", [])
time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
if time_info:
article_time = parse_time_string(time_info)
if article_time >= cutoff_time:
url = article.get("url", "")
if url:
if not url.startswith("http"):
url = base_url + url
recent_urls.append(url)
except KeyError as e:
print(f"KeyError: {e}")
return recent_urls
@task(name = "Data Collection - sohu_ccef", log_prints = True)
def crawl(delta):
"""
Crawls the website and collects URLs from the last `delta` days.
Args:
delta (int): The number of days to look back.
Returns:
list: A list of URLs published in the last `delta` days.
"""
logger = get_run_logger()
logger.info("sohu.com")
all_recent_urls = []
url = "https://odin.sohu.com/odin/api/blockdata"
page = 1
while True:
json_data = json_data_template.copy()
json_data["resourceList"][0]["content"]["page"] = page
data = json.dumps(json_data).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
try:
with urllib.request.urlopen(req) as response:
response_data = response.read().decode("utf-8")
response_json = json.loads(response_data)
recent_urls = extract_recent_urls(response_json, delta)
if not recent_urls: # Stop if no recent URLs are found
logger.info(f"No recent data found on page {page}. Stopping.")
break
all_recent_urls.extend(recent_urls)
logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
except urllib.error.HTTPError as e:
logger.info(f"HTTP error: {e.code} on page {page}")
logger.info(e.read().decode("utf-8"))
break
except urllib.error.URLError as e:
logger.info(f"URL error: {e.reason} on page {page}")
break
page += 1
for url in all_recent_urls:
print(url)
article = {}
if "http://www.sohu.com" in url:
article['category'] = "Policy Interpretation"
crawl_by_url(url, article)