Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

App Files Files Community

Data-Collection-China / source /sohu_ccef.py

RaymondWongWL

Add Data Source [CCEF]

f255707 about 2 months ago

raw

history blame contribute delete

5.81 kB

	"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""

	import json
	import time
	import urllib.request
	from datetime import datetime, timedelta

	from prefect import task, get_run_logger
	from lxml import etree

	from controllers.utils import crawl_by_url

	headers = {
	"Accept": "application/json, text/javascript, /; q=0.01",
	"Content-Type": "application/json;charset=UTF-8",
	"Origin": "https://www.sohu.com",
	"Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
	"Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
	}

	json_data_template = {
	"pvId": "1742131372391_ChMyB2V",
	"pageId": "1742131372927_1741844737337odi_MYH",
	"mainContent": {
	"productType": "13",
	"productId": "324",
	"secureScore": "5",
	"categoryId": "47",
	"adTags": "11111111",
	"authorId": 121135924,
	},
	"resourceList": [
	{
	"tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
	"isServerRender": False,
	"configSource": "mp",
	"content": {
	"productId": "325",
	"productType": "13",
	"size": 20,
	"page": 1, # The number of pages will be updated dynamically in the loop
	"requestId": "1742131372524LxjXrUY_324",
	},
	"adInfo": {},
	"context": {"mkey": "465450"},
	}
	],
	}

	def parse_time_string(time_str):
	"""
	Parses the time string into a datetime object.
	Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
	"""
	now = datetime.now()
	if "分钟前" in time_str:
	minutes = int(time_str.replace("分钟前", "").strip())
	return now - timedelta(minutes=minutes)
	if "小时前" in time_str:
	hours = int(time_str.replace("小时前", "").strip())
	return now - timedelta(hours=hours)
	elif "天前" in time_str:
	days = int(time_str.replace("天前", "").strip())
	return now - timedelta(days=days)
	elif "昨天" in time_str:
	time_part = time_str.replace("昨天", "").strip()
	return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
	elif "前天" in time_str:
	time_part = time_str.replace("前天", "").strip()
	return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
	else:
	return datetime.strptime(time_str, "%Y.%m.%d")

	def extract_recent_urls(response_json, delta):
	"""
	Extracts URLs from the response JSON and filters them based on the given delta (in days).

	Args:
	response_json (dict): The JSON response from the API.
	delta (int): The number of days to look back.

	Returns:
	list: A list of filtered URLs.
	"""
	base_url = "http://www.sohu.com"
	cutoff_time = datetime.now() - timedelta(days=delta)
	recent_urls = []

	try:
	articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
	for article in articles:
	extra_info = article.get("extraInfoList", [])
	time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
	if time_info:
	article_time = parse_time_string(time_info)
	if article_time >= cutoff_time:
	url = article.get("url", "")
	if url:
	if not url.startswith("http"):
	url = base_url + url
	recent_urls.append(url)
	except KeyError as e:
	print(f"KeyError: {e}")

	return recent_urls

	@task(name = "Data Collection - sohu_ccef", log_prints = True)
	def crawl(delta):
	"""
	Crawls the website and collects URLs from the last `delta` days.

	Args:
	delta (int): The number of days to look back.

	Returns:
	list: A list of URLs published in the last `delta` days.
	"""
	logger = get_run_logger()
	logger.info("sohu.com")
	all_recent_urls = []
	url = "https://odin.sohu.com/odin/api/blockdata"

	page = 1
	while True:
	json_data = json_data_template.copy()
	json_data["resourceList"][0]["content"]["page"] = page

	data = json.dumps(json_data).encode("utf-8")
	req = urllib.request.Request(url, data=data, headers=headers, method="POST")

	try:
	with urllib.request.urlopen(req) as response:
	response_data = response.read().decode("utf-8")
	response_json = json.loads(response_data)

	recent_urls = extract_recent_urls(response_json, delta)
	if not recent_urls: # Stop if no recent URLs are found
	logger.info(f"No recent data found on page {page}. Stopping.")
	break

	all_recent_urls.extend(recent_urls)
	logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")

	except urllib.error.HTTPError as e:
	logger.info(f"HTTP error: {e.code} on page {page}")
	logger.info(e.read().decode("utf-8"))
	break
	except urllib.error.URLError as e:
	logger.info(f"URL error: {e.reason} on page {page}")
	break

	page += 1

	for url in all_recent_urls:
	print(url)
	article = {}
	if "http://www.sohu.com" in url:
	article['category'] = "Policy Interpretation"
	crawl_by_url(url, article)