|
"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles.""" |
|
|
|
import json |
|
import time |
|
import urllib.request |
|
from datetime import datetime, timedelta |
|
|
|
from prefect import task, get_run_logger |
|
from lxml import etree |
|
|
|
from controllers.utils import crawl_by_url |
|
|
|
headers = { |
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
|
"Content-Type": "application/json;charset=UTF-8", |
|
"Origin": "https://www.sohu.com", |
|
"Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20", |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36", |
|
"Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;", |
|
} |
|
|
|
json_data_template = { |
|
"pvId": "1742131372391_ChMyB2V", |
|
"pageId": "1742131372927_1741844737337odi_MYH", |
|
"mainContent": { |
|
"productType": "13", |
|
"productId": "324", |
|
"secureScore": "5", |
|
"categoryId": "47", |
|
"adTags": "11111111", |
|
"authorId": 121135924, |
|
}, |
|
"resourceList": [ |
|
{ |
|
"tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2", |
|
"isServerRender": False, |
|
"configSource": "mp", |
|
"content": { |
|
"productId": "325", |
|
"productType": "13", |
|
"size": 20, |
|
"page": 1, |
|
"requestId": "1742131372524LxjXrUY_324", |
|
}, |
|
"adInfo": {}, |
|
"context": {"mkey": "465450"}, |
|
} |
|
], |
|
} |
|
|
|
def parse_time_string(time_str): |
|
""" |
|
Parses the time string into a datetime object. |
|
Supports formats like "ζ¨ε€©18:41", "2倩ε", "2025.03.10", etc. |
|
""" |
|
now = datetime.now() |
|
if "ειε" in time_str: |
|
minutes = int(time_str.replace("ειε", "").strip()) |
|
return now - timedelta(minutes=minutes) |
|
if "ε°ζΆε" in time_str: |
|
hours = int(time_str.replace("ε°ζΆε", "").strip()) |
|
return now - timedelta(hours=hours) |
|
elif "倩ε" in time_str: |
|
days = int(time_str.replace("倩ε", "").strip()) |
|
return now - timedelta(days=days) |
|
elif "ζ¨ε€©" in time_str: |
|
time_part = time_str.replace("ζ¨ε€©", "").strip() |
|
return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M") |
|
elif "ε倩" in time_str: |
|
time_part = time_str.replace("ε倩", "").strip() |
|
return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M") |
|
else: |
|
return datetime.strptime(time_str, "%Y.%m.%d") |
|
|
|
def extract_recent_urls(response_json, delta): |
|
""" |
|
Extracts URLs from the response JSON and filters them based on the given delta (in days). |
|
|
|
Args: |
|
response_json (dict): The JSON response from the API. |
|
delta (int): The number of days to look back. |
|
|
|
Returns: |
|
list: A list of filtered URLs. |
|
""" |
|
base_url = "http://www.sohu.com" |
|
cutoff_time = datetime.now() - timedelta(days=delta) |
|
recent_urls = [] |
|
|
|
try: |
|
articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", []) |
|
for article in articles: |
|
extra_info = article.get("extraInfoList", []) |
|
time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None) |
|
if time_info: |
|
article_time = parse_time_string(time_info) |
|
if article_time >= cutoff_time: |
|
url = article.get("url", "") |
|
if url: |
|
if not url.startswith("http"): |
|
url = base_url + url |
|
recent_urls.append(url) |
|
except KeyError as e: |
|
print(f"KeyError: {e}") |
|
|
|
return recent_urls |
|
|
|
@task(name = "Data Collection - sohu_ccef", log_prints = True) |
|
def crawl(delta): |
|
""" |
|
Crawls the website and collects URLs from the last `delta` days. |
|
|
|
Args: |
|
delta (int): The number of days to look back. |
|
|
|
Returns: |
|
list: A list of URLs published in the last `delta` days. |
|
""" |
|
logger = get_run_logger() |
|
logger.info("sohu.com") |
|
all_recent_urls = [] |
|
url = "https://odin.sohu.com/odin/api/blockdata" |
|
|
|
page = 1 |
|
while True: |
|
json_data = json_data_template.copy() |
|
json_data["resourceList"][0]["content"]["page"] = page |
|
|
|
data = json.dumps(json_data).encode("utf-8") |
|
req = urllib.request.Request(url, data=data, headers=headers, method="POST") |
|
|
|
try: |
|
with urllib.request.urlopen(req) as response: |
|
response_data = response.read().decode("utf-8") |
|
response_json = json.loads(response_data) |
|
|
|
recent_urls = extract_recent_urls(response_json, delta) |
|
if not recent_urls: |
|
logger.info(f"No recent data found on page {page}. Stopping.") |
|
break |
|
|
|
all_recent_urls.extend(recent_urls) |
|
logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.") |
|
|
|
except urllib.error.HTTPError as e: |
|
logger.info(f"HTTP error: {e.code} on page {page}") |
|
logger.info(e.read().decode("utf-8")) |
|
break |
|
except urllib.error.URLError as e: |
|
logger.info(f"URL error: {e.reason} on page {page}") |
|
break |
|
|
|
page += 1 |
|
|
|
for url in all_recent_urls: |
|
print(url) |
|
article = {} |
|
if "http://www.sohu.com" in url: |
|
article['category'] = "Policy Interpretation" |
|
crawl_by_url(url, article) |