File size: 5,812 Bytes
a13f43a f255707 a13f43a f255707 a13f43a f255707 a13f43a f255707 a13f43a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
import json
import time
import urllib.request
from datetime import datetime, timedelta
from prefect import task, get_run_logger
from lxml import etree
from controllers.utils import crawl_by_url
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Content-Type": "application/json;charset=UTF-8",
"Origin": "https://www.sohu.com",
"Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
}
json_data_template = {
"pvId": "1742131372391_ChMyB2V",
"pageId": "1742131372927_1741844737337odi_MYH",
"mainContent": {
"productType": "13",
"productId": "324",
"secureScore": "5",
"categoryId": "47",
"adTags": "11111111",
"authorId": 121135924,
},
"resourceList": [
{
"tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
"isServerRender": False,
"configSource": "mp",
"content": {
"productId": "325",
"productType": "13",
"size": 20,
"page": 1, # The number of pages will be updated dynamically in the loop
"requestId": "1742131372524LxjXrUY_324",
},
"adInfo": {},
"context": {"mkey": "465450"},
}
],
}
def parse_time_string(time_str):
"""
Parses the time string into a datetime object.
Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
"""
now = datetime.now()
if "分钟前" in time_str:
minutes = int(time_str.replace("分钟前", "").strip())
return now - timedelta(minutes=minutes)
if "小时前" in time_str:
hours = int(time_str.replace("小时前", "").strip())
return now - timedelta(hours=hours)
elif "天前" in time_str:
days = int(time_str.replace("天前", "").strip())
return now - timedelta(days=days)
elif "昨天" in time_str:
time_part = time_str.replace("昨天", "").strip()
return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
elif "前天" in time_str:
time_part = time_str.replace("前天", "").strip()
return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
else:
return datetime.strptime(time_str, "%Y.%m.%d")
def extract_recent_urls(response_json, delta):
"""
Extracts URLs from the response JSON and filters them based on the given delta (in days).
Args:
response_json (dict): The JSON response from the API.
delta (int): The number of days to look back.
Returns:
list: A list of filtered URLs.
"""
base_url = "http://www.sohu.com"
cutoff_time = datetime.now() - timedelta(days=delta)
recent_urls = []
try:
articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
for article in articles:
extra_info = article.get("extraInfoList", [])
time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
if time_info:
article_time = parse_time_string(time_info)
if article_time >= cutoff_time:
url = article.get("url", "")
if url:
if not url.startswith("http"):
url = base_url + url
recent_urls.append(url)
except KeyError as e:
print(f"KeyError: {e}")
return recent_urls
@task(name = "Data Collection - sohu_ccef", log_prints = True)
def crawl(delta):
"""
Crawls the website and collects URLs from the last `delta` days.
Args:
delta (int): The number of days to look back.
Returns:
list: A list of URLs published in the last `delta` days.
"""
logger = get_run_logger()
logger.info("sohu.com")
all_recent_urls = []
url = "https://odin.sohu.com/odin/api/blockdata"
page = 1
while True:
json_data = json_data_template.copy()
json_data["resourceList"][0]["content"]["page"] = page
data = json.dumps(json_data).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
try:
with urllib.request.urlopen(req) as response:
response_data = response.read().decode("utf-8")
response_json = json.loads(response_data)
recent_urls = extract_recent_urls(response_json, delta)
if not recent_urls: # Stop if no recent URLs are found
logger.info(f"No recent data found on page {page}. Stopping.")
break
all_recent_urls.extend(recent_urls)
logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
except urllib.error.HTTPError as e:
logger.info(f"HTTP error: {e.code} on page {page}")
logger.info(e.read().decode("utf-8"))
break
except urllib.error.URLError as e:
logger.info(f"URL error: {e.reason} on page {page}")
break
page += 1
for url in all_recent_urls:
print(url)
article = {}
if "http://www.sohu.com" in url:
article['category'] = "Policy Interpretation"
crawl_by_url(url, article) |