Commit
·
a13f43a
1
Parent(s):
236ef33
Add Data Source [CCEF]
Browse files- controllers/utils.py +13 -8
- main.py +2 -2
- source/sohu_ccef.py +160 -0
- xpath.json +9 -0
controllers/utils.py
CHANGED
@@ -22,12 +22,12 @@ import PyPDF2
|
|
22 |
from transformers import pipeline
|
23 |
|
24 |
from controllers.summarizer import summarize
|
25 |
-
from controllers.vectorizer import vectorize
|
26 |
|
27 |
load_dotenv()
|
28 |
|
29 |
-
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
30 |
-
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
31 |
|
32 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
33 |
|
@@ -671,7 +671,11 @@ def crawl_by_url(url, article):
|
|
671 |
|
672 |
"""
|
673 |
domain = '.'.join(urlparse(url).netloc.split('.')[1:])
|
674 |
-
|
|
|
|
|
|
|
|
|
675 |
text = req.read()
|
676 |
html_text = text.decode("utf-8")
|
677 |
page = etree.HTML(html_text)
|
@@ -704,7 +708,8 @@ def crawl_by_url(url, article):
|
|
704 |
print(f"An unexpected error occurred: {e}")
|
705 |
article['content'] = repr(contenteng)[1:-1].strip()
|
706 |
try:
|
707 |
-
article['subtitle'] = summarize(article['content'])
|
|
|
708 |
except (ValueError, KeyError, TypeError):
|
709 |
article['subtitle'] = translate(summary)
|
710 |
article['publishDate'] = datemodifier(
|
@@ -719,8 +724,8 @@ def crawl_by_url(url, article):
|
|
719 |
article['titleCN'] + article['publishDate'])
|
720 |
logging.info("%s - %s", article['id'], article['site'])
|
721 |
article['referenceid'] = None
|
722 |
-
update_content(article)
|
723 |
-
vectorize(article)
|
724 |
# openai_vectorize(article)
|
725 |
|
726 |
-
data = download_files_from_s3('data')
|
|
|
22 |
from transformers import pipeline
|
23 |
|
24 |
from controllers.summarizer import summarize
|
25 |
+
# from controllers.vectorizer import vectorize
|
26 |
|
27 |
load_dotenv()
|
28 |
|
29 |
+
# AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
30 |
+
# AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
31 |
|
32 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
33 |
|
|
|
671 |
|
672 |
"""
|
673 |
domain = '.'.join(urlparse(url).netloc.split('.')[1:])
|
674 |
+
headers = {'User-Agent':
|
675 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
676 |
+
# req = urllib.request.urlopen(url, timeout=60)
|
677 |
+
req = urllib.request.Request(url, headers=headers)
|
678 |
+
req = urllib.request.urlopen(req, timeout=60)
|
679 |
text = req.read()
|
680 |
html_text = text.decode("utf-8")
|
681 |
page = etree.HTML(html_text)
|
|
|
708 |
print(f"An unexpected error occurred: {e}")
|
709 |
article['content'] = repr(contenteng)[1:-1].strip()
|
710 |
try:
|
711 |
+
# article['subtitle'] = summarize(article['content'])
|
712 |
+
pass
|
713 |
except (ValueError, KeyError, TypeError):
|
714 |
article['subtitle'] = translate(summary)
|
715 |
article['publishDate'] = datemodifier(
|
|
|
724 |
article['titleCN'] + article['publishDate'])
|
725 |
logging.info("%s - %s", article['id'], article['site'])
|
726 |
article['referenceid'] = None
|
727 |
+
# update_content(article)
|
728 |
+
# vectorize(article)
|
729 |
# openai_vectorize(article)
|
730 |
|
731 |
+
# data = download_files_from_s3('data')
|
main.py
CHANGED
@@ -10,10 +10,9 @@ import os
|
|
10 |
from dotenv import load_dotenv
|
11 |
from prefect import flow
|
12 |
|
13 |
-
from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
|
14 |
# from glue import glue_job_run
|
15 |
|
16 |
-
|
17 |
load_dotenv()
|
18 |
|
19 |
logging.basicConfig(
|
@@ -40,6 +39,7 @@ def main():
|
|
40 |
ndrc.crawl(delta)
|
41 |
mof.crawl(delta)
|
42 |
# glue_job_run()
|
|
|
43 |
|
44 |
if __name__ == '__main__':
|
45 |
main()
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
from prefect import flow
|
12 |
|
13 |
+
from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof, sohu_ccef
|
14 |
# from glue import glue_job_run
|
15 |
|
|
|
16 |
load_dotenv()
|
17 |
|
18 |
logging.basicConfig(
|
|
|
39 |
ndrc.crawl(delta)
|
40 |
mof.crawl(delta)
|
41 |
# glue_job_run()
|
42 |
+
sohu_ccef.crawl(delta)
|
43 |
|
44 |
if __name__ == '__main__':
|
45 |
main()
|
source/sohu_ccef.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
|
2 |
+
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
import urllib.request
|
6 |
+
from datetime import datetime, timedelta
|
7 |
+
|
8 |
+
from prefect import task, get_run_logger
|
9 |
+
from lxml import etree
|
10 |
+
|
11 |
+
from controllers.utils import crawl_by_url
|
12 |
+
|
13 |
+
headers = {
|
14 |
+
"Accept": "application/json, text/javascript, */*; q=0.01",
|
15 |
+
"Content-Type": "application/json;charset=UTF-8",
|
16 |
+
"Origin": "https://www.sohu.com",
|
17 |
+
"Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
|
18 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
|
19 |
+
"Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
|
20 |
+
}
|
21 |
+
|
22 |
+
json_data_template = {
|
23 |
+
"pvId": "1742131372391_ChMyB2V",
|
24 |
+
"pageId": "1742131372927_1741844737337odi_MYH",
|
25 |
+
"mainContent": {
|
26 |
+
"productType": "13",
|
27 |
+
"productId": "324",
|
28 |
+
"secureScore": "5",
|
29 |
+
"categoryId": "47",
|
30 |
+
"adTags": "11111111",
|
31 |
+
"authorId": 121135924,
|
32 |
+
},
|
33 |
+
"resourceList": [
|
34 |
+
{
|
35 |
+
"tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
|
36 |
+
"isServerRender": False,
|
37 |
+
"configSource": "mp",
|
38 |
+
"content": {
|
39 |
+
"productId": "325",
|
40 |
+
"productType": "13",
|
41 |
+
"size": 20,
|
42 |
+
"page": 1, # The number of pages will be updated dynamically in the loop
|
43 |
+
"requestId": "1742131372524LxjXrUY_324",
|
44 |
+
},
|
45 |
+
"adInfo": {},
|
46 |
+
"context": {"mkey": "465450"},
|
47 |
+
}
|
48 |
+
],
|
49 |
+
}
|
50 |
+
|
51 |
+
def parse_time_string(time_str):
|
52 |
+
"""
|
53 |
+
Parses the time string into a datetime object.
|
54 |
+
Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
|
55 |
+
"""
|
56 |
+
now = datetime.now()
|
57 |
+
if "分钟前" in time_str:
|
58 |
+
minutes = int(time_str.replace("分钟前", "").strip())
|
59 |
+
return now - timedelta(minutes=minutes)
|
60 |
+
if "小时前" in time_str:
|
61 |
+
hours = int(time_str.replace("小时前", "").strip())
|
62 |
+
return now - timedelta(hours=hours)
|
63 |
+
elif "天前" in time_str:
|
64 |
+
days = int(time_str.replace("天前", "").strip())
|
65 |
+
return now - timedelta(days=days)
|
66 |
+
elif "昨天" in time_str:
|
67 |
+
time_part = time_str.replace("昨天", "").strip()
|
68 |
+
return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
|
69 |
+
elif "前天" in time_str:
|
70 |
+
time_part = time_str.replace("前天", "").strip()
|
71 |
+
return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
|
72 |
+
else:
|
73 |
+
return datetime.strptime(time_str, "%Y.%m.%d")
|
74 |
+
|
75 |
+
def extract_recent_urls(response_json, delta):
|
76 |
+
"""
|
77 |
+
Extracts URLs from the response JSON and filters them based on the given delta (in days).
|
78 |
+
|
79 |
+
Args:
|
80 |
+
response_json (dict): The JSON response from the API.
|
81 |
+
delta (int): The number of days to look back.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
list: A list of filtered URLs.
|
85 |
+
"""
|
86 |
+
base_url = "http://www.sohu.com"
|
87 |
+
cutoff_time = datetime.now() - timedelta(days=delta)
|
88 |
+
recent_urls = []
|
89 |
+
|
90 |
+
try:
|
91 |
+
articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
|
92 |
+
for article in articles:
|
93 |
+
extra_info = article.get("extraInfoList", [])
|
94 |
+
time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
|
95 |
+
if time_info:
|
96 |
+
article_time = parse_time_string(time_info)
|
97 |
+
if article_time >= cutoff_time:
|
98 |
+
url = article.get("url", "")
|
99 |
+
if url:
|
100 |
+
if not url.startswith("http"):
|
101 |
+
url = base_url + url
|
102 |
+
recent_urls.append(url)
|
103 |
+
except KeyError as e:
|
104 |
+
print(f"KeyError: {e}")
|
105 |
+
|
106 |
+
return recent_urls
|
107 |
+
|
108 |
+
@task(name = "Data Collection - sohu_ccef", log_prints = True)
|
109 |
+
def crawl(delta):
|
110 |
+
"""
|
111 |
+
Crawls the website and collects URLs from the last `delta` days.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
delta (int): The number of days to look back.
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
list: A list of URLs published in the last `delta` days.
|
118 |
+
"""
|
119 |
+
logger = get_run_logger()
|
120 |
+
logger.info("sohu.com")
|
121 |
+
all_recent_urls = []
|
122 |
+
url = "https://odin.sohu.com/odin/api/blockdata"
|
123 |
+
|
124 |
+
page = 1
|
125 |
+
while True:
|
126 |
+
json_data = json_data_template.copy()
|
127 |
+
json_data["resourceList"][0]["content"]["page"] = page
|
128 |
+
|
129 |
+
data = json.dumps(json_data).encode("utf-8")
|
130 |
+
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
|
131 |
+
|
132 |
+
try:
|
133 |
+
with urllib.request.urlopen(req) as response:
|
134 |
+
response_data = response.read().decode("utf-8")
|
135 |
+
response_json = json.loads(response_data)
|
136 |
+
|
137 |
+
recent_urls = extract_recent_urls(response_json, delta)
|
138 |
+
if not recent_urls: # Stop if no recent URLs are found
|
139 |
+
print(f"No recent data found on page {page}. Stopping.")
|
140 |
+
break
|
141 |
+
|
142 |
+
all_recent_urls.extend(recent_urls)
|
143 |
+
print(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
|
144 |
+
|
145 |
+
except urllib.error.HTTPError as e:
|
146 |
+
print(f"HTTP error: {e.code} on page {page}")
|
147 |
+
print(e.read().decode("utf-8"))
|
148 |
+
break
|
149 |
+
except urllib.error.URLError as e:
|
150 |
+
print(f"URL error: {e.reason} on page {page}")
|
151 |
+
break
|
152 |
+
|
153 |
+
page += 1
|
154 |
+
|
155 |
+
for url in all_recent_urls:
|
156 |
+
print(url)
|
157 |
+
article = {}
|
158 |
+
if "http://www.sohu.com" in url:
|
159 |
+
article['category'] = "Policy Interpretation"
|
160 |
+
crawl_by_url(url, article)
|
xpath.json
CHANGED
@@ -82,5 +82,14 @@
|
|
82 |
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
83 |
"siteCN": "中国国家税务总局",
|
84 |
"site": "State Taxation Administration of China"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
}
|
86 |
}
|
|
|
82 |
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
83 |
"siteCN": "中国国家税务总局",
|
84 |
"site": "State Taxation Administration of China"
|
85 |
+
},
|
86 |
+
"sohu.com":{
|
87 |
+
"title": "//title/text()",
|
88 |
+
"subtitle": "//meta[@name = 'description']/@content",
|
89 |
+
"publishdate": "//meta[@itemprop = 'datePublished']/@content",
|
90 |
+
"content": "//article[contains(@id, 'mp-editor')]//p",
|
91 |
+
"datetime_format": "%Y-%m-%d %H:%M",
|
92 |
+
"siteCN": "首席经济学家论坛",
|
93 |
+
"site": "China Chief Economist Forum"
|
94 |
}
|
95 |
}
|