OxbridgeEconomics commited on
Commit
456ce19
·
unverified ·
2 Parent(s): d8e4f29 f255707

Merge pull request #12 from oxbridge-econ/feature/ccef-data-source

Browse files
Files changed (4) hide show
  1. controllers/utils.py +4 -1
  2. main.py +2 -2
  3. source/sohu_ccef.py +160 -0
  4. xpath.json +9 -0
controllers/utils.py CHANGED
@@ -668,7 +668,10 @@ def crawl_by_url(url, article):
668
 
669
  """
670
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
671
- req = urllib.request.urlopen(url, timeout=60)
 
 
 
672
  text = req.read()
673
  html_text = text.decode("utf-8")
674
  page = etree.HTML(html_text)
 
668
 
669
  """
670
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
671
+ headers = {'User-Agent':
672
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
673
+ req = urllib.request.Request(url, headers=headers)
674
+ req = urllib.request.urlopen(req, timeout=60)
675
  text = req.read()
676
  html_text = text.decode("utf-8")
677
  page = etree.HTML(html_text)
main.py CHANGED
@@ -10,10 +10,9 @@ import os
10
  from dotenv import load_dotenv
11
  from prefect import flow
12
 
13
- from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
14
  # from glue import glue_job_run
15
 
16
-
17
  load_dotenv()
18
 
19
  logging.basicConfig(
@@ -40,6 +39,7 @@ def main():
40
  ndrc.crawl(delta)
41
  mof.crawl(delta)
42
  # glue_job_run()
 
43
 
44
  if __name__ == '__main__':
45
  main()
 
10
  from dotenv import load_dotenv
11
  from prefect import flow
12
 
13
+ from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof, sohu_ccef
14
  # from glue import glue_job_run
15
 
 
16
  load_dotenv()
17
 
18
  logging.basicConfig(
 
39
  ndrc.crawl(delta)
40
  mof.crawl(delta)
41
  # glue_job_run()
42
+ sohu_ccef.crawl(delta)
43
 
44
  if __name__ == '__main__':
45
  main()
source/sohu_ccef.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
2
+
3
+ import json
4
+ import time
5
+ import urllib.request
6
+ from datetime import datetime, timedelta
7
+
8
+ from prefect import task, get_run_logger
9
+ from lxml import etree
10
+
11
+ from controllers.utils import crawl_by_url
12
+
13
+ headers = {
14
+ "Accept": "application/json, text/javascript, */*; q=0.01",
15
+ "Content-Type": "application/json;charset=UTF-8",
16
+ "Origin": "https://www.sohu.com",
17
+ "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
18
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
19
+ "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
20
+ }
21
+
22
+ json_data_template = {
23
+ "pvId": "1742131372391_ChMyB2V",
24
+ "pageId": "1742131372927_1741844737337odi_MYH",
25
+ "mainContent": {
26
+ "productType": "13",
27
+ "productId": "324",
28
+ "secureScore": "5",
29
+ "categoryId": "47",
30
+ "adTags": "11111111",
31
+ "authorId": 121135924,
32
+ },
33
+ "resourceList": [
34
+ {
35
+ "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
36
+ "isServerRender": False,
37
+ "configSource": "mp",
38
+ "content": {
39
+ "productId": "325",
40
+ "productType": "13",
41
+ "size": 20,
42
+ "page": 1, # The number of pages will be updated dynamically in the loop
43
+ "requestId": "1742131372524LxjXrUY_324",
44
+ },
45
+ "adInfo": {},
46
+ "context": {"mkey": "465450"},
47
+ }
48
+ ],
49
+ }
50
+
51
+ def parse_time_string(time_str):
52
+ """
53
+ Parses the time string into a datetime object.
54
+ Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
55
+ """
56
+ now = datetime.now()
57
+ if "分钟前" in time_str:
58
+ minutes = int(time_str.replace("分钟前", "").strip())
59
+ return now - timedelta(minutes=minutes)
60
+ if "小时前" in time_str:
61
+ hours = int(time_str.replace("小时前", "").strip())
62
+ return now - timedelta(hours=hours)
63
+ elif "天前" in time_str:
64
+ days = int(time_str.replace("天前", "").strip())
65
+ return now - timedelta(days=days)
66
+ elif "昨天" in time_str:
67
+ time_part = time_str.replace("昨天", "").strip()
68
+ return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
69
+ elif "前天" in time_str:
70
+ time_part = time_str.replace("前天", "").strip()
71
+ return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
72
+ else:
73
+ return datetime.strptime(time_str, "%Y.%m.%d")
74
+
75
+ def extract_recent_urls(response_json, delta):
76
+ """
77
+ Extracts URLs from the response JSON and filters them based on the given delta (in days).
78
+
79
+ Args:
80
+ response_json (dict): The JSON response from the API.
81
+ delta (int): The number of days to look back.
82
+
83
+ Returns:
84
+ list: A list of filtered URLs.
85
+ """
86
+ base_url = "http://www.sohu.com"
87
+ cutoff_time = datetime.now() - timedelta(days=delta)
88
+ recent_urls = []
89
+
90
+ try:
91
+ articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
92
+ for article in articles:
93
+ extra_info = article.get("extraInfoList", [])
94
+ time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
95
+ if time_info:
96
+ article_time = parse_time_string(time_info)
97
+ if article_time >= cutoff_time:
98
+ url = article.get("url", "")
99
+ if url:
100
+ if not url.startswith("http"):
101
+ url = base_url + url
102
+ recent_urls.append(url)
103
+ except KeyError as e:
104
+ print(f"KeyError: {e}")
105
+
106
+ return recent_urls
107
+
108
+ @task(name = "Data Collection - sohu_ccef", log_prints = True)
109
+ def crawl(delta):
110
+ """
111
+ Crawls the website and collects URLs from the last `delta` days.
112
+
113
+ Args:
114
+ delta (int): The number of days to look back.
115
+
116
+ Returns:
117
+ list: A list of URLs published in the last `delta` days.
118
+ """
119
+ logger = get_run_logger()
120
+ logger.info("sohu.com")
121
+ all_recent_urls = []
122
+ url = "https://odin.sohu.com/odin/api/blockdata"
123
+
124
+ page = 1
125
+ while True:
126
+ json_data = json_data_template.copy()
127
+ json_data["resourceList"][0]["content"]["page"] = page
128
+
129
+ data = json.dumps(json_data).encode("utf-8")
130
+ req = urllib.request.Request(url, data=data, headers=headers, method="POST")
131
+
132
+ try:
133
+ with urllib.request.urlopen(req) as response:
134
+ response_data = response.read().decode("utf-8")
135
+ response_json = json.loads(response_data)
136
+
137
+ recent_urls = extract_recent_urls(response_json, delta)
138
+ if not recent_urls: # Stop if no recent URLs are found
139
+ logger.info(f"No recent data found on page {page}. Stopping.")
140
+ break
141
+
142
+ all_recent_urls.extend(recent_urls)
143
+ logger.info(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
144
+
145
+ except urllib.error.HTTPError as e:
146
+ logger.info(f"HTTP error: {e.code} on page {page}")
147
+ logger.info(e.read().decode("utf-8"))
148
+ break
149
+ except urllib.error.URLError as e:
150
+ logger.info(f"URL error: {e.reason} on page {page}")
151
+ break
152
+
153
+ page += 1
154
+
155
+ for url in all_recent_urls:
156
+ print(url)
157
+ article = {}
158
+ if "http://www.sohu.com" in url:
159
+ article['category'] = "Policy Interpretation"
160
+ crawl_by_url(url, article)
xpath.json CHANGED
@@ -82,5 +82,14 @@
82
  "datetime_format": "%Y-%m-%d %H:%M:%S",
83
  "siteCN": "中国国家税务总局",
84
  "site": "State Taxation Administration of China"
 
 
 
 
 
 
 
 
 
85
  }
86
  }
 
82
  "datetime_format": "%Y-%m-%d %H:%M:%S",
83
  "siteCN": "中国国家税务总局",
84
  "site": "State Taxation Administration of China"
85
+ },
86
+ "sohu.com":{
87
+ "title": "//title/text()",
88
+ "subtitle": "//meta[@name = 'description']/@content",
89
+ "publishdate": "//meta[@itemprop = 'datePublished']/@content",
90
+ "content": "//article[contains(@id, 'mp-editor')]//p",
91
+ "datetime_format": "%Y-%m-%d %H:%M",
92
+ "siteCN": "首席经济学家论坛",
93
+ "site": "China Chief Economist Forum"
94
  }
95
  }