RaymondWongWL commited on
Commit
a13f43a
·
1 Parent(s): 236ef33

Add Data Source [CCEF]

Browse files
Files changed (4) hide show
  1. controllers/utils.py +13 -8
  2. main.py +2 -2
  3. source/sohu_ccef.py +160 -0
  4. xpath.json +9 -0
controllers/utils.py CHANGED
@@ -22,12 +22,12 @@ import PyPDF2
22
  from transformers import pipeline
23
 
24
  from controllers.summarizer import summarize
25
- from controllers.vectorizer import vectorize
26
 
27
  load_dotenv()
28
 
29
- AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
30
- AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
31
 
32
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
33
 
@@ -671,7 +671,11 @@ def crawl_by_url(url, article):
671
 
672
  """
673
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
674
- req = urllib.request.urlopen(url, timeout=60)
 
 
 
 
675
  text = req.read()
676
  html_text = text.decode("utf-8")
677
  page = etree.HTML(html_text)
@@ -704,7 +708,8 @@ def crawl_by_url(url, article):
704
  print(f"An unexpected error occurred: {e}")
705
  article['content'] = repr(contenteng)[1:-1].strip()
706
  try:
707
- article['subtitle'] = summarize(article['content'])
 
708
  except (ValueError, KeyError, TypeError):
709
  article['subtitle'] = translate(summary)
710
  article['publishDate'] = datemodifier(
@@ -719,8 +724,8 @@ def crawl_by_url(url, article):
719
  article['titleCN'] + article['publishDate'])
720
  logging.info("%s - %s", article['id'], article['site'])
721
  article['referenceid'] = None
722
- update_content(article)
723
- vectorize(article)
724
  # openai_vectorize(article)
725
 
726
- data = download_files_from_s3('data')
 
22
  from transformers import pipeline
23
 
24
  from controllers.summarizer import summarize
25
+ # from controllers.vectorizer import vectorize
26
 
27
  load_dotenv()
28
 
29
+ # AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
30
+ # AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
31
 
32
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
33
 
 
671
 
672
  """
673
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
674
+ headers = {'User-Agent':
675
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
676
+ # req = urllib.request.urlopen(url, timeout=60)
677
+ req = urllib.request.Request(url, headers=headers)
678
+ req = urllib.request.urlopen(req, timeout=60)
679
  text = req.read()
680
  html_text = text.decode("utf-8")
681
  page = etree.HTML(html_text)
 
708
  print(f"An unexpected error occurred: {e}")
709
  article['content'] = repr(contenteng)[1:-1].strip()
710
  try:
711
+ # article['subtitle'] = summarize(article['content'])
712
+ pass
713
  except (ValueError, KeyError, TypeError):
714
  article['subtitle'] = translate(summary)
715
  article['publishDate'] = datemodifier(
 
724
  article['titleCN'] + article['publishDate'])
725
  logging.info("%s - %s", article['id'], article['site'])
726
  article['referenceid'] = None
727
+ # update_content(article)
728
+ # vectorize(article)
729
  # openai_vectorize(article)
730
 
731
+ # data = download_files_from_s3('data')
main.py CHANGED
@@ -10,10 +10,9 @@ import os
10
  from dotenv import load_dotenv
11
  from prefect import flow
12
 
13
- from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
14
  # from glue import glue_job_run
15
 
16
-
17
  load_dotenv()
18
 
19
  logging.basicConfig(
@@ -40,6 +39,7 @@ def main():
40
  ndrc.crawl(delta)
41
  mof.crawl(delta)
42
  # glue_job_run()
 
43
 
44
  if __name__ == '__main__':
45
  main()
 
10
  from dotenv import load_dotenv
11
  from prefect import flow
12
 
13
+ from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof, sohu_ccef
14
  # from glue import glue_job_run
15
 
 
16
  load_dotenv()
17
 
18
  logging.basicConfig(
 
39
  ndrc.crawl(delta)
40
  mof.crawl(delta)
41
  # glue_job_run()
42
+ sohu_ccef.crawl(delta)
43
 
44
  if __name__ == '__main__':
45
  main()
source/sohu_ccef.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
2
+
3
+ import json
4
+ import time
5
+ import urllib.request
6
+ from datetime import datetime, timedelta
7
+
8
+ from prefect import task, get_run_logger
9
+ from lxml import etree
10
+
11
+ from controllers.utils import crawl_by_url
12
+
13
+ headers = {
14
+ "Accept": "application/json, text/javascript, */*; q=0.01",
15
+ "Content-Type": "application/json;charset=UTF-8",
16
+ "Origin": "https://www.sohu.com",
17
+ "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
18
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
19
+ "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
20
+ }
21
+
22
+ json_data_template = {
23
+ "pvId": "1742131372391_ChMyB2V",
24
+ "pageId": "1742131372927_1741844737337odi_MYH",
25
+ "mainContent": {
26
+ "productType": "13",
27
+ "productId": "324",
28
+ "secureScore": "5",
29
+ "categoryId": "47",
30
+ "adTags": "11111111",
31
+ "authorId": 121135924,
32
+ },
33
+ "resourceList": [
34
+ {
35
+ "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
36
+ "isServerRender": False,
37
+ "configSource": "mp",
38
+ "content": {
39
+ "productId": "325",
40
+ "productType": "13",
41
+ "size": 20,
42
+ "page": 1, # The number of pages will be updated dynamically in the loop
43
+ "requestId": "1742131372524LxjXrUY_324",
44
+ },
45
+ "adInfo": {},
46
+ "context": {"mkey": "465450"},
47
+ }
48
+ ],
49
+ }
50
+
51
+ def parse_time_string(time_str):
52
+ """
53
+ Parses the time string into a datetime object.
54
+ Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
55
+ """
56
+ now = datetime.now()
57
+ if "分钟前" in time_str:
58
+ minutes = int(time_str.replace("分钟前", "").strip())
59
+ return now - timedelta(minutes=minutes)
60
+ if "小时前" in time_str:
61
+ hours = int(time_str.replace("小时前", "").strip())
62
+ return now - timedelta(hours=hours)
63
+ elif "天前" in time_str:
64
+ days = int(time_str.replace("天前", "").strip())
65
+ return now - timedelta(days=days)
66
+ elif "昨天" in time_str:
67
+ time_part = time_str.replace("昨天", "").strip()
68
+ return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
69
+ elif "前天" in time_str:
70
+ time_part = time_str.replace("前天", "").strip()
71
+ return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
72
+ else:
73
+ return datetime.strptime(time_str, "%Y.%m.%d")
74
+
75
+ def extract_recent_urls(response_json, delta):
76
+ """
77
+ Extracts URLs from the response JSON and filters them based on the given delta (in days).
78
+
79
+ Args:
80
+ response_json (dict): The JSON response from the API.
81
+ delta (int): The number of days to look back.
82
+
83
+ Returns:
84
+ list: A list of filtered URLs.
85
+ """
86
+ base_url = "http://www.sohu.com"
87
+ cutoff_time = datetime.now() - timedelta(days=delta)
88
+ recent_urls = []
89
+
90
+ try:
91
+ articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
92
+ for article in articles:
93
+ extra_info = article.get("extraInfoList", [])
94
+ time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
95
+ if time_info:
96
+ article_time = parse_time_string(time_info)
97
+ if article_time >= cutoff_time:
98
+ url = article.get("url", "")
99
+ if url:
100
+ if not url.startswith("http"):
101
+ url = base_url + url
102
+ recent_urls.append(url)
103
+ except KeyError as e:
104
+ print(f"KeyError: {e}")
105
+
106
+ return recent_urls
107
+
108
+ @task(name = "Data Collection - sohu_ccef", log_prints = True)
109
+ def crawl(delta):
110
+ """
111
+ Crawls the website and collects URLs from the last `delta` days.
112
+
113
+ Args:
114
+ delta (int): The number of days to look back.
115
+
116
+ Returns:
117
+ list: A list of URLs published in the last `delta` days.
118
+ """
119
+ logger = get_run_logger()
120
+ logger.info("sohu.com")
121
+ all_recent_urls = []
122
+ url = "https://odin.sohu.com/odin/api/blockdata"
123
+
124
+ page = 1
125
+ while True:
126
+ json_data = json_data_template.copy()
127
+ json_data["resourceList"][0]["content"]["page"] = page
128
+
129
+ data = json.dumps(json_data).encode("utf-8")
130
+ req = urllib.request.Request(url, data=data, headers=headers, method="POST")
131
+
132
+ try:
133
+ with urllib.request.urlopen(req) as response:
134
+ response_data = response.read().decode("utf-8")
135
+ response_json = json.loads(response_data)
136
+
137
+ recent_urls = extract_recent_urls(response_json, delta)
138
+ if not recent_urls: # Stop if no recent URLs are found
139
+ print(f"No recent data found on page {page}. Stopping.")
140
+ break
141
+
142
+ all_recent_urls.extend(recent_urls)
143
+ print(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
144
+
145
+ except urllib.error.HTTPError as e:
146
+ print(f"HTTP error: {e.code} on page {page}")
147
+ print(e.read().decode("utf-8"))
148
+ break
149
+ except urllib.error.URLError as e:
150
+ print(f"URL error: {e.reason} on page {page}")
151
+ break
152
+
153
+ page += 1
154
+
155
+ for url in all_recent_urls:
156
+ print(url)
157
+ article = {}
158
+ if "http://www.sohu.com" in url:
159
+ article['category'] = "Policy Interpretation"
160
+ crawl_by_url(url, article)
xpath.json CHANGED
@@ -82,5 +82,14 @@
82
  "datetime_format": "%Y-%m-%d %H:%M:%S",
83
  "siteCN": "中国国家税务总局",
84
  "site": "State Taxation Administration of China"
 
 
 
 
 
 
 
 
 
85
  }
86
  }
 
82
  "datetime_format": "%Y-%m-%d %H:%M:%S",
83
  "siteCN": "中国国家税务总局",
84
  "site": "State Taxation Administration of China"
85
+ },
86
+ "sohu.com":{
87
+ "title": "//title/text()",
88
+ "subtitle": "//meta[@name = 'description']/@content",
89
+ "publishdate": "//meta[@itemprop = 'datePublished']/@content",
90
+ "content": "//article[contains(@id, 'mp-editor')]//p",
91
+ "datetime_format": "%Y-%m-%d %H:%M",
92
+ "siteCN": "首席经济学家论坛",
93
+ "site": "China Chief Economist Forum"
94
  }
95
  }