OxbridgeEconomics
commited on
Commit
·
8925fd4
1
Parent(s):
efcd6b8
commit
Browse files- daily.py +3 -1
- patterns.json +380 -0
- utils.py +158 -24
daily.py
CHANGED
@@ -14,7 +14,8 @@ from utils import (encode,
|
|
14 |
crawl,
|
15 |
datemodifier,
|
16 |
encode_content,
|
17 |
-
update_content
|
|
|
18 |
|
19 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
20 |
xpath_dict = json.load(f)
|
@@ -161,6 +162,7 @@ def crawl_eastmoney(url, article):
|
|
161 |
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
|
162 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
163 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
|
|
|
164 |
update_content(article)
|
165 |
|
166 |
today = datetime.today().strftime('%Y-%m-%d')
|
|
|
14 |
crawl,
|
15 |
datemodifier,
|
16 |
encode_content,
|
17 |
+
update_content,
|
18 |
+
extract_reference)
|
19 |
|
20 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
21 |
xpath_dict = json.load(f)
|
|
|
162 |
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
|
163 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
164 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
|
165 |
+
extract_reference(article)
|
166 |
update_content(article)
|
167 |
|
168 |
today = datetime.today().strftime('%Y-%m-%d')
|
patterns.json
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"site": "Guosen Securities Co., Ltd.",
|
4 |
+
"pages": [0],
|
5 |
+
"date_range": 1,
|
6 |
+
"keyword": "相关研究报告",
|
7 |
+
"article_regex": "《(.*?)》",
|
8 |
+
"date_regex": "(\\d{4}-\\d{2}-\\d{2})",
|
9 |
+
"date_format": "%Y-%m-%d",
|
10 |
+
"split":[
|
11 |
+
{
|
12 |
+
"string": "-",
|
13 |
+
"index": -1
|
14 |
+
}
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"site": "Soochow Securities Co., Ltd.",
|
19 |
+
"pages": [0],
|
20 |
+
"date_range": 2,
|
21 |
+
"keyword": "相关研究",
|
22 |
+
"article_regex": "《(.*?)》",
|
23 |
+
"date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
|
24 |
+
"date_format": "%Y-%m-%d",
|
25 |
+
"split":[
|
26 |
+
{
|
27 |
+
"string": "-",
|
28 |
+
"index": 0
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"string": "—",
|
32 |
+
"index": 0
|
33 |
+
}
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"site": "BOCI Securities Co., Ltd.",
|
38 |
+
"pages": [0],
|
39 |
+
"date_range": 1,
|
40 |
+
"keyword": "相关研究报告",
|
41 |
+
"article_regex": "《(.*?)》",
|
42 |
+
"date_regex": "20\\d{6}|20\\d{5}\\s{1}\\d{1}",
|
43 |
+
"date_format": "%Y%m%d"
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"site": "Tianfeng Securities Co., Ltd.",
|
47 |
+
"pages": [0],
|
48 |
+
"date_range": 3,
|
49 |
+
"keyword": "相关报告",
|
50 |
+
"article_regex": " 《(.*?)》",
|
51 |
+
"date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
|
52 |
+
"date_format": "%Y-%m-%d",
|
53 |
+
"remove": ["宏观报告:", "宏观-", "宏观报告-", "——"],
|
54 |
+
"split":[
|
55 |
+
{
|
56 |
+
"string": ":",
|
57 |
+
"index": 1
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"string": "-",
|
61 |
+
"index": 0
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"string": "(",
|
65 |
+
"index": 1
|
66 |
+
}
|
67 |
+
]
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"site": "Kaiyuan Securities Co., Ltd.",
|
71 |
+
"pages": [0],
|
72 |
+
"date_range": 1,
|
73 |
+
"keyword": " ",
|
74 |
+
"article_regex": " 《(.*?)》",
|
75 |
+
"date_regex": "\\b\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\b",
|
76 |
+
"date_format": "%Y.%m.%d",
|
77 |
+
"split":[
|
78 |
+
{
|
79 |
+
"string": "—",
|
80 |
+
"index": 1
|
81 |
+
}
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"site": "Huafu Securities Co., Ltd.",
|
86 |
+
"pages": [0],
|
87 |
+
"date_range": 4,
|
88 |
+
"keyword": "相关报告",
|
89 |
+
"article_regex": "《(.*?)》",
|
90 |
+
"date_regex": "20\\d{2}\\s?\\.\\s?\\d{1}\\s?\\d{1}\\s?\\.\\s?\\d{1,2}",
|
91 |
+
"date_format": "%Y.%m.%d",
|
92 |
+
"split":[
|
93 |
+
{
|
94 |
+
"string": ":",
|
95 |
+
"index": 1
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"string": "——",
|
99 |
+
"index": 0
|
100 |
+
}
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"site": "Minsheng Securities Co., Ltd.",
|
105 |
+
"pages": [0],
|
106 |
+
"date_range": 1,
|
107 |
+
"keyword": "相关研究",
|
108 |
+
"article_regex": "\\.(.*?)\\-",
|
109 |
+
"date_regex": "20\\d{2}\\/\\d{2}\\/\\d{2}",
|
110 |
+
"date_format": "%Y/%m/%d",
|
111 |
+
"split":[
|
112 |
+
{
|
113 |
+
"string": ":",
|
114 |
+
"index": 1
|
115 |
+
}
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"site": "Guolian Securities Co., Ltd.",
|
120 |
+
"pages": [0],
|
121 |
+
"date_range": 1,
|
122 |
+
"keyword": "相关报告 ",
|
123 |
+
"article_regex": "《(.*?)》",
|
124 |
+
"date_regex": "[》 ]20\\d{2}\\.\\d{2}\\.\\d{2}",
|
125 |
+
"date_format": "%Y.%m.%d",
|
126 |
+
"split":[
|
127 |
+
{
|
128 |
+
"string": ":",
|
129 |
+
"index": 0
|
130 |
+
}
|
131 |
+
]
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"site": "Southwest Securities Co., Ltd.",
|
135 |
+
"pages": [0],
|
136 |
+
"date_range": 1,
|
137 |
+
"keyword": "相关研究",
|
138 |
+
"article_regex": "\\.(.*?)\\(",
|
139 |
+
"date_regex": "(20\\d{2}\\s?-\\d{2}\\-\\d{2})",
|
140 |
+
"date_format": "%Y-%m-%d"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"site": "Guangdong Securities Co., Ltd.",
|
144 |
+
"pages": [0],
|
145 |
+
"date_range": 1,
|
146 |
+
"keyword": "近期报告",
|
147 |
+
"article_regex": "《(.*?)》",
|
148 |
+
"date_regex": "20\\d{2}\\s?-\\d{2}\\-\\d{2}",
|
149 |
+
"date_format": "%Y-%m-%d"
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"site": "China Post Securities Co., Ltd.",
|
153 |
+
"pages": [0],
|
154 |
+
"date_range": 1,
|
155 |
+
"keyword": "近期研究报告",
|
156 |
+
"article_regex": "《(.*?)》",
|
157 |
+
"date_regex": "20\\d{2}\\s?.\\d{2}\\.\\d{2}",
|
158 |
+
"date_format": "%Y.%m.%d",
|
159 |
+
"split":[
|
160 |
+
{
|
161 |
+
"string": "-",
|
162 |
+
"index": 1
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"string": "——",
|
166 |
+
"index": 0
|
167 |
+
}
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"site": "Shanxi Securities Co., Ltd.",
|
172 |
+
"pages": [0],
|
173 |
+
"date_range": 1,
|
174 |
+
"keyword": " ",
|
175 |
+
"article_regex": "】(.*?)\\(",
|
176 |
+
"date_regex": "20\\d{2}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2} ",
|
177 |
+
"date_format": "%Y.%m.%d"
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"site": "Shanghai Securities Co., Ltd.",
|
181 |
+
"pages": [0],
|
182 |
+
"date_range": 1,
|
183 |
+
"keyword": "Table_Rep",
|
184 |
+
"article_regex": "《(.*?)》",
|
185 |
+
"date_regex": "20\\d{2}年\\d{2}月\\d{2}",
|
186 |
+
"date_format": "%Y年%m月%d"
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"site": "Guoyuan Securities Co., Ltd.",
|
190 |
+
"pages": [0],
|
191 |
+
"date_range": 2,
|
192 |
+
"keyword": "[Table_Report]",
|
193 |
+
"article_regex": "《(.*?)》 ",
|
194 |
+
"date_regex": " 20\\d{2}.\\d{2}.\\d{2} ",
|
195 |
+
"date_format": "%Y.%m.%d",
|
196 |
+
"split":[
|
197 |
+
{
|
198 |
+
"string": ":",
|
199 |
+
"index": 0
|
200 |
+
}
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"site": "Mago Securities Co., Ltd.",
|
205 |
+
"pages": [0],
|
206 |
+
"date_range": 1,
|
207 |
+
"keyword": "相关研究",
|
208 |
+
"article_regex": "《(.*?)》",
|
209 |
+
"date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2} ",
|
210 |
+
"date_format": "%Y.%m.%d",
|
211 |
+
"split":[
|
212 |
+
{
|
213 |
+
"string": "(",
|
214 |
+
"index": 0
|
215 |
+
}
|
216 |
+
]
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"site": "Fed Securities, Inc.",
|
220 |
+
"pages": [0],
|
221 |
+
"date_range": 3,
|
222 |
+
"keyword": "相关报告",
|
223 |
+
"article_regex": ":(.*?)20",
|
224 |
+
"date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2}",
|
225 |
+
"date_format": "%Y.%m.%d"
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"site": "Huabao Securities Co., Ltd.",
|
229 |
+
"pages": [0],
|
230 |
+
"date_range": 1,
|
231 |
+
"keyword": "相关研究报告",
|
232 |
+
"article_regex": "《(.*?)》",
|
233 |
+
"date_regex": "(\\d{4}-\\d{2}-\\d{2})",
|
234 |
+
"date_format": "%Y-%m-%d"
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"site": "Ruitingdog (Shenzhen) Information Technology Co., Ltd.",
|
238 |
+
"pages": [0],
|
239 |
+
"date_range": 1,
|
240 |
+
"keyword": "近期研究",
|
241 |
+
"article_regex": ":(.*?)-",
|
242 |
+
"date_regex": "\\d{4}\\s?/\\s?\\d{1,2}\\s?/\\s?\\d{1,2}",
|
243 |
+
"date_format": "%Y/%m/%d"
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"site": "Oriental Fortune Securities Co., Ltd.",
|
247 |
+
"pages": [0],
|
248 |
+
"date_range": 1,
|
249 |
+
"keyword": "相关研究",
|
250 |
+
"article_regex": "《(.*?)》",
|
251 |
+
"date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
|
252 |
+
"date_format": "%Y.%m.%d"
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"site": "Yongxing Securities Co., Ltd.",
|
256 |
+
"pages": [0],
|
257 |
+
"date_range": 1,
|
258 |
+
"keyword": "相关报告:",
|
259 |
+
"article_regex": "《(.*?)》",
|
260 |
+
"date_regex": "—— \\d{4}\\s?年\\s?\\d{1,2}\\s?月\\s?\\d{1,2}",
|
261 |
+
"date_format": "——%Y年%m月%d"
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"site": "Minmetals Securities Co., Ltd.",
|
265 |
+
"pages": [0],
|
266 |
+
"date_range": 1,
|
267 |
+
"keyword": "相关研究",
|
268 |
+
"article_regex": "《(.*?)》",
|
269 |
+
"date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
|
270 |
+
"date_format": "(%Y/%m/%d) "
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"site": "Hualong Securities Co., Ltd.",
|
274 |
+
"pages": [0],
|
275 |
+
"date_range": 1,
|
276 |
+
"keyword": "相关阅读",
|
277 |
+
"article_regex": "《(.*?)》",
|
278 |
+
"date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
|
279 |
+
"date_format": "%Y.%m.%d"
|
280 |
+
},
|
281 |
+
{
|
282 |
+
"site": "Hebei Yuanda Information Technology Co., Ltd.",
|
283 |
+
"pages": [0],
|
284 |
+
"date_range": 1,
|
285 |
+
"keyword": "相关报告:",
|
286 |
+
"article_regex": "《(.*?)》",
|
287 |
+
"date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
|
288 |
+
"date_format": "%Y.%m.%d"
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"site": "Huaxin Securities Co., Ltd.",
|
292 |
+
"pages": [0],
|
293 |
+
"date_range": 1,
|
294 |
+
"keyword": "相关研究",
|
295 |
+
"article_regex": "《(.*?)》",
|
296 |
+
"date_regex": "(\\d{4}-\\d{2}-\\d{2})",
|
297 |
+
"date_format": "%Y-%m-%d"
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"site": "Far East Credit Rating Co., Ltd.",
|
301 |
+
"pages": [0],
|
302 |
+
"date_range": 1,
|
303 |
+
"keyword": "1.",
|
304 |
+
"article_regex": "《(.*?)》",
|
305 |
+
"date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
|
306 |
+
"date_format": "%Y.%m.%d"
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"site": "Beijing Tengjing Big Data Application Technology Research Institute",
|
310 |
+
"pages": [0],
|
311 |
+
"date_range": 1,
|
312 |
+
"keyword": "相关报告",
|
313 |
+
"article_regex": "《(.*?)》",
|
314 |
+
"date_regex": "(\\d{4}-\\d{2}-\\d{2})",
|
315 |
+
"date_format": "%Y-%m-%d"
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"site": "Wanhe Securities Co., Ltd.",
|
319 |
+
"pages": [0],
|
320 |
+
"date_range": 1,
|
321 |
+
"keyword": "相关报告",
|
322 |
+
"article_regex": "《(.*?)》",
|
323 |
+
"date_regex": "(\\d{4}-\\d{2}-\\d{2})",
|
324 |
+
"date_format": "%Y-%m-%d"
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"site": "Centaline Securities Co., Ltd.",
|
328 |
+
"pages": [0],
|
329 |
+
"date_range": 1,
|
330 |
+
"keyword": "相关报告",
|
331 |
+
"article_regex": "《(.*?)》",
|
332 |
+
"date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
|
333 |
+
"date_format": "%Y-%m-%d"
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"site": "Tengjing Digital Research",
|
337 |
+
"pages": [0],
|
338 |
+
"date_range": 1,
|
339 |
+
"keyword": "相关报告",
|
340 |
+
"article_regex": "《(.*?)》",
|
341 |
+
"date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
|
342 |
+
"date_format": "%Y-%m-%d"
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"site": "Guoyuan Securities",
|
346 |
+
"pages": [0],
|
347 |
+
"date_range": 1,
|
348 |
+
"keyword": "相关研究报告",
|
349 |
+
"article_regex": "《(.*?)》",
|
350 |
+
"date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
|
351 |
+
"date_format": "%Y.%m.%d"
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"site": "China Galaxy Co., Ltd.",
|
355 |
+
"pages": [0],
|
356 |
+
"date_range": 1,
|
357 |
+
"keyword": "相关报告",
|
358 |
+
"article_regex": "《(.*?)》",
|
359 |
+
"date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
|
360 |
+
"date_format": "%Y-%m-%d"
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"site": "Shengang Securities Co., Ltd.",
|
364 |
+
"pages": [0],
|
365 |
+
"date_range": 1,
|
366 |
+
"keyword": "相关报告",
|
367 |
+
"article_regex": "《(.*?)》",
|
368 |
+
"date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
|
369 |
+
"date_format": "%Y-%m-%d"
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"site": "SDIC Anxin Futures",
|
373 |
+
"pages": [0],
|
374 |
+
"date_range": 1,
|
375 |
+
"keyword": "相关报告",
|
376 |
+
"article_regex": "《(.*?)》",
|
377 |
+
"date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
|
378 |
+
"date_format": "%Y-%m-%d"
|
379 |
+
}
|
380 |
+
]
|
utils.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
"""Utilis Functions"""
|
2 |
import os
|
|
|
3 |
import json
|
4 |
import uuid
|
5 |
import time
|
|
|
6 |
import urllib.request
|
7 |
from urllib.parse import urlparse
|
8 |
-
from datetime import datetime
|
9 |
from decimal import Decimal
|
|
|
10 |
import requests
|
11 |
import boto3
|
12 |
from lxml import etree
|
@@ -26,6 +29,136 @@ translator = Translator()
|
|
26 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
27 |
xpath_dict = json.load(f)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def translate(text):
|
30 |
return translator.translate(text, dest='en').text
|
31 |
|
@@ -38,7 +171,7 @@ def datemodifier(date_string, date_format):
|
|
38 |
return False
|
39 |
|
40 |
def fetch_url(url):
|
41 |
-
response = requests.get(url)
|
42 |
if response.status_code == 200:
|
43 |
return response.text
|
44 |
else:
|
@@ -78,29 +211,29 @@ def encode_content(content):
|
|
78 |
else:
|
79 |
line = element
|
80 |
if line != '':
|
81 |
-
|
82 |
-
|
83 |
index = text.find('打印本页')
|
84 |
if index != -1:
|
85 |
-
|
86 |
try:
|
87 |
-
|
88 |
except:
|
89 |
-
|
90 |
return text, summary
|
91 |
|
92 |
def extract_from_pdf(url):
|
93 |
# Send a GET request to the URL and retrieve the PDF content
|
94 |
-
response = requests.get(url)
|
95 |
pdf_content = response.content
|
96 |
|
97 |
# Save the PDF content to a local file
|
98 |
-
with open("downloaded_file.pdf", "wb") as
|
99 |
-
|
100 |
|
101 |
# Open the downloaded PDF file and extract the text
|
102 |
-
with open("downloaded_file.pdf", "rb") as
|
103 |
-
pdf_reader = PdfReader(
|
104 |
num_pages = len(pdf_reader.pages)
|
105 |
extracted_text = ""
|
106 |
for page in range(num_pages):
|
@@ -213,19 +346,19 @@ def upsert_content(report):
|
|
213 |
response = table.put_item(Item=item)
|
214 |
print(response)
|
215 |
|
216 |
-
def get_client_connection():
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
|
226 |
def delete_records(item):
|
227 |
-
|
228 |
-
|
229 |
TableName="article_test",
|
230 |
Key={
|
231 |
'id': {'S': item['id']},
|
@@ -275,4 +408,5 @@ def update_content_sentiment(report):
|
|
275 |
}
|
276 |
)
|
277 |
print(response)
|
278 |
-
|
|
|
|
1 |
"""Utilis Functions"""
|
2 |
import os
|
3 |
+
import re
|
4 |
import json
|
5 |
import uuid
|
6 |
import time
|
7 |
+
import glob
|
8 |
import urllib.request
|
9 |
from urllib.parse import urlparse
|
10 |
+
from datetime import datetime, timedelta
|
11 |
from decimal import Decimal
|
12 |
+
import pandas as pd
|
13 |
import requests
|
14 |
import boto3
|
15 |
from lxml import etree
|
|
|
29 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
30 |
xpath_dict = json.load(f)
|
31 |
|
32 |
+
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
33 |
+
patterns = json.load(f)
|
34 |
+
|
35 |
+
def get_client_connection():
|
36 |
+
"""Get dynamoDB connection"""
|
37 |
+
dynamodb = boto3.client(
|
38 |
+
service_name='dynamodb',
|
39 |
+
region_name='us-east-1',
|
40 |
+
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
41 |
+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
|
42 |
+
)
|
43 |
+
return dynamodb
|
44 |
+
|
45 |
+
def update_reference(report):
|
46 |
+
dynamodb = get_client_connection()
|
47 |
+
response = dynamodb.update_item(
|
48 |
+
TableName="reference_china",
|
49 |
+
Key={
|
50 |
+
'id': {'S': str(report['refID'])},
|
51 |
+
'sourceID': {'S': report['sourceID']}
|
52 |
+
},
|
53 |
+
UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
|
54 |
+
ExpressionAttributeValues={
|
55 |
+
':link': {'S': report['link']},
|
56 |
+
':referenceID': {'S': report['referenceID']},
|
57 |
+
':LastModifiedDate': {'S': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")},
|
58 |
+
}
|
59 |
+
)
|
60 |
+
print(response)
|
61 |
+
|
62 |
+
def download_files_from_s3(folder):
|
63 |
+
"""Download Data Files"""
|
64 |
+
if not os.path.exists(folder):
|
65 |
+
os.makedirs(folder)
|
66 |
+
client = boto3.client(
|
67 |
+
's3',
|
68 |
+
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
69 |
+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
|
70 |
+
)
|
71 |
+
response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
|
72 |
+
for obj in response['Contents']:
|
73 |
+
key = obj['Key']
|
74 |
+
if key.endswith('.parquet'):
|
75 |
+
client.download_file('china-securities-report', key, key)
|
76 |
+
file_paths = glob.glob(os.path.join(folder, '*.parquet'))
|
77 |
+
return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
|
78 |
+
|
79 |
+
def extract_from_pdf_by_pattern(url, pattern):
|
80 |
+
# Send a GET request to the URL and retrieve the PDF content
|
81 |
+
try:
|
82 |
+
response = requests.get(url, timeout=60)
|
83 |
+
pdf_content = response.content
|
84 |
+
# Save the PDF content to a local file
|
85 |
+
with open("downloaded_file.pdf", "wb") as file:
|
86 |
+
file.write(pdf_content)
|
87 |
+
|
88 |
+
# Open the downloaded PDF file and extract the text
|
89 |
+
with open("downloaded_file.pdf", "rb") as file:
|
90 |
+
pdf_reader = PdfReader(file)
|
91 |
+
extracted_text = ""
|
92 |
+
if 'pages' in pattern:
|
93 |
+
pages = pattern['pages']
|
94 |
+
else:
|
95 |
+
pages = len(pdf_reader.pages)
|
96 |
+
for page in pages:
|
97 |
+
text = pdf_reader.pages[page].extract_text()
|
98 |
+
if 'keyword' in pattern and pattern['keyword'] in text:
|
99 |
+
text = text.split(pattern['keyword'], 1)[1].strip()
|
100 |
+
else:
|
101 |
+
text = text.strip()
|
102 |
+
extracted_text += text
|
103 |
+
except:
|
104 |
+
extracted_text = ''
|
105 |
+
return extracted_text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n',' ').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
|
106 |
+
|
107 |
+
def get_reference_by_regex(pattern, text):
|
108 |
+
return re.findall(pattern, text)
|
109 |
+
|
110 |
+
def isnot_substring(list_a, string_to_check):
|
111 |
+
for s in list_a:
|
112 |
+
if s in string_to_check:
|
113 |
+
return False
|
114 |
+
return True
|
115 |
+
|
116 |
+
def extract_reference(row):
|
117 |
+
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
118 |
+
extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
|
119 |
+
reference_titles = re.findall(pattern['article_regex'], extracted_text)
|
120 |
+
reference_dates = re.findall(pattern['date_regex'], extracted_text)
|
121 |
+
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
122 |
+
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
123 |
+
if 'remove' in pattern:
|
124 |
+
for remove_string in pattern['remove']:
|
125 |
+
reference_titles = [s.replace(remove_string, '') for s in reference_titles]
|
126 |
+
for title, date in zip(reference_titles, reference_dates):
|
127 |
+
try:
|
128 |
+
date = datetime.strptime(date, pattern['date_format'])
|
129 |
+
except:
|
130 |
+
date = datetime(2006, 1, 1)
|
131 |
+
dates = []
|
132 |
+
if 'date_range' in pattern:
|
133 |
+
for i in range(pattern['date_range'] + 1):
|
134 |
+
dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
|
135 |
+
dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
|
136 |
+
dates.append(date.strftime('%Y-%m-%d'))
|
137 |
+
date = date.strftime('%Y-%m-%d')
|
138 |
+
if 'split' in pattern:
|
139 |
+
for split_item in pattern['split']:
|
140 |
+
if 'exceptional_string' in split_item:
|
141 |
+
if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
|
142 |
+
title = re.split(split_item['string'], title)[split_item['index']]
|
143 |
+
else:
|
144 |
+
if split_item['string'] in title:
|
145 |
+
title = title.split(split_item['string'])[split_item['index']]
|
146 |
+
if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
|
147 |
+
print("------------ = 0 ------------")
|
148 |
+
print(date, repr(title))
|
149 |
+
elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
|
150 |
+
print("------------ > 1 ------------")
|
151 |
+
print(date, repr(title))
|
152 |
+
else:
|
153 |
+
print("------------ = 1 ------------")
|
154 |
+
reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
|
155 |
+
row['referenceID'] = reference_df.iloc[0]['id']
|
156 |
+
row['link'] = reference_df.iloc[0]['link']
|
157 |
+
row['sourceID'] = row['id_x']
|
158 |
+
row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
|
159 |
+
print(date, repr(title), row['sourceID'],row['referenceID'])
|
160 |
+
# update_reference(row)
|
161 |
+
|
162 |
def translate(text):
|
163 |
return translator.translate(text, dest='en').text
|
164 |
|
|
|
171 |
return False
|
172 |
|
173 |
def fetch_url(url):
|
174 |
+
response = requests.get(url, timeout = 60)
|
175 |
if response.status_code == 200:
|
176 |
return response.text
|
177 |
else:
|
|
|
211 |
else:
|
212 |
line = element
|
213 |
if line != '':
|
214 |
+
line = line + '\n'
|
215 |
+
text += line
|
216 |
index = text.find('打印本页')
|
217 |
if index != -1:
|
218 |
+
text = text[:index]
|
219 |
try:
|
220 |
+
summary = '\n'.join(text.split('\n')[:2])
|
221 |
except:
|
222 |
+
summary = text
|
223 |
return text, summary
|
224 |
|
225 |
def extract_from_pdf(url):
|
226 |
# Send a GET request to the URL and retrieve the PDF content
|
227 |
+
response = requests.get(url, timeout=60)
|
228 |
pdf_content = response.content
|
229 |
|
230 |
# Save the PDF content to a local file
|
231 |
+
with open("downloaded_file.pdf", "wb") as file:
|
232 |
+
file.write(pdf_content)
|
233 |
|
234 |
# Open the downloaded PDF file and extract the text
|
235 |
+
with open("downloaded_file.pdf", "rb") as file:
|
236 |
+
pdf_reader = PdfReader(file)
|
237 |
num_pages = len(pdf_reader.pages)
|
238 |
extracted_text = ""
|
239 |
for page in range(num_pages):
|
|
|
346 |
response = table.put_item(Item=item)
|
347 |
print(response)
|
348 |
|
349 |
+
# def get_client_connection():
|
350 |
+
# """Get dynamoDB connection"""
|
351 |
+
# dynamodb = boto3.client(
|
352 |
+
# service_name='dynamodb',
|
353 |
+
# region_name='us-east-1',
|
354 |
+
# aws_access_key_id=AWS_ACCESS_KEY_ID,
|
355 |
+
# aws_secret_access_key=AWS_SECRET_ACCESS_KEY
|
356 |
+
# )
|
357 |
+
# return dynamodb
|
358 |
|
359 |
def delete_records(item):
|
360 |
+
dynamodb_client = get_client_connection()
|
361 |
+
dynamodb_client.delete_item(
|
362 |
TableName="article_test",
|
363 |
Key={
|
364 |
'id': {'S': item['id']},
|
|
|
408 |
}
|
409 |
)
|
410 |
print(response)
|
411 |
+
|
412 |
+
data = download_files_from_s3('data')
|