OxbridgeEconomics commited on
Commit
d860eae
·
1 Parent(s): 36b7b27
Files changed (2) hide show
  1. patterns.json +404 -378
  2. utils.py +61 -35
patterns.json CHANGED
@@ -1,380 +1,406 @@
1
  [
2
- {
3
- "site": "Guosen Securities Co., Ltd.",
4
- "pages": [0],
5
- "date_range": 1,
6
- "keyword": "相关研究报告",
7
- "article_regex": "《(.*?)》",
8
- "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
9
- "date_format": "%Y-%m-%d",
10
- "split":[
11
- {
12
- "string": "-",
13
- "index": -1
14
- }
15
- ]
16
- },
17
- {
18
- "site": "Soochow Securities Co., Ltd.",
19
- "pages": [0],
20
- "date_range": 2,
21
- "keyword": "相关研究",
22
- "article_regex": "《(.*?)》",
23
- "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
24
- "date_format": "%Y-%m-%d",
25
- "split":[
26
- {
27
- "string": "-",
28
- "index": 0
29
- },
30
- {
31
- "string": "—",
32
- "index": 0
33
- }
34
- ]
35
- },
36
- {
37
- "site": "BOCI Securities Co., Ltd.",
38
- "pages": [0],
39
- "date_range": 1,
40
- "keyword": "相关研究报告",
41
- "article_regex": "《(.*?)》",
42
- "date_regex": "20\\d{6}|20\\d{5}\\s{1}\\d{1}",
43
- "date_format": "%Y%m%d"
44
- },
45
- {
46
- "site": "Tianfeng Securities Co., Ltd.",
47
- "pages": [0],
48
- "date_range": 3,
49
- "keyword": "相关报告",
50
- "article_regex": " 《(.*?)》",
51
- "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
52
- "date_format": "%Y-%m-%d",
53
- "remove": ["宏观报告:", "宏观-", "宏观报告-", "——"],
54
- "split":[
55
- {
56
- "string": ":",
57
- "index": 1
58
- },
59
- {
60
- "string": "-",
61
- "index": 0
62
- },
63
- {
64
- "string": "(",
65
- "index": 1
66
- }
67
- ]
68
- },
69
- {
70
- "site": "Kaiyuan Securities Co., Ltd.",
71
- "pages": [0],
72
- "date_range": 1,
73
- "keyword": " ",
74
- "article_regex": " 《(.*?)》",
75
- "date_regex": "\\b\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\b",
76
- "date_format": "%Y.%m.%d",
77
- "split":[
78
- {
79
- "string": "",
80
- "index": 1
81
- }
82
- ]
83
- },
84
- {
85
- "site": "Huafu Securities Co., Ltd.",
86
- "pages": [0],
87
- "date_range": 4,
88
- "keyword": "相关报告",
89
- "article_regex": "《(.*?)》",
90
- "date_regex": "20\\d{2}\\s?\\.\\s?\\d{1}\\s?\\d{1}\\s?\\.\\s?\\d{1,2}",
91
- "date_format": "%Y.%m.%d",
92
- "split":[
93
- {
94
- "string": ":",
95
- "index": 1
96
- },
97
- {
98
- "string": "——",
99
- "index": 0
100
- }
101
- ]
102
- },
103
- {
104
- "site": "Minsheng Securities Co., Ltd.",
105
- "pages": [0],
106
- "date_range": 1,
107
- "keyword": "相关研究",
108
- "article_regex": "\\.(.*?)\\-",
109
- "date_regex": "20\\d{2}\\/\\d{2}\\/\\d{2}",
110
- "date_format": "%Y/%m/%d",
111
- "split":[
112
- {
113
- "string": ":",
114
- "index": 1
115
- }
116
- ]
117
- },
118
- {
119
- "site": "Guolian Securities Co., Ltd.",
120
- "pages": [0],
121
- "date_range": 1,
122
- "keyword": "相关报告 ",
123
- "article_regex": "《(.*?)》",
124
- "date_regex": "[》 ]20\\d{2}\\.\\d{2}\\.\\d{2}",
125
- "date_format": "%Y.%m.%d",
126
- "split":[
127
- {
128
- "string": ":",
129
- "index": 0
130
- }
131
- ]
132
- },
133
- {
134
- "site": "Southwest Securities Co., Ltd.",
135
- "pages": [0],
136
- "date_range": 1,
137
- "keyword": "相关研究",
138
- "article_regex": "\\.(.*?)\\(",
139
- "date_regex": "(20\\d{2}\\s?-\\d{2}\\-\\d{2})",
140
- "date_format": "%Y-%m-%d"
141
- },
142
- {
143
- "site": "Guangdong Securities Co., Ltd.",
144
- "pages": [0],
145
- "date_range": 1,
146
- "keyword": "近期报告",
147
- "article_regex": "《(.*?)》",
148
- "date_regex": "20\\d{2}\\s?-\\d{2}\\-\\d{2}",
149
- "date_format": "%Y-%m-%d"
150
- },
151
- {
152
- "site": "China Post Securities Co., Ltd.",
153
- "pages": [0],
154
- "date_range": 1,
155
- "keyword": "近期研究报告",
156
- "article_regex": "《(.*?)》",
157
- "date_regex": "20\\d{2}\\s?.\\d{2}\\.\\d{2}",
158
- "date_format": "%Y.%m.%d",
159
- "split":[
160
- {
161
- "string": "-",
162
- "index": 1
163
- },
164
- {
165
- "string": "——",
166
- "index": 0
167
- }
168
- ]
169
- },
170
- {
171
- "site": "Shanxi Securities Co., Ltd.",
172
- "pages": [0],
173
- "date_range": 1,
174
- "keyword": " ",
175
- "article_regex": "】(.*?)\\(",
176
- "date_regex": "20\\d{2}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2} ",
177
- "date_format": "%Y.%m.%d"
178
- },
179
- {
180
- "site": "Shanghai Securities Co., Ltd.",
181
- "pages": [0],
182
- "date_range": 1,
183
- "keyword": "Table_Rep",
184
- "article_regex": "《(.*?)》",
185
- "date_regex": "20\\d{2}年\\d{2}月\\d{2}",
186
- "date_format": "%Y年%m月%d"
187
- },
188
- {
189
- "site": "Guoyuan Securities Co., Ltd.",
190
- "pages": [0],
191
- "date_range": 2,
192
- "keyword": "[Table_Report]",
193
- "article_regex": "《(.*?)》 ",
194
- "date_regex": " 20\\d{2}.\\d{2}.\\d{2} ",
195
- "date_format": "%Y.%m.%d",
196
- "split":[
197
- {
198
- "string": ":",
199
- "index": 0
200
- }
201
- ]
202
- },
203
- {
204
- "site": "Mago Securities Co., Ltd.",
205
- "pages": [0],
206
- "date_range": 1,
207
- "keyword": "相关研究",
208
- "article_regex": "《(.*?)》",
209
- "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2} ",
210
- "date_format": "%Y.%m.%d",
211
- "split":[
212
- {
213
- "string": "(",
214
- "index": 0
215
- }
216
- ]
217
- },
218
- {
219
- "site": "Fed Securities, Inc.",
220
- "pages": [0],
221
- "date_range": 3,
222
- "keyword": "相关报告",
223
- "article_regex": ":(.*?)20",
224
- "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2}",
225
- "date_format": "%Y.%m.%d"
226
- },
227
- {
228
- "site": "Huabao Securities Co., Ltd.",
229
- "pages": [0],
230
- "date_range": 1,
231
- "keyword": "相关研究报告",
232
- "article_regex": "《(.*?)》",
233
- "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
234
- "date_format": "%Y-%m-%d"
235
- },
236
- {
237
- "site": "Ruitingdog (Shenzhen) Information Technology Co., Ltd.",
238
- "pages": [0],
239
- "date_range": 1,
240
- "keyword": "近期研究",
241
- "article_regex": ":(.*?)-",
242
- "date_regex": "\\d{4}\\s?/\\s?\\d{1,2}\\s?/\\s?\\d{1,2}",
243
- "date_format": "%Y/%m/%d"
244
- },
245
- {
246
- "site": "Oriental Fortune Securities Co., Ltd.",
247
- "pages": [0],
248
- "date_range": 1,
249
- "keyword": "相关研究",
250
- "article_regex": "《(.*?)》",
251
- "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
252
- "date_format": "%Y.%m.%d"
253
- },
254
- {
255
- "site": "Yongxing Securities Co., Ltd.",
256
- "pages": [0],
257
- "date_range": 1,
258
- "keyword": "相关报告:",
259
- "article_regex": "(.*?)",
260
- "date_regex": "—— \\d{4}\\s?年\\s?\\d{1,2}\\s?月\\s?\\d{1,2}",
261
- "date_format": "——%Y年%m月%d"
262
- },
263
- {
264
- "site": "Minmetals Securities Co., Ltd.",
265
- "pages": [0],
266
- "date_range": 1,
267
- "keyword": "相关研究",
268
- "article_regex": "《(.*?)》",
269
- "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
270
- "date_format": "(%Y/%m/%d) "
271
- },
272
- {
273
- "site": "Hualong Securities Co., Ltd.",
274
- "pages": [0],
275
- "date_range": 1,
276
- "keyword": "相关阅读",
277
- "article_regex": "《(.*?)》",
278
- "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
279
- "date_format": "%Y.%m.%d"
280
- },
281
- {
282
- "site": "Hebei Yuanda Information Technology Co., Ltd.",
283
- "pages": [0],
284
- "date_range": 1,
285
- "keyword": "相关报告:",
286
- "article_regex": "《(.*?)》",
287
- "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
288
- "date_format": "%Y.%m.%d"
289
- },
290
- {
291
- "site": "Huaxin Securities Co., Ltd.",
292
- "pages": [0],
293
- "date_range": 1,
294
- "keyword": "相关研究",
295
- "article_regex": "(.*?)",
296
- "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
297
- "date_format": "%Y-%m-%d"
298
- },
299
- {
300
- "site": "Far East Credit Rating Co., Ltd.",
301
- "pages": [0],
302
- "date_range": 1,
303
- "keyword": "1.",
304
- "article_regex": "《(.*?)》",
305
- "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
306
- "date_format": "%Y.%m.%d"
307
- },
308
- {
309
- "site": "Beijing Tengjing Big Data Application Technology Research Institute",
310
- "pages": [0],
311
- "date_range": 1,
312
- "keyword": "相关报告",
313
- "article_regex": "《(.*?)》",
314
- "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
315
- "date_format": "%Y-%m-%d"
316
- },
317
- {
318
- "site": "Wanhe Securities Co., Ltd.",
319
- "pages": [0],
320
- "date_range": 1,
321
- "keyword": "相关报告",
322
- "article_regex": "(.*?)",
323
- "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
324
- "date_format": "%Y-%m-%d"
325
- },
326
- {
327
- "site": "Centaline Securities Co., Ltd.",
328
- "pages": [0],
329
- "date_range": 1,
330
- "keyword": "相关报告",
331
- "article_regex": "《(.*?)》",
332
- "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
333
- "date_format": "%Y-%m-%d"
334
- },
335
- {
336
- "site": "Tengjing Digital Research",
337
- "pages": [0],
338
- "date_range": 1,
339
- "keyword": "相关报告",
340
- "article_regex": "(.*?)",
341
- "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
342
- "date_format": "%Y-%m-%d"
343
- },
344
- {
345
- "site": "Guoyuan Securities",
346
- "pages": [0],
347
- "date_range": 1,
348
- "keyword": "相关研究报告",
349
- "article_regex": "(.*?)",
350
- "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
351
- "date_format": "%Y.%m.%d"
352
- },
353
- {
354
- "site": "China Galaxy Co., Ltd.",
355
- "pages": [0],
356
- "date_range": 1,
357
- "keyword": "相关报告",
358
- "article_regex": "(.*?)",
359
- "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
360
- "date_format": "%Y-%m-%d"
361
- },
362
- {
363
- "site": "Shengang Securities Co., Ltd.",
364
- "pages": [0],
365
- "date_range": 1,
366
- "keyword": "相关报告",
367
- "article_regex": "(.*?)",
368
- "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
369
- "date_format": "%Y-%m-%d"
370
- },
371
- {
372
- "site": "SDIC Anxin Futures",
373
- "pages": [0],
374
- "date_range": 1,
375
- "keyword": "相关报告",
376
- "article_regex": "《(.*?)》",
377
- "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
378
- "date_format": "%Y-%m-%d"
379
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  ]
 
1
  [
2
+ {
3
+ "site": "China Development Bank Securities Co., Ltd.",
4
+ "pages": [0],
5
+ "date_range": 1,
6
+ "keyword": "相关报告",
7
+ "article_regex": "《(.*?)》",
8
+ "no_date": 1,
9
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
10
+ "date_format": "%Y-%m-%d",
11
+ "split":[
12
+ {
13
+ "string": "—",
14
+ "index": -1
15
+ }
16
+ ]
17
+ },
18
+ {
19
+ "site": "Donghai Securities Co., Ltd.",
20
+ "pages": [0],
21
+ "date_range": 1,
22
+ "keyword": "相关研究",
23
+ "article_regex": "《(.*?)》",
24
+ "no_date": 1,
25
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
26
+ "date_format": "%Y-%m-%d"
27
+ },
28
+ {
29
+ "site": "Guosen Securities Co., Ltd.",
30
+ "pages": [0],
31
+ "date_range": 1,
32
+ "keyword": "相关研究报告",
33
+ "article_regex": "《(.*?)》",
34
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
35
+ "date_format": "%Y-%m-%d",
36
+ "split":[
37
+ {
38
+ "string": "-",
39
+ "index": -1
40
+ }
41
+ ]
42
+ },
43
+ {
44
+ "site": "Soochow Securities Co., Ltd.",
45
+ "pages": [0],
46
+ "date_range": 2,
47
+ "keyword": "相关研究",
48
+ "article_regex": "《(.*?)》",
49
+ "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
50
+ "date_format": "%Y-%m-%d",
51
+ "split":[
52
+ {
53
+ "string": "-",
54
+ "index": 0
55
+ },
56
+ {
57
+ "string": "—",
58
+ "index": 0
59
+ }
60
+ ]
61
+ },
62
+ {
63
+ "site": "BOCI Securities Co., Ltd.",
64
+ "pages": [0],
65
+ "date_range": 1,
66
+ "keyword": "相关研究报告",
67
+ "article_regex": "《(.*?)》",
68
+ "date_regex": "20\\d{6}|20\\d{5}\\s{1}\\d{1}",
69
+ "date_format": "%Y%m%d"
70
+ },
71
+ {
72
+ "site": "Tianfeng Securities Co., Ltd.",
73
+ "pages": [0],
74
+ "date_range": 3,
75
+ "keyword": "相关报告",
76
+ "article_regex": " 《(.*?)》",
77
+ "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
78
+ "date_format": "%Y-%m-%d",
79
+ "remove": ["宏观报告:", "宏观-", "宏观报告-", "——"],
80
+ "split":[
81
+ {
82
+ "string": ":",
83
+ "index": 1
84
+ },
85
+ {
86
+ "string": "-",
87
+ "index": 0
88
+ },
89
+ {
90
+ "string": "(",
91
+ "index": 1
92
+ }
93
+ ]
94
+ },
95
+ {
96
+ "site": "Kaiyuan Securities Co., Ltd.",
97
+ "pages": [0],
98
+ "date_range": 1,
99
+ "keyword": " ",
100
+ "article_regex": " 《(.*?)》",
101
+ "date_regex": "\\b\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\b",
102
+ "date_format": "%Y.%m.%d",
103
+ "split":[
104
+ {
105
+ "string": "—",
106
+ "index": 1
107
+ }
108
+ ]
109
+ },
110
+ {
111
+ "site": "Huafu Securities Co., Ltd.",
112
+ "pages": [0],
113
+ "date_range": 4,
114
+ "keyword": "相关报告",
115
+ "article_regex": "《(.*?)》",
116
+ "date_regex": "20\\d{2}\\s?\\.\\s?\\d{1}\\s?\\d{1}\\s?\\.\\s?\\d{1,2}",
117
+ "date_format": "%Y.%m.%d",
118
+ "split":[
119
+ {
120
+ "string": ":",
121
+ "index": 1
122
+ },
123
+ {
124
+ "string": "——",
125
+ "index": 0
126
+ }
127
+ ]
128
+ },
129
+ {
130
+ "site": "Minsheng Securities Co., Ltd.",
131
+ "pages": [0],
132
+ "date_range": 1,
133
+ "keyword": "相关研究",
134
+ "article_regex": "\\.(.*?)\\-",
135
+ "date_regex": "20\\d{2}\\/\\d{2}\\/\\d{2}",
136
+ "date_format": "%Y/%m/%d",
137
+ "split":[
138
+ {
139
+ "string": "",
140
+ "index": 1
141
+ }
142
+ ]
143
+ },
144
+ {
145
+ "site": "Guolian Securities Co., Ltd.",
146
+ "pages": [0],
147
+ "date_range": 1,
148
+ "keyword": "相关报告 ",
149
+ "article_regex": "《(.*?)》",
150
+ "date_regex": "[》 ]20\\d{2}\\.\\d{2}\\.\\d{2}",
151
+ "date_format": "%Y.%m.%d",
152
+ "split":[
153
+ {
154
+ "string": ":",
155
+ "index": 0
156
+ }
157
+ ]
158
+ },
159
+ {
160
+ "site": "Southwest Securities Co., Ltd.",
161
+ "pages": [0],
162
+ "date_range": 1,
163
+ "keyword": "相关研究",
164
+ "article_regex": "\\.(.*?)\\(",
165
+ "date_regex": "(20\\d{2}\\s?-\\d{2}\\-\\d{2})",
166
+ "date_format": "%Y-%m-%d"
167
+ },
168
+ {
169
+ "site": "Guangdong Securities Co., Ltd.",
170
+ "pages": [0],
171
+ "date_range": 1,
172
+ "keyword": "近期报告",
173
+ "article_regex": "《(.*?)》",
174
+ "date_regex": "20\\d{2}\\s?-\\d{2}\\-\\d{2}",
175
+ "date_format": "%Y-%m-%d"
176
+ },
177
+ {
178
+ "site": "China Post Securities Co., Ltd.",
179
+ "pages": [0],
180
+ "date_range": 1,
181
+ "keyword": "近期研究报告",
182
+ "article_regex": "《(.*?)》",
183
+ "date_regex": "20\\d{2}\\s?.\\d{2}\\.\\d{2}",
184
+ "date_format": "%Y.%m.%d",
185
+ "split":[
186
+ {
187
+ "string": "-",
188
+ "index": 1
189
+ },
190
+ {
191
+ "string": "——",
192
+ "index": 0
193
+ }
194
+ ]
195
+ },
196
+ {
197
+ "site": "Shanxi Securities Co., Ltd.",
198
+ "pages": [0],
199
+ "date_range": 1,
200
+ "keyword": " ",
201
+ "article_regex": "】(.*?)\\(",
202
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2} ",
203
+ "date_format": "%Y.%m.%d"
204
+ },
205
+ {
206
+ "site": "Shanghai Securities Co., Ltd.",
207
+ "pages": [0],
208
+ "date_range": 1,
209
+ "keyword": "Table_Rep",
210
+ "article_regex": "《(.*?)》",
211
+ "date_regex": "20\\d{2}年\\d{2}月\\d{2}",
212
+ "date_format": "%Y年%m月%d"
213
+ },
214
+ {
215
+ "site": "Guoyuan Securities Co., Ltd.",
216
+ "pages": [0],
217
+ "date_range": 2,
218
+ "keyword": "[Table_Report]",
219
+ "article_regex": "《(.*?)》 ",
220
+ "date_regex": " 20\\d{2}.\\d{2}.\\d{2} ",
221
+ "date_format": "%Y.%m.%d",
222
+ "split":[
223
+ {
224
+ "string": "",
225
+ "index": 0
226
+ }
227
+ ]
228
+ },
229
+ {
230
+ "site": "Mago Securities Co., Ltd.",
231
+ "pages": [0],
232
+ "date_range": 1,
233
+ "keyword": "相关研究",
234
+ "article_regex": "《(.*?)》",
235
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2} ",
236
+ "date_format": "%Y.%m.%d",
237
+ "split":[
238
+ {
239
+ "string": "(",
240
+ "index": 0
241
+ }
242
+ ]
243
+ },
244
+ {
245
+ "site": "Fed Securities, Inc.",
246
+ "pages": [0],
247
+ "date_range": 3,
248
+ "keyword": "相关报告",
249
+ "article_regex": ":(.*?)20",
250
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2}",
251
+ "date_format": "%Y.%m.%d"
252
+ },
253
+ {
254
+ "site": "Huabao Securities Co., Ltd.",
255
+ "pages": [0],
256
+ "date_range": 1,
257
+ "keyword": "相关研究报告",
258
+ "article_regex": "《(.*?)》",
259
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
260
+ "date_format": "%Y-%m-%d"
261
+ },
262
+ {
263
+ "site": "Ruitingdog (Shenzhen) Information Technology Co., Ltd.",
264
+ "pages": [0],
265
+ "date_range": 1,
266
+ "keyword": "近期研究",
267
+ "article_regex": ":(.*?)-",
268
+ "date_regex": "\\d{4}\\s?/\\s?\\d{1,2}\\s?/\\s?\\d{1,2}",
269
+ "date_format": "%Y/%m/%d"
270
+ },
271
+ {
272
+ "site": "Oriental Fortune Securities Co., Ltd.",
273
+ "pages": [0],
274
+ "date_range": 1,
275
+ "keyword": "相关研究",
276
+ "article_regex": "《(.*?)》",
277
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
278
+ "date_format": "%Y.%m.%d"
279
+ },
280
+ {
281
+ "site": "Yongxing Securities Co., Ltd.",
282
+ "pages": [0],
283
+ "date_range": 1,
284
+ "keyword": "相关报告:",
285
+ "article_regex": "《(.*?)》",
286
+ "date_regex": "—— \\d{4}\\s?年\\s?\\d{1,2}\\s?月\\s?\\d{1,2}",
287
+ "date_format": "——%Y年%m月%d"
288
+ },
289
+ {
290
+ "site": "Minmetals Securities Co., Ltd.",
291
+ "pages": [0],
292
+ "date_range": 1,
293
+ "keyword": "相关研究",
294
+ "article_regex": "《(.*?)》",
295
+ "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
296
+ "date_format": "(%Y/%m/%d) "
297
+ },
298
+ {
299
+ "site": "Hualong Securities Co., Ltd.",
300
+ "pages": [0],
301
+ "date_range": 1,
302
+ "keyword": "相关阅读",
303
+ "article_regex": "《(.*?)》",
304
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
305
+ "date_format": "%Y.%m.%d"
306
+ },
307
+ {
308
+ "site": "Hebei Yuanda Information Technology Co., Ltd.",
309
+ "pages": [0],
310
+ "date_range": 1,
311
+ "keyword": "相关报告:",
312
+ "article_regex": "《(.*?)》",
313
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
314
+ "date_format": "%Y.%m.%d"
315
+ },
316
+ {
317
+ "site": "Huaxin Securities Co., Ltd.",
318
+ "pages": [0],
319
+ "date_range": 1,
320
+ "keyword": "相关研究",
321
+ "article_regex": "《(.*?)》",
322
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
323
+ "date_format": "%Y-%m-%d"
324
+ },
325
+ {
326
+ "site": "Far East Credit Rating Co., Ltd.",
327
+ "pages": [0],
328
+ "date_range": 1,
329
+ "keyword": "1.",
330
+ "article_regex": "《(.*?)》",
331
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
332
+ "date_format": "%Y.%m.%d"
333
+ },
334
+ {
335
+ "site": "Beijing Tengjing Big Data Application Technology Research Institute",
336
+ "pages": [0],
337
+ "date_range": 1,
338
+ "keyword": "相关报告",
339
+ "article_regex": "《(.*?)》",
340
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
341
+ "date_format": "%Y-%m-%d"
342
+ },
343
+ {
344
+ "site": "Wanhe Securities Co., Ltd.",
345
+ "pages": [0],
346
+ "date_range": 1,
347
+ "keyword": "相关报告",
348
+ "article_regex": "《(.*?)》",
349
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
350
+ "date_format": "%Y-%m-%d"
351
+ },
352
+ {
353
+ "site": "Centaline Securities Co., Ltd.",
354
+ "pages": [0],
355
+ "date_range": 1,
356
+ "keyword": "相关报告",
357
+ "article_regex": "《(.*?)》",
358
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
359
+ "date_format": "%Y-%m-%d"
360
+ },
361
+ {
362
+ "site": "Tengjing Digital Research",
363
+ "pages": [0],
364
+ "date_range": 1,
365
+ "keyword": "相关报告",
366
+ "article_regex": "《(.*?)》",
367
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
368
+ "date_format": "%Y-%m-%d"
369
+ },
370
+ {
371
+ "site": "Guoyuan Securities",
372
+ "pages": [0],
373
+ "date_range": 1,
374
+ "keyword": "相关研究报告",
375
+ "article_regex": "《(.*?)》",
376
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
377
+ "date_format": "%Y.%m.%d"
378
+ },
379
+ {
380
+ "site": "China Galaxy Co., Ltd.",
381
+ "pages": [0],
382
+ "date_range": 1,
383
+ "keyword": "相关报告",
384
+ "article_regex": "《(.*?)》",
385
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
386
+ "date_format": "%Y-%m-%d"
387
+ },
388
+ {
389
+ "site": "Shengang Securities Co., Ltd.",
390
+ "pages": [0],
391
+ "date_range": 1,
392
+ "keyword": "相关报告",
393
+ "article_regex": "《(.*?)》",
394
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
395
+ "date_format": "%Y-%m-%d"
396
+ },
397
+ {
398
+ "site": "SDIC Anxin Futures",
399
+ "pages": [0],
400
+ "date_range": 1,
401
+ "keyword": "相关报告",
402
+ "article_regex": "《(.*?)》",
403
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
404
+ "date_format": "%Y-%m-%d"
405
+ }
406
  ]
utils.py CHANGED
@@ -123,41 +123,67 @@ def extract_reference(row):
123
  if 'remove' in pattern:
124
  for remove_string in pattern['remove']:
125
  reference_titles = [s.replace(remove_string, '') for s in reference_titles]
126
- for title, date in zip(reference_titles, reference_dates):
127
- try:
128
- date = datetime.strptime(date, pattern['date_format'])
129
- except:
130
- date = datetime(2006, 1, 1)
131
- dates = []
132
- if 'date_range' in pattern:
133
- for i in range(pattern['date_range'] + 1):
134
- dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
135
- dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
136
- dates.append(date.strftime('%Y-%m-%d'))
137
- date = date.strftime('%Y-%m-%d')
138
- if 'split' in pattern:
139
- for split_item in pattern['split']:
140
- if 'exceptional_string' in split_item:
141
- if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
142
- title = re.split(split_item['string'], title)[split_item['index']]
143
- else:
144
- if split_item['string'] in title:
145
- title = title.split(split_item['string'])[split_item['index']]
146
- if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
147
- print("------------ = 0 ------------")
148
- print(date, repr(title))
149
- elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
150
- print("------------ > 1 ------------")
151
- print(date, repr(title))
152
- else:
153
- print("------------ = 1 ------------")
154
- reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
155
- row['referenceID'] = reference_df.iloc[0]['id']
156
- row['link'] = reference_df.iloc[0]['link']
157
- row['sourceID'] = row['id']
158
- row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
159
- print(date, repr(title), row['sourceID'],row['referenceID'])
160
- update_reference(row)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  except Exception as error:
162
  print(error)
163
 
 
123
  if 'remove' in pattern:
124
  for remove_string in pattern['remove']:
125
  reference_titles = [s.replace(remove_string, '') for s in reference_titles]
126
+ if len(reference_dates) > 0:
127
+ for title, date in zip(reference_titles, reference_dates):
128
+ try:
129
+ date = datetime.strptime(date, pattern['date_format'])
130
+ except:
131
+ date = datetime(2006, 1, 1)
132
+ dates = []
133
+ if 'date_range' in pattern:
134
+ for i in range(pattern['date_range'] + 1):
135
+ dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
136
+ dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
137
+ dates.append(date.strftime('%Y-%m-%d'))
138
+ date = date.strftime('%Y-%m-%d')
139
+ if 'split' in pattern:
140
+ for split_item in pattern['split']:
141
+ if 'exceptional_string' in split_item:
142
+ if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
143
+ title = re.split(split_item['string'], title)[split_item['index']]
144
+ else:
145
+ if split_item['string'] in title:
146
+ title = title.split(split_item['string'])[split_item['index']]
147
+ if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
148
+ print("------------ = 0 ------------")
149
+ print(date, repr(title))
150
+ elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
151
+ print("------------ > 1 ------------")
152
+ print(date, repr(title))
153
+ else:
154
+ print("------------ = 1 ------------")
155
+ reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
156
+ row['referenceID'] = reference_df.iloc[0]['id']
157
+ row['link'] = reference_df.iloc[0]['link']
158
+ row['sourceID'] = row['id']
159
+ row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
160
+ print(date, repr(title), row['sourceID'],row['referenceID'])
161
+ update_reference(row)
162
+ else:
163
+ for title in reference_titles:
164
+ if 'split' in pattern:
165
+ for split_item in pattern['split']:
166
+ if 'exceptional_string' in split_item:
167
+ if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
168
+ title = re.split(split_item['string'], title)[split_item['index']]
169
+ else:
170
+ if split_item['string'] in title:
171
+ title = title.split(split_item['string'])[split_item['index']]
172
+ if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site'])]) == 0:
173
+ print("------------ = 0 ------------")
174
+ print(repr(title))
175
+ elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site'])]) > 1:
176
+ print("------------ > 1 ------------")
177
+ print(repr(title))
178
+ else:
179
+ print("------------ = 1 ------------")
180
+ reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site'])]
181
+ row['referenceID'] = reference_df.iloc[0]['id']
182
+ row['link'] = reference_df.iloc[0]['link']
183
+ row['sourceID'] = row['id']
184
+ row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
185
+ print(repr(title), row['sourceID'],row['referenceID'])
186
+ update_reference(row)
187
  except Exception as error:
188
  print(error)
189