OxbridgeEconomics commited on
Commit
8925fd4
·
1 Parent(s): efcd6b8
Files changed (3) hide show
  1. daily.py +3 -1
  2. patterns.json +380 -0
  3. utils.py +158 -24
daily.py CHANGED
@@ -14,7 +14,8 @@ from utils import (encode,
14
  crawl,
15
  datemodifier,
16
  encode_content,
17
- update_content)
 
18
 
19
  with open('xpath.json', 'r', encoding='UTF-8') as f:
20
  xpath_dict = json.load(f)
@@ -161,6 +162,7 @@ def crawl_eastmoney(url, article):
161
  article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
162
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
163
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
 
164
  update_content(article)
165
 
166
  today = datetime.today().strftime('%Y-%m-%d')
 
14
  crawl,
15
  datemodifier,
16
  encode_content,
17
+ update_content,
18
+ extract_reference)
19
 
20
  with open('xpath.json', 'r', encoding='UTF-8') as f:
21
  xpath_dict = json.load(f)
 
162
  article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
163
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
164
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
165
+ extract_reference(article)
166
  update_content(article)
167
 
168
  today = datetime.today().strftime('%Y-%m-%d')
patterns.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "site": "Guosen Securities Co., Ltd.",
4
+ "pages": [0],
5
+ "date_range": 1,
6
+ "keyword": "相关研究报告",
7
+ "article_regex": "《(.*?)》",
8
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
9
+ "date_format": "%Y-%m-%d",
10
+ "split":[
11
+ {
12
+ "string": "-",
13
+ "index": -1
14
+ }
15
+ ]
16
+ },
17
+ {
18
+ "site": "Soochow Securities Co., Ltd.",
19
+ "pages": [0],
20
+ "date_range": 2,
21
+ "keyword": "相关研究",
22
+ "article_regex": "《(.*?)》",
23
+ "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
24
+ "date_format": "%Y-%m-%d",
25
+ "split":[
26
+ {
27
+ "string": "-",
28
+ "index": 0
29
+ },
30
+ {
31
+ "string": "—",
32
+ "index": 0
33
+ }
34
+ ]
35
+ },
36
+ {
37
+ "site": "BOCI Securities Co., Ltd.",
38
+ "pages": [0],
39
+ "date_range": 1,
40
+ "keyword": "相关研究报告",
41
+ "article_regex": "《(.*?)》",
42
+ "date_regex": "20\\d{6}|20\\d{5}\\s{1}\\d{1}",
43
+ "date_format": "%Y%m%d"
44
+ },
45
+ {
46
+ "site": "Tianfeng Securities Co., Ltd.",
47
+ "pages": [0],
48
+ "date_range": 3,
49
+ "keyword": "相关报告",
50
+ "article_regex": " 《(.*?)》",
51
+ "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
52
+ "date_format": "%Y-%m-%d",
53
+ "remove": ["宏观报告:", "宏观-", "宏观报告-", "——"],
54
+ "split":[
55
+ {
56
+ "string": ":",
57
+ "index": 1
58
+ },
59
+ {
60
+ "string": "-",
61
+ "index": 0
62
+ },
63
+ {
64
+ "string": "(",
65
+ "index": 1
66
+ }
67
+ ]
68
+ },
69
+ {
70
+ "site": "Kaiyuan Securities Co., Ltd.",
71
+ "pages": [0],
72
+ "date_range": 1,
73
+ "keyword": " ",
74
+ "article_regex": " 《(.*?)》",
75
+ "date_regex": "\\b\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\b",
76
+ "date_format": "%Y.%m.%d",
77
+ "split":[
78
+ {
79
+ "string": "—",
80
+ "index": 1
81
+ }
82
+ ]
83
+ },
84
+ {
85
+ "site": "Huafu Securities Co., Ltd.",
86
+ "pages": [0],
87
+ "date_range": 4,
88
+ "keyword": "相关报告",
89
+ "article_regex": "《(.*?)》",
90
+ "date_regex": "20\\d{2}\\s?\\.\\s?\\d{1}\\s?\\d{1}\\s?\\.\\s?\\d{1,2}",
91
+ "date_format": "%Y.%m.%d",
92
+ "split":[
93
+ {
94
+ "string": ":",
95
+ "index": 1
96
+ },
97
+ {
98
+ "string": "——",
99
+ "index": 0
100
+ }
101
+ ]
102
+ },
103
+ {
104
+ "site": "Minsheng Securities Co., Ltd.",
105
+ "pages": [0],
106
+ "date_range": 1,
107
+ "keyword": "相关研究",
108
+ "article_regex": "\\.(.*?)\\-",
109
+ "date_regex": "20\\d{2}\\/\\d{2}\\/\\d{2}",
110
+ "date_format": "%Y/%m/%d",
111
+ "split":[
112
+ {
113
+ "string": ":",
114
+ "index": 1
115
+ }
116
+ ]
117
+ },
118
+ {
119
+ "site": "Guolian Securities Co., Ltd.",
120
+ "pages": [0],
121
+ "date_range": 1,
122
+ "keyword": "相关报告 ",
123
+ "article_regex": "《(.*?)》",
124
+ "date_regex": "[》 ]20\\d{2}\\.\\d{2}\\.\\d{2}",
125
+ "date_format": "%Y.%m.%d",
126
+ "split":[
127
+ {
128
+ "string": ":",
129
+ "index": 0
130
+ }
131
+ ]
132
+ },
133
+ {
134
+ "site": "Southwest Securities Co., Ltd.",
135
+ "pages": [0],
136
+ "date_range": 1,
137
+ "keyword": "相关研究",
138
+ "article_regex": "\\.(.*?)\\(",
139
+ "date_regex": "(20\\d{2}\\s?-\\d{2}\\-\\d{2})",
140
+ "date_format": "%Y-%m-%d"
141
+ },
142
+ {
143
+ "site": "Guangdong Securities Co., Ltd.",
144
+ "pages": [0],
145
+ "date_range": 1,
146
+ "keyword": "近期报告",
147
+ "article_regex": "《(.*?)》",
148
+ "date_regex": "20\\d{2}\\s?-\\d{2}\\-\\d{2}",
149
+ "date_format": "%Y-%m-%d"
150
+ },
151
+ {
152
+ "site": "China Post Securities Co., Ltd.",
153
+ "pages": [0],
154
+ "date_range": 1,
155
+ "keyword": "近期研究报告",
156
+ "article_regex": "《(.*?)》",
157
+ "date_regex": "20\\d{2}\\s?.\\d{2}\\.\\d{2}",
158
+ "date_format": "%Y.%m.%d",
159
+ "split":[
160
+ {
161
+ "string": "-",
162
+ "index": 1
163
+ },
164
+ {
165
+ "string": "——",
166
+ "index": 0
167
+ }
168
+ ]
169
+ },
170
+ {
171
+ "site": "Shanxi Securities Co., Ltd.",
172
+ "pages": [0],
173
+ "date_range": 1,
174
+ "keyword": " ",
175
+ "article_regex": "】(.*?)\\(",
176
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2} ",
177
+ "date_format": "%Y.%m.%d"
178
+ },
179
+ {
180
+ "site": "Shanghai Securities Co., Ltd.",
181
+ "pages": [0],
182
+ "date_range": 1,
183
+ "keyword": "Table_Rep",
184
+ "article_regex": "《(.*?)》",
185
+ "date_regex": "20\\d{2}年\\d{2}月\\d{2}",
186
+ "date_format": "%Y年%m月%d"
187
+ },
188
+ {
189
+ "site": "Guoyuan Securities Co., Ltd.",
190
+ "pages": [0],
191
+ "date_range": 2,
192
+ "keyword": "[Table_Report]",
193
+ "article_regex": "《(.*?)》 ",
194
+ "date_regex": " 20\\d{2}.\\d{2}.\\d{2} ",
195
+ "date_format": "%Y.%m.%d",
196
+ "split":[
197
+ {
198
+ "string": ":",
199
+ "index": 0
200
+ }
201
+ ]
202
+ },
203
+ {
204
+ "site": "Mago Securities Co., Ltd.",
205
+ "pages": [0],
206
+ "date_range": 1,
207
+ "keyword": "相关研究",
208
+ "article_regex": "《(.*?)》",
209
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2} ",
210
+ "date_format": "%Y.%m.%d",
211
+ "split":[
212
+ {
213
+ "string": "(",
214
+ "index": 0
215
+ }
216
+ ]
217
+ },
218
+ {
219
+ "site": "Fed Securities, Inc.",
220
+ "pages": [0],
221
+ "date_range": 3,
222
+ "keyword": "相关报告",
223
+ "article_regex": ":(.*?)20",
224
+ "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2}",
225
+ "date_format": "%Y.%m.%d"
226
+ },
227
+ {
228
+ "site": "Huabao Securities Co., Ltd.",
229
+ "pages": [0],
230
+ "date_range": 1,
231
+ "keyword": "相关研究报告",
232
+ "article_regex": "《(.*?)》",
233
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
234
+ "date_format": "%Y-%m-%d"
235
+ },
236
+ {
237
+ "site": "Ruitingdog (Shenzhen) Information Technology Co., Ltd.",
238
+ "pages": [0],
239
+ "date_range": 1,
240
+ "keyword": "近期研究",
241
+ "article_regex": ":(.*?)-",
242
+ "date_regex": "\\d{4}\\s?/\\s?\\d{1,2}\\s?/\\s?\\d{1,2}",
243
+ "date_format": "%Y/%m/%d"
244
+ },
245
+ {
246
+ "site": "Oriental Fortune Securities Co., Ltd.",
247
+ "pages": [0],
248
+ "date_range": 1,
249
+ "keyword": "相关研究",
250
+ "article_regex": "《(.*?)》",
251
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
252
+ "date_format": "%Y.%m.%d"
253
+ },
254
+ {
255
+ "site": "Yongxing Securities Co., Ltd.",
256
+ "pages": [0],
257
+ "date_range": 1,
258
+ "keyword": "相关报告:",
259
+ "article_regex": "《(.*?)》",
260
+ "date_regex": "—— \\d{4}\\s?年\\s?\\d{1,2}\\s?月\\s?\\d{1,2}",
261
+ "date_format": "——%Y年%m月%d"
262
+ },
263
+ {
264
+ "site": "Minmetals Securities Co., Ltd.",
265
+ "pages": [0],
266
+ "date_range": 1,
267
+ "keyword": "相关研究",
268
+ "article_regex": "《(.*?)》",
269
+ "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
270
+ "date_format": "(%Y/%m/%d) "
271
+ },
272
+ {
273
+ "site": "Hualong Securities Co., Ltd.",
274
+ "pages": [0],
275
+ "date_range": 1,
276
+ "keyword": "相关阅读",
277
+ "article_regex": "《(.*?)》",
278
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
279
+ "date_format": "%Y.%m.%d"
280
+ },
281
+ {
282
+ "site": "Hebei Yuanda Information Technology Co., Ltd.",
283
+ "pages": [0],
284
+ "date_range": 1,
285
+ "keyword": "相关报告:",
286
+ "article_regex": "《(.*?)》",
287
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
288
+ "date_format": "%Y.%m.%d"
289
+ },
290
+ {
291
+ "site": "Huaxin Securities Co., Ltd.",
292
+ "pages": [0],
293
+ "date_range": 1,
294
+ "keyword": "相关研究",
295
+ "article_regex": "《(.*?)》",
296
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
297
+ "date_format": "%Y-%m-%d"
298
+ },
299
+ {
300
+ "site": "Far East Credit Rating Co., Ltd.",
301
+ "pages": [0],
302
+ "date_range": 1,
303
+ "keyword": "1.",
304
+ "article_regex": "《(.*?)》",
305
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
306
+ "date_format": "%Y.%m.%d"
307
+ },
308
+ {
309
+ "site": "Beijing Tengjing Big Data Application Technology Research Institute",
310
+ "pages": [0],
311
+ "date_range": 1,
312
+ "keyword": "相关报告",
313
+ "article_regex": "《(.*?)》",
314
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
315
+ "date_format": "%Y-%m-%d"
316
+ },
317
+ {
318
+ "site": "Wanhe Securities Co., Ltd.",
319
+ "pages": [0],
320
+ "date_range": 1,
321
+ "keyword": "相关报告",
322
+ "article_regex": "《(.*?)》",
323
+ "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
324
+ "date_format": "%Y-%m-%d"
325
+ },
326
+ {
327
+ "site": "Centaline Securities Co., Ltd.",
328
+ "pages": [0],
329
+ "date_range": 1,
330
+ "keyword": "相关报告",
331
+ "article_regex": "《(.*?)》",
332
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
333
+ "date_format": "%Y-%m-%d"
334
+ },
335
+ {
336
+ "site": "Tengjing Digital Research",
337
+ "pages": [0],
338
+ "date_range": 1,
339
+ "keyword": "相关报告",
340
+ "article_regex": "《(.*?)》",
341
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
342
+ "date_format": "%Y-%m-%d"
343
+ },
344
+ {
345
+ "site": "Guoyuan Securities",
346
+ "pages": [0],
347
+ "date_range": 1,
348
+ "keyword": "相关研究报告",
349
+ "article_regex": "《(.*?)》",
350
+ "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
351
+ "date_format": "%Y.%m.%d"
352
+ },
353
+ {
354
+ "site": "China Galaxy Co., Ltd.",
355
+ "pages": [0],
356
+ "date_range": 1,
357
+ "keyword": "相关报告",
358
+ "article_regex": "《(.*?)》",
359
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
360
+ "date_format": "%Y-%m-%d"
361
+ },
362
+ {
363
+ "site": "Shengang Securities Co., Ltd.",
364
+ "pages": [0],
365
+ "date_range": 1,
366
+ "keyword": "相关报告",
367
+ "article_regex": "《(.*?)》",
368
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
369
+ "date_format": "%Y-%m-%d"
370
+ },
371
+ {
372
+ "site": "SDIC Anxin Futures",
373
+ "pages": [0],
374
+ "date_range": 1,
375
+ "keyword": "相关报告",
376
+ "article_regex": "《(.*?)》",
377
+ "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
378
+ "date_format": "%Y-%m-%d"
379
+ }
380
+ ]
utils.py CHANGED
@@ -1,12 +1,15 @@
1
  """Utilis Functions"""
2
  import os
 
3
  import json
4
  import uuid
5
  import time
 
6
  import urllib.request
7
  from urllib.parse import urlparse
8
- from datetime import datetime
9
  from decimal import Decimal
 
10
  import requests
11
  import boto3
12
  from lxml import etree
@@ -26,6 +29,136 @@ translator = Translator()
26
  with open('xpath.json', 'r', encoding='UTF-8') as f:
27
  xpath_dict = json.load(f)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def translate(text):
30
  return translator.translate(text, dest='en').text
31
 
@@ -38,7 +171,7 @@ def datemodifier(date_string, date_format):
38
  return False
39
 
40
  def fetch_url(url):
41
- response = requests.get(url)
42
  if response.status_code == 200:
43
  return response.text
44
  else:
@@ -78,29 +211,29 @@ def encode_content(content):
78
  else:
79
  line = element
80
  if line != '':
81
- line = line + '\n'
82
- text += line
83
  index = text.find('打印本页')
84
  if index != -1:
85
- text = text[:index]
86
  try:
87
- summary = '\n'.join(text.split('\n')[:2])
88
  except:
89
- summary = text
90
  return text, summary
91
 
92
  def extract_from_pdf(url):
93
  # Send a GET request to the URL and retrieve the PDF content
94
- response = requests.get(url)
95
  pdf_content = response.content
96
 
97
  # Save the PDF content to a local file
98
- with open("downloaded_file.pdf", "wb") as f:
99
- f.write(pdf_content)
100
 
101
  # Open the downloaded PDF file and extract the text
102
- with open("downloaded_file.pdf", "rb") as f:
103
- pdf_reader = PdfReader(f)
104
  num_pages = len(pdf_reader.pages)
105
  extracted_text = ""
106
  for page in range(num_pages):
@@ -213,19 +346,19 @@ def upsert_content(report):
213
  response = table.put_item(Item=item)
214
  print(response)
215
 
216
- def get_client_connection():
217
- """Get dynamoDB connection"""
218
- dynamodb = boto3.client(
219
- service_name='dynamodb',
220
- region_name='us-east-1',
221
- aws_access_key_id=AWS_ACCESS_KEY_ID,
222
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY
223
- )
224
- return dynamodb
225
 
226
  def delete_records(item):
227
- dynamodb_client = get_client_connection()
228
- dynamodb_client.delete_item(
229
  TableName="article_test",
230
  Key={
231
  'id': {'S': item['id']},
@@ -275,4 +408,5 @@ def update_content_sentiment(report):
275
  }
276
  )
277
  print(response)
278
-
 
 
1
  """Utilis Functions"""
2
  import os
3
+ import re
4
  import json
5
  import uuid
6
  import time
7
+ import glob
8
  import urllib.request
9
  from urllib.parse import urlparse
10
+ from datetime import datetime, timedelta
11
  from decimal import Decimal
12
+ import pandas as pd
13
  import requests
14
  import boto3
15
  from lxml import etree
 
29
  with open('xpath.json', 'r', encoding='UTF-8') as f:
30
  xpath_dict = json.load(f)
31
 
32
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
33
+ patterns = json.load(f)
34
+
35
+ def get_client_connection():
36
+ """Get dynamoDB connection"""
37
+ dynamodb = boto3.client(
38
+ service_name='dynamodb',
39
+ region_name='us-east-1',
40
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
41
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
42
+ )
43
+ return dynamodb
44
+
45
+ def update_reference(report):
46
+ dynamodb = get_client_connection()
47
+ response = dynamodb.update_item(
48
+ TableName="reference_china",
49
+ Key={
50
+ 'id': {'S': str(report['refID'])},
51
+ 'sourceID': {'S': report['sourceID']}
52
+ },
53
+ UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
54
+ ExpressionAttributeValues={
55
+ ':link': {'S': report['link']},
56
+ ':referenceID': {'S': report['referenceID']},
57
+ ':LastModifiedDate': {'S': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")},
58
+ }
59
+ )
60
+ print(response)
61
+
62
+ def download_files_from_s3(folder):
63
+ """Download Data Files"""
64
+ if not os.path.exists(folder):
65
+ os.makedirs(folder)
66
+ client = boto3.client(
67
+ 's3',
68
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
69
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
70
+ )
71
+ response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
72
+ for obj in response['Contents']:
73
+ key = obj['Key']
74
+ if key.endswith('.parquet'):
75
+ client.download_file('china-securities-report', key, key)
76
+ file_paths = glob.glob(os.path.join(folder, '*.parquet'))
77
+ return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
78
+
79
+ def extract_from_pdf_by_pattern(url, pattern):
80
+ # Send a GET request to the URL and retrieve the PDF content
81
+ try:
82
+ response = requests.get(url, timeout=60)
83
+ pdf_content = response.content
84
+ # Save the PDF content to a local file
85
+ with open("downloaded_file.pdf", "wb") as file:
86
+ file.write(pdf_content)
87
+
88
+ # Open the downloaded PDF file and extract the text
89
+ with open("downloaded_file.pdf", "rb") as file:
90
+ pdf_reader = PdfReader(file)
91
+ extracted_text = ""
92
+ if 'pages' in pattern:
93
+ pages = pattern['pages']
94
+ else:
95
+ pages = len(pdf_reader.pages)
96
+ for page in pages:
97
+ text = pdf_reader.pages[page].extract_text()
98
+ if 'keyword' in pattern and pattern['keyword'] in text:
99
+ text = text.split(pattern['keyword'], 1)[1].strip()
100
+ else:
101
+ text = text.strip()
102
+ extracted_text += text
103
+ except:
104
+ extracted_text = ''
105
+ return extracted_text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n',' ').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
106
+
107
+ def get_reference_by_regex(pattern, text):
108
+ return re.findall(pattern, text)
109
+
110
+ def isnot_substring(list_a, string_to_check):
111
+ for s in list_a:
112
+ if s in string_to_check:
113
+ return False
114
+ return True
115
+
116
+ def extract_reference(row):
117
+ pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
118
+ extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
119
+ reference_titles = re.findall(pattern['article_regex'], extracted_text)
120
+ reference_dates = re.findall(pattern['date_regex'], extracted_text)
121
+ reference_titles = [s.replace(' ', '') for s in reference_titles]
122
+ reference_dates = [s.replace(' ', '') for s in reference_dates]
123
+ if 'remove' in pattern:
124
+ for remove_string in pattern['remove']:
125
+ reference_titles = [s.replace(remove_string, '') for s in reference_titles]
126
+ for title, date in zip(reference_titles, reference_dates):
127
+ try:
128
+ date = datetime.strptime(date, pattern['date_format'])
129
+ except:
130
+ date = datetime(2006, 1, 1)
131
+ dates = []
132
+ if 'date_range' in pattern:
133
+ for i in range(pattern['date_range'] + 1):
134
+ dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
135
+ dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
136
+ dates.append(date.strftime('%Y-%m-%d'))
137
+ date = date.strftime('%Y-%m-%d')
138
+ if 'split' in pattern:
139
+ for split_item in pattern['split']:
140
+ if 'exceptional_string' in split_item:
141
+ if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
142
+ title = re.split(split_item['string'], title)[split_item['index']]
143
+ else:
144
+ if split_item['string'] in title:
145
+ title = title.split(split_item['string'])[split_item['index']]
146
+ if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
147
+ print("------------ = 0 ------------")
148
+ print(date, repr(title))
149
+ elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
150
+ print("------------ > 1 ------------")
151
+ print(date, repr(title))
152
+ else:
153
+ print("------------ = 1 ------------")
154
+ reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
155
+ row['referenceID'] = reference_df.iloc[0]['id']
156
+ row['link'] = reference_df.iloc[0]['link']
157
+ row['sourceID'] = row['id_x']
158
+ row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
159
+ print(date, repr(title), row['sourceID'],row['referenceID'])
160
+ # update_reference(row)
161
+
162
  def translate(text):
163
  return translator.translate(text, dest='en').text
164
 
 
171
  return False
172
 
173
  def fetch_url(url):
174
+ response = requests.get(url, timeout = 60)
175
  if response.status_code == 200:
176
  return response.text
177
  else:
 
211
  else:
212
  line = element
213
  if line != '':
214
+ line = line + '\n'
215
+ text += line
216
  index = text.find('打印本页')
217
  if index != -1:
218
+ text = text[:index]
219
  try:
220
+ summary = '\n'.join(text.split('\n')[:2])
221
  except:
222
+ summary = text
223
  return text, summary
224
 
225
  def extract_from_pdf(url):
226
  # Send a GET request to the URL and retrieve the PDF content
227
+ response = requests.get(url, timeout=60)
228
  pdf_content = response.content
229
 
230
  # Save the PDF content to a local file
231
+ with open("downloaded_file.pdf", "wb") as file:
232
+ file.write(pdf_content)
233
 
234
  # Open the downloaded PDF file and extract the text
235
+ with open("downloaded_file.pdf", "rb") as file:
236
+ pdf_reader = PdfReader(file)
237
  num_pages = len(pdf_reader.pages)
238
  extracted_text = ""
239
  for page in range(num_pages):
 
346
  response = table.put_item(Item=item)
347
  print(response)
348
 
349
+ # def get_client_connection():
350
+ # """Get dynamoDB connection"""
351
+ # dynamodb = boto3.client(
352
+ # service_name='dynamodb',
353
+ # region_name='us-east-1',
354
+ # aws_access_key_id=AWS_ACCESS_KEY_ID,
355
+ # aws_secret_access_key=AWS_SECRET_ACCESS_KEY
356
+ # )
357
+ # return dynamodb
358
 
359
  def delete_records(item):
360
+ dynamodb_client = get_client_connection()
361
+ dynamodb_client.delete_item(
362
  TableName="article_test",
363
  Key={
364
  'id': {'S': item['id']},
 
408
  }
409
  )
410
  print(response)
411
+
412
+ data = download_files_from_s3('data')