OxbridgeEconomics commited on
Commit
93e74f7
·
1 Parent(s): 19f7db5
Files changed (2) hide show
  1. glue.py +4 -0
  2. utils.py +5 -11
glue.py CHANGED
@@ -18,4 +18,8 @@ glue = get_client_connection()
18
  response = glue.start_job_run(
19
  JobName='Article Snapshot China'
20
  )
 
 
 
 
21
  print(response)
 
18
  response = glue.start_job_run(
19
  JobName='Article Snapshot China'
20
  )
21
+ print(response)
22
+ response = glue.start_job_run(
23
+ JobName='Reference China'
24
+ )
25
  print(response)
utils.py CHANGED
@@ -19,8 +19,6 @@ from PyPDF2 import PdfReader
19
 
20
  AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
21
  AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
22
- # AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
23
- # AWS_SECRET_ACCESS_KEY="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"
24
 
25
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
26
 
@@ -29,7 +27,7 @@ translator = Translator()
29
  with open('xpath.json', 'r', encoding='UTF-8') as f:
30
  xpath_dict = json.load(f)
31
 
32
- with open('xpath.json', 'r', encoding='UTF-8') as f:
33
  patterns = json.load(f)
34
 
35
  def get_client_connection():
@@ -48,7 +46,7 @@ def update_reference(report):
48
  TableName="reference_china",
49
  Key={
50
  'id': {'S': str(report['refID'])},
51
- 'sourceID': {'S': report['sourceID']}
52
  },
53
  UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
54
  ExpressionAttributeValues={
@@ -114,11 +112,8 @@ def isnot_substring(list_a, string_to_check):
114
  return True
115
 
116
  def extract_reference(row):
117
- print(row['site'])
118
- print(patterns)
119
  try:
120
  pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
121
- print(pattern)
122
  extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
123
  reference_titles = re.findall(pattern['article_regex'], extracted_text)
124
  reference_dates = re.findall(pattern['date_regex'], extracted_text)
@@ -129,7 +124,6 @@ def extract_reference(row):
129
  for remove_string in pattern['remove']:
130
  reference_titles = [s.replace(remove_string, '') for s in reference_titles]
131
  for title, date in zip(reference_titles, reference_dates):
132
- print(title, date)
133
  try:
134
  date = datetime.strptime(date, pattern['date_format'])
135
  except:
@@ -160,12 +154,13 @@ def extract_reference(row):
160
  reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
161
  row['referenceID'] = reference_df.iloc[0]['id']
162
  row['link'] = reference_df.iloc[0]['link']
163
- row['sourceID'] = row['id_x']
164
  row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
165
  print(date, repr(title), row['sourceID'],row['referenceID'])
 
166
  except Exception as error:
167
  print(error)
168
- # update_reference(row)
169
 
170
  def translate(text):
171
  return translator.translate(text, dest='en').text
@@ -251,7 +246,6 @@ def extract_from_pdf(url):
251
  # first_newline_index = text.find('。\n')
252
  # text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
253
  text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
254
- print(text)
255
  if text != '':
256
  extracted_text += text
257
  try:
 
19
 
20
  AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
21
  AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
 
 
22
 
23
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
24
 
 
27
  with open('xpath.json', 'r', encoding='UTF-8') as f:
28
  xpath_dict = json.load(f)
29
 
30
+ with open('patterns.json', 'r', encoding='UTF-8') as f:
31
  patterns = json.load(f)
32
 
33
  def get_client_connection():
 
46
  TableName="reference_china",
47
  Key={
48
  'id': {'S': str(report['refID'])},
49
+ 'sourceID': {'S': str(report['sourceID'])}
50
  },
51
  UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
52
  ExpressionAttributeValues={
 
112
  return True
113
 
114
  def extract_reference(row):
 
 
115
  try:
116
  pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
 
117
  extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
118
  reference_titles = re.findall(pattern['article_regex'], extracted_text)
119
  reference_dates = re.findall(pattern['date_regex'], extracted_text)
 
124
  for remove_string in pattern['remove']:
125
  reference_titles = [s.replace(remove_string, '') for s in reference_titles]
126
  for title, date in zip(reference_titles, reference_dates):
 
127
  try:
128
  date = datetime.strptime(date, pattern['date_format'])
129
  except:
 
154
  reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
155
  row['referenceID'] = reference_df.iloc[0]['id']
156
  row['link'] = reference_df.iloc[0]['link']
157
+ row['sourceID'] = row['id']
158
  row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
159
  print(date, repr(title), row['sourceID'],row['referenceID'])
160
+ update_reference(row)
161
  except Exception as error:
162
  print(error)
163
+
164
 
165
  def translate(text):
166
  return translator.translate(text, dest='en').text
 
246
  # first_newline_index = text.find('。\n')
247
  # text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
248
  text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
 
249
  if text != '':
250
  extracted_text += text
251
  try: