OxbridgeEconomics
commited on
Commit
·
93e74f7
1
Parent(s):
19f7db5
commit
Browse files
glue.py
CHANGED
@@ -18,4 +18,8 @@ glue = get_client_connection()
|
|
18 |
response = glue.start_job_run(
|
19 |
JobName='Article Snapshot China'
|
20 |
)
|
|
|
|
|
|
|
|
|
21 |
print(response)
|
|
|
18 |
response = glue.start_job_run(
|
19 |
JobName='Article Snapshot China'
|
20 |
)
|
21 |
+
print(response)
|
22 |
+
response = glue.start_job_run(
|
23 |
+
JobName='Reference China'
|
24 |
+
)
|
25 |
print(response)
|
utils.py
CHANGED
@@ -19,8 +19,6 @@ from PyPDF2 import PdfReader
|
|
19 |
|
20 |
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
21 |
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
22 |
-
# AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
|
23 |
-
# AWS_SECRET_ACCESS_KEY="D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
|
24 |
|
25 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
26 |
|
@@ -29,7 +27,7 @@ translator = Translator()
|
|
29 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
30 |
xpath_dict = json.load(f)
|
31 |
|
32 |
-
with open('
|
33 |
patterns = json.load(f)
|
34 |
|
35 |
def get_client_connection():
|
@@ -48,7 +46,7 @@ def update_reference(report):
|
|
48 |
TableName="reference_china",
|
49 |
Key={
|
50 |
'id': {'S': str(report['refID'])},
|
51 |
-
'sourceID': {'S': report['sourceID']}
|
52 |
},
|
53 |
UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
|
54 |
ExpressionAttributeValues={
|
@@ -114,11 +112,8 @@ def isnot_substring(list_a, string_to_check):
|
|
114 |
return True
|
115 |
|
116 |
def extract_reference(row):
|
117 |
-
print(row['site'])
|
118 |
-
print(patterns)
|
119 |
try:
|
120 |
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
121 |
-
print(pattern)
|
122 |
extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
|
123 |
reference_titles = re.findall(pattern['article_regex'], extracted_text)
|
124 |
reference_dates = re.findall(pattern['date_regex'], extracted_text)
|
@@ -129,7 +124,6 @@ def extract_reference(row):
|
|
129 |
for remove_string in pattern['remove']:
|
130 |
reference_titles = [s.replace(remove_string, '') for s in reference_titles]
|
131 |
for title, date in zip(reference_titles, reference_dates):
|
132 |
-
print(title, date)
|
133 |
try:
|
134 |
date = datetime.strptime(date, pattern['date_format'])
|
135 |
except:
|
@@ -160,12 +154,13 @@ def extract_reference(row):
|
|
160 |
reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
|
161 |
row['referenceID'] = reference_df.iloc[0]['id']
|
162 |
row['link'] = reference_df.iloc[0]['link']
|
163 |
-
row['sourceID'] = row['
|
164 |
row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
|
165 |
print(date, repr(title), row['sourceID'],row['referenceID'])
|
|
|
166 |
except Exception as error:
|
167 |
print(error)
|
168 |
-
|
169 |
|
170 |
def translate(text):
|
171 |
return translator.translate(text, dest='en').text
|
@@ -251,7 +246,6 @@ def extract_from_pdf(url):
|
|
251 |
# first_newline_index = text.find('。\n')
|
252 |
# text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
|
253 |
text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
|
254 |
-
print(text)
|
255 |
if text != '':
|
256 |
extracted_text += text
|
257 |
try:
|
|
|
19 |
|
20 |
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
21 |
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
|
|
|
|
22 |
|
23 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
24 |
|
|
|
27 |
with open('xpath.json', 'r', encoding='UTF-8') as f:
|
28 |
xpath_dict = json.load(f)
|
29 |
|
30 |
+
with open('patterns.json', 'r', encoding='UTF-8') as f:
|
31 |
patterns = json.load(f)
|
32 |
|
33 |
def get_client_connection():
|
|
|
46 |
TableName="reference_china",
|
47 |
Key={
|
48 |
'id': {'S': str(report['refID'])},
|
49 |
+
'sourceID': {'S': str(report['sourceID'])}
|
50 |
},
|
51 |
UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
|
52 |
ExpressionAttributeValues={
|
|
|
112 |
return True
|
113 |
|
114 |
def extract_reference(row):
|
|
|
|
|
115 |
try:
|
116 |
pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
|
|
|
117 |
extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
|
118 |
reference_titles = re.findall(pattern['article_regex'], extracted_text)
|
119 |
reference_dates = re.findall(pattern['date_regex'], extracted_text)
|
|
|
124 |
for remove_string in pattern['remove']:
|
125 |
reference_titles = [s.replace(remove_string, '') for s in reference_titles]
|
126 |
for title, date in zip(reference_titles, reference_dates):
|
|
|
127 |
try:
|
128 |
date = datetime.strptime(date, pattern['date_format'])
|
129 |
except:
|
|
|
154 |
reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
|
155 |
row['referenceID'] = reference_df.iloc[0]['id']
|
156 |
row['link'] = reference_df.iloc[0]['link']
|
157 |
+
row['sourceID'] = row['id']
|
158 |
row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
|
159 |
print(date, repr(title), row['sourceID'],row['referenceID'])
|
160 |
+
update_reference(row)
|
161 |
except Exception as error:
|
162 |
print(error)
|
163 |
+
|
164 |
|
165 |
def translate(text):
|
166 |
return translator.translate(text, dest='en').text
|
|
|
246 |
# first_newline_index = text.find('。\n')
|
247 |
# text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
|
248 |
text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
|
|
|
249 |
if text != '':
|
250 |
extracted_text += text
|
251 |
try:
|