OxbridgeEconomics commited on
Commit
fba27b9
·
1 Parent(s): ca144fd
Files changed (1) hide show
  1. utils.py +48 -45
utils.py CHANGED
@@ -114,51 +114,54 @@ def isnot_substring(list_a, string_to_check):
114
  return True
115
 
116
  def extract_reference(row):
117
- pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
118
- extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
119
- reference_titles = re.findall(pattern['article_regex'], extracted_text)
120
- reference_dates = re.findall(pattern['date_regex'], extracted_text)
121
- reference_titles = [s.replace(' ', '') for s in reference_titles]
122
- reference_dates = [s.replace(' ', '') for s in reference_dates]
123
- print(reference_dates, reference_titles)
124
- if 'remove' in pattern:
125
- for remove_string in pattern['remove']:
126
- reference_titles = [s.replace(remove_string, '') for s in reference_titles]
127
- for title, date in zip(reference_titles, reference_dates):
128
- print(title, date)
129
- try:
130
- date = datetime.strptime(date, pattern['date_format'])
131
- except:
132
- date = datetime(2006, 1, 1)
133
- dates = []
134
- if 'date_range' in pattern:
135
- for i in range(pattern['date_range'] + 1):
136
- dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
137
- dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
138
- dates.append(date.strftime('%Y-%m-%d'))
139
- date = date.strftime('%Y-%m-%d')
140
- if 'split' in pattern:
141
- for split_item in pattern['split']:
142
- if 'exceptional_string' in split_item:
143
- if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
144
- title = re.split(split_item['string'], title)[split_item['index']]
145
- else:
146
- if split_item['string'] in title:
147
- title = title.split(split_item['string'])[split_item['index']]
148
- if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
149
- print("------------ = 0 ------------")
150
- print(date, repr(title))
151
- elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
152
- print("------------ > 1 ------------")
153
- print(date, repr(title))
154
- else:
155
- print("------------ = 1 ------------")
156
- reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
157
- row['referenceID'] = reference_df.iloc[0]['id']
158
- row['link'] = reference_df.iloc[0]['link']
159
- row['sourceID'] = row['id_x']
160
- row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
161
- print(date, repr(title), row['sourceID'],row['referenceID'])
 
 
 
162
  # update_reference(row)
163
 
164
  def translate(text):
 
114
  return True
115
 
116
  def extract_reference(row):
117
+ try:
118
+ pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
119
+ extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
120
+ reference_titles = re.findall(pattern['article_regex'], extracted_text)
121
+ reference_dates = re.findall(pattern['date_regex'], extracted_text)
122
+ reference_titles = [s.replace(' ', '') for s in reference_titles]
123
+ reference_dates = [s.replace(' ', '') for s in reference_dates]
124
+ print(reference_dates, reference_titles)
125
+ if 'remove' in pattern:
126
+ for remove_string in pattern['remove']:
127
+ reference_titles = [s.replace(remove_string, '') for s in reference_titles]
128
+ for title, date in zip(reference_titles, reference_dates):
129
+ print(title, date)
130
+ try:
131
+ date = datetime.strptime(date, pattern['date_format'])
132
+ except:
133
+ date = datetime(2006, 1, 1)
134
+ dates = []
135
+ if 'date_range' in pattern:
136
+ for i in range(pattern['date_range'] + 1):
137
+ dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
138
+ dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
139
+ dates.append(date.strftime('%Y-%m-%d'))
140
+ date = date.strftime('%Y-%m-%d')
141
+ if 'split' in pattern:
142
+ for split_item in pattern['split']:
143
+ if 'exceptional_string' in split_item:
144
+ if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
145
+ title = re.split(split_item['string'], title)[split_item['index']]
146
+ else:
147
+ if split_item['string'] in title:
148
+ title = title.split(split_item['string'])[split_item['index']]
149
+ if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
150
+ print("------------ = 0 ------------")
151
+ print(date, repr(title))
152
+ elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
153
+ print("------------ > 1 ------------")
154
+ print(date, repr(title))
155
+ else:
156
+ print("------------ = 1 ------------")
157
+ reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
158
+ row['referenceID'] = reference_df.iloc[0]['id']
159
+ row['link'] = reference_df.iloc[0]['link']
160
+ row['sourceID'] = row['id_x']
161
+ row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
162
+ print(date, repr(title), row['sourceID'],row['referenceID'])
163
+ except Exception as error:
164
+ print(error)
165
  # update_reference(row)
166
 
167
  def translate(text):