OxbridgeEconomics commited on
Commit
e47a6a0
·
unverified ·
1 Parent(s): 71294ad

Create pbc.py

Browse files
Files changed (1) hide show
  1. pbc.py +184 -0
pbc.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import uuid
3
+ import time
4
+ import urllib.request
5
+ from lxml import etree
6
+ from googletrans import Translator
7
+ from transformers import pipeline
8
+ from PyPDF2 import PdfReader
9
+ analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
10
+
11
+ translator = Translator()
12
+
13
+ def datemodifier(date_string):
14
+ """Date Modifier Function"""
15
+ try:
16
+ to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S")
17
+ return time.strftime("%Y-%m-%d",to_date)
18
+ except:
19
+ return False
20
+
21
+ def fetch_url(url):
22
+ response = requests.get(url)
23
+ if response.status_code == 200:
24
+ return response.text
25
+ else:
26
+ return None
27
+
28
+ def translist(infolist):
29
+ """Translist Function"""
30
+ out = list(filter(lambda s: s and
31
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
32
+ return out
33
+
34
+ def encode(content):
35
+ """Encode Function"""
36
+ text = ''
37
+ for element in content[:1]:
38
+ if isinstance(element, etree._Element):
39
+ subelement = etree.tostring(element).decode()
40
+ subpage = etree.HTML(subelement)
41
+ tree = subpage.xpath('//text()')
42
+ line = ''.join(translist(tree)).\
43
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
44
+ else:
45
+ line = element
46
+ text += line
47
+ index = text.find('打印本页')
48
+ if index != -1:
49
+ text = text[:index]
50
+
51
+ return text
52
+
53
+ def extract_from_pdf(url):
54
+ # Send a GET request to the URL and retrieve the PDF content
55
+ response = requests.get(url)
56
+ pdf_content = response.content
57
+
58
+ # Save the PDF content to a local file
59
+ with open("downloaded_file.pdf", "wb") as f:
60
+ f.write(pdf_content)
61
+
62
+ # Open the downloaded PDF file and extract the text
63
+ with open("downloaded_file.pdf", "rb") as f:
64
+ pdf_reader = PdfReader(f)
65
+ num_pages = len(pdf_reader.pages)
66
+ extracted_text = ""
67
+ extracted_text_eng = ""
68
+ for page in range(num_pages):
69
+ text = pdf_reader.pages[page].extract_text()
70
+ if text and text[0].isdigit():
71
+ text = text[1:]
72
+ first_newline_index = text.find('\n')
73
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
74
+ extracted_text_eng += translator.translate(text, dest='en').text
75
+ extracted_text += text
76
+ return extracted_text, extracted_text_eng
77
+
78
+ """Upload file to dynamoDB"""
79
+ # import datetime
80
+ from datetime import datetime, timedelta
81
+ from decimal import Decimal
82
+ import boto3
83
+
84
+ AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
85
+ AWS_SECRET_ACCESS_KEY = "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"
86
+
87
+ print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
88
+
89
+ def get_db_connection():
90
+ """Get dynamoDB connection"""
91
+ dynamodb = boto3.resource(
92
+ service_name='dynamodb',
93
+ region_name='us-east-1',
94
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
95
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
96
+ )
97
+ return dynamodb
98
+
99
+ def upsert_content(report):
100
+ """Upsert the content records"""
101
+ dynamodb = get_db_connection()
102
+ table = dynamodb.Table('article_test')
103
+ # Define the item data
104
+ item = {
105
+ 'id': str(report['id']),
106
+ 'site': report['site'],
107
+ 'title': report['title'],
108
+ 'originalSite': report['originalSite'],
109
+ 'originalTitle': report['originalTitle'],
110
+ 'originalContent': report['originalContent'],
111
+ 'category': report['category'],
112
+ # 'author': report['author'],
113
+ 'content': report['content'],
114
+ 'publishDate': report['publishDate'],
115
+ 'link': report['url'],
116
+ # 'attachment': report['reporturl'],
117
+ # 'authorID': str(report['authorid']),
118
+ 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
119
+ 'sentimentLabel': report['sentimentLabel'],
120
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
121
+ }
122
+ response = table.put_item(Item=item)
123
+ print(response)
124
+
125
+ reportList = []
126
+ categoryu_urls = ["http://www.pbc.gov.cn/rmyh/3963412/index.html"]
127
+ for categoryu_url in categoryu_urls:
128
+ response = requests.get(categoryu_url)
129
+ page = etree.HTML(response.text)
130
+ urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
131
+ urls = [item for item in urls if item.startswith("/rmyh/")]
132
+ for url in urls:
133
+ try:
134
+ url = "http://www.pbc.gov.cn" + url
135
+ print(url)
136
+ article = {}
137
+ response = requests.get(url)
138
+ response.encoding = 'utf-8'
139
+ page = etree.HTML(response.text)
140
+ article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
141
+ content_eng = ''
142
+ for element in article['originalContent'].split("。"):
143
+ content_eng += translator.translate(element, dest='en').text + ' '
144
+ article['content'] = content_eng
145
+ print(article['content'])
146
+ article['site'] = "The People's Bank of China"
147
+ article['originalSite'] = "中国人民银行"
148
+ article['originalTitle'] = page.xpath("//title/text()")[0]
149
+ print(article['originalTitle'])
150
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
151
+ article['url'] = url
152
+ article['category']= "Policy Interpretation"
153
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
154
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
155
+ if parsed_datetime < (datetime.today() - timedelta(days=180)):
156
+ print(article['publishDate'])
157
+ continue
158
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
159
+ label_dict = {
160
+ "positive": "+",
161
+ "negative": "-",
162
+ "neutral": "0",
163
+ }
164
+ sentiment_score = 0
165
+ maximum_value = 0
166
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
167
+ sentiment_label = None
168
+ for sentiment_dict in raw_sentiment[0]:
169
+ value = sentiment_dict["score"]
170
+ if value > maximum_value:
171
+ sentiment_label = sentiment_dict["label"]
172
+ maximum_value = value
173
+ if sentiment_dict["label"] == "positive":
174
+ sentiment_score = sentiment_score + value
175
+ if sentiment_dict["label"] == "negative":
176
+ sentiment_score = sentiment_score - value
177
+ else:
178
+ sentiment_score = sentiment_score + 0
179
+ article['sentimentScore'] = sentiment_score
180
+ article['sentimentLabel'] = label_dict[sentiment_label]
181
+ print(article)
182
+ upsert_content(article)
183
+ except Exception as error:
184
+ print(error)