OxbridgeEconomics
commited on
Commit
·
d83e215
1
Parent(s):
42ba1cc
commit
Browse files- eastmoney.py +9 -9
- utils.py +13 -12
- xpath.json +1 -1
eastmoney.py
CHANGED
@@ -15,30 +15,30 @@ def crawl(url, article):
|
|
15 |
text = req.read()
|
16 |
html_text = text.decode("utf-8")
|
17 |
page = etree.HTML(html_text)
|
18 |
-
|
19 |
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
|
20 |
article['link'] = url
|
21 |
if article['orgSName'] == "''":
|
22 |
article['site'] = translate(article['orgSName'])
|
23 |
else:
|
24 |
article['site'] = translate(article['orgName'])
|
25 |
-
article['
|
26 |
article['title'] = translate(article['title'])
|
27 |
article['author'] = translate(article['researcher'])
|
28 |
article['originAuthor'] = article['researcher']
|
29 |
-
article['
|
30 |
article['subtitle'] = translate(summary)
|
31 |
article['category'] = "Macroeconomic Research"
|
32 |
-
if len(article['
|
33 |
return None
|
34 |
CONTENT_ENG = ''
|
35 |
-
for element in
|
36 |
CONTENT_ENG += translate(element) + '\n'
|
37 |
-
article['content'] = repr(CONTENT_ENG)
|
38 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
39 |
-
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['
|
40 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['
|
41 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(
|
42 |
upsert_content(article)
|
43 |
|
44 |
today = datetime.today().strftime('%Y-%m-%d')
|
|
|
15 |
text = req.read()
|
16 |
html_text = text.decode("utf-8")
|
17 |
page = etree.HTML(html_text)
|
18 |
+
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
|
19 |
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
|
20 |
article['link'] = url
|
21 |
if article['orgSName'] == "''":
|
22 |
article['site'] = translate(article['orgSName'])
|
23 |
else:
|
24 |
article['site'] = translate(article['orgName'])
|
25 |
+
article['titleCN'] = article['title']
|
26 |
article['title'] = translate(article['title'])
|
27 |
article['author'] = translate(article['researcher'])
|
28 |
article['originAuthor'] = article['researcher']
|
29 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
30 |
article['subtitle'] = translate(summary)
|
31 |
article['category'] = "Macroeconomic Research"
|
32 |
+
if len(article['contentCN']) < 10:
|
33 |
return None
|
34 |
CONTENT_ENG = ''
|
35 |
+
for element in contentCN.split("\n"):
|
36 |
CONTENT_ENG += translate(element) + '\n'
|
37 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
38 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
39 |
+
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
|
40 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
41 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
|
42 |
upsert_content(article)
|
43 |
|
44 |
today = datetime.today().strftime('%Y-%m-%d')
|
utils.py
CHANGED
@@ -94,15 +94,16 @@ def encode_content(content):
|
|
94 |
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
95 |
else:
|
96 |
line = element
|
97 |
-
|
98 |
-
|
|
|
99 |
index = text.find('打印本页')
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
return text, summary
|
107 |
|
108 |
def extract_from_pdf(url):
|
@@ -180,13 +181,13 @@ def crawl(url, article):
|
|
180 |
article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
|
181 |
else:
|
182 |
article['author'] = ""
|
183 |
-
article['contentCN'] = repr(contentCN)
|
184 |
if len(article['contentCN']) < 10:
|
185 |
return None
|
186 |
CONTENT_ENG = ''
|
187 |
for element in contentCN.split("\n"):
|
188 |
CONTENT_ENG += translate(element) + '\n'
|
189 |
-
article['content'] = repr(CONTENT_ENG)
|
190 |
if 'subtitle' in xpath_dict[domain]:
|
191 |
article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
|
192 |
else:
|
@@ -201,7 +202,7 @@ def crawl(url, article):
|
|
201 |
def upsert_content(report):
|
202 |
"""Upsert the content records"""
|
203 |
dynamodb = get_db_connection()
|
204 |
-
table = dynamodb.Table('
|
205 |
# Define the item data
|
206 |
item = {
|
207 |
'id': str(report['id']),
|
@@ -261,4 +262,4 @@ def update_content(report):
|
|
261 |
}
|
262 |
)
|
263 |
print(response)
|
264 |
-
|
|
|
94 |
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
95 |
else:
|
96 |
line = element
|
97 |
+
if line != '':
|
98 |
+
line = line + '\n'
|
99 |
+
text += line
|
100 |
index = text.find('打印本页')
|
101 |
+
if index != -1:
|
102 |
+
text = text[:index]
|
103 |
+
try:
|
104 |
+
summary = '\n'.join(text.split('\n')[:2])
|
105 |
+
except:
|
106 |
+
summary = text
|
107 |
return text, summary
|
108 |
|
109 |
def extract_from_pdf(url):
|
|
|
181 |
article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
|
182 |
else:
|
183 |
article['author'] = ""
|
184 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
185 |
if len(article['contentCN']) < 10:
|
186 |
return None
|
187 |
CONTENT_ENG = ''
|
188 |
for element in contentCN.split("\n"):
|
189 |
CONTENT_ENG += translate(element) + '\n'
|
190 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
191 |
if 'subtitle' in xpath_dict[domain]:
|
192 |
article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
|
193 |
else:
|
|
|
202 |
def upsert_content(report):
|
203 |
"""Upsert the content records"""
|
204 |
dynamodb = get_db_connection()
|
205 |
+
table = dynamodb.Table('article_china')
|
206 |
# Define the item data
|
207 |
item = {
|
208 |
'id': str(report['id']),
|
|
|
262 |
}
|
263 |
)
|
264 |
print(response)
|
265 |
+
|
xpath.json
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"data.eastmoney.com": {
|
3 |
"attachment": "//a[contains(@class, 'pdf-link')]/@href",
|
4 |
"content": "//div[contains(@class, 'ctx-content')]//p",
|
5 |
-
"
|
6 |
},
|
7 |
"gov.cn": {
|
8 |
"title": "//title/text()",
|
|
|
2 |
"data.eastmoney.com": {
|
3 |
"attachment": "//a[contains(@class, 'pdf-link')]/@href",
|
4 |
"content": "//div[contains(@class, 'ctx-content')]//p",
|
5 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S.%f"
|
6 |
},
|
7 |
"gov.cn": {
|
8 |
"title": "//title/text()",
|