OxbridgeEconomics commited on
Commit
d83e215
·
1 Parent(s): 42ba1cc
Files changed (3) hide show
  1. eastmoney.py +9 -9
  2. utils.py +13 -12
  3. xpath.json +1 -1
eastmoney.py CHANGED
@@ -15,30 +15,30 @@ def crawl(url, article):
15
  text = req.read()
16
  html_text = text.decode("utf-8")
17
  page = etree.HTML(html_text)
18
- originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
19
  article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
20
  article['link'] = url
21
  if article['orgSName'] == "''":
22
  article['site'] = translate(article['orgSName'])
23
  else:
24
  article['site'] = translate(article['orgName'])
25
- article['originTitle'] = article['title']
26
  article['title'] = translate(article['title'])
27
  article['author'] = translate(article['researcher'])
28
  article['originAuthor'] = article['researcher']
29
- article['originContent'] = repr(originContent)
30
  article['subtitle'] = translate(summary)
31
  article['category'] = "Macroeconomic Research"
32
- if len(article['originContent']) < 10:
33
  return None
34
  CONTENT_ENG = ''
35
- for element in originContent.split("\n"):
36
  CONTENT_ENG += translate(element) + '\n'
37
- article['content'] = repr(CONTENT_ENG)
38
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
39
- article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
40
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
41
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
42
  upsert_content(article)
43
 
44
  today = datetime.today().strftime('%Y-%m-%d')
 
15
  text = req.read()
16
  html_text = text.decode("utf-8")
17
  page = etree.HTML(html_text)
18
+ contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
19
  article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
20
  article['link'] = url
21
  if article['orgSName'] == "''":
22
  article['site'] = translate(article['orgSName'])
23
  else:
24
  article['site'] = translate(article['orgName'])
25
+ article['titleCN'] = article['title']
26
  article['title'] = translate(article['title'])
27
  article['author'] = translate(article['researcher'])
28
  article['originAuthor'] = article['researcher']
29
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
30
  article['subtitle'] = translate(summary)
31
  article['category'] = "Macroeconomic Research"
32
+ if len(article['contentCN']) < 10:
33
  return None
34
  CONTENT_ENG = ''
35
+ for element in contentCN.split("\n"):
36
  CONTENT_ENG += translate(element) + '\n'
37
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
38
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
39
+ article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
40
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
41
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
42
  upsert_content(article)
43
 
44
  today = datetime.today().strftime('%Y-%m-%d')
utils.py CHANGED
@@ -94,15 +94,16 @@ def encode_content(content):
94
  replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
95
  else:
96
  line = element
97
- line = line + '\n'
98
- text += line
 
99
  index = text.find('打印本页')
100
- if index != -1:
101
- text = text[:index]
102
- try:
103
- summary = '\n'.join(text.split('\n')[:2])
104
- except:
105
- summary = text
106
  return text, summary
107
 
108
  def extract_from_pdf(url):
@@ -180,13 +181,13 @@ def crawl(url, article):
180
  article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
181
  else:
182
  article['author'] = ""
183
- article['contentCN'] = repr(contentCN)
184
  if len(article['contentCN']) < 10:
185
  return None
186
  CONTENT_ENG = ''
187
  for element in contentCN.split("\n"):
188
  CONTENT_ENG += translate(element) + '\n'
189
- article['content'] = repr(CONTENT_ENG)
190
  if 'subtitle' in xpath_dict[domain]:
191
  article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
192
  else:
@@ -201,7 +202,7 @@ def crawl(url, article):
201
  def upsert_content(report):
202
  """Upsert the content records"""
203
  dynamodb = get_db_connection()
204
- table = dynamodb.Table('article_test')
205
  # Define the item data
206
  item = {
207
  'id': str(report['id']),
@@ -261,4 +262,4 @@ def update_content(report):
261
  }
262
  )
263
  print(response)
264
-
 
94
  replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
95
  else:
96
  line = element
97
+ if line != '':
98
+ line = line + '\n'
99
+ text += line
100
  index = text.find('打印本页')
101
+ if index != -1:
102
+ text = text[:index]
103
+ try:
104
+ summary = '\n'.join(text.split('\n')[:2])
105
+ except:
106
+ summary = text
107
  return text, summary
108
 
109
  def extract_from_pdf(url):
 
181
  article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
182
  else:
183
  article['author'] = ""
184
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
185
  if len(article['contentCN']) < 10:
186
  return None
187
  CONTENT_ENG = ''
188
  for element in contentCN.split("\n"):
189
  CONTENT_ENG += translate(element) + '\n'
190
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
191
  if 'subtitle' in xpath_dict[domain]:
192
  article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
193
  else:
 
202
  def upsert_content(report):
203
  """Upsert the content records"""
204
  dynamodb = get_db_connection()
205
+ table = dynamodb.Table('article_china')
206
  # Define the item data
207
  item = {
208
  'id': str(report['id']),
 
262
  }
263
  )
264
  print(response)
265
+
xpath.json CHANGED
@@ -2,7 +2,7 @@
2
  "data.eastmoney.com": {
3
  "attachment": "//a[contains(@class, 'pdf-link')]/@href",
4
  "content": "//div[contains(@class, 'ctx-content')]//p",
5
- "datetime": "%Y-%m-%d %H:%M:%S.%f"
6
  },
7
  "gov.cn": {
8
  "title": "//title/text()",
 
2
  "data.eastmoney.com": {
3
  "attachment": "//a[contains(@class, 'pdf-link')]/@href",
4
  "content": "//div[contains(@class, 'ctx-content')]//p",
5
+ "datetime_format": "%Y-%m-%d %H:%M:%S.%f"
6
  },
7
  "gov.cn": {
8
  "title": "//title/text()",