Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 17, 2024

Commit

d83e215

1 Parent(s): 42ba1cc

commit

Browse files

Files changed (3) hide show

eastmoney.py +9 -9
utils.py +13 -12
xpath.json +1 -1

eastmoney.py CHANGED Viewed

@@ -15,30 +15,30 @@ def crawl(url, article):
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
-    originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
     article['link'] = url
     if article['orgSName'] == "''":
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
-    article['originTitle'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
     article['originAuthor'] = article['researcher']
-    article['originContent'] = repr(originContent)
     article['subtitle'] = translate(summary)
     article['category'] = "Macroeconomic Research"
-    if len(article['originContent']) < 10:
         return None
     CONTENT_ENG = ''
-    for element in originContent.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
-    article['content'] = repr(CONTENT_ENG)
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
-    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
-    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
     upsert_content(article)
 today = datetime.today().strftime('%Y-%m-%d')

     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
+    contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
     article['link'] = url
     if article['orgSName'] == "''":
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
+    article['titleCN'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
     article['originAuthor'] = article['researcher']
+    article['contentCN'] = repr(contentCN)[1:-1].strip()
     article['subtitle'] = translate(summary)
     article['category'] = "Macroeconomic Research"
+    if len(article['contentCN']) < 10:
         return None
     CONTENT_ENG = ''
+    for element in contentCN.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
+    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
+    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
+    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
     upsert_content(article)
 today = datetime.today().strftime('%Y-%m-%d')

utils.py CHANGED Viewed

@@ -94,15 +94,16 @@ def encode_content(content):
                 replace('\n','').replace('\t','').replace('\r','').replace('  ','').strip()
         else:
             line = element
-        line = line + '\n'
-        text += line
         index = text.find('打印本页')
-        if index != -1:
-          text = text[:index]
-        try:
-          summary = '\n'.join(text.split('\n')[:2])
-        except:
-          summary = text
     return text, summary
 def extract_from_pdf(url):
@@ -180,13 +181,13 @@ def crawl(url, article):
         article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
     else:
         article['author'] = ""
-    article['contentCN'] = repr(contentCN)
     if len(article['contentCN']) < 10:
         return None
     CONTENT_ENG = ''
     for element in contentCN.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
-    article['content'] = repr(CONTENT_ENG)
     if 'subtitle' in xpath_dict[domain]:
         article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
     else:
@@ -201,7 +202,7 @@ def crawl(url, article):
 def upsert_content(report):
     """Upsert the content records"""
     dynamodb = get_db_connection()
-    table = dynamodb.Table('article_test')
         # Define the item data
     item = {
         'id': str(report['id']),
@@ -261,4 +262,4 @@ def update_content(report):
                 }
             )
     print(response)

                 replace('\n','').replace('\t','').replace('\r','').replace('  ','').strip()
         else:
             line = element
+        if line != '':
+          line = line + '\n'
+          text += line
         index = text.find('打印本页')
+    if index != -1:
+      text = text[:index]
+    try:
+      summary = '\n'.join(text.split('\n')[:2])
+    except:
+      summary = text
     return text, summary
 def extract_from_pdf(url):
         article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
     else:
         article['author'] = ""
+    article['contentCN'] = repr(contentCN)[1:-1].strip()
     if len(article['contentCN']) < 10:
         return None
     CONTENT_ENG = ''
     for element in contentCN.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
+    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
     if 'subtitle' in xpath_dict[domain]:
         article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
     else:
 def upsert_content(report):
     """Upsert the content records"""
     dynamodb = get_db_connection()
+    table = dynamodb.Table('article_china')
         # Define the item data
     item = {
         'id': str(report['id']),
                 }
             )
     print(response)

xpath.json CHANGED Viewed

@@ -2,7 +2,7 @@
     "data.eastmoney.com": {
         "attachment": "//a[contains(@class, 'pdf-link')]/@href",
         "content": "//div[contains(@class, 'ctx-content')]//p",
-        "datetime": "%Y-%m-%d %H:%M:%S.%f"
     },
     "gov.cn": {
         "title": "//title/text()",

     "data.eastmoney.com": {
         "attachment": "//a[contains(@class, 'pdf-link')]/@href",
         "content": "//div[contains(@class, 'ctx-content')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S.%f"
     },
     "gov.cn": {
         "title": "//title/text()",