Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 3,075 Bytes

c48c6cf
 
 
b6dcee5
86c11ee
57c4050
b6dcee5
 
 
 
 
 
 
 
 
 
 
d83e215
b6dcee5
 
 
 
 
 
d83e215
b6dcee5
 
 
d83e215
b6dcee5
 
d83e215
b6dcee5
 
d83e215
b6dcee5
d83e215
b6dcee5
d83e215
 
 
b6dcee5
c48c6cf
86c11ee
54c7a12
c48c6cf
 
57c4050
c48c6cf
 
 
86c11ee
 
c48c6cf
 
 
57c4050
 
 
c48c6cf
 
 
 
 
 
 
 
 
57c4050
c48c6cf
57c4050
b6dcee5
c48c6cf

import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content

with open('xpath.json', 'r', encoding='UTF-8') as f:
    xpath_dict = json.load(f)

def crawl(url, article):
    domain = urlparse(url).netloc
    req = urllib.request.urlopen(url)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
    article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
    article['link'] = url
    if article['orgSName'] == "''":
        article['site'] = translate(article['orgSName'])
    else:
        article['site'] = translate(article['orgName'])
    article['titleCN'] = article['title']
    article['title'] = translate(article['title'])
    article['author'] = translate(article['researcher'])
    article['originAuthor'] = article['researcher']
    article['contentCN'] = repr(contentCN)[1:-1].strip()
    article['subtitle'] = translate(summary)
    article['category'] = "Macroeconomic Research"
    if len(article['contentCN']) < 10:
        return None
    CONTENT_ENG = ''
    for element in contentCN.split("\n"):
        CONTENT_ENG += translate(element) + '\n'
    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
    article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
    upsert_content(article)

today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
    URL = "https://reportapi.eastmoney.com/report/jg"
    params = {
        "cb": "datatable8544623",
        "pageSize": "100",
        "beginTime": beginDate,
        "endTime": today,
        "pageNo": i,
        "qType": "3",
    }
    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
    print(URL)
    content = fetch_url(URL)
    if content:
        start_index = content.find("(")
        if start_index != -1:
            result = content[start_index + 1: -1]
        else:
            result = content
        reportinfo = json.loads(result)
        if reportinfo["size"] > 0:
            i = i + 1
            for article in reportinfo['data']:
                try:
                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                    crawl(url,article)
                except Exception as error:
                    print(error)
        else:
            print(reportinfo)
            i = -1
    else:
        print("Failed to fetch URL:", url)