gavinzli's picture
chore: Add script descriptions and improve code readability
39fe3d1
raw
history blame
2.3 kB
"""
This script is used to crawl data from the website https://www.stats.gov.cn/sj/sjjd/.
It retrieves articles from the website and extracts relevant information from each article.
The script starts by iterating over the pages of the website, starting from the first page.
For each page, it retrieves the HTML content and parses it using the lxml library.
It then extracts the list of articles from the parsed HTML.
For each article, it extracts the publication date and checks if it is within the last 6 months.
If the article is within the last 6 months, it extracts the URL and crawls the article to extract additional information.
The extracted information is stored in a dictionary and can be further processed or saved as needed.
Note: This script requires the 'utils' module, which contains the 'encode' and 'crawl' functions.
"""
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, crawl
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
else:
CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
for url in urls:
try:
article = {}
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
article['category']= "Data Interpretation"
crawl(url, article)
except Exception as error:
print(error)