Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Mar 25, 2024

Commit

71294ad

unverified ·

1 Parent(s): a6d7194

Update ndrc.py

Browse files

Files changed (1) hide show

ndrc.py +48 -24

ndrc.py CHANGED Viewed

@@ -18,6 +18,14 @@ def datemodifier(date_string):
     except:
         return False
 def fetch_url(url):
     response = requests.get(url)
     if response.status_code == 200:
@@ -40,7 +48,7 @@ def encode(content):
             subpage = etree.HTML(subelement)
             tree = subpage.xpath('//text()')
             line = ''.join(translist(tree)).\
-                replace('\n','').replace('\t','').replace('\r','').replace('  ','').strip()
         else:
             line = element
         text += line
@@ -206,29 +214,46 @@ for categoryu_url in categoryu_urls:
                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                 for url in urls:
                     try:
-                        print(url)
                         article = {}
-                        url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                        url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                        print(url)
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                        split_text = article['originalContent'].split("。")
-                        half_length = len(split_text) // 2
-                        part1 = "。".join(split_text[:half_length])
-                        part2 = "。".join(split_text[half_length:])
-                        article['content'] = translator.translate(part1, dest='en').text + translator.translate(part2, dest='en').text
-                        print(len(article['originalContent']),article['content'])
-                        article['site'] = "National Development and Reform Commission"
-                        article['originalSite'] = "国家发展和改革委员会"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translator.translate(article['originalTitle'], dest='en').text
-                        article['url'] = url
-                        article['category']= "Policy Interpretation"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                         label_dict = {
                             "positive": "+",
@@ -256,4 +281,3 @@ for categoryu_url in categoryu_urls:
                         upsert_content(article)
                     except Exception as error:
                         print(error)

     except:
         return False
+def datemodifier_gov(date_string):
+    """Date Modifier Function"""
+    try:
+        to_date = time.strptime(date_string,"%Y-%m-%d-%H:%M:%S")
+        return time.strftime("%Y-%m-%d",to_date)
+    except:
+        return False
 def fetch_url(url):
     response = requests.get(url)
     if response.status_code == 200:
             subpage = etree.HTML(subelement)
             tree = subpage.xpath('//text()')
             line = ''.join(translist(tree)).\
+                replace('\n','').replace('\t','').replace('\r','').replace('  ','').replace('\u3000',' ').replace('\xa0','').strip()
         else:
             line = element
         text += line
                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                 for url in urls:
                     try:
                         article = {}
+                        if "https://www.gov.cn" in url:
+                          print(url)
+                          req = urllib.request.urlopen(url)
+                          text = req.read()
+                          html_text = text.decode("utf-8")
+                          page = etree.HTML(html_text)
+                          article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
+                          content_eng = ''
+                          for element in article['originalContent'].split("。"):
+                            content_eng += translator.translate(element, dest='en').text + ' '
+                          article['content'] = content_eng
+                          print(article['content'])
+                          article['site'] = "State Council"
+                          article['originalSite'] = "国务院"
+                          article['originalTitle'] = page.xpath("//title/text()")[0]
+                          article['title'] = translator.translate(article['originalTitle'], dest='en').text
+                          article['url'] = url
+                          article['category']= "Policy Release"
+                          article['publishDate'] = datemodifier_gov(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
+                        else:
+                          url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                          url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                          req = urllib.request.urlopen(url)
+                          text = req.read()
+                          html_text = text.decode("utf-8")
+                          page = etree.HTML(html_text)
+                          article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
+                          content_eng = ''
+                          for element in article['originalContent'].split("。"):
+                            content_eng += translator.translate(element, dest='en').text + ' '
+                          article['content'] = content_eng
+                          print(article['content'])
+                          article['site'] = "National Development and Reform Commission"
+                          article['originalSite'] = "国家发展和改革委员会"
+                          article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
+                          article['title'] = translator.translate(article['originalTitle'], dest='en').text
+                          article['url'] = url
+                          article['category']= "Policy Interpretation"
+                          article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                         label_dict = {
                             "positive": "+",
                         upsert_content(article)
                     except Exception as error:
                         print(error)