Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

RaymondWongWL commited on Mar 22

Commit

a13f43a

1 Parent(s): 236ef33

Add Data Source [CCEF]

Browse files

Files changed (4) hide show

controllers/utils.py +13 -8
main.py +2 -2
source/sohu_ccef.py +160 -0
xpath.json +9 -0

controllers/utils.py CHANGED Viewed

@@ -22,12 +22,12 @@ import PyPDF2
 from transformers import pipeline
 from controllers.summarizer import summarize
-from controllers.vectorizer import vectorize
 load_dotenv()
-AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
-AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
 analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
@@ -671,7 +671,11 @@ def crawl_by_url(url, article):
     """
     domain = '.'.join(urlparse(url).netloc.split('.')[1:])
-    req = urllib.request.urlopen(url, timeout=60)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
@@ -704,7 +708,8 @@ def crawl_by_url(url, article):
         print(f"An unexpected error occurred: {e}")
     article['content'] = repr(contenteng)[1:-1].strip()
     try:
-        article['subtitle'] = summarize(article['content'])
     except (ValueError, KeyError, TypeError):
         article['subtitle'] = translate(summary)
     article['publishDate'] = datemodifier(
@@ -719,8 +724,8 @@ def crawl_by_url(url, article):
                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
     article['referenceid'] = None
-    update_content(article)
-    vectorize(article)
     # openai_vectorize(article)
-data = download_files_from_s3('data')

 from transformers import pipeline
 from controllers.summarizer import summarize
+# from controllers.vectorizer import vectorize
 load_dotenv()
+# AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
+# AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
 analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
     """
     domain = '.'.join(urlparse(url).netloc.split('.')[1:])
+    headers = {'User-Agent':
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+    # req = urllib.request.urlopen(url, timeout=60)
+    req = urllib.request.Request(url, headers=headers)
+    req = urllib.request.urlopen(req, timeout=60)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
         print(f"An unexpected error occurred: {e}")
     article['content'] = repr(contenteng)[1:-1].strip()
     try:
+        # article['subtitle'] = summarize(article['content'])
+        pass
     except (ValueError, KeyError, TypeError):
         article['subtitle'] = translate(summary)
     article['publishDate'] = datemodifier(
                                article['titleCN'] + article['publishDate'])
     logging.info("%s - %s", article['id'], article['site'])
     article['referenceid'] = None
+    # update_content(article)
+    # vectorize(article)
     # openai_vectorize(article)
+# data = download_files_from_s3('data')

main.py CHANGED Viewed

@@ -10,10 +10,9 @@ import os
 from dotenv import load_dotenv
 from prefect import flow
-from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
 # from glue import glue_job_run
 load_dotenv()
 logging.basicConfig(
@@ -40,6 +39,7 @@ def main():
     ndrc.crawl(delta)
     mof.crawl(delta)
     # glue_job_run()
 if __name__ == '__main__':
     main()

 from dotenv import load_dotenv
 from prefect import flow
+from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof, sohu_ccef
 # from glue import glue_job_run
 load_dotenv()
 logging.basicConfig(
     ndrc.crawl(delta)
     mof.crawl(delta)
     # glue_job_run()
+    sohu_ccef.crawl(delta)
 if __name__ == '__main__':
     main()

source/sohu_ccef.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Module to crawl the website 'https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20' to fetch and process articles."""
+import json
+import time
+import urllib.request
+from datetime import datetime, timedelta
+from prefect import task, get_run_logger
+from lxml import etree
+from controllers.utils import crawl_by_url
+headers = {
+    "Accept": "application/json, text/javascript, */*; q=0.01",
+    "Content-Type": "application/json;charset=UTF-8",
+    "Origin": "https://www.sohu.com",
+    "Referer": "https://mp.sohu.com/profile?xpt=Y2NlZmNjZWZAc29odS5jb20",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
+    "Cookie": "SUV=1741844737337odinqwkn; _ga=GA1.1.827128807.1741845527; IPLOC=CN; cityIpLocation=219.79.103.209;",
+}
+json_data_template = {
+    "pvId": "1742131372391_ChMyB2V",
+    "pageId": "1742131372927_1741844737337odi_MYH",
+    "mainContent": {
+        "productType": "13",
+        "productId": "324",
+        "secureScore": "5",
+        "categoryId": "47",
+        "adTags": "11111111",
+        "authorId": 121135924,
+    },
+    "resourceList": [
+        {
+            "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
+            "isServerRender": False,
+            "configSource": "mp",
+            "content": {
+                "productId": "325",
+                "productType": "13",
+                "size": 20,
+                "page": 1,  # The number of pages will be updated dynamically in the loop
+                "requestId": "1742131372524LxjXrUY_324",
+            },
+            "adInfo": {},
+            "context": {"mkey": "465450"},
+        }
+    ],
+}
+def parse_time_string(time_str):
+    """
+    Parses the time string into a datetime object.
+    Supports formats like "昨天18:41", "2天前", "2025.03.10", etc.
+    """
+    now = datetime.now()
+    if "分钟前" in time_str:
+        minutes = int(time_str.replace("分钟前", "").strip())
+        return now - timedelta(minutes=minutes)
+    if "小时前" in time_str:
+        hours = int(time_str.replace("小时前", "").strip())
+        return now - timedelta(hours=hours)
+    elif "天前" in time_str:
+        days = int(time_str.replace("天前", "").strip())
+        return now - timedelta(days=days)
+    elif "昨天" in time_str:
+        time_part = time_str.replace("昨天", "").strip()
+        return datetime.strptime(f"{(now - timedelta(days=1)).date()} {time_part}", "%Y-%m-%d %H:%M")
+    elif "前天" in time_str:
+        time_part = time_str.replace("前天", "").strip()
+        return datetime.strptime(f"{(datetime.now() - timedelta(days=2)).date()} {time_part}", "%Y-%m-%d %H:%M")
+    else:
+        return datetime.strptime(time_str, "%Y.%m.%d")
+def extract_recent_urls(response_json, delta):
+    """
+    Extracts URLs from the response JSON and filters them based on the given delta (in days).
+    Args:
+        response_json (dict): The JSON response from the API.
+        delta (int): The number of days to look back.
+    Returns:
+        list: A list of filtered URLs.
+    """
+    base_url = "http://www.sohu.com"
+    cutoff_time = datetime.now() - timedelta(days=delta)
+    recent_urls = []
+    try:
+        articles = response_json["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"].get("list", [])
+        for article in articles:
+            extra_info = article.get("extraInfoList", [])
+            time_info = next((info["text"] for info in extra_info if info["image"] == "time"), None)
+            if time_info:
+                article_time = parse_time_string(time_info)
+                if article_time >= cutoff_time:
+                    url = article.get("url", "")
+                    if url:
+                        if not url.startswith("http"):
+                            url = base_url + url
+                        recent_urls.append(url)
+    except KeyError as e:
+        print(f"KeyError: {e}")
+    return recent_urls
+@task(name = "Data Collection - sohu_ccef", log_prints = True)
+def crawl(delta):
+    """
+    Crawls the website and collects URLs from the last `delta` days.
+    Args:
+        delta (int): The number of days to look back.
+    Returns:
+        list: A list of URLs published in the last `delta` days.
+    """
+    logger = get_run_logger()
+    logger.info("sohu.com")
+    all_recent_urls = []
+    url = "https://odin.sohu.com/odin/api/blockdata"
+    page = 1
+    while True:
+        json_data = json_data_template.copy()
+        json_data["resourceList"][0]["content"]["page"] = page
+        data = json.dumps(json_data).encode("utf-8")
+        req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+        try:
+            with urllib.request.urlopen(req) as response:
+                response_data = response.read().decode("utf-8")
+                response_json = json.loads(response_data)
+                recent_urls = extract_recent_urls(response_json, delta)
+                if not recent_urls:  # Stop if no recent URLs are found
+                    print(f"No recent data found on page {page}. Stopping.")
+                    break
+                all_recent_urls.extend(recent_urls)
+                print(f"Page {page}: {len(recent_urls)} recent URLs extracted.")
+        except urllib.error.HTTPError as e:
+            print(f"HTTP error: {e.code} on page {page}")
+            print(e.read().decode("utf-8"))
+            break
+        except urllib.error.URLError as e:
+            print(f"URL error: {e.reason} on page {page}")
+            break
+        page += 1
+    for url in all_recent_urls:
+        print(url)
+        article = {}
+        if "http://www.sohu.com" in url:
+            article['category'] = "Policy Interpretation"
+            crawl_by_url(url, article)

xpath.json CHANGED Viewed

@@ -82,5 +82,14 @@
         "datetime_format": "%Y-%m-%d %H:%M:%S",
         "siteCN": "中国国家税务总局",
         "site": "State Taxation Administration of China"
     }
 }

         "datetime_format": "%Y-%m-%d %H:%M:%S",
         "siteCN": "中国国家税务总局",
         "site": "State Taxation Administration of China"
+    },
+    "sohu.com":{
+        "title": "//title/text()",
+        "subtitle": "//meta[@name = 'description']/@content",
+        "publishdate": "//meta[@itemprop = 'datePublished']/@content",
+        "content": "//article[contains(@id, 'mp-editor')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M",
+        "siteCN": "首席经济学家论坛",
+        "site": "China Chief Economist Forum"
     }
 }