Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

gavinzli commited on Dec 5, 2024

Commit

c39d841

1 Parent(s): c50caa9

Refactor error handling and improve logging in utils.py; update vectorization process in vectorizer.py; adjust variable naming in eastmoney.py

Files changed (3) hide show

controllers/utils.py CHANGED Viewed

@@ -55,8 +55,8 @@ def datemodifier(date_string, date_format):
     try:
         to_date = time.strptime(date_string, date_format)
         return time.strftime("%Y-%m-%d", to_date)
-    except Exception as e:
-        logging.error(e)
         return False
@@ -141,7 +141,7 @@ def fetch_url(url):
             return response.text
         else:
             return None
-    except requests.exceptions.RequestException or requests.exceptions.ReadTimeout as e:
         logging.error(e)  # Optional: handle or log the error in some way
         return None
@@ -530,7 +530,7 @@ def extract_reference(row):
                     logging.info("%s - %s - %s", repr(title), row['sourceID'],
                                  row['referenceID'])
                     update_reference(row)
-    except Exception as error:
         logging.error(error)
         return None

     try:
         to_date = time.strptime(date_string, date_format)
         return time.strftime("%Y-%m-%d", to_date)
+    except (ValueError, KeyError, TypeError) as error:
+        logging.error("ValueError: %s", error)
         return False
             return response.text
         else:
             return None
+    except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
         logging.error(e)  # Optional: handle or log the error in some way
         return None
                     logging.info("%s - %s - %s", repr(title), row['sourceID'],
                                  row['referenceID'])
                     update_reference(row)
+    except (ValueError, KeyError, TypeError) as error:
         logging.error(error)
         return None

controllers/vectorizer.py CHANGED Viewed

@@ -40,9 +40,11 @@ def vectorize(article):
     article['id'] = str(article['id'])
     df = pd.DataFrame(article)
     df = df[['id','site','title','titleCN','category','author','content',
-             'publishDate','link','attachment','sentimentScore','sentimentLabel']]
-    df['sentimentScore'] = df['sentimentScore'].round(2)
-    df['sentimentScore'] = df['sentimentScore'].astype(float)
     df['publishDate'] = pd.to_datetime(df['publishDate'])
     loader = DataFrameLoader(df, page_content_column="content")
     documents = loader.load()
@@ -56,7 +58,7 @@ def vectorize(article):
     chunks = text_splitter.split_documents(documents)
     ids = []
     for chunk in chunks:
-        id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
-        ids.append(id)
     inserted_ids = vstore.add_documents(chunks, ids=ids)
     logging.info(inserted_ids)

     article['id'] = str(article['id'])
     df = pd.DataFrame(article)
     df = df[['id','site','title','titleCN','category','author','content',
+             'publishDate','link']]
+    df = df[['id', 'publishdate', 'author', 'category',
+                         'content', 'referenceid', 'site', 'title', 'link']]
+    # df['sentimentScore'] = df['sentimentScore'].round(2)
+    # df['sentimentScore'] = df['sentimentScore'].astype(float)
     df['publishDate'] = pd.to_datetime(df['publishDate'])
     loader = DataFrameLoader(df, page_content_column="content")
     documents = loader.load()
     chunks = text_splitter.split_documents(documents)
     ids = []
     for chunk in chunks:
+        _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
+        ids.append(_id)
     inserted_ids = vstore.add_documents(chunks, ids=ids)
     logging.info(inserted_ids)

source/eastmoney.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _crawl(url, article):
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
-    contentcn, _ = encode_content(
         page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(
         xpath_dict[domain]['attachment']))

     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
+    contentcn, summary = encode_content(
         page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(
         xpath_dict[domain]['attachment']))