gavinzli commited on
Commit
c39d841
·
1 Parent(s): c50caa9

Refactor error handling and improve logging in utils.py; update vectorization process in vectorizer.py; adjust variable naming in eastmoney.py

Browse files
controllers/utils.py CHANGED
@@ -55,8 +55,8 @@ def datemodifier(date_string, date_format):
55
  try:
56
  to_date = time.strptime(date_string, date_format)
57
  return time.strftime("%Y-%m-%d", to_date)
58
- except Exception as e:
59
- logging.error(e)
60
  return False
61
 
62
 
@@ -141,7 +141,7 @@ def fetch_url(url):
141
  return response.text
142
  else:
143
  return None
144
- except requests.exceptions.RequestException or requests.exceptions.ReadTimeout as e:
145
  logging.error(e) # Optional: handle or log the error in some way
146
  return None
147
 
@@ -530,7 +530,7 @@ def extract_reference(row):
530
  logging.info("%s - %s - %s", repr(title), row['sourceID'],
531
  row['referenceID'])
532
  update_reference(row)
533
- except Exception as error:
534
  logging.error(error)
535
  return None
536
 
 
55
  try:
56
  to_date = time.strptime(date_string, date_format)
57
  return time.strftime("%Y-%m-%d", to_date)
58
+ except (ValueError, KeyError, TypeError) as error:
59
+ logging.error("ValueError: %s", error)
60
  return False
61
 
62
 
 
141
  return response.text
142
  else:
143
  return None
144
+ except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
145
  logging.error(e) # Optional: handle or log the error in some way
146
  return None
147
 
 
530
  logging.info("%s - %s - %s", repr(title), row['sourceID'],
531
  row['referenceID'])
532
  update_reference(row)
533
+ except (ValueError, KeyError, TypeError) as error:
534
  logging.error(error)
535
  return None
536
 
controllers/vectorizer.py CHANGED
@@ -40,9 +40,11 @@ def vectorize(article):
40
  article['id'] = str(article['id'])
41
  df = pd.DataFrame(article)
42
  df = df[['id','site','title','titleCN','category','author','content',
43
- 'publishDate','link','attachment','sentimentScore','sentimentLabel']]
44
- df['sentimentScore'] = df['sentimentScore'].round(2)
45
- df['sentimentScore'] = df['sentimentScore'].astype(float)
 
 
46
  df['publishDate'] = pd.to_datetime(df['publishDate'])
47
  loader = DataFrameLoader(df, page_content_column="content")
48
  documents = loader.load()
@@ -56,7 +58,7 @@ def vectorize(article):
56
  chunks = text_splitter.split_documents(documents)
57
  ids = []
58
  for chunk in chunks:
59
- id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
60
- ids.append(id)
61
  inserted_ids = vstore.add_documents(chunks, ids=ids)
62
  logging.info(inserted_ids)
 
40
  article['id'] = str(article['id'])
41
  df = pd.DataFrame(article)
42
  df = df[['id','site','title','titleCN','category','author','content',
43
+ 'publishDate','link']]
44
+ df = df[['id', 'publishdate', 'author', 'category',
45
+ 'content', 'referenceid', 'site', 'title', 'link']]
46
+ # df['sentimentScore'] = df['sentimentScore'].round(2)
47
+ # df['sentimentScore'] = df['sentimentScore'].astype(float)
48
  df['publishDate'] = pd.to_datetime(df['publishDate'])
49
  loader = DataFrameLoader(df, page_content_column="content")
50
  documents = loader.load()
 
58
  chunks = text_splitter.split_documents(documents)
59
  ids = []
60
  for chunk in chunks:
61
+ _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
62
+ ids.append(_id)
63
  inserted_ids = vstore.add_documents(chunks, ids=ids)
64
  logging.info(inserted_ids)
source/eastmoney.py CHANGED
@@ -45,7 +45,7 @@ def _crawl(url, article):
45
  text = req.read()
46
  html_text = text.decode("utf-8")
47
  page = etree.HTML(html_text)
48
- contentcn, _ = encode_content(
49
  page.xpath(xpath_dict[domain]['content']))
50
  article['attachment'] = encode(page.xpath(
51
  xpath_dict[domain]['attachment']))
 
45
  text = req.read()
46
  html_text = text.decode("utf-8")
47
  page = etree.HTML(html_text)
48
+ contentcn, summary = encode_content(
49
  page.xpath(xpath_dict[domain]['content']))
50
  article['attachment'] = encode(page.xpath(
51
  xpath_dict[domain]['attachment']))