Refactor error handling and improve logging in utils.py; update vectorization process in vectorizer.py; adjust variable naming in eastmoney.py
Browse files- controllers/utils.py +4 -4
- controllers/vectorizer.py +7 -5
- source/eastmoney.py +1 -1
controllers/utils.py
CHANGED
@@ -55,8 +55,8 @@ def datemodifier(date_string, date_format):
|
|
55 |
try:
|
56 |
to_date = time.strptime(date_string, date_format)
|
57 |
return time.strftime("%Y-%m-%d", to_date)
|
58 |
-
except
|
59 |
-
logging.error(
|
60 |
return False
|
61 |
|
62 |
|
@@ -141,7 +141,7 @@ def fetch_url(url):
|
|
141 |
return response.text
|
142 |
else:
|
143 |
return None
|
144 |
-
except requests.exceptions.RequestException
|
145 |
logging.error(e) # Optional: handle or log the error in some way
|
146 |
return None
|
147 |
|
@@ -530,7 +530,7 @@ def extract_reference(row):
|
|
530 |
logging.info("%s - %s - %s", repr(title), row['sourceID'],
|
531 |
row['referenceID'])
|
532 |
update_reference(row)
|
533 |
-
except
|
534 |
logging.error(error)
|
535 |
return None
|
536 |
|
|
|
55 |
try:
|
56 |
to_date = time.strptime(date_string, date_format)
|
57 |
return time.strftime("%Y-%m-%d", to_date)
|
58 |
+
except (ValueError, KeyError, TypeError) as error:
|
59 |
+
logging.error("ValueError: %s", error)
|
60 |
return False
|
61 |
|
62 |
|
|
|
141 |
return response.text
|
142 |
else:
|
143 |
return None
|
144 |
+
except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
|
145 |
logging.error(e) # Optional: handle or log the error in some way
|
146 |
return None
|
147 |
|
|
|
530 |
logging.info("%s - %s - %s", repr(title), row['sourceID'],
|
531 |
row['referenceID'])
|
532 |
update_reference(row)
|
533 |
+
except (ValueError, KeyError, TypeError) as error:
|
534 |
logging.error(error)
|
535 |
return None
|
536 |
|
controllers/vectorizer.py
CHANGED
@@ -40,9 +40,11 @@ def vectorize(article):
|
|
40 |
article['id'] = str(article['id'])
|
41 |
df = pd.DataFrame(article)
|
42 |
df = df[['id','site','title','titleCN','category','author','content',
|
43 |
-
'publishDate','link'
|
44 |
-
df
|
45 |
-
|
|
|
|
|
46 |
df['publishDate'] = pd.to_datetime(df['publishDate'])
|
47 |
loader = DataFrameLoader(df, page_content_column="content")
|
48 |
documents = loader.load()
|
@@ -56,7 +58,7 @@ def vectorize(article):
|
|
56 |
chunks = text_splitter.split_documents(documents)
|
57 |
ids = []
|
58 |
for chunk in chunks:
|
59 |
-
|
60 |
-
ids.append(
|
61 |
inserted_ids = vstore.add_documents(chunks, ids=ids)
|
62 |
logging.info(inserted_ids)
|
|
|
40 |
article['id'] = str(article['id'])
|
41 |
df = pd.DataFrame(article)
|
42 |
df = df[['id','site','title','titleCN','category','author','content',
|
43 |
+
'publishDate','link']]
|
44 |
+
df = df[['id', 'publishdate', 'author', 'category',
|
45 |
+
'content', 'referenceid', 'site', 'title', 'link']]
|
46 |
+
# df['sentimentScore'] = df['sentimentScore'].round(2)
|
47 |
+
# df['sentimentScore'] = df['sentimentScore'].astype(float)
|
48 |
df['publishDate'] = pd.to_datetime(df['publishDate'])
|
49 |
loader = DataFrameLoader(df, page_content_column="content")
|
50 |
documents = loader.load()
|
|
|
58 |
chunks = text_splitter.split_documents(documents)
|
59 |
ids = []
|
60 |
for chunk in chunks:
|
61 |
+
_id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
|
62 |
+
ids.append(_id)
|
63 |
inserted_ids = vstore.add_documents(chunks, ids=ids)
|
64 |
logging.info(inserted_ids)
|
source/eastmoney.py
CHANGED
@@ -45,7 +45,7 @@ def _crawl(url, article):
|
|
45 |
text = req.read()
|
46 |
html_text = text.decode("utf-8")
|
47 |
page = etree.HTML(html_text)
|
48 |
-
contentcn,
|
49 |
page.xpath(xpath_dict[domain]['content']))
|
50 |
article['attachment'] = encode(page.xpath(
|
51 |
xpath_dict[domain]['attachment']))
|
|
|
45 |
text = req.read()
|
46 |
html_text = text.decode("utf-8")
|
47 |
page = etree.HTML(html_text)
|
48 |
+
contentcn, summary = encode_content(
|
49 |
page.xpath(xpath_dict[domain]['content']))
|
50 |
article['attachment'] = encode(page.xpath(
|
51 |
xpath_dict[domain]['attachment']))
|