Refactor translation error handling and remove debug print statements in vectorization
Browse files- controllers/utils.py +1 -4
- controllers/vectorizer.py +0 -12
controllers/utils.py
CHANGED
@@ -164,8 +164,7 @@ def translate(text):
|
|
164 |
for i in range(3):
|
165 |
try:
|
166 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
167 |
-
except exceptions.RequestError
|
168 |
-
print(f"Request failed: {e}. Retrying {i + 1}/{3}...")
|
169 |
time.sleep(2)
|
170 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
171 |
return ""
|
@@ -288,7 +287,6 @@ def update_content(report):
|
|
288 |
}
|
289 |
})
|
290 |
vectorize(report)
|
291 |
-
print(response)
|
292 |
|
293 |
|
294 |
def update_reference(report):
|
@@ -450,7 +448,6 @@ def extract_reference(row):
|
|
450 |
reference_dates = re.findall(pattern['date_regex'], extracted_text)
|
451 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
452 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
453 |
-
print("%s - %s", reference_dates, reference_titles)
|
454 |
if 'remove' in pattern:
|
455 |
for remove_string in pattern['remove']:
|
456 |
reference_titles = [
|
|
|
164 |
for i in range(3):
|
165 |
try:
|
166 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
167 |
+
except exceptions.RequestError:
|
|
|
168 |
time.sleep(2)
|
169 |
return GoogleTranslator(source='auto', target='en').translate(text)
|
170 |
return ""
|
|
|
287 |
}
|
288 |
})
|
289 |
vectorize(report)
|
|
|
290 |
|
291 |
|
292 |
def update_reference(report):
|
|
|
448 |
reference_dates = re.findall(pattern['date_regex'], extracted_text)
|
449 |
reference_titles = [s.replace(' ', '') for s in reference_titles]
|
450 |
reference_dates = [s.replace(' ', '') for s in reference_dates]
|
|
|
451 |
if 'remove' in pattern:
|
452 |
for remove_string in pattern['remove']:
|
453 |
reference_titles = [
|
controllers/vectorizer.py
CHANGED
@@ -37,20 +37,13 @@ def vectorize(article):
|
|
37 |
Returns:
|
38 |
None
|
39 |
"""
|
40 |
-
print("&"*50)
|
41 |
article['id'] = str(article['id'])
|
42 |
if isinstance(article, dict):
|
43 |
article = [article] # Convert single dictionary to list of dictionaries
|
44 |
df = pd.DataFrame(article)
|
45 |
df = df[['id','site','title','titleCN','category','author','content',
|
46 |
'publishDate','link']]
|
47 |
-
# df = df[['id', 'publishdate', 'author', 'category',
|
48 |
-
# 'content', 'referenceid', 'site', 'title', 'link']]
|
49 |
-
# df['sentimentScore'] = df['sentimentScore'].round(2)
|
50 |
-
# df['sentimentScore'] = df['sentimentScore'].astype(float)
|
51 |
df['publishDate'] = pd.to_datetime(df['publishDate'])
|
52 |
-
print(df.columns)
|
53 |
-
print(df['content'].values[0])
|
54 |
loader = DataFrameLoader(df, page_content_column="content")
|
55 |
documents = loader.load()
|
56 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -61,14 +54,9 @@ def vectorize(article):
|
|
61 |
)
|
62 |
|
63 |
chunks = text_splitter.split_documents(documents)
|
64 |
-
for chunk in chunks:
|
65 |
-
print(chunk)
|
66 |
-
print("*"*50)
|
67 |
ids = []
|
68 |
for chunk in chunks:
|
69 |
_id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
|
70 |
-
print(_id)
|
71 |
-
print("-"*50)
|
72 |
ids.append(_id)
|
73 |
inserted_ids = vstore.add_documents(chunks, ids=ids)
|
74 |
print(inserted_ids)
|
|
|
37 |
Returns:
|
38 |
None
|
39 |
"""
|
|
|
40 |
article['id'] = str(article['id'])
|
41 |
if isinstance(article, dict):
|
42 |
article = [article] # Convert single dictionary to list of dictionaries
|
43 |
df = pd.DataFrame(article)
|
44 |
df = df[['id','site','title','titleCN','category','author','content',
|
45 |
'publishDate','link']]
|
|
|
|
|
|
|
|
|
46 |
df['publishDate'] = pd.to_datetime(df['publishDate'])
|
|
|
|
|
47 |
loader = DataFrameLoader(df, page_content_column="content")
|
48 |
documents = loader.load()
|
49 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
54 |
)
|
55 |
|
56 |
chunks = text_splitter.split_documents(documents)
|
|
|
|
|
|
|
57 |
ids = []
|
58 |
for chunk in chunks:
|
59 |
_id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
|
|
|
|
|
60 |
ids.append(_id)
|
61 |
inserted_ids = vstore.add_documents(chunks, ids=ids)
|
62 |
print(inserted_ids)
|