gavinzli commited on
Commit
0750507
·
1 Parent(s): 9853f17

Refactor translation error handling and remove debug print statements in vectorization

Browse files
Files changed (2) hide show
  1. controllers/utils.py +1 -4
  2. controllers/vectorizer.py +0 -12
controllers/utils.py CHANGED
@@ -164,8 +164,7 @@ def translate(text):
164
  for i in range(3):
165
  try:
166
  return GoogleTranslator(source='auto', target='en').translate(text)
167
- except exceptions.RequestError as e:
168
- print(f"Request failed: {e}. Retrying {i + 1}/{3}...")
169
  time.sleep(2)
170
  return GoogleTranslator(source='auto', target='en').translate(text)
171
  return ""
@@ -288,7 +287,6 @@ def update_content(report):
288
  }
289
  })
290
  vectorize(report)
291
- print(response)
292
 
293
 
294
  def update_reference(report):
@@ -450,7 +448,6 @@ def extract_reference(row):
450
  reference_dates = re.findall(pattern['date_regex'], extracted_text)
451
  reference_titles = [s.replace(' ', '') for s in reference_titles]
452
  reference_dates = [s.replace(' ', '') for s in reference_dates]
453
- print("%s - %s", reference_dates, reference_titles)
454
  if 'remove' in pattern:
455
  for remove_string in pattern['remove']:
456
  reference_titles = [
 
164
  for i in range(3):
165
  try:
166
  return GoogleTranslator(source='auto', target='en').translate(text)
167
+ except exceptions.RequestError:
 
168
  time.sleep(2)
169
  return GoogleTranslator(source='auto', target='en').translate(text)
170
  return ""
 
287
  }
288
  })
289
  vectorize(report)
 
290
 
291
 
292
  def update_reference(report):
 
448
  reference_dates = re.findall(pattern['date_regex'], extracted_text)
449
  reference_titles = [s.replace(' ', '') for s in reference_titles]
450
  reference_dates = [s.replace(' ', '') for s in reference_dates]
 
451
  if 'remove' in pattern:
452
  for remove_string in pattern['remove']:
453
  reference_titles = [
controllers/vectorizer.py CHANGED
@@ -37,20 +37,13 @@ def vectorize(article):
37
  Returns:
38
  None
39
  """
40
- print("&"*50)
41
  article['id'] = str(article['id'])
42
  if isinstance(article, dict):
43
  article = [article] # Convert single dictionary to list of dictionaries
44
  df = pd.DataFrame(article)
45
  df = df[['id','site','title','titleCN','category','author','content',
46
  'publishDate','link']]
47
- # df = df[['id', 'publishdate', 'author', 'category',
48
- # 'content', 'referenceid', 'site', 'title', 'link']]
49
- # df['sentimentScore'] = df['sentimentScore'].round(2)
50
- # df['sentimentScore'] = df['sentimentScore'].astype(float)
51
  df['publishDate'] = pd.to_datetime(df['publishDate'])
52
- print(df.columns)
53
- print(df['content'].values[0])
54
  loader = DataFrameLoader(df, page_content_column="content")
55
  documents = loader.load()
56
  text_splitter = RecursiveCharacterTextSplitter(
@@ -61,14 +54,9 @@ def vectorize(article):
61
  )
62
 
63
  chunks = text_splitter.split_documents(documents)
64
- for chunk in chunks:
65
- print(chunk)
66
- print("*"*50)
67
  ids = []
68
  for chunk in chunks:
69
  _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
70
- print(_id)
71
- print("-"*50)
72
  ids.append(_id)
73
  inserted_ids = vstore.add_documents(chunks, ids=ids)
74
  print(inserted_ids)
 
37
  Returns:
38
  None
39
  """
 
40
  article['id'] = str(article['id'])
41
  if isinstance(article, dict):
42
  article = [article] # Convert single dictionary to list of dictionaries
43
  df = pd.DataFrame(article)
44
  df = df[['id','site','title','titleCN','category','author','content',
45
  'publishDate','link']]
 
 
 
 
46
  df['publishDate'] = pd.to_datetime(df['publishDate'])
 
 
47
  loader = DataFrameLoader(df, page_content_column="content")
48
  documents = loader.load()
49
  text_splitter = RecursiveCharacterTextSplitter(
 
54
  )
55
 
56
  chunks = text_splitter.split_documents(documents)
 
 
 
57
  ids = []
58
  for chunk in chunks:
59
  _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
 
 
60
  ids.append(_id)
61
  inserted_ids = vstore.add_documents(chunks, ids=ids)
62
  print(inserted_ids)