gavinzli commited on
Commit
29d3eca
·
1 Parent(s): b4bd94d

Add validation for content length and enhance error handling in crawl_by_url function

Browse files
Files changed (1) hide show
  1. controllers/utils.py +4 -2
controllers/utils.py CHANGED
@@ -660,6 +660,8 @@ def crawl_by_url(url, article):
660
  page = etree.HTML(html_text)
661
  contentcn, summary = encode_content(
662
  page.xpath(xpath_dict[domain]['content']))
 
 
663
  article['originSite'] = xpath_dict[domain]['siteCN']
664
  article['site'] = xpath_dict[domain]['site']
665
  article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
@@ -679,7 +681,8 @@ def crawl_by_url(url, article):
679
  if detect(contenteng) != 'en':
680
  for element in contentcn.split("。"):
681
  contenteng += translate(element) + '. '
682
- except Exception as e:
 
683
  print(f"An unexpected error occurred: {e}")
684
  article['content'] = repr(contenteng)[1:-1].strip()
685
  try:
@@ -702,5 +705,4 @@ def crawl_by_url(url, article):
702
  vectorize(article)
703
  openai_vectorize(article)
704
 
705
-
706
  data = download_files_from_s3('data')
 
660
  page = etree.HTML(html_text)
661
  contentcn, summary = encode_content(
662
  page.xpath(xpath_dict[domain]['content']))
663
+ if contentcn is None or len(contentcn) < 10:
664
+ return
665
  article['originSite'] = xpath_dict[domain]['siteCN']
666
  article['site'] = xpath_dict[domain]['site']
667
  article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
 
681
  if detect(contenteng) != 'en':
682
  for element in contentcn.split("。"):
683
  contenteng += translate(element) + '. '
684
+ except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout,
685
+ PyPDF2.errors.PdfReadError, PyPDF2.errors.DependencyError) as e:
686
  print(f"An unexpected error occurred: {e}")
687
  article['content'] = repr(contenteng)[1:-1].strip()
688
  try:
 
705
  vectorize(article)
706
  openai_vectorize(article)
707
 
 
708
  data = download_files_from_s3('data')