Add validation for content length and enhance error handling in crawl_by_url function
Browse files- controllers/utils.py +4 -2
controllers/utils.py
CHANGED
@@ -660,6 +660,8 @@ def crawl_by_url(url, article):
|
|
660 |
page = etree.HTML(html_text)
|
661 |
contentcn, summary = encode_content(
|
662 |
page.xpath(xpath_dict[domain]['content']))
|
|
|
|
|
663 |
article['originSite'] = xpath_dict[domain]['siteCN']
|
664 |
article['site'] = xpath_dict[domain]['site']
|
665 |
article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
|
@@ -679,7 +681,8 @@ def crawl_by_url(url, article):
|
|
679 |
if detect(contenteng) != 'en':
|
680 |
for element in contentcn.split("。"):
|
681 |
contenteng += translate(element) + '. '
|
682 |
-
except
|
|
|
683 |
print(f"An unexpected error occurred: {e}")
|
684 |
article['content'] = repr(contenteng)[1:-1].strip()
|
685 |
try:
|
@@ -702,5 +705,4 @@ def crawl_by_url(url, article):
|
|
702 |
vectorize(article)
|
703 |
openai_vectorize(article)
|
704 |
|
705 |
-
|
706 |
data = download_files_from_s3('data')
|
|
|
660 |
page = etree.HTML(html_text)
|
661 |
contentcn, summary = encode_content(
|
662 |
page.xpath(xpath_dict[domain]['content']))
|
663 |
+
if contentcn is None or len(contentcn) < 10:
|
664 |
+
return
|
665 |
article['originSite'] = xpath_dict[domain]['siteCN']
|
666 |
article['site'] = xpath_dict[domain]['site']
|
667 |
article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
|
|
|
681 |
if detect(contenteng) != 'en':
|
682 |
for element in contentcn.split("。"):
|
683 |
contenteng += translate(element) + '. '
|
684 |
+
except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout,
|
685 |
+
PyPDF2.errors.PdfReadError, PyPDF2.errors.DependencyError) as e:
|
686 |
print(f"An unexpected error occurred: {e}")
|
687 |
article['content'] = repr(contenteng)[1:-1].strip()
|
688 |
try:
|
|
|
705 |
vectorize(article)
|
706 |
openai_vectorize(article)
|
707 |
|
|
|
708 |
data = download_files_from_s3('data')
|