Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

gavinzli commited on Jan 4

Commit

29d3eca

1 Parent(s): b4bd94d

Add validation for content length and enhance error handling in crawl_by_url function

Files changed (1) hide show

controllers/utils.py CHANGED Viewed

@@ -660,6 +660,8 @@ def crawl_by_url(url, article):
     page = etree.HTML(html_text)
     contentcn, summary = encode_content(
         page.xpath(xpath_dict[domain]['content']))
     article['originSite'] = xpath_dict[domain]['siteCN']
     article['site'] = xpath_dict[domain]['site']
     article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
@@ -679,7 +681,8 @@ def crawl_by_url(url, article):
         if detect(contenteng) != 'en':
             for element in contentcn.split("。"):
                 contenteng += translate(element) + '. '
-    except Exception as e:
         print(f"An unexpected error occurred: {e}")
     article['content'] = repr(contenteng)[1:-1].strip()
     try:
@@ -702,5 +705,4 @@ def crawl_by_url(url, article):
     vectorize(article)
     openai_vectorize(article)
 data = download_files_from_s3('data')

     page = etree.HTML(html_text)
     contentcn, summary = encode_content(
         page.xpath(xpath_dict[domain]['content']))
+    if contentcn is None or len(contentcn) < 10:
+        return
     article['originSite'] = xpath_dict[domain]['siteCN']
     article['site'] = xpath_dict[domain]['site']
     article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
         if detect(contenteng) != 'en':
             for element in contentcn.split("。"):
                 contenteng += translate(element) + '. '
+    except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout,
+            PyPDF2.errors.PdfReadError, PyPDF2.errors.DependencyError) as e:
         print(f"An unexpected error occurred: {e}")
     article['content'] = repr(contenteng)[1:-1].strip()
     try:
     vectorize(article)
     openai_vectorize(article)
 data = download_files_from_s3('data')