Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

gavinzli commited on Dec 18, 2024

Commit

693e166

1 Parent(s): 1269de7

Add reference ID extraction and implement retry logic for document addition

Browse files

Files changed (3) hide show

controllers/utils.py +55 -37
controllers/vectorizer.py +91 -0
source/eastmoney.py +5 -2

controllers/utils.py CHANGED Viewed

@@ -18,11 +18,11 @@ from dotenv import load_dotenv
 from deep_translator import GoogleTranslator, exceptions
 from langdetect import detect
 from lxml import etree
-from PyPDF2 import PdfReader
 from transformers import pipeline
 from controllers.summarizer import summarize
-from controllers.vectorizer import vectorize
 load_dotenv()
@@ -47,7 +47,7 @@ def datemodifier(date_string, date_format):
     """Date Modifier Function
     This function takes a date string and a date format as input and modifies the date string
-    according to the specified format. It returns the modified date string in the format 'YYYY-MM-DD'.
     Args:
         date_string (str): The date string to be modified.
@@ -70,7 +70,8 @@ def encode(content):
     Encodes the given content into a single string.
     Args:
-        content (list): A list of elements to be encoded. Each element can be either a string or an `etree._Element` object.
     Returns:
         str: The encoded content as a single string.
@@ -138,7 +139,8 @@ def fetch_url(url):
         otherwise None.
     Raises:
-        requests.exceptions.RequestException: If there is an error while making the request or if the response status code is not 200.
     """
     try:
         response = requests.get(url, timeout=60)
@@ -161,7 +163,7 @@ def translate(text):
     Returns:
         str: The translated text in English.
     """
-    for i in range(5):
         try:
             return GoogleTranslator(source='auto', target='en').translate(text)
         except exceptions.RequestError:
@@ -178,7 +180,9 @@ def sentiment_computation(content):
     content (str): The content for which sentiment needs to be computed.
     Returns:
-    tuple: A tuple containing the sentiment score and label. The sentiment score is a float representing the overall sentiment score of the content. The sentiment label is a string representing the sentiment label ('+', '-', or '0').
     """
     label_dict = {
@@ -230,22 +234,29 @@ def update_content(report):
     """
     print("Updating content for %s", report['id'])
     dynamodb = get_client_connection()
-    response = dynamodb.update_item(
         TableName="article_china",
         Key={
             'id': {
                 'S': str(report['id'])
-            },
-            'site': {
-                'S': report['site']
             }
         },
         UpdateExpression=
-        'SET title = :title, titleCN = :titleCN, contentCN = :contentCN, category = :category, author = :author, content = :content, subtitle = :subtitle, publishDate = :publishDate, link = :link, attachment = :attachment, sentimentScore = :sentimentScore, sentimentLabel = :sentimentLabel, LastModifiedDate = :LastModifiedDate',
         ExpressionAttributeValues={
             ':title': {
                 'S': report['title']
             },
             ':titleCN': {
                 'S': report['titleCN']
             },
@@ -287,6 +298,7 @@ def update_content(report):
             }
         })
     vectorize(report)
 def update_reference(report):
@@ -334,7 +346,7 @@ def download_files_from_s3(folder):
         folder (str): The folder in the S3 bucket to download files from.
     Returns:
-        pandas.DataFrame: A concatenated DataFrame containing the data from the downloaded Parquet files.
     """
     if not os.path.exists(folder):
         os.makedirs(folder)
@@ -360,7 +372,7 @@ def extract_from_pdf_by_pattern(url, pattern):
     Args:
         url (str): The URL of the PDF file to extract text from.
-        pattern (dict): A dictionary containing the pattern to match and the pages to extract text from.
     Returns:
         str: The extracted text from the PDF file.
@@ -379,7 +391,7 @@ def extract_from_pdf_by_pattern(url, pattern):
         # Open the downloaded PDF file and extract the text
         with open("downloaded_file.pdf", "rb") as file:
-            pdf_reader = PdfReader(file)
             extracted_text = ""
             if 'pages' in pattern:
                 pages = pattern['pages']
@@ -392,7 +404,8 @@ def extract_from_pdf_by_pattern(url, pattern):
                 else:
                     text = text.strip()
                 extracted_text += text
-    except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
         logging.error(e)
         extracted_text = ''
     return extracted_text.replace('?\n', '?-\n').replace(
@@ -423,7 +436,7 @@ def isnot_substring(list_a, string_to_check):
         string_to_check (str): The string to check for substrings.
     Returns:
-        bool: True if none of the strings in list_a are substrings of string_to_check, False otherwise.
     """
     return all(s not in string_to_check for s in list_a)
@@ -454,6 +467,7 @@ def extract_reference(row):
                     s.replace(remove_string, '') for s in reference_titles
                 ]
         if len(reference_dates) > 0:
             for title, date in zip(reference_titles, reference_dates):
                 try:
                     date = datetime.strptime(date, pattern['date_format'])
@@ -497,6 +511,7 @@ def extract_reference(row):
                                         & (data['site'] == row['site']) &
                                         (data['publishdate'].isin(dates))]
                     row['referenceID'] = reference_df.iloc[0]['id']
                     row['link'] = reference_df.iloc[0]['link']
                     row['sourceID'] = row['id']
                     row['refID'] = uuid.uuid5(
@@ -505,7 +520,9 @@ def extract_reference(row):
                     logging.info("%s - %s - %s - %s",
                                  date, repr(title), row['sourceID'], row['referenceID'])
                     update_reference(row)
         else:
             for title in reference_titles:
                 if 'split' in pattern:
                     for split_item in pattern['split']:
@@ -533,6 +550,7 @@ def extract_reference(row):
                     reference_df = data[(data['titleCN'].str.contains(title))
                                         & (data['site'] == row['site'])]
                     row['referenceID'] = reference_df.iloc[0]['id']
                     row['link'] = reference_df.iloc[0]['link']
                     row['sourceID'] = row['id']
                     row['refID'] = uuid.uuid5(
@@ -541,6 +559,7 @@ def extract_reference(row):
                     logging.info("%s - %s - %s", repr(title), row['sourceID'],
                                  row['referenceID'])
                     update_reference(row)
     except (ValueError, KeyError, TypeError) as error:
         logging.error(error)
         return None
@@ -584,26 +603,25 @@ def extract_from_pdf(url):
         file.write(pdf_content)
     # Open the downloaded PDF file and extract the text
-    with open("downloaded_file.pdf", "rb") as file:
-        pdf_reader = PdfReader(file)
-        num_pages = len(pdf_reader.pages)
-        extracted_text = ""
-        for page in range(num_pages):
-            text = pdf_reader.pages[page].extract_text()
-            if text and text[0].isdigit():
-                text = text[1:]
-            # first_newline_index = text.find('。\n')
-            # text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
-            text = text.replace('?\n', '?-\n').replace('！\n', '！-\n').replace(
-                '。\n', '。-\n').replace('\n', '').replace('?-', '?\n').replace(
-                    '！-', '！\n').replace('。-', '。\n')
-            if text != '':
-                extracted_text += text
-        try:
-            summary = '\n'.join(extracted_text.split('\n')[:2])
-        except (ValueError, KeyError, TypeError) as e:
-            logging.error(e)
-            summary = extracted_text
     return extracted_text, summary

 from deep_translator import GoogleTranslator, exceptions
 from langdetect import detect
 from lxml import etree
+import PyPDF2
 from transformers import pipeline
 from controllers.summarizer import summarize
+from controllers.vectorizer import vectorize, openai_vectorize
 load_dotenv()
     """Date Modifier Function
     This function takes a date string and a date format as input and modifies the date string
+    according to the specified format. It returns modified date string in the format 'YYYY-MM-DD'.
     Args:
         date_string (str): The date string to be modified.
     Encodes the given content into a single string.
     Args:
+        content (list): A list of elements to be encoded.
+        Each element can be either a string or an `etree._Element` object.
     Returns:
         str: The encoded content as a single string.
         otherwise None.
     Raises:
+        requests.exceptions.RequestException:
+        If there is an error while making the request or if the response status code is not 200.
     """
     try:
         response = requests.get(url, timeout=60)
     Returns:
         str: The translated text in English.
     """
+    for _ in range(5):
         try:
             return GoogleTranslator(source='auto', target='en').translate(text)
         except exceptions.RequestError:
     content (str): The content for which sentiment needs to be computed.
     Returns:
+    tuple: A tuple containing the sentiment score and label.
+    The sentiment score is a float representing the overall sentiment score of the content.
+    The sentiment label is a string representing the sentiment label ('+', '-', or '0').
     """
     label_dict = {
     """
     print("Updating content for %s", report['id'])
     dynamodb = get_client_connection()
+    dynamodb.update_item(
         TableName="article_china",
         Key={
             'id': {
                 'S': str(report['id'])
             }
+            # 'site': {
+            #     'S': report['site']
+            # }
         },
         UpdateExpression=
+        'SET title = :title, site = :site, titleCN = :titleCN, contentCN = :contentCN, \
+            category = :category, author = :author, content = :content, subtitle = :subtitle, \
+            publishDate = :publishDate, link = :link, attachment = :attachment, \
+            sentimentScore = :sentimentScore, sentimentLabel = :sentimentLabel, \
+            LastModifiedDate = :LastModifiedDate',
         ExpressionAttributeValues={
             ':title': {
                 'S': report['title']
             },
+            ':site': {
+                'S': report['site']
+            },
             ':titleCN': {
                 'S': report['titleCN']
             },
             }
         })
     vectorize(report)
+    openai_vectorize(report)
 def update_reference(report):
         folder (str): The folder in the S3 bucket to download files from.
     Returns:
+        pandas.DataFrame: A concatenated DataFrame containing data from downloaded Parquet files.
     """
     if not os.path.exists(folder):
         os.makedirs(folder)
     Args:
         url (str): The URL of the PDF file to extract text from.
+        pattern (dict): A dictionary containing pattern to match and the pages to extract text from.
     Returns:
         str: The extracted text from the PDF file.
         # Open the downloaded PDF file and extract the text
         with open("downloaded_file.pdf", "rb") as file:
+            pdf_reader = PyPDF2.PdfReader(file)
             extracted_text = ""
             if 'pages' in pattern:
                 pages = pattern['pages']
                 else:
                     text = text.strip()
                 extracted_text += text
+    except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout,
+            PyPDF2.errors.PdfReadError) as e:
         logging.error(e)
         extracted_text = ''
     return extracted_text.replace('?\n', '?-\n').replace(
         string_to_check (str): The string to check for substrings.
     Returns:
+        bool: True if none of strings in list_a are substrings of string_to_check, False otherwise.
     """
     return all(s not in string_to_check for s in list_a)
                     s.replace(remove_string, '') for s in reference_titles
                 ]
         if len(reference_dates) > 0:
+            reference_ids = []
             for title, date in zip(reference_titles, reference_dates):
                 try:
                     date = datetime.strptime(date, pattern['date_format'])
                                         & (data['site'] == row['site']) &
                                         (data['publishdate'].isin(dates))]
                     row['referenceID'] = reference_df.iloc[0]['id']
+                    reference_ids.append(row['referenceID'])
                     row['link'] = reference_df.iloc[0]['link']
                     row['sourceID'] = row['id']
                     row['refID'] = uuid.uuid5(
                     logging.info("%s - %s - %s - %s",
                                  date, repr(title), row['sourceID'], row['referenceID'])
                     update_reference(row)
+                return reference_ids
         else:
+            reference_ids = []
             for title in reference_titles:
                 if 'split' in pattern:
                     for split_item in pattern['split']:
                     reference_df = data[(data['titleCN'].str.contains(title))
                                         & (data['site'] == row['site'])]
                     row['referenceID'] = reference_df.iloc[0]['id']
+                    reference_ids.append(row['referenceID'])
                     row['link'] = reference_df.iloc[0]['link']
                     row['sourceID'] = row['id']
                     row['refID'] = uuid.uuid5(
                     logging.info("%s - %s - %s", repr(title), row['sourceID'],
                                  row['referenceID'])
                     update_reference(row)
+            return reference_ids
     except (ValueError, KeyError, TypeError) as error:
         logging.error(error)
         return None
         file.write(pdf_content)
     # Open the downloaded PDF file and extract the text
+    extracted_text = ""
+    try:
+        with open("downloaded_file.pdf", "rb") as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            num_pages = len(pdf_reader.pages)
+            for page in range(num_pages):
+                text = pdf_reader.pages[page].extract_text()
+                if text and text[0].isdigit():
+                    text = text[1:]
+                # first_newline_index = text.find('。\n')
+                text = text.replace('?\n', '?-\n').replace('！\n', '！-\n').replace(
+                    '。\n', '。-\n').replace('\n', '').replace('?-', '?\n').replace(
+                        '！-', '！\n').replace('。-', '。\n')
+                if text != '':
+                    extracted_text += text
+        summary = '\n'.join(extracted_text.split('\n')[:2])
+    except (ValueError, KeyError, TypeError, PyPDF2.errors.PdfReadError) as e:
+        logging.error(e)
+        summary = extracted_text
     return extracted_text, summary

controllers/vectorizer.py CHANGED Viewed

@@ -2,12 +2,15 @@
 import os
 import logging
 import uuid
 import pandas as pd
 from langchain_astradb import AstraDBVectorStore
 from langchain_openai import AzureOpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DataFrameLoader
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
@@ -27,6 +30,94 @@ vstore = AstraDBVectorStore(embedding=embedding,
                             token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
                             api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
 def vectorize(article):
     """
     Process the given article.

 import os
 import logging
 import uuid
+import time
+import tiktoken
 import pandas as pd
 from langchain_astradb import AstraDBVectorStore
 from langchain_openai import AzureOpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DataFrameLoader
+from astrapy.info import CollectionVectorServiceOptions
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
                             token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
                             api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
+openai_vstore = AstraDBVectorStore(
+    collection_vector_service_options=CollectionVectorServiceOptions(
+        provider="azureOpenAI",
+        model_name="text-embedding-3-small",
+        authentication={
+            "providerKey": "AZURE_OPENAI_API_KEY",
+        },
+        parameters={
+            "resourceName": "openai-oe",
+            "deploymentId": "text-embedding-3-small",
+        },
+    ),
+    namespace="default_keyspace",
+    collection_name="text_embedding_3_small",
+    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
+    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
+def token_length(text):
+    """
+    Calculates length of encoded text using the tokenizer for the "text-embedding-3-small" model.
+    Args:
+        text (str): The input text to be tokenized and measured.
+    Returns:
+        int: The length of the encoded text.
+    """
+    tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
+    return len(tokenizer.encode(text))
+def add_documents_with_retry(chunks, ids, max_retries=3):
+    """
+    Attempts to add documents to the vstore with a specified number of retries.
+    Parameters:
+    chunks (list): The list of document chunks to be added.
+    ids (list): The list of document IDs corresponding to the chunks.
+    max_retries (int, optional): The maximum number of retry attempts. Default is 3.
+    Raises:
+    Exception: If the operation fails after the maximum number of retries, the exception is logged.
+    """
+    for attempt in range(max_retries):
+        try:
+            vstore.add_documents(chunks, ids=ids)
+        except (ConnectionError, TimeoutError) as e:
+            logging.info("Attempt %d failed: %s", attempt + 1, e)
+            if attempt < max_retries - 1:
+                time.sleep(0.5)
+            else:
+                logging.error("Max retries reached. Operation failed.")
+                logging.error(ids)
+def openai_vectorize(article):
+    """
+    Process the given article.
+    Parameters:
+    article (DataFrame): The article to be processed.
+    Returns:
+    None
+    """
+    article['id'] = str(article['id'])
+    if isinstance(article, dict):
+        article = [article]  # Convert single dictionary to list of dictionaries
+    df = pd.DataFrame(article)
+    df = df[['id', 'publishDate', 'author', 'category',
+                         'content', 'referenceid', 'site', 'title', 'link']]
+    df['publishDate'] = pd.to_datetime(df['publishDate'])
+    documents = DataFrameLoader(df, page_content_column="content").load()
+    text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=token_length,
+            is_separator_regex=False,
+            separators=["\n\n", "\n", "\t", ".", "?"]  # Logical separators
+        )
+    chunks = text_splitter.split_documents(documents)
+    ids = []
+    for index, chunk in enumerate(chunks):
+        _id = f"{chunk.metadata['id']}-{str(index)}"
+        ids.append(_id)
+    try:
+        add_documents_with_retry(chunks, ids)
+    except (ConnectionError, TimeoutError, ValueError) as e:
+        logging.error("Failed to add documents: %s", e)
 def vectorize(article):
     """
     Process the given article.

source/eastmoney.py CHANGED Viewed

@@ -91,7 +91,9 @@ def _crawl(url, article, retries=3):
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
-    extract_reference(article)
     update_content(article)
 @task(name = "Data Collection - eastmoney", log_prints = True)
@@ -136,7 +138,8 @@ def crawl(delta):
                 i = i + 1
                 for article in reportinfo['data']:
                     try:
-                        url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                         _crawl(url, article)
                     except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
                         logger.error(error)

                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
+    reference_id = extract_reference(article)
+    if reference_id:
+        article['referenceid'] = reference_id
     update_content(article)
 @task(name = "Data Collection - eastmoney", log_prints = True)
                 i = i + 1
                 for article in reportinfo['data']:
                     try:
+                        domain = "https://data.eastmoney.com"
+                        url = f"{domain}/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                         _crawl(url, article)
                     except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
                         logger.error(error)