File size: 2,020 Bytes
4e18ce3 62774df 4e18ce3 62774df 4e18ce3 62774df 4e18ce3 62774df 1177da7 62774df 4e18ce3 ba224f2 99b66e9 62774df 2dbc5e6 741bcee 62774df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
"""Module to upsert data into AstraDB"""
import os
import logging
import pandas as pd
from langchain_astradb import AstraDBVectorStore
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader
logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']
embedding = AzureOpenAIEmbeddings(
api_version="2024-07-01-preview",
azure_endpoint="https://openai-oe.openai.azure.com/")
vstore = AstraDBVectorStore(embedding=embedding,
namespace="default_keyspace",
collection_name="finfast_china",
token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])
def vectorize(article):
"""
Process the given article.
Parameters:
article (DataFrame): The article to be processed.
Returns:
None
"""
article['id'] = str(article['id'])
df = pd.DataFrame(article)
df = df[['id','site','title','titleCN','category','author','content',
'publishDate','link','attachment','sentimentScore','sentimentLabel']]
df['sentimentScore'] = df['sentimentScore'].round(2)
df['sentimentScore'] = df['sentimentScore'].astype(float)
df['publishDate'] = pd.to_datetime(df['publishDate'])
loader = DataFrameLoader(df, page_content_column="content")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
docs = text_splitter.split_documents(documents)
inserted_ids = vstore.add_documents(docs)
logging.info(inserted_ids)
|