File size: 2,294 Bytes
4e18ce3
 
 
96825f6
4e18ce3
 
62774df
 
4e18ce3
 
 
 
 
 
 
 
62774df
 
4e18ce3
62774df
 
 
4e18ce3
62774df
 
108738b
62774df
 
4e18ce3
 
 
 
 
 
 
 
 
 
 
ba224f2
99b66e9
62774df
c39d841
 
 
 
 
741bcee
62774df
 
 
 
 
 
 
 
 
96825f6
 
 
c39d841
 
96825f6
62774df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""Module to upsert data into AstraDB"""
import os
import logging
import uuid

import pandas as pd
from langchain_astradb import AstraDBVectorStore
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DataFrameLoader

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO)

ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
ASTRA_DB_API_ENDPOINT = os.environ['ASTRA_DB_API_ENDPOINT']

embedding = AzureOpenAIEmbeddings(
    api_version="2024-07-01-preview",
    azure_endpoint="https://openai-oe.openai.azure.com/")

vstore = AstraDBVectorStore(embedding=embedding,
                            namespace="default_keyspace",
                            collection_name="FinFast_China",
                            token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
                            api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"])

def vectorize(article):
    """
    Process the given article.

    Parameters:
    article (DataFrame): The article to be processed.

    Returns:
    None
    """
    article['id'] = str(article['id'])
    df = pd.DataFrame(article)
    df = df[['id','site','title','titleCN','category','author','content',
             'publishDate','link']]
    df = df[['id', 'publishdate', 'author', 'category',
                         'content', 'referenceid', 'site', 'title', 'link']]
    # df['sentimentScore'] = df['sentimentScore'].round(2)
    # df['sentimentScore'] = df['sentimentScore'].astype(float)
    df['publishDate'] = pd.to_datetime(df['publishDate'])
    loader = DataFrameLoader(df, page_content_column="content")
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )

    chunks = text_splitter.split_documents(documents)
    ids = []
    for chunk in chunks:
        _id = f"{chunk.metadata['id']}-{str(uuid.uuid5(uuid.NAMESPACE_OID,chunk.page_content))}"
        ids.append(_id)
    inserted_ids = vstore.add_documents(chunks, ids=ids)
    logging.info(inserted_ids)