File size: 788 Bytes
a409078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import bs4
import os
from langchain_text_splitters import CharacterTextSplitter
import requests
import streamlit as st
import sys
from vectordb import add_image_to_index, add_pdf_to_index, update_vectordb

sys.path.append(os.path.dirname(os.path.abspath(__file__)))


def process_text(text: str, text_embedding_model):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1200,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    chunks = text_splitter.split_text(text)
    text_embeddings = text_embedding_model.encode(chunks)
    for chunk, embedding in zip(chunks, text_embeddings):
        index = update_vectordb(index_path="text_index.index", embedding=embedding, text_content=chunk)
    return index