|
|
|
|
|
|
|
import tiktoken |
|
tokenizer = tiktoken.get_encoding('cl100k_base') |
|
def tiktoken_len(text): |
|
tokens = tokenizer.encode(text) |
|
return len(tokens) |
|
|
|
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf') |
|
pages = loader.load_and_split() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) |
|
sourceDocs = text_splitter.split_documents(pages) |
|
|
|
|
|
|
|
|
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
|
|
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', |
|
model_kwargs = {'device':'cpu'}, |
|
encode_kwargs = {'normalize_embeddings' : True}) |
|
|
|
|
|
db = Chroma.from_documents(sourceDocs, model_huggingface) |
|
|
|
|
|
question = '์ผ์ฑ์ ์์ ์ฃผ์ ์ฌ์
์์ญ์?' |
|
docs3 = db.similarity_search_with_relevance_scores(question, k = 1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
joinDoc = ' '.join([doc[0].page_content for doc in docs3]) |
|
print(joinDoc) |
|
|
|
|
|
|
|
|
|
from langchain_community.chat_models import ChatOllama |
|
llm = ChatOllama( |
|
base_url='http://localhost:11434', |
|
|
|
model="phi3:mini", |
|
) |
|
|
|
from langchain_core.prompts import ChatPromptTemplate |
|
|
|
prompt = ChatPromptTemplate.from_messages([ |
|
("system", "Please answer the following question from the document: {document}"), |
|
("user", "{question}"), |
|
]) |
|
|
|
print('-'*50) |
|
chain = prompt | llm |
|
print(chain.invoke({"question": question, "document": joinDoc})) |