Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ from langchain.chat_models import ChatOpenAI
|
|
10 |
from typing import List
|
11 |
from together import Together
|
12 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
|
13 |
|
14 |
|
15 |
import streamlit as st
|
@@ -107,8 +109,8 @@ from transformers import AutoTokenizer, AutoModel
|
|
107 |
|
108 |
class HuggingFaceEmbeddings(Embeddings):
|
109 |
def __init__(self, model_name: str):
|
110 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
111 |
-
self.model = AutoModel.from_pretrained(
|
112 |
|
113 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
114 |
embeddings = []
|
@@ -125,24 +127,27 @@ class HuggingFaceEmbeddings(Embeddings):
|
|
125 |
@st.cache_resource
|
126 |
def get_pdf_index():
|
127 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
|
|
128 |
loader = PyPDFLoader('test1.pdf')
|
129 |
pages = loader.load()
|
|
|
|
|
130 |
full_text = "\n".join([page.page_content for page in pages])
|
|
|
|
|
131 |
text_splitter = RecursiveCharacterTextSplitter(
|
132 |
-
chunk_size=1024,
|
133 |
-
chunk_overlap=128
|
134 |
)
|
135 |
texts = text_splitter.split_text(full_text)
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
)
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
index = get_pdf_index()
|
147 |
|
148 |
llm = ChatOpenAI(
|
|
|
10 |
from typing import List
|
11 |
from together import Together
|
12 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
13 |
+
from transformers import AutoTokenizer, AutoModel
|
14 |
+
import torch
|
15 |
|
16 |
|
17 |
import streamlit as st
|
|
|
109 |
|
110 |
class HuggingFaceEmbeddings(Embeddings):
|
111 |
def __init__(self, model_name: str):
|
112 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
113 |
+
self.model = AutoModel.from_pretrained(model_name)
|
114 |
|
115 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
116 |
embeddings = []
|
|
|
127 |
@st.cache_resource
|
128 |
def get_pdf_index():
|
129 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
130 |
+
# Load the PDF file
|
131 |
loader = PyPDFLoader('test1.pdf')
|
132 |
pages = loader.load()
|
133 |
+
|
134 |
+
# Extract text from each page
|
135 |
full_text = "\n".join([page.page_content for page in pages])
|
136 |
+
|
137 |
+
# Split the text into chunks
|
138 |
text_splitter = RecursiveCharacterTextSplitter(
|
139 |
+
chunk_size=1024, # Chunk size
|
140 |
+
chunk_overlap=128 # Overlap between chunks
|
141 |
)
|
142 |
texts = text_splitter.split_text(full_text)
|
143 |
|
144 |
+
# Create embeddings
|
145 |
+
embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
|
|
|
146 |
|
147 |
+
# Create FAISS vector store
|
148 |
+
vector_store = FAISS.from_texts(texts, embeddings)
|
149 |
+
|
150 |
+
return vector_store
|
|
|
151 |
index = get_pdf_index()
|
152 |
|
153 |
llm = ChatOpenAI(
|