Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from together import Together
|
|
12 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
13 |
from transformers import AutoTokenizer, AutoModel
|
14 |
import torch
|
|
|
15 |
|
16 |
|
17 |
import streamlit as st
|
@@ -125,47 +126,11 @@ class HuggingFaceEmbeddings(Embeddings):
|
|
125 |
return self.embed_documents([text])[0]
|
126 |
|
127 |
@st.cache_resource
|
128 |
-
def
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
# تقسیم صفحات به دستههای ۵ تایی
|
135 |
-
batch_size = 338 # پردازش ۵ صفحه در هر بار
|
136 |
-
all_texts = []
|
137 |
-
|
138 |
-
progress = st.progress(0) # نوار پیشرفت
|
139 |
-
total_batches = len(pages) // batch_size + (1 if len(pages) % batch_size != 0 else 0) # تعداد دستهها
|
140 |
-
|
141 |
-
# پردازش هر دسته
|
142 |
-
for i in range(0, len(pages), batch_size):
|
143 |
-
batch = pages[i:i + batch_size]
|
144 |
-
batch_text = "\n".join([page.page_content for page in batch])
|
145 |
-
all_texts.append(batch_text)
|
146 |
-
|
147 |
-
# بهروزرسانی نوار پیشرفت
|
148 |
-
progress.progress((i // batch_size + 1) / total_batches)
|
149 |
-
|
150 |
-
time.sleep(0.5) # شبیهسازی زمان پردازش
|
151 |
-
|
152 |
-
# ترکیب تمام متنها برای پردازش بیشتر
|
153 |
-
full_text = "\n".join(all_texts)
|
154 |
-
|
155 |
-
# تقسیم متن به بخشها
|
156 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
157 |
-
chunk_size=1024, # اندازه هر بخش
|
158 |
-
chunk_overlap=128 # همپوشانی بین بخشها
|
159 |
-
)
|
160 |
-
texts = text_splitter.split_text(full_text)
|
161 |
-
|
162 |
-
# ایجاد انتشارات
|
163 |
-
embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
|
164 |
-
|
165 |
-
# ایجاد FAISS vector store
|
166 |
-
vector_store = FAISS.from_texts(texts, embeddings)
|
167 |
-
|
168 |
-
return vector_store
|
169 |
index = get_pdf_index()
|
170 |
|
171 |
llm = ChatOpenAI(
|
|
|
12 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
13 |
from transformers import AutoTokenizer, AutoModel
|
14 |
import torch
|
15 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
16 |
|
17 |
|
18 |
import streamlit as st
|
|
|
126 |
return self.embed_documents([text])[0]
|
127 |
|
128 |
@st.cache_resource
|
129 |
+
def get_pdf_text(pdf_docs='C:/Users/itel/Desktop/your work data.pdf'):
|
130 |
+
pdf_reader = [PyPDFLoader(pdf_docs)]
|
131 |
+
embeddings = HuggingFaceInstructEmbeddings(model_name="SajjadAyoubi/xlm-roberta-large-fa-qa")
|
132 |
+
index = VectorstoreIndexCreator( embedding=embeddings, text_splitter=RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)).from_loaders(pdf_reader)
|
133 |
+
return index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
index = get_pdf_index()
|
135 |
|
136 |
llm = ChatOpenAI(
|