Update app.py
Browse files
app.py
CHANGED
@@ -102,40 +102,44 @@ st.markdown("""
|
|
102 |
|
103 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
104 |
|
|
|
105 |
@st.cache_resource
|
106 |
def get_pdf_index():
|
107 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
108 |
# بارگذاری PDF
|
109 |
loader = PyPDFLoader('test1.pdf')
|
110 |
documents = loader.load_and_split() # اینجا متن PDF را استخراج میکنیم
|
|
|
|
|
111 |
model = TogetherEmbeddings(
|
112 |
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
113 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
114 |
-
|
115 |
)
|
|
|
116 |
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
117 |
texts = []
|
118 |
for doc in documents:
|
119 |
-
texts.extend(splitter.split_text(doc.page_content))
|
120 |
-
progress_bar = st.progress(0)
|
121 |
-
total_docs = len(texts)
|
122 |
|
|
|
123 |
embeddings = []
|
124 |
batch_size = 512
|
125 |
-
for i in range(0,
|
126 |
batch_texts = texts[i:i + batch_size]
|
127 |
batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
|
128 |
embeddings.extend(batch_embeddings)
|
129 |
|
130 |
-
|
131 |
-
|
132 |
time.sleep(1)
|
133 |
-
progress_bar.empty()
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
139 |
|
140 |
# ----------------- تعریف LLM از Groq -----------------
|
141 |
llm = ChatOpenAI(
|
|
|
102 |
|
103 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
104 |
|
105 |
+
@st.cache_resource
|
106 |
@st.cache_resource
|
107 |
def get_pdf_index():
|
108 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
109 |
# بارگذاری PDF
|
110 |
loader = PyPDFLoader('test1.pdf')
|
111 |
documents = loader.load_and_split() # اینجا متن PDF را استخراج میکنیم
|
112 |
+
|
113 |
+
# استفاده از مدل embeddings
|
114 |
model = TogetherEmbeddings(
|
115 |
model_name="togethercomputer/m2-bert-80M-8k-retrieval",
|
116 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
|
|
117 |
)
|
118 |
+
|
119 |
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
120 |
texts = []
|
121 |
for doc in documents:
|
122 |
+
texts.extend(splitter.split_text(doc.page_content)) # دسترسی به متن از ویژگی page_content
|
|
|
|
|
123 |
|
124 |
+
# تولید امبدینگها
|
125 |
embeddings = []
|
126 |
batch_size = 512
|
127 |
+
for i in range(0, len(texts), batch_size):
|
128 |
batch_texts = texts[i:i + batch_size]
|
129 |
batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
|
130 |
embeddings.extend(batch_embeddings)
|
131 |
|
132 |
+
# زمان دادن به progress bar برای خالی شدن
|
|
|
133 |
time.sleep(1)
|
|
|
134 |
|
135 |
+
# تبدیل لیست embeddings به آرایه numpy
|
136 |
+
embeddings = np.array(embeddings)
|
137 |
+
|
138 |
+
# ایجاد ایندکس با استفاده از VectorstoreIndexCreator
|
139 |
+
return VectorstoreIndexCreator(
|
140 |
+
embedding_function=model.encode, # استفاده از تابع انکودینگ برای تولید امبدینگها
|
141 |
+
text_splitter=splitter
|
142 |
+
).from_texts(texts, embeddings=embeddings)
|
143 |
|
144 |
# ----------------- تعریف LLM از Groq -----------------
|
145 |
llm = ChatOpenAI(
|