Update app.py
Browse files
app.py
CHANGED
@@ -107,23 +107,48 @@ class TogetherEmbeddings(Embeddings):
|
|
107 |
@st.cache_resource
|
108 |
def get_pdf_index():
|
109 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
|
|
110 |
loader = [PyPDFLoader('test1.pdf')]
|
111 |
-
embeddings = TogetherEmbeddings(
|
112 |
-
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
113 |
-
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
114 |
-
|
115 |
|
|
|
|
|
|
|
|
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
|
|
|
|
|
|
118 |
)
|
119 |
-
return VectorstoreIndexCreator(
|
120 |
-
embedding=embeddings,
|
121 |
-
text_splitter=RecursiveCharacterTextSplitter(
|
122 |
-
chunk_size=124,
|
123 |
-
chunk_overlap=25
|
124 |
-
)
|
125 |
-
).from_loaders(loader)
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
index = get_pdf_index()
|
128 |
|
129 |
llm = ChatOpenAI(
|
|
|
107 |
@st.cache_resource
|
108 |
def get_pdf_index():
|
109 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
110 |
+
# لود کردن فایل PDF
|
111 |
loader = [PyPDFLoader('test1.pdf')]
|
|
|
|
|
|
|
|
|
112 |
|
113 |
+
# لود کردن همه صفحات
|
114 |
+
pages = []
|
115 |
+
for l in loader:
|
116 |
+
pages.extend(l.load())
|
117 |
|
118 |
+
# پردازش کردن طول هر صفحه
|
119 |
+
processed_pages = []
|
120 |
+
splitter = RecursiveCharacterTextSplitter(
|
121 |
+
chunk_size=2000,
|
122 |
+
chunk_overlap=25
|
123 |
+
)
|
124 |
+
|
125 |
+
for page in pages:
|
126 |
+
text = page.page_content
|
127 |
+
if len(text) > 2000:
|
128 |
+
# اگر متن طولانی بود، خوردش کن
|
129 |
+
chunks = splitter.split_text(text)
|
130 |
+
for chunk in chunks:
|
131 |
+
# ساختن شی جدید برای هر چانک
|
132 |
+
processed_pages.append(type(page)(page_content=chunk))
|
133 |
+
else:
|
134 |
+
# اگر کوتاه بود، همونطور اضافه کن
|
135 |
+
processed_pages.append(page)
|
136 |
|
137 |
+
embeddings = TogetherEmbeddings(
|
138 |
+
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
139 |
+
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
140 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
# ساختن index از صفحات پردازش شده
|
143 |
+
return VectorstoreIndexCreator(
|
144 |
+
embedding=embeddings,
|
145 |
+
text_splitter=RecursiveCharacterTextSplitter(
|
146 |
+
chunk_size=2000,
|
147 |
+
chunk_overlap=25
|
148 |
+
)
|
149 |
+
).from_documents(processed_pages)
|
150 |
+
|
151 |
+
# فراخوانی
|
152 |
index = get_pdf_index()
|
153 |
|
154 |
llm = ChatOpenAI(
|