Update app.py
Browse files
app.py
CHANGED
@@ -107,48 +107,58 @@ class TogetherEmbeddings(Embeddings):
|
|
107 |
@st.cache_resource
|
108 |
def get_pdf_index():
|
109 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
110 |
-
# لود
|
111 |
loader = [PyPDFLoader('test1.pdf')]
|
112 |
-
|
113 |
-
# لود کردن همه صفحات
|
114 |
pages = []
|
115 |
for l in loader:
|
116 |
pages.extend(l.load())
|
117 |
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
chunk_size=2000,
|
122 |
chunk_overlap=25
|
123 |
)
|
124 |
-
|
|
|
125 |
for page in pages:
|
126 |
text = page.page_content
|
127 |
-
if len(text) >
|
128 |
-
|
129 |
-
chunks = splitter.split_text(text)
|
130 |
-
for chunk in chunks:
|
131 |
-
# ساختن شی جدید برای هر چانک
|
132 |
-
processed_pages.append(type(page)(page_content=chunk))
|
133 |
else:
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
|
|
137 |
embeddings = TogetherEmbeddings(
|
138 |
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
139 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
140 |
)
|
141 |
|
142 |
-
# ساختن
|
143 |
return VectorstoreIndexCreator(
|
144 |
embedding=embeddings,
|
145 |
text_splitter=RecursiveCharacterTextSplitter(
|
146 |
chunk_size=2000,
|
147 |
-
chunk_overlap=
|
148 |
)
|
149 |
-
).from_documents(
|
150 |
|
151 |
-
# فراخوانی
|
152 |
index = get_pdf_index()
|
153 |
|
154 |
llm = ChatOpenAI(
|
|
|
107 |
@st.cache_resource
|
108 |
def get_pdf_index():
|
109 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
110 |
+
# لود فایل
|
111 |
loader = [PyPDFLoader('test1.pdf')]
|
|
|
|
|
112 |
pages = []
|
113 |
for l in loader:
|
114 |
pages.extend(l.load())
|
115 |
|
116 |
+
# اول چانک کردن عادی با سایز 124
|
117 |
+
splitter_initial = RecursiveCharacterTextSplitter(
|
118 |
+
chunk_size=124,
|
|
|
119 |
chunk_overlap=25
|
120 |
)
|
121 |
+
|
122 |
+
small_chunks = []
|
123 |
for page in pages:
|
124 |
text = page.page_content
|
125 |
+
if len(text) > 124:
|
126 |
+
small_chunks.extend(splitter_initial.split_text(text))
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
+
small_chunks.append(text)
|
129 |
+
|
130 |
+
# حالا چک کنیم هیچ چانکی بیشتر از 3000 کاراکتر نباشه
|
131 |
+
final_chunks = []
|
132 |
+
final_splitter = RecursiveCharacterTextSplitter(
|
133 |
+
chunk_size=2000,
|
134 |
+
chunk_overlap=200
|
135 |
+
)
|
136 |
+
|
137 |
+
for chunk in small_chunks:
|
138 |
+
if len(chunk) > 2000:
|
139 |
+
final_chunks.extend(final_splitter.split_text(chunk))
|
140 |
+
else:
|
141 |
+
final_chunks.append(chunk)
|
142 |
+
|
143 |
+
# آماده سازی اسناد برای index
|
144 |
+
from langchain.schema import Document as LangchainDocument
|
145 |
+
documents = [LangchainDocument(page_content=text) for text in final_chunks]
|
146 |
|
147 |
+
# ساختن embedding
|
148 |
embeddings = TogetherEmbeddings(
|
149 |
model_name="togethercomputer/m2-bert-80M-32k-retrieval",
|
150 |
api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
|
151 |
)
|
152 |
|
153 |
+
# ساختن ایندکس
|
154 |
return VectorstoreIndexCreator(
|
155 |
embedding=embeddings,
|
156 |
text_splitter=RecursiveCharacterTextSplitter(
|
157 |
chunk_size=2000,
|
158 |
+
chunk_overlap=200
|
159 |
)
|
160 |
+
).from_documents(documents)
|
161 |
|
|
|
162 |
index = get_pdf_index()
|
163 |
|
164 |
llm = ChatOpenAI(
|