Update app.py
Browse files
app.py
CHANGED
@@ -127,26 +127,35 @@ class HuggingFaceEmbeddings(Embeddings):
|
|
127 |
@st.cache_resource
|
128 |
def get_pdf_index():
|
129 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
130 |
-
# Load the PDF file
|
131 |
loader = PyPDFLoader('test1.pdf')
|
132 |
pages = loader.load()
|
133 |
-
|
134 |
-
#
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
# Split the text into chunks
|
138 |
text_splitter = RecursiveCharacterTextSplitter(
|
139 |
chunk_size=1024, # Chunk size
|
140 |
chunk_overlap=128 # Overlap between chunks
|
141 |
)
|
142 |
texts = text_splitter.split_text(full_text)
|
143 |
-
|
144 |
# Create embeddings
|
145 |
embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
|
146 |
-
|
147 |
# Create FAISS vector store
|
148 |
vector_store = FAISS.from_texts(texts, embeddings)
|
149 |
-
|
150 |
return vector_store
|
151 |
index = get_pdf_index()
|
152 |
|
|
|
127 |
@st.cache_resource
|
128 |
def get_pdf_index():
|
129 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
130 |
+
# Load the PDF file with chunks
|
131 |
loader = PyPDFLoader('test1.pdf')
|
132 |
pages = loader.load()
|
133 |
+
|
134 |
+
# Instead of loading all pages, process them in batches
|
135 |
+
batch_size = 5 # Processing 5 pages at a time
|
136 |
+
all_texts = []
|
137 |
+
|
138 |
+
for i in range(0, len(pages), batch_size):
|
139 |
+
batch = pages[i:i + batch_size]
|
140 |
+
batch_text = "\n".join([page.page_content for page in batch])
|
141 |
+
all_texts.append(batch_text)
|
142 |
+
|
143 |
+
# Combine all texts for further processing
|
144 |
+
full_text = "\n".join(all_texts)
|
145 |
+
|
146 |
# Split the text into chunks
|
147 |
text_splitter = RecursiveCharacterTextSplitter(
|
148 |
chunk_size=1024, # Chunk size
|
149 |
chunk_overlap=128 # Overlap between chunks
|
150 |
)
|
151 |
texts = text_splitter.split_text(full_text)
|
152 |
+
|
153 |
# Create embeddings
|
154 |
embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
|
155 |
+
|
156 |
# Create FAISS vector store
|
157 |
vector_store = FAISS.from_texts(texts, embeddings)
|
158 |
+
|
159 |
return vector_store
|
160 |
index = get_pdf_index()
|
161 |
|