M17idd commited on
Commit
2b9cbaf
·
verified ·
1 Parent(s): b5be236

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -127,26 +127,35 @@ class HuggingFaceEmbeddings(Embeddings):
127
  @st.cache_resource
128
  def get_pdf_index():
129
  with st.spinner('📄 در حال پردازش فایل PDF...'):
130
- # Load the PDF file
131
  loader = PyPDFLoader('test1.pdf')
132
  pages = loader.load()
133
-
134
- # Extract text from each page
135
- full_text = "\n".join([page.page_content for page in pages])
136
-
 
 
 
 
 
 
 
 
 
137
  # Split the text into chunks
138
  text_splitter = RecursiveCharacterTextSplitter(
139
  chunk_size=1024, # Chunk size
140
  chunk_overlap=128 # Overlap between chunks
141
  )
142
  texts = text_splitter.split_text(full_text)
143
-
144
  # Create embeddings
145
  embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
146
-
147
  # Create FAISS vector store
148
  vector_store = FAISS.from_texts(texts, embeddings)
149
-
150
  return vector_store
151
  index = get_pdf_index()
152
 
 
127
  @st.cache_resource
128
  def get_pdf_index():
129
  with st.spinner('📄 در حال پردازش فایل PDF...'):
130
+ # Load the PDF file with chunks
131
  loader = PyPDFLoader('test1.pdf')
132
  pages = loader.load()
133
+
134
+ # Instead of loading all pages, process them in batches
135
+ batch_size = 5 # Processing 5 pages at a time
136
+ all_texts = []
137
+
138
+ for i in range(0, len(pages), batch_size):
139
+ batch = pages[i:i + batch_size]
140
+ batch_text = "\n".join([page.page_content for page in batch])
141
+ all_texts.append(batch_text)
142
+
143
+ # Combine all texts for further processing
144
+ full_text = "\n".join(all_texts)
145
+
146
  # Split the text into chunks
147
  text_splitter = RecursiveCharacterTextSplitter(
148
  chunk_size=1024, # Chunk size
149
  chunk_overlap=128 # Overlap between chunks
150
  )
151
  texts = text_splitter.split_text(full_text)
152
+
153
  # Create embeddings
154
  embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
155
+
156
  # Create FAISS vector store
157
  vector_store = FAISS.from_texts(texts, embeddings)
158
+
159
  return vector_store
160
  index = get_pdf_index()
161