M17idd commited on
Commit
4dfc654
·
verified ·
1 Parent(s): 0628114

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -105,17 +105,18 @@ st.markdown("""
105
  @st.cache_resource
106
  def get_pdf_index():
107
  with st.spinner('📄 در حال پردازش فایل PDF...'):
108
- loader = [PyPDFLoader('test1.pdf')]
 
 
109
 
 
110
  model_name = "togethercomputer/m2-bert-80M-8k-retrieval" # نام مدل
111
- model = SentenceTransformer(model_name, trust_remote_code=True) # استفاده از توکن
112
-
113
-
114
 
115
  splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
116
  texts = []
117
- for doc in loader:
118
- texts.extend(splitter.split_text(doc.page_content))
119
 
120
  progress_bar = st.progress(0)
121
  total_docs = len(texts)
@@ -123,7 +124,7 @@ def get_pdf_index():
123
  embeddings = []
124
  batch_size = 128
125
  for i in range(0, total_docs, batch_size):
126
- batch_texts = texts[i:i+batch_size]
127
  batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
128
  embeddings.extend(batch_embeddings)
129
 
@@ -133,13 +134,11 @@ def get_pdf_index():
133
  progress_bar.empty()
134
 
135
  embeddings = np.array(embeddings)
136
- index = faiss.IndexFlatL2(embeddings.shape[1])
137
  index.add(embeddings)
138
 
139
- return VectorstoreIndexCreator(
140
- embedding=model.encode,
141
- text_splitter=splitter
142
- ).from_loaders(loader)
143
 
144
 
145
  # ----------------- تعریف LLM از Groq -----------------
@@ -152,11 +151,11 @@ llm = ChatOpenAI(
152
 
153
  # ----------------- تعریف SimpleRetriever -----------------
154
  class SimpleRetriever(BaseRetriever):
155
- documents: List[Document] = Field(...)
156
  embeddings: List[np.ndarray] = Field(...)
157
  index: faiss.Index
158
 
159
- def _get_relevant_documents(self, query: str) -> List[Document]:
160
  sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
161
  query_embedding = sentence_model.encode(query, convert_to_numpy=True)
162
 
@@ -170,6 +169,7 @@ class SimpleRetriever(BaseRetriever):
170
  documents, embeddings, index = get_pdf_index()
171
  retriever = SimpleRetriever(documents=documents, embeddings=embeddings, index=index)
172
 
 
173
  # ----------------- ساخت Chain -----------------
174
  chain = RetrievalQA.from_chain_type(
175
  llm=llm,
 
105
  @st.cache_resource
106
  def get_pdf_index():
107
  with st.spinner('📄 در حال پردازش فایل PDF...'):
108
+ # بارگذاری PDF
109
+ loader = PyPDFLoader('test1.pdf')
110
+ documents = loader.load_and_split() # اینجا متن PDF را استخراج می‌کنیم
111
 
112
+ # استفاده از مدل SentenceTransformer
113
  model_name = "togethercomputer/m2-bert-80M-8k-retrieval" # نام مدل
114
+ model = SentenceTransformer(model_name, trust_remote_code=True)
 
 
115
 
116
  splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
117
  texts = []
118
+ for doc in documents:
119
+ texts.extend(splitter.split_text(doc['text'])) # تقسیم متن به تکه‌های کوچک‌تر
120
 
121
  progress_bar = st.progress(0)
122
  total_docs = len(texts)
 
124
  embeddings = []
125
  batch_size = 128
126
  for i in range(0, total_docs, batch_size):
127
+ batch_texts = texts[i:i + batch_size]
128
  batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
129
  embeddings.extend(batch_embeddings)
130
 
 
134
  progress_bar.empty()
135
 
136
  embeddings = np.array(embeddings)
137
+ index = faiss.IndexFlatL2(embeddings.shape[1]) # ایجاد ایندکس با استفاده از faiss
138
  index.add(embeddings)
139
 
140
+ # استفاده از VectorstoreIndexCreator
141
+ return documents, embeddings, index # بازگشت به اسناد و ایندکس
 
 
142
 
143
 
144
  # ----------------- تعریف LLM از Groq -----------------
 
151
 
152
  # ----------------- تعریف SimpleRetriever -----------------
153
  class SimpleRetriever(BaseRetriever):
154
+ documents: List[dict] = Field(...) # تغییر نوع مستند به dict
155
  embeddings: List[np.ndarray] = Field(...)
156
  index: faiss.Index
157
 
158
+ def _get_relevant_documents(self, query: str) -> List[dict]:
159
  sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
160
  query_embedding = sentence_model.encode(query, convert_to_numpy=True)
161
 
 
169
  documents, embeddings, index = get_pdf_index()
170
  retriever = SimpleRetriever(documents=documents, embeddings=embeddings, index=index)
171
 
172
+
173
  # ----------------- ساخت Chain -----------------
174
  chain = RetrievalQA.from_chain_type(
175
  llm=llm,