Update app.py
Browse files
app.py
CHANGED
@@ -105,17 +105,18 @@ st.markdown("""
|
|
105 |
@st.cache_resource
|
106 |
def get_pdf_index():
|
107 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
108 |
-
|
|
|
|
|
109 |
|
|
|
110 |
model_name = "togethercomputer/m2-bert-80M-8k-retrieval" # نام مدل
|
111 |
-
model = SentenceTransformer(model_name, trust_remote_code=True)
|
112 |
-
|
113 |
-
|
114 |
|
115 |
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
116 |
texts = []
|
117 |
-
for doc in
|
118 |
-
texts.extend(splitter.split_text(doc
|
119 |
|
120 |
progress_bar = st.progress(0)
|
121 |
total_docs = len(texts)
|
@@ -123,7 +124,7 @@ def get_pdf_index():
|
|
123 |
embeddings = []
|
124 |
batch_size = 128
|
125 |
for i in range(0, total_docs, batch_size):
|
126 |
-
batch_texts = texts[i:i+batch_size]
|
127 |
batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
|
128 |
embeddings.extend(batch_embeddings)
|
129 |
|
@@ -133,13 +134,11 @@ def get_pdf_index():
|
|
133 |
progress_bar.empty()
|
134 |
|
135 |
embeddings = np.array(embeddings)
|
136 |
-
index = faiss.IndexFlatL2(embeddings.shape[1])
|
137 |
index.add(embeddings)
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
text_splitter=splitter
|
142 |
-
).from_loaders(loader)
|
143 |
|
144 |
|
145 |
# ----------------- تعریف LLM از Groq -----------------
|
@@ -152,11 +151,11 @@ llm = ChatOpenAI(
|
|
152 |
|
153 |
# ----------------- تعریف SimpleRetriever -----------------
|
154 |
class SimpleRetriever(BaseRetriever):
|
155 |
-
documents: List[
|
156 |
embeddings: List[np.ndarray] = Field(...)
|
157 |
index: faiss.Index
|
158 |
|
159 |
-
def _get_relevant_documents(self, query: str) -> List[
|
160 |
sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
|
161 |
query_embedding = sentence_model.encode(query, convert_to_numpy=True)
|
162 |
|
@@ -170,6 +169,7 @@ class SimpleRetriever(BaseRetriever):
|
|
170 |
documents, embeddings, index = get_pdf_index()
|
171 |
retriever = SimpleRetriever(documents=documents, embeddings=embeddings, index=index)
|
172 |
|
|
|
173 |
# ----------------- ساخت Chain -----------------
|
174 |
chain = RetrievalQA.from_chain_type(
|
175 |
llm=llm,
|
|
|
105 |
@st.cache_resource
|
106 |
def get_pdf_index():
|
107 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
108 |
+
# بارگذاری PDF
|
109 |
+
loader = PyPDFLoader('test1.pdf')
|
110 |
+
documents = loader.load_and_split() # اینجا متن PDF را استخراج میکنیم
|
111 |
|
112 |
+
# استفاده از مدل SentenceTransformer
|
113 |
model_name = "togethercomputer/m2-bert-80M-8k-retrieval" # نام مدل
|
114 |
+
model = SentenceTransformer(model_name, trust_remote_code=True)
|
|
|
|
|
115 |
|
116 |
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
|
117 |
texts = []
|
118 |
+
for doc in documents:
|
119 |
+
texts.extend(splitter.split_text(doc['text'])) # تقسیم متن به تکههای کوچکتر
|
120 |
|
121 |
progress_bar = st.progress(0)
|
122 |
total_docs = len(texts)
|
|
|
124 |
embeddings = []
|
125 |
batch_size = 128
|
126 |
for i in range(0, total_docs, batch_size):
|
127 |
+
batch_texts = texts[i:i + batch_size]
|
128 |
batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
|
129 |
embeddings.extend(batch_embeddings)
|
130 |
|
|
|
134 |
progress_bar.empty()
|
135 |
|
136 |
embeddings = np.array(embeddings)
|
137 |
+
index = faiss.IndexFlatL2(embeddings.shape[1]) # ایجاد ایندکس با استفاده از faiss
|
138 |
index.add(embeddings)
|
139 |
|
140 |
+
# استفاده از VectorstoreIndexCreator
|
141 |
+
return documents, embeddings, index # بازگشت به اسناد و ایندکس
|
|
|
|
|
142 |
|
143 |
|
144 |
# ----------------- تعریف LLM از Groq -----------------
|
|
|
151 |
|
152 |
# ----------------- تعریف SimpleRetriever -----------------
|
153 |
class SimpleRetriever(BaseRetriever):
|
154 |
+
documents: List[dict] = Field(...) # تغییر نوع مستند به dict
|
155 |
embeddings: List[np.ndarray] = Field(...)
|
156 |
index: faiss.Index
|
157 |
|
158 |
+
def _get_relevant_documents(self, query: str) -> List[dict]:
|
159 |
sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
|
160 |
query_embedding = sentence_model.encode(query, convert_to_numpy=True)
|
161 |
|
|
|
169 |
documents, embeddings, index = get_pdf_index()
|
170 |
retriever = SimpleRetriever(documents=documents, embeddings=embeddings, index=index)
|
171 |
|
172 |
+
|
173 |
# ----------------- ساخت Chain -----------------
|
174 |
chain = RetrievalQA.from_chain_type(
|
175 |
llm=llm,
|