Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,10 @@ import os
|
|
2 |
import time
|
3 |
import streamlit as st
|
4 |
from langchain.chat_models import ChatOpenAI
|
5 |
-
|
6 |
-
from transformers import AutoTokenizer, AutoModel
|
7 |
from langchain.document_loaders import PyPDFLoader
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.schema import Document as LangchainDocument
|
10 |
from langchain.chains import RetrievalQA
|
11 |
-
from langchain.llms import OpenAI
|
12 |
import torch
|
13 |
from langchain_core.retrievers import BaseRetriever
|
14 |
from langchain_core.documents import Document
|
@@ -158,24 +155,21 @@ llm = ChatOpenAI(
|
|
158 |
# ----------------- تعریف SimpleRetriever -----------------
|
159 |
class SimpleRetriever(BaseRetriever):
|
160 |
documents: List[Document] = Field(...)
|
161 |
-
embeddings: List = Field(...)
|
162 |
|
163 |
def _get_relevant_documents(self, query: str) -> List[Document]:
|
164 |
-
#
|
165 |
sentence_model = SentenceTransformer("aubmindlab/bert-base-arabert")
|
166 |
query_embedding = sentence_model.encode(query, convert_to_numpy=True)
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
similarity = (query_embedding * doc_embedding).sum()
|
171 |
-
similarities.append(similarity)
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
return [doc for _, doc in ranked_docs[:5]]
|
179 |
|
180 |
# ----------------- ساخت Index -----------------
|
181 |
documents, embeddings = build_pdf_index()
|
|
|
2 |
import time
|
3 |
import streamlit as st
|
4 |
from langchain.chat_models import ChatOpenAI
|
|
|
|
|
5 |
from langchain.document_loaders import PyPDFLoader
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain.schema import Document as LangchainDocument
|
8 |
from langchain.chains import RetrievalQA
|
|
|
9 |
import torch
|
10 |
from langchain_core.retrievers import BaseRetriever
|
11 |
from langchain_core.documents import Document
|
|
|
155 |
# ----------------- تعریف SimpleRetriever -----------------
|
156 |
class SimpleRetriever(BaseRetriever):
|
157 |
documents: List[Document] = Field(...)
|
158 |
+
embeddings: List[np.ndarray] = Field(...)
|
159 |
|
160 |
def _get_relevant_documents(self, query: str) -> List[Document]:
|
161 |
+
# استفاده از sentence_model برای تبدیل query به بردار
|
162 |
sentence_model = SentenceTransformer("aubmindlab/bert-base-arabert")
|
163 |
query_embedding = sentence_model.encode(query, convert_to_numpy=True)
|
164 |
|
165 |
+
# محاسبه شباهتهای برداری برای تمام اسناد
|
166 |
+
similarities = np.dot(self.embeddings, query_embedding)
|
|
|
|
|
167 |
|
168 |
+
# ترتیبدهی اسناد بر اساس شباهتها
|
169 |
+
ranked_docs = np.argsort(similarities)[::-1]
|
170 |
+
|
171 |
+
# برگشتن به ۵ سند برتر
|
172 |
+
return [self.documents[i] for i in ranked_docs[:5]]
|
|
|
173 |
|
174 |
# ----------------- ساخت Index -----------------
|
175 |
documents, embeddings = build_pdf_index()
|