Update app.py
Browse files
app.py
CHANGED
@@ -102,14 +102,19 @@ st.markdown("""
|
|
102 |
""", unsafe_allow_html=True)
|
103 |
|
104 |
|
105 |
-
class
|
106 |
-
def __init__(self, model_name: str
|
107 |
-
self.
|
108 |
-
self.
|
109 |
|
110 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
def embed_query(self, text: str) -> List[float]:
|
115 |
return self.embed_documents([text])[0]
|
@@ -118,25 +123,21 @@ class TogetherEmbeddings(Embeddings):
|
|
118 |
def get_pdf_index():
|
119 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
120 |
loader = PyPDFLoader('test1.pdf')
|
121 |
-
pages = loader.load()
|
122 |
full_text = "\n".join([page.page_content for page in pages])
|
123 |
text_splitter = RecursiveCharacterTextSplitter(
|
124 |
-
chunk_size=
|
125 |
-
chunk_overlap=
|
126 |
)
|
127 |
texts = text_splitter.split_text(full_text)
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
)
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
140 |
|
141 |
index = get_pdf_index()
|
142 |
|
@@ -191,4 +192,3 @@ if st.session_state.pending_prompt:
|
|
191 |
placeholder.markdown(full_response)
|
192 |
st.session_state.messages.append({'role': 'ai', 'content': full_response})
|
193 |
st.session_state.pending_prompt = None
|
194 |
-
|
|
|
102 |
""", unsafe_allow_html=True)
|
103 |
|
104 |
|
105 |
+
class HuggingFaceEmbeddings(Embeddings):
|
106 |
+
def __init__(self, model_name: str):
|
107 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
108 |
+
self.model = AutoModel.from_pretrained(model_name)
|
109 |
|
110 |
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
111 |
+
embeddings = []
|
112 |
+
for text in texts:
|
113 |
+
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
114 |
+
with torch.no_grad():
|
115 |
+
outputs = self.model(**inputs)
|
116 |
+
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().tolist())
|
117 |
+
return embeddings
|
118 |
|
119 |
def embed_query(self, text: str) -> List[float]:
|
120 |
return self.embed_documents([text])[0]
|
|
|
123 |
def get_pdf_index():
|
124 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
125 |
loader = PyPDFLoader('test1.pdf')
|
|
|
126 |
full_text = "\n".join([page.page_content for page in pages])
|
127 |
text_splitter = RecursiveCharacterTextSplitter(
|
128 |
+
chunk_size=1024,
|
129 |
+
chunk_overlap=128
|
130 |
)
|
131 |
texts = text_splitter.split_text(full_text)
|
132 |
+
|
133 |
+
embeddings = HuggingFaceEmbeddings(
|
134 |
+
model_name="bert-base-uncased"
|
|
|
|
|
|
|
|
|
135 |
)
|
136 |
+
|
137 |
+
return VectorstoreIndexCreator(
|
138 |
+
embedding=embeddings,
|
139 |
+
text_splitter=text_splitter
|
140 |
+
).from_texts(texts)
|
141 |
|
142 |
index = get_pdf_index()
|
143 |
|
|
|
192 |
placeholder.markdown(full_response)
|
193 |
st.session_state.messages.append({'role': 'ai', 'content': full_response})
|
194 |
st.session_state.pending_prompt = None
|
|