M17idd commited on
Commit
6f50fa8
·
verified ·
1 Parent(s): 658735d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -21
app.py CHANGED
@@ -102,14 +102,19 @@ st.markdown("""
102
  """, unsafe_allow_html=True)
103
 
104
 
105
- class TogetherEmbeddings(Embeddings):
106
- def __init__(self, model_name: str, api_key: str):
107
- self.model_name = model_name
108
- self.client = Together(api_key=api_key)
109
 
110
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
111
- response = self.client.embeddings.create(model=self.model_name, input=texts)
112
- return [item.embedding for item in response.data]
 
 
 
 
 
113
 
114
  def embed_query(self, text: str) -> List[float]:
115
  return self.embed_documents([text])[0]
@@ -118,25 +123,21 @@ class TogetherEmbeddings(Embeddings):
118
  def get_pdf_index():
119
  with st.spinner('📄 در حال پردازش فایل PDF...'):
120
  loader = PyPDFLoader('test1.pdf')
121
- pages = loader.load()
122
  full_text = "\n".join([page.page_content for page in pages])
123
  text_splitter = RecursiveCharacterTextSplitter(
124
- chunk_size=2048,
125
- chunk_overlap=256
126
  )
127
  texts = text_splitter.split_text(full_text)
128
- embeddings = TogetherEmbeddings(
129
- model_name="togethercomputer/m2-bert-80M-8k-retrieval",
130
- api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
131
-
132
-
133
-
134
-
135
  )
136
-
137
- vectorstore = FAISS.from_texts(texts, embeddings)
138
-
139
- return vectorstore
 
140
 
141
  index = get_pdf_index()
142
 
@@ -191,4 +192,3 @@ if st.session_state.pending_prompt:
191
  placeholder.markdown(full_response)
192
  st.session_state.messages.append({'role': 'ai', 'content': full_response})
193
  st.session_state.pending_prompt = None
194
-
 
102
  """, unsafe_allow_html=True)
103
 
104
 
105
+ class HuggingFaceEmbeddings(Embeddings):
106
+ def __init__(self, model_name: str):
107
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
108
+ self.model = AutoModel.from_pretrained(model_name)
109
 
110
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
111
+ embeddings = []
112
+ for text in texts:
113
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
114
+ with torch.no_grad():
115
+ outputs = self.model(**inputs)
116
+ embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().tolist())
117
+ return embeddings
118
 
119
  def embed_query(self, text: str) -> List[float]:
120
  return self.embed_documents([text])[0]
 
123
  def get_pdf_index():
124
  with st.spinner('📄 در حال پردازش فایل PDF...'):
125
  loader = PyPDFLoader('test1.pdf')
 
126
  full_text = "\n".join([page.page_content for page in pages])
127
  text_splitter = RecursiveCharacterTextSplitter(
128
+ chunk_size=1024,
129
+ chunk_overlap=128
130
  )
131
  texts = text_splitter.split_text(full_text)
132
+
133
+ embeddings = HuggingFaceEmbeddings(
134
+ model_name="bert-base-uncased"
 
 
 
 
135
  )
136
+
137
+ return VectorstoreIndexCreator(
138
+ embedding=embeddings,
139
+ text_splitter=text_splitter
140
+ ).from_texts(texts)
141
 
142
  index = get_pdf_index()
143
 
 
192
  placeholder.markdown(full_response)
193
  st.session_state.messages.append({'role': 'ai', 'content': full_response})
194
  st.session_state.pending_prompt = None