M17idd commited on
Commit
b877134
·
verified ·
1 Parent(s): 9923628

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -16
app.py CHANGED
@@ -11,11 +11,7 @@ from langchain.chat_models import ChatOpenAI
11
  from typing import List
12
  from together import Together
13
 
14
- import streamlit as st
15
- import tiktoken
16
- from langchain.document_loaders import PyPDFLoader
17
- from langchain.text_splitter import RecursiveCharacterTextSplitter
18
- from langchain.vectorstores import VectorstoreIndexCreator
19
  from langchain.embeddings import TogetherEmbeddings
20
  from langchain.schema import Document as LangchainDocument
21
 
@@ -140,16 +136,14 @@ def get_pdf_index():
140
  else:
141
  small_chunks.append(text)
142
 
143
- # حالا چک واقعی بر اساس تعداد توکن
144
  final_chunks = []
145
- max_tokens = 2000 # حداکثر توکن مجاز برای Together
146
 
147
  for chunk in small_chunks:
148
  token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
149
  if token_count > max_tokens:
150
- # اگر چانک بزرگ بود، خوردش کن
151
  splitter_token_safe = RecursiveCharacterTextSplitter(
152
- chunk_size=1000, # یا هر چیزی که مطمئن شی توکنش زیر 2000 میمونه
153
  chunk_overlap=100
154
  )
155
  smaller_chunks = splitter_token_safe.split_text(chunk)
@@ -164,13 +158,10 @@ def get_pdf_index():
164
  api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
165
  )
166
 
167
- return VectorstoreIndexCreator(
168
- embedding=embeddings,
169
- text_splitter=RecursiveCharacterTextSplitter(
170
- chunk_size=1000,
171
- chunk_overlap=100
172
- )
173
- ).from_documents(documents)
174
 
175
  index = get_pdf_index()
176
 
 
11
  from typing import List
12
  from together import Together
13
 
14
+
 
 
 
 
15
  from langchain.embeddings import TogetherEmbeddings
16
  from langchain.schema import Document as LangchainDocument
17
 
 
136
  else:
137
  small_chunks.append(text)
138
 
 
139
  final_chunks = []
140
+ max_tokens = 2000
141
 
142
  for chunk in small_chunks:
143
  token_count = count_tokens(chunk, model_name="gpt-3.5-turbo")
144
  if token_count > max_tokens:
 
145
  splitter_token_safe = RecursiveCharacterTextSplitter(
146
+ chunk_size=1000,
147
  chunk_overlap=100
148
  )
149
  smaller_chunks = splitter_token_safe.split_text(chunk)
 
158
  api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
159
  )
160
 
161
+ # اینجا دیگه Vectorstore مستقیم میسازیم با FAISS
162
+ vectordb = FAISS.from_documents(documents, embedding=embeddings)
163
+
164
+ return vectordb
 
 
 
165
 
166
  index = get_pdf_index()
167