M17idd commited on
Commit
e3f5de5
·
verified ·
1 Parent(s): d5531f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -37
app.py CHANGED
@@ -13,6 +13,8 @@ from typing import List
13
  from pydantic import Field
14
  from sentence_transformers import SentenceTransformer
15
  import numpy as np
 
 
16
  import faiss
17
 
18
  # ----------------- تنظیمات صفحه -----------------
@@ -99,47 +101,22 @@ st.markdown("""
99
  # ----------------- لود PDF و ساخت ایندکس -----------------
100
 
101
  @st.cache_resource
102
- def build_pdf_index():
103
- with st.spinner('📄 در حال پردازش فایل ...'):
104
- loader = PyPDFLoader("test1.pdf")
105
- pages = loader.load()
106
-
107
- splitter = RecursiveCharacterTextSplitter(
108
- chunk_size=128,
109
- chunk_overlap=50
110
- )
111
-
112
- texts = []
113
- for page in pages:
114
- texts.extend(splitter.split_text(page.page_content))
115
-
116
- documents = [LangchainDocument(page_content=t) for t in texts]
117
-
118
- sentence_model = SentenceTransformer("togethercomputer/m2-bert-80M-8k-retrieval", trust_remote_code=True)
119
-
120
- progress_bar = st.progress(0)
121
- total_docs = len(documents)
122
-
123
- texts_to_encode = [doc.page_content for doc in documents]
124
-
125
- batch_size = 128
126
- embeddings = []
127
- for i in range(0, total_docs, batch_size):
128
- batch_texts = texts_to_encode[i:i+batch_size]
129
- batch_embeddings = sentence_model.encode(batch_texts, convert_to_numpy=True)
130
- embeddings.extend(batch_embeddings)
131
 
132
- progress_bar.progress(min((i + batch_size) / total_docs, 1.0))
 
133
 
134
-
135
- time.sleep(5)
136
- progress_bar.empty()
137
- embeddings = np.array(embeddings)
138
 
139
- index = faiss.IndexFlatL2(embeddings.shape[1])
140
- index.add(embeddings)
 
 
141
 
142
- return documents, embeddings, index
143
 
144
  # ----------------- تعریف LLM از Groq -----------------
145
  llm = ChatOpenAI(
 
13
  from pydantic import Field
14
  from sentence_transformers import SentenceTransformer
15
  import numpy as np
16
+ from langchain.vectorstores import VectorstoreIndexCreator
17
+ from sentence_transformers import SentenceTransformer
18
  import faiss
19
 
20
  # ----------------- تنظیمات صفحه -----------------
 
101
  # ----------------- لود PDF و ساخت ایندکس -----------------
102
 
103
  @st.cache_resource
104
+ @st.cache_resource
105
+ def get_pdf_index():
106
+ with st.spinner('📄 در حال پردازش فایل PDF...'):
107
+ loader = [PyPDFLoader('test1.pdf')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ model_name = "togethercomputer/m2-bert-80M-8k-retrieval"
110
+ model = SentenceTransformer(model_name)
111
 
112
+ embeddings = model.encode
 
 
 
113
 
114
+ index_creator = VectorstoreIndexCreator(
115
+ embedding=embeddings,
116
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
117
+ )
118
 
119
+ return index_creator.from_loaders(loader)
120
 
121
  # ----------------- تعریف LLM از Groq -----------------
122
  llm = ChatOpenAI(