M17idd commited on
Commit
324f762
·
verified ·
1 Parent(s): 99ed84f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -45
app.py CHANGED
@@ -6,67 +6,43 @@ from langchain_together import TogetherEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.chains import RetrievalQA
 
9
 
10
- # --- 📄 ساخت امبدینگ‌ها با batch 50 تایی
11
- def batch_embed(texts, embeddings_model, batch_size=50):
12
- all_embeddings = []
13
- for i in range(0, len(texts), batch_size):
14
- batch = texts[i:i+batch_size]
15
- embs = embeddings_model.embed_documents([doc.page_content for doc in batch])
16
- all_embeddings.extend(embs)
17
- return all_embeddings
18
 
19
  @st.cache_resource
20
  def load_chunks_and_embeddings():
21
- pdf_loader = PyPDFLoader('test1.pdf')
22
- pages = pdf_loader.load()
23
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
24
- docs = text_splitter.split_documents(pages)
25
 
26
- embeddings = TogetherEmbeddings(
27
- api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
28
- )
29
-
30
- vectorstore = None # هنوز نساختیم
31
 
32
- # پروگرس بار
33
- progress = st.progress(0, text="🔄 در حال پردازش چانک‌ها...")
34
- total = len(docs)
 
35
 
36
- batch_size = 50
37
- for i in range(0, total, batch_size):
38
- batch_docs = docs[i:i+batch_size]
39
- embeddings_batch = embeddings.embed_documents([doc.page_content for doc in batch_docs])
40
 
41
- if vectorstore is None:
42
- vectorstore = FAISS.from_embeddings(embeddings_batch, batch_docs)
43
- else:
44
- vectorstore.add_embeddings(embeddings_batch, batch_docs)
45
-
46
- progress.progress(min((i+batch_size)/total, 1.0))
47
-
48
- progress.empty()
49
- return vectorstore
50
-
51
- # --- 🛠️ آماده کردن دیتابیس
52
- with st.spinner("📚 در حال بارگذاری فایل و ساخت امبدینگ‌ها... لطفا صبور باشید"):
53
- vectorstore = load_chunks_and_embeddings()
54
-
55
- # --- 🤖 آماده سازی مدل LLM
56
  llm = ChatOpenAI(
57
  base_url="https://api.together.xyz/v1",
58
  api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
59
  model="meta-llama/Llama-3-70B-Instruct-Turbo-Free"
60
  )
61
 
62
- retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})
63
 
64
  chain = RetrievalQA.from_chain_type(
65
- llm=llm,
66
- chain_type='stuff',
67
- retriever=retriever,
68
- input_key='question'
69
- )
70
 
71
  # --- 💬 چت بات
72
  if 'messages' not in st.session_state:
 
6
  from langchain.vectorstores import FAISS
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.chains import RetrievalQA
9
+ from langchain.indexes import VectorstoreIndexCreator
10
 
 
 
 
 
 
 
 
 
11
 
12
  @st.cache_resource
13
  def load_chunks_and_embeddings():
14
+ with st.spinner("در حال بارگذاری فایل و آماده‌سازی... لطفاً صبور باشید 🙏"):
15
+ progress_bar = st.progress(0, text="در حال بارگذاری فایل PDF...")
 
 
16
 
17
+ pdf_loader = PyPDFLoader('test1.pdf')
18
+ pages = pdf_loader.load()
19
+ progress_bar.progress(30, text="صفحات PDF بارگذاری شد. در حال ایجاد مدل برداری...")
 
 
20
 
21
+ embeddings = TogetherEmbeddings(
22
+ api_key="0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979"
23
+ )
24
+ progress_bar.progress(60, text="مدل Embedding ساخته شد. در حال ایجاد ایندکس...")
25
 
26
+ index = VectorstoreIndexCreator(
27
+ embedding=embeddings,
28
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
29
+ ).from_loaders([pdf_loader])
30
 
31
+ progress_bar.progress(100, text="بارگذاری کامل شد! ✅")
32
+ return index
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  llm = ChatOpenAI(
34
  base_url="https://api.together.xyz/v1",
35
  api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
36
  model="meta-llama/Llama-3-70B-Instruct-Turbo-Free"
37
  )
38
 
39
+ index = load_chunks_and_embeddings()
40
 
41
  chain = RetrievalQA.from_chain_type(
42
+ llm=llm,
43
+ chain_type='stuff',
44
+ retriever=index.vectorstore.as_retriever(), input_key='question')
45
+
 
46
 
47
  # --- 💬 چت بات
48
  if 'messages' not in st.session_state: