Update app.py
Browse files
app.py
CHANGED
@@ -25,12 +25,24 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
25 |
model = AutoModel.from_pretrained(model_name)
|
26 |
|
27 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
@st.cache_resource
|
29 |
def build_pdf_index():
|
30 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
|
|
31 |
loader = PyPDFLoader("test1.pdf")
|
32 |
pages = loader.load()
|
33 |
|
|
|
34 |
splitter = RecursiveCharacterTextSplitter(
|
35 |
chunk_size=500,
|
36 |
chunk_overlap=50
|
@@ -42,15 +54,41 @@ def build_pdf_index():
|
|
42 |
|
43 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
embeddings = []
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
return documents, embeddings
|
53 |
|
|
|
54 |
# ----------------- تعریف LLM از Groq -----------------
|
55 |
groq_api_key = "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"
|
56 |
|
|
|
25 |
model = AutoModel.from_pretrained(model_name)
|
26 |
|
27 |
# ----------------- لود PDF و ساخت ایندکس -----------------
|
28 |
+
import os
|
29 |
+
import streamlit as st
|
30 |
+
import torch
|
31 |
+
from transformers import AutoTokenizer, AutoModel
|
32 |
+
from langchain.document_loaders import PyPDFLoader
|
33 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
34 |
+
from langchain.schema import Document as LangchainDocument
|
35 |
+
from sentence_transformers import SentenceTransformer
|
36 |
+
import numpy as np
|
37 |
+
|
38 |
@st.cache_resource
|
39 |
def build_pdf_index():
|
40 |
with st.spinner('📄 در حال پردازش فایل PDF...'):
|
41 |
+
# بارگذاری فایل
|
42 |
loader = PyPDFLoader("test1.pdf")
|
43 |
pages = loader.load()
|
44 |
|
45 |
+
# تکهتکه کردن متن
|
46 |
splitter = RecursiveCharacterTextSplitter(
|
47 |
chunk_size=500,
|
48 |
chunk_overlap=50
|
|
|
54 |
|
55 |
documents = [LangchainDocument(page_content=t) for t in texts]
|
56 |
|
57 |
+
# مدلهای Embedding
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-zwnj-base")
|
59 |
+
bert_model = AutoModel.from_pretrained("HooshvareLab/bert-fa-zwnj-base")
|
60 |
+
|
61 |
+
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
62 |
+
|
63 |
embeddings = []
|
64 |
+
batch_size = 16
|
65 |
+
|
66 |
+
for i in range(0, len(documents), batch_size):
|
67 |
+
batch_docs = documents[i:i+batch_size]
|
68 |
+
batch_texts = [doc.page_content for doc in batch_docs]
|
69 |
+
|
70 |
+
# اول تلاش با مدل SentenceTransformer (خیلی سریعتره)
|
71 |
+
try:
|
72 |
+
batch_embeddings = sentence_model.encode(batch_texts, batch_size=batch_size, convert_to_numpy=True)
|
73 |
+
except Exception as e:
|
74 |
+
st.error(f"❌ خطا در SentenceTransformer: {e}")
|
75 |
+
batch_embeddings = []
|
76 |
+
|
77 |
+
# اگر موفق نبود، استفاده از BERT
|
78 |
+
if batch_embeddings == []:
|
79 |
+
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
|
80 |
+
with torch.no_grad():
|
81 |
+
outputs = bert_model(**inputs)
|
82 |
+
batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
|
83 |
+
|
84 |
+
embeddings.extend(batch_embeddings)
|
85 |
+
|
86 |
+
# اطمینان که خروجی NumpyArray باشه
|
87 |
+
embeddings = np.array(embeddings)
|
88 |
|
89 |
return documents, embeddings
|
90 |
|
91 |
+
|
92 |
# ----------------- تعریف LLM از Groq -----------------
|
93 |
groq_api_key = "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"
|
94 |
|