M17idd commited on
Commit
41af8de
·
verified ·
1 Parent(s): dce6443

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -5
app.py CHANGED
@@ -25,12 +25,24 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  model = AutoModel.from_pretrained(model_name)
26
 
27
  # ----------------- لود PDF و ساخت ایندکس -----------------
 
 
 
 
 
 
 
 
 
 
28
  @st.cache_resource
29
  def build_pdf_index():
30
  with st.spinner('📄 در حال پردازش فایل PDF...'):
 
31
  loader = PyPDFLoader("test1.pdf")
32
  pages = loader.load()
33
 
 
34
  splitter = RecursiveCharacterTextSplitter(
35
  chunk_size=500,
36
  chunk_overlap=50
@@ -42,15 +54,41 @@ def build_pdf_index():
42
 
43
  documents = [LangchainDocument(page_content=t) for t in texts]
44
 
 
 
 
 
 
 
45
  embeddings = []
46
- for doc in documents:
47
- inputs = tokenizer(doc.page_content, return_tensors="pt", padding=True, truncation=True)
48
- with torch.no_grad():
49
- outputs = model(**inputs)
50
- embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  return documents, embeddings
53
 
 
54
  # ----------------- تعریف LLM از Groq -----------------
55
  groq_api_key = "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"
56
 
 
25
  model = AutoModel.from_pretrained(model_name)
26
 
27
  # ----------------- لود PDF و ساخت ایندکس -----------------
28
+ import os
29
+ import streamlit as st
30
+ import torch
31
+ from transformers import AutoTokenizer, AutoModel
32
+ from langchain.document_loaders import PyPDFLoader
33
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
34
+ from langchain.schema import Document as LangchainDocument
35
+ from sentence_transformers import SentenceTransformer
36
+ import numpy as np
37
+
38
  @st.cache_resource
39
  def build_pdf_index():
40
  with st.spinner('📄 در حال پردازش فایل PDF...'):
41
+ # بارگذاری فایل
42
  loader = PyPDFLoader("test1.pdf")
43
  pages = loader.load()
44
 
45
+ # تکه‌تکه کردن متن
46
  splitter = RecursiveCharacterTextSplitter(
47
  chunk_size=500,
48
  chunk_overlap=50
 
54
 
55
  documents = [LangchainDocument(page_content=t) for t in texts]
56
 
57
+ # مدل‌های Embedding
58
+ tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-zwnj-base")
59
+ bert_model = AutoModel.from_pretrained("HooshvareLab/bert-fa-zwnj-base")
60
+
61
+ sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
62
+
63
  embeddings = []
64
+ batch_size = 16
65
+
66
+ for i in range(0, len(documents), batch_size):
67
+ batch_docs = documents[i:i+batch_size]
68
+ batch_texts = [doc.page_content for doc in batch_docs]
69
+
70
+ # اول تلاش با مدل SentenceTransformer (خیلی سریعتره)
71
+ try:
72
+ batch_embeddings = sentence_model.encode(batch_texts, batch_size=batch_size, convert_to_numpy=True)
73
+ except Exception as e:
74
+ st.error(f"❌ خطا در SentenceTransformer: {e}")
75
+ batch_embeddings = []
76
+
77
+ # اگر موفق نبود، استفاده از BERT
78
+ if batch_embeddings == []:
79
+ inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
80
+ with torch.no_grad():
81
+ outputs = bert_model(**inputs)
82
+ batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
83
+
84
+ embeddings.extend(batch_embeddings)
85
+
86
+ # اطمینان که خروجی NumpyArray باشه
87
+ embeddings = np.array(embeddings)
88
 
89
  return documents, embeddings
90
 
91
+
92
  # ----------------- تعریف LLM از Groq -----------------
93
  groq_api_key = "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"
94