ramysaidagieb commited on
Commit
3242064
·
verified ·
1 Parent(s): a4a047b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -69
app.py CHANGED
@@ -1,89 +1,98 @@
1
-
2
  import gradio as gr
3
  import os
4
  import tempfile
5
  import shutil
6
- import pdfminer.high_level
7
- import docx
8
- import faiss
9
- import torch
10
- from transformers import AutoTokenizer, AutoModelForCausalLM
11
- from sentence_transformers import SentenceTransformer
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_community.embeddings import HuggingFaceEmbeddings
14
- from langchain.vectorstores import FAISS
 
 
 
15
 
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
17
 
18
- # تحميل النماذج
19
- embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)
20
- qa_model_name = "aubmindlab/aragpt2-base"
21
- qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
22
- qa_model = AutoModelForCausalLM.from_pretrained(qa_model_name).to(device)
23
 
24
- # إعداد قاعدة البيانات
25
- index = None
26
- docs = []
27
 
28
- def extract_text(file_path):
29
- if file_path.endswith('.pdf'):
30
- with open(file_path, 'rb') as f:
31
- return pdfminer.high_level.extract_text(f)
32
- elif file_path.endswith('.docx') or file_path.endswith('.doc'):
33
- doc = docx.Document(file_path)
34
- return "\n".join([para.text for para in doc.paragraphs])
35
  else:
36
- raise ValueError("صيغة ملف غير مدعومة")
37
-
38
- def process_files(files):
39
- global index, docs
40
- all_text = ""
41
- for file in files:
42
- text = extract_text(file.name)
43
- all_text += text + "\n"
44
 
45
- # تقسيم النص إلى مقاطع
46
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
47
- texts = text_splitter.split_text(all_text)
 
 
48
 
49
- # إنشاء المتجهات
50
- embeddings = embedding_model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
51
- index = faiss.IndexFlatL2(embeddings.shape[1])
52
- index.add(embeddings.cpu().numpy())
53
- docs = texts
54
- return "✅ تم تحميل الكتب واستيعاب الأفكار! النظام جاهز للإجابة."
 
 
 
 
 
 
 
 
 
55
 
56
- def generate_answer(question):
57
- global index, docs
58
- if index is None:
59
- return "❌ الرجاء رفع الكتب أولاً."
60
 
61
- q_emb = embedding_model.encode([question])
62
- D, I = index.search(q_emb, k=3)
63
- context = "\n".join([docs[i] for i in I[0]])
 
 
 
64
 
65
- # تجهيز الإدخال للنموذج
66
- prompt = f"سؤال: {question}\n\nمحتوى ذو صلة:\n{context}\n\nالإجابة:"
67
- inputs = qa_tokenizer(prompt, return_tensors='pt').to(device)
68
- outputs = qa_model.generate(**inputs, max_new_tokens=300, pad_token_id=qa_tokenizer.eos_token_id)
 
 
 
 
 
 
 
 
69
  answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
70
- return answer.split("الإجابة:")[-1].strip()
71
 
72
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
- gr.Markdown("""
74
- # 🚀 أهلاً بك في محاكاة عقل المؤلف
75
- ارفع كتبك واستعد للانطلاق في رحلة استكشاف الأفكار العميقة!
76
- """)
77
- with gr.Tab("📚 رفع الكتب للتدريب"):
78
- upload = gr.File(file_types=['.pdf', '.docx', '.doc'], file_count='multiple')
79
- train_button = gr.Button("🚀 ابدأ التدريب!")
80
- train_output = gr.Textbox(label="🔵 حالة التدريب", interactive=False)
81
- with gr.Tab("❓ اسأل الكتاب"):
82
- question = gr.Textbox(label="اكتب سؤالك هنا...")
83
- answer = gr.Textbox(label="الإجابة", interactive=False)
84
- ask_button = gr.Button("✉️ أرسل السؤال!")
85
 
86
- train_button.click(process_files, inputs=[upload], outputs=[train_output])
87
- ask_button.click(generate_answer, inputs=[question], outputs=[answer])
88
 
89
- demo.launch()
 
 
1
  import gradio as gr
2
  import os
3
  import tempfile
4
  import shutil
5
+ from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
 
 
 
 
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.chains import RetrievalQA
10
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
11
+ import torch
12
 
13
+ EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
14
+ QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter"
15
 
16
+ embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True)
19
+ qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device)
 
20
 
21
+ vectordb = None
 
 
22
 
23
+ def load_document(file_path):
24
+ ext = os.path.splitext(file_path)[1].lower()
25
+ if ext == ".pdf":
26
+ loader = PyMuPDFLoader(file_path)
27
+ elif ext in [".doc", ".docx"]:
28
+ loader = UnstructuredWordDocumentLoader(file_path)
 
29
  else:
30
+ raise ValueError("صيغة الملف غير مدعومة.")
31
+ return loader.load()
 
 
 
 
 
 
32
 
33
+ def train_from_documents(documents):
34
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
35
+ texts = splitter.split_documents(documents)
36
+ vectorstore = FAISS.from_documents(texts, embedding_model)
37
+ return vectorstore
38
 
39
+ def upload_files(files):
40
+ global vectordb
41
+ temp_dir = tempfile.mkdtemp()
42
+ all_docs = []
43
+
44
+ for file in files:
45
+ file_path = os.path.join(temp_dir, file.name)
46
+ with open(file_path, "wb") as f:
47
+ f.write(file.read())
48
+ docs = load_document(file_path)
49
+ all_docs.extend(docs)
50
+
51
+ vectordb = train_from_documents(all_docs)
52
+ shutil.rmtree(temp_dir)
53
+ return "✅ النظام جاهز للإجابة على أسئلتك!"
54
 
55
+ def answer_question(question):
56
+ if vectordb is None:
57
+ return "⚠️ الرجاء رفع الملفات أولاً."
 
58
 
59
+ retriever = vectordb.as_retriever(search_kwargs={"k": 5})
60
+ qa_chain = RetrievalQA.from_chain_type(
61
+ llm=None,
62
+ retriever=retriever,
63
+ return_source_documents=True
64
+ )
65
 
66
+ relevant_docs = qa_chain.retriever.get_relevant_documents(question)
67
+ context = "\n".join(doc.page_content for doc in relevant_docs)
68
+
69
+ inputs = qa_tokenizer(
70
+ f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}",
71
+ return_tensors="pt",
72
+ truncation=True,
73
+ max_length=1024
74
+ ).to(device)
75
+
76
+ with torch.no_grad():
77
+ outputs = qa_model.generate(**inputs, max_length=300)
78
  answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
79
+ return answer
80
 
81
+ with gr.Blocks(title="محاكاة دماغ المؤلف") as demo:
82
+ with gr.Row():
83
+ with gr.Column():
84
+ gr.Markdown("## 📚 ارفع كتبك هنا")
85
+ file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
86
+ upload_button = gr.Button("🚀 ابدأ التدريب")
87
+ training_status = gr.Textbox(label="حالة التدريب", interactive=False)
88
+
89
+ with gr.Column():
90
+ gr.Markdown("## اطرح سؤالك")
91
+ question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...")
92
+ ask_button = gr.Button("✉️ أرسل السؤال!")
93
+ answer_output = gr.Textbox(label="الإجابة", interactive=False)
94
 
95
+ upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status])
96
+ ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output])
97
 
98
+ demo.launch(share=True)