import gradio as gr import os import tempfile import shutil from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import torch EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix" QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter" embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True) qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device) vectordb = None def load_document(file_path): ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": loader = PyMuPDFLoader(file_path) elif ext in [".doc", ".docx"]: loader = UnstructuredWordDocumentLoader(file_path) else: raise ValueError("صيغة الملف غير مدعومة.") return loader.load() def train_from_documents(documents): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = splitter.split_documents(documents) vectorstore = FAISS.from_documents(texts, embedding_model) return vectorstore def upload_files(files): global vectordb temp_dir = tempfile.mkdtemp() all_docs = [] for file in files: file_path = os.path.join(temp_dir, file.name) with open(file_path, "wb") as f: f.write(file.read()) docs = load_document(file_path) all_docs.extend(docs) vectordb = train_from_documents(all_docs) shutil.rmtree(temp_dir) return "✅ النظام جاهز للإجابة على أسئلتك!" def answer_question(question): if vectordb is None: return "⚠️ الرجاء رفع الملفات أولاً." retriever = vectordb.as_retriever(search_kwargs={"k": 5}) qa_chain = RetrievalQA.from_chain_type( llm=None, retriever=retriever, return_source_documents=True ) relevant_docs = qa_chain.retriever.get_relevant_documents(question) context = "\n".join(doc.page_content for doc in relevant_docs) inputs = qa_tokenizer( f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}", return_tensors="pt", truncation=True, max_length=1024 ).to(device) with torch.no_grad(): outputs = qa_model.generate(**inputs, max_length=300) answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True) return answer with gr.Blocks(title="محاكاة دماغ المؤلف") as demo: with gr.Row(): with gr.Column(): gr.Markdown("## 📚 ارفع كتبك هنا") file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple") upload_button = gr.Button("🚀 ابدأ التدريب") training_status = gr.Textbox(label="حالة التدريب", interactive=False) with gr.Column(): gr.Markdown("## ❓ اطرح سؤالك") question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...") ask_button = gr.Button("✉️ أرسل السؤال!") answer_output = gr.Textbox(label="الإجابة", interactive=False) upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status]) ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output]) demo.launch(share=True)