# app.py import gradio as gr import os import pdfminer.high_level import docx from sentence_transformers import SentenceTransformer from transformers import AutoModelForCausalLM, AutoTokenizer import torch import faiss import tempfile # ====== Settings ====== EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" GENERATION_MODEL_NAME = "aubmindlab/aragpt2-small" CHUNK_SIZE = 500 CHUNK_OVERLAP = 50 TOP_K = 5 # ====== Load Models ====== embedder = SentenceTransformer(EMBEDDING_MODEL_NAME) gen_tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME) gen_model = AutoModelForCausalLM.from_pretrained(GENERATION_MODEL_NAME) # ====== Globals ====== index = None chunks = [] # ====== Helpers ====== def extract_text_from_pdf(file_path): with open(file_path, 'rb') as f: return pdfminer.high_level.extract_text(f) def extract_text_from_docx(file_path): doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) def chunk_text(text): words = text.split() chunks = [] for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP): chunk = " ".join(words[i:i+CHUNK_SIZE]) chunks.append(chunk) return chunks def build_vector_store(chunks): vectors = embedder.encode(chunks) dim = vectors.shape[1] idx = faiss.IndexFlatL2(dim) idx.add(vectors) return idx, vectors def retrieve_relevant_chunks(question, idx, chunks, vectors): q_vec = embedder.encode([question]) D, I = idx.search(q_vec, TOP_K) return [chunks[i] for i in I[0] if i < len(chunks)] def generate_answer(context_chunks, question): context = " \n".join(context_chunks) prompt = f"سؤال: {question}\nمحتوى ذو صلة: {context}\nجواب:" inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True) outputs = gen_model.generate(**inputs, max_new_tokens=100) answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True) answer = answer.split("جواب:")[-1].strip() return answer # ====== Gradio Functions ====== def upload_and_train(files): global index, chunks all_text = "" for file in files: suffix = os.path.splitext(file.name)[-1].lower() with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(file.read()) tmp_path = tmp.name if suffix == ".pdf": all_text += extract_text_from_pdf(tmp_path) + "\n" elif suffix in [".docx", ".doc"]: all_text += extract_text_from_docx(tmp_path) + "\n" os.unlink(tmp_path) chunks = chunk_text(all_text) index, vectors = build_vector_store(chunks) return "✅ النظام جاهز للإجابة على أسئلتك" def ask_question(user_question): if index is None: return "الرجاء رفع الكتب أولاً وتدريب النظام." rel_chunks = retrieve_relevant_chunks(user_question, index, chunks, None) answer = generate_answer(rel_chunks, user_question) return answer # ====== Gradio Interface ====== upload = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple") train_btn = gr.Button("ابدأ التدريب") train_output = gr.Textbox() question_input = gr.Textbox(placeholder="اكتب سؤالك هنا باللغة العربية") answer_output = gr.Textbox() ask_btn = gr.Button("أرسل السؤال") with gr.Blocks() as demo: gr.Markdown("# 🧠 محاكاة دماغ المؤلف - نظام ذكي للإجابة على الأسئلة من كتبك بالعربية") upload.render() train_btn.render() train_output.render() question_input.render() ask_btn.render() answer_output.render() train_btn.click(upload_and_train, inputs=[upload], outputs=[train_output]) ask_btn.click(ask_question, inputs=[question_input], outputs=[answer_output]) # Launch if __name__ == "__main__": demo.launch()