Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
import os | |
import pdfminer.high_level | |
import docx | |
from sentence_transformers import SentenceTransformer | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import faiss | |
import tempfile | |
# ====== Settings ====== | |
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
GENERATION_MODEL_NAME = "aubmindlab/aragpt2-small" | |
CHUNK_SIZE = 500 | |
CHUNK_OVERLAP = 50 | |
TOP_K = 5 | |
# ====== Load Models ====== | |
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
gen_tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME) | |
gen_model = AutoModelForCausalLM.from_pretrained(GENERATION_MODEL_NAME) | |
# ====== Globals ====== | |
index = None | |
chunks = [] | |
# ====== Helpers ====== | |
def extract_text_from_pdf(file_path): | |
with open(file_path, 'rb') as f: | |
return pdfminer.high_level.extract_text(f) | |
def extract_text_from_docx(file_path): | |
doc = docx.Document(file_path) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def chunk_text(text): | |
words = text.split() | |
chunks = [] | |
for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP): | |
chunk = " ".join(words[i:i+CHUNK_SIZE]) | |
chunks.append(chunk) | |
return chunks | |
def build_vector_store(chunks): | |
vectors = embedder.encode(chunks) | |
dim = vectors.shape[1] | |
idx = faiss.IndexFlatL2(dim) | |
idx.add(vectors) | |
return idx, vectors | |
def retrieve_relevant_chunks(question, idx, chunks, vectors): | |
q_vec = embedder.encode([question]) | |
D, I = idx.search(q_vec, TOP_K) | |
return [chunks[i] for i in I[0] if i < len(chunks)] | |
def generate_answer(context_chunks, question): | |
context = " \n".join(context_chunks) | |
prompt = f"سؤال: {question}\nمحتوى ذو صلة: {context}\nجواب:" | |
inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True) | |
outputs = gen_model.generate(**inputs, max_new_tokens=100) | |
answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
answer = answer.split("جواب:")[-1].strip() | |
return answer | |
# ====== Gradio Functions ====== | |
def upload_and_train(files): | |
global index, chunks | |
all_text = "" | |
for file in files: | |
suffix = os.path.splitext(file.name)[-1].lower() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: | |
tmp.write(file.read()) | |
tmp_path = tmp.name | |
if suffix == ".pdf": | |
all_text += extract_text_from_pdf(tmp_path) + "\n" | |
elif suffix in [".docx", ".doc"]: | |
all_text += extract_text_from_docx(tmp_path) + "\n" | |
os.unlink(tmp_path) | |
chunks = chunk_text(all_text) | |
index, vectors = build_vector_store(chunks) | |
return "✅ النظام جاهز للإجابة على أسئلتك" | |
def ask_question(user_question): | |
if index is None: | |
return "الرجاء رفع الكتب أولاً وتدريب النظام." | |
rel_chunks = retrieve_relevant_chunks(user_question, index, chunks, None) | |
answer = generate_answer(rel_chunks, user_question) | |
return answer | |
# ====== Gradio Interface ====== | |
upload = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple") | |
train_btn = gr.Button("ابدأ التدريب") | |
train_output = gr.Textbox() | |
question_input = gr.Textbox(placeholder="اكتب سؤالك هنا باللغة العربية") | |
answer_output = gr.Textbox() | |
ask_btn = gr.Button("أرسل السؤال") | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🧠 محاكاة دماغ المؤلف - نظام ذكي للإجابة على الأسئلة من كتبك بالعربية") | |
upload.render() | |
train_btn.render() | |
train_output.render() | |
question_input.render() | |
ask_btn.render() | |
answer_output.render() | |
train_btn.click(upload_and_train, inputs=[upload], outputs=[train_output]) | |
ask_btn.click(ask_question, inputs=[question_input], outputs=[answer_output]) | |
# Launch | |
if __name__ == "__main__": | |
demo.launch() | |