Spaces:
Sleeping
Sleeping
File size: 3,970 Bytes
a8ba66a 4141758 a8ba66a 4141758 a8ba66a 4141758 a8ba66a 4141758 a8ba66a 4141758 a8ba66a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# app.py
import gradio as gr
import os
import pdfminer.high_level
import docx
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import faiss
import tempfile
# ====== Settings ======
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
GENERATION_MODEL_NAME = "aubmindlab/aragpt2-small"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
TOP_K = 5
# ====== Load Models ======
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
gen_tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForCausalLM.from_pretrained(GENERATION_MODEL_NAME)
# ====== Globals ======
index = None
chunks = []
# ====== Helpers ======
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as f:
return pdfminer.high_level.extract_text(f)
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def chunk_text(text):
words = text.split()
chunks = []
for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
chunk = " ".join(words[i:i+CHUNK_SIZE])
chunks.append(chunk)
return chunks
def build_vector_store(chunks):
vectors = embedder.encode(chunks)
dim = vectors.shape[1]
idx = faiss.IndexFlatL2(dim)
idx.add(vectors)
return idx, vectors
def retrieve_relevant_chunks(question, idx, chunks, vectors):
q_vec = embedder.encode([question])
D, I = idx.search(q_vec, TOP_K)
return [chunks[i] for i in I[0] if i < len(chunks)]
def generate_answer(context_chunks, question):
context = " \n".join(context_chunks)
prompt = f"سؤال: {question}\nمحتوى ذو صلة: {context}\nجواب:"
inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
outputs = gen_model.generate(**inputs, max_new_tokens=100)
answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = answer.split("جواب:")[-1].strip()
return answer
# ====== Gradio Functions ======
def upload_and_train(files):
global index, chunks
all_text = ""
for file in files:
suffix = os.path.splitext(file.name)[-1].lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
if suffix == ".pdf":
all_text += extract_text_from_pdf(tmp_path) + "\n"
elif suffix in [".docx", ".doc"]:
all_text += extract_text_from_docx(tmp_path) + "\n"
os.unlink(tmp_path)
chunks = chunk_text(all_text)
index, vectors = build_vector_store(chunks)
return "✅ النظام جاهز للإجابة على أسئلتك"
def ask_question(user_question):
if index is None:
return "الرجاء رفع الكتب أولاً وتدريب النظام."
rel_chunks = retrieve_relevant_chunks(user_question, index, chunks, None)
answer = generate_answer(rel_chunks, user_question)
return answer
# ====== Gradio Interface ======
upload = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
train_btn = gr.Button("ابدأ التدريب")
train_output = gr.Textbox()
question_input = gr.Textbox(placeholder="اكتب سؤالك هنا باللغة العربية")
answer_output = gr.Textbox()
ask_btn = gr.Button("أرسل السؤال")
with gr.Blocks() as demo:
gr.Markdown("# 🧠 محاكاة دماغ المؤلف - نظام ذكي للإجابة على الأسئلة من كتبك بالعربية")
upload.render()
train_btn.render()
train_output.render()
question_input.render()
ask_btn.render()
answer_output.render()
train_btn.click(upload_and_train, inputs=[upload], outputs=[train_output])
ask_btn.click(ask_question, inputs=[question_input], outputs=[answer_output])
# Launch
if __name__ == "__main__":
demo.launch()
|