File size: 3,867 Bytes
b278fd0
 
 
 
3242064
b278fd0
ed001a0
3242064
 
 
 
b278fd0
3242064
 
b278fd0
3242064
 
 
 
b278fd0
3242064
b278fd0
3242064
 
 
 
 
 
b278fd0
3242064
 
b278fd0
3242064
 
 
 
 
b278fd0
3242064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b278fd0
3242064
 
 
b278fd0
3242064
 
 
 
 
 
b278fd0
3242064
 
 
 
 
 
 
 
 
 
 
 
b278fd0
3242064
b278fd0
3242064
 
 
 
 
 
 
 
 
 
 
 
 
b278fd0
3242064
 
b278fd0
3242064
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import os
import tempfile
import shutil
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter"

embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device)

vectordb = None

def load_document(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        loader = PyMuPDFLoader(file_path)
    elif ext in [".doc", ".docx"]:
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError("صيغة الملف غير مدعومة.")
    return loader.load()

def train_from_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = splitter.split_documents(documents)
    vectorstore = FAISS.from_documents(texts, embedding_model)
    return vectorstore

def upload_files(files):
    global vectordb
    temp_dir = tempfile.mkdtemp()
    all_docs = []
    
    for file in files:
        file_path = os.path.join(temp_dir, file.name)
        with open(file_path, "wb") as f:
            f.write(file.read())
        docs = load_document(file_path)
        all_docs.extend(docs)
    
    vectordb = train_from_documents(all_docs)
    shutil.rmtree(temp_dir)
    return "✅ النظام جاهز للإجابة على أسئلتك!"

def answer_question(question):
    if vectordb is None:
        return "⚠️ الرجاء رفع الملفات أولاً."
    
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})
    qa_chain = RetrievalQA.from_chain_type(
        llm=None,
        retriever=retriever,
        return_source_documents=True
    )

    relevant_docs = qa_chain.retriever.get_relevant_documents(question)
    context = "\n".join(doc.page_content for doc in relevant_docs)
    
    inputs = qa_tokenizer(
        f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}",
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)
    
    with torch.no_grad():
        outputs = qa_model.generate(**inputs, max_length=300)
    answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

with gr.Blocks(title="محاكاة دماغ المؤلف") as demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown("## 📚 ارفع كتبك هنا")
            file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
            upload_button = gr.Button("🚀 ابدأ التدريب")
            training_status = gr.Textbox(label="حالة التدريب", interactive=False)
        
        with gr.Column():
            gr.Markdown("## ❓ اطرح سؤالك")
            question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...")
            ask_button = gr.Button("✉️ أرسل السؤال!")
            answer_output = gr.Textbox(label="الإجابة", interactive=False)

    upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status])
    ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output])

demo.launch(share=True)