Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import tempfile | |
import shutil | |
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
import torch | |
EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix" | |
QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter" | |
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True) | |
qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device) | |
vectordb = None | |
def load_document(file_path): | |
ext = os.path.splitext(file_path)[1].lower() | |
if ext == ".pdf": | |
loader = PyMuPDFLoader(file_path) | |
elif ext in [".doc", ".docx"]: | |
loader = UnstructuredWordDocumentLoader(file_path) | |
else: | |
raise ValueError("صيغة الملف غير مدعومة.") | |
return loader.load() | |
def train_from_documents(documents): | |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
texts = splitter.split_documents(documents) | |
vectorstore = FAISS.from_documents(texts, embedding_model) | |
return vectorstore | |
def upload_files(files): | |
global vectordb | |
temp_dir = tempfile.mkdtemp() | |
all_docs = [] | |
for file in files: | |
file_path = os.path.join(temp_dir, file.name) | |
with open(file_path, "wb") as f: | |
f.write(file.read()) | |
docs = load_document(file_path) | |
all_docs.extend(docs) | |
vectordb = train_from_documents(all_docs) | |
shutil.rmtree(temp_dir) | |
return "✅ النظام جاهز للإجابة على أسئلتك!" | |
def answer_question(question): | |
if vectordb is None: | |
return "⚠️ الرجاء رفع الملفات أولاً." | |
retriever = vectordb.as_retriever(search_kwargs={"k": 5}) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=None, | |
retriever=retriever, | |
return_source_documents=True | |
) | |
relevant_docs = qa_chain.retriever.get_relevant_documents(question) | |
context = "\n".join(doc.page_content for doc in relevant_docs) | |
inputs = qa_tokenizer( | |
f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}", | |
return_tensors="pt", | |
truncation=True, | |
max_length=1024 | |
).to(device) | |
with torch.no_grad(): | |
outputs = qa_model.generate(**inputs, max_length=300) | |
answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return answer | |
with gr.Blocks(title="محاكاة دماغ المؤلف") as demo: | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## 📚 ارفع كتبك هنا") | |
file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple") | |
upload_button = gr.Button("🚀 ابدأ التدريب") | |
training_status = gr.Textbox(label="حالة التدريب", interactive=False) | |
with gr.Column(): | |
gr.Markdown("## ❓ اطرح سؤالك") | |
question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...") | |
ask_button = gr.Button("✉️ أرسل السؤال!") | |
answer_output = gr.Textbox(label="الإجابة", interactive=False) | |
upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status]) | |
ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output]) | |
demo.launch(share=True) | |