Spaces:
Sleeping
Sleeping
File size: 3,867 Bytes
b278fd0 3242064 b278fd0 ed001a0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 b278fd0 3242064 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
import os
import tempfile
import shutil
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter"
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device)
vectordb = None
def load_document(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
loader = PyMuPDFLoader(file_path)
elif ext in [".doc", ".docx"]:
loader = UnstructuredWordDocumentLoader(file_path)
else:
raise ValueError("صيغة الملف غير مدعومة.")
return loader.load()
def train_from_documents(documents):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = splitter.split_documents(documents)
vectorstore = FAISS.from_documents(texts, embedding_model)
return vectorstore
def upload_files(files):
global vectordb
temp_dir = tempfile.mkdtemp()
all_docs = []
for file in files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.read())
docs = load_document(file_path)
all_docs.extend(docs)
vectordb = train_from_documents(all_docs)
shutil.rmtree(temp_dir)
return "✅ النظام جاهز للإجابة على أسئلتك!"
def answer_question(question):
if vectordb is None:
return "⚠️ الرجاء رفع الملفات أولاً."
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
llm=None,
retriever=retriever,
return_source_documents=True
)
relevant_docs = qa_chain.retriever.get_relevant_documents(question)
context = "\n".join(doc.page_content for doc in relevant_docs)
inputs = qa_tokenizer(
f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}",
return_tensors="pt",
truncation=True,
max_length=1024
).to(device)
with torch.no_grad():
outputs = qa_model.generate(**inputs, max_length=300)
answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer
with gr.Blocks(title="محاكاة دماغ المؤلف") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("## 📚 ارفع كتبك هنا")
file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
upload_button = gr.Button("🚀 ابدأ التدريب")
training_status = gr.Textbox(label="حالة التدريب", interactive=False)
with gr.Column():
gr.Markdown("## ❓ اطرح سؤالك")
question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...")
ask_button = gr.Button("✉️ أرسل السؤال!")
answer_output = gr.Textbox(label="الإجابة", interactive=False)
upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status])
ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output])
demo.launch(share=True)
|