project1 / app.py
ramysaidagieb's picture
Update app.py
3242064 verified
raw
history blame
3.87 kB
import gradio as gr
import os
import tempfile
import shutil
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter"
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device)
vectordb = None
def load_document(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
loader = PyMuPDFLoader(file_path)
elif ext in [".doc", ".docx"]:
loader = UnstructuredWordDocumentLoader(file_path)
else:
raise ValueError("صيغة الملف غير مدعومة.")
return loader.load()
def train_from_documents(documents):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = splitter.split_documents(documents)
vectorstore = FAISS.from_documents(texts, embedding_model)
return vectorstore
def upload_files(files):
global vectordb
temp_dir = tempfile.mkdtemp()
all_docs = []
for file in files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.read())
docs = load_document(file_path)
all_docs.extend(docs)
vectordb = train_from_documents(all_docs)
shutil.rmtree(temp_dir)
return "✅ النظام جاهز للإجابة على أسئلتك!"
def answer_question(question):
if vectordb is None:
return "⚠️ الرجاء رفع الملفات أولاً."
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
llm=None,
retriever=retriever,
return_source_documents=True
)
relevant_docs = qa_chain.retriever.get_relevant_documents(question)
context = "\n".join(doc.page_content for doc in relevant_docs)
inputs = qa_tokenizer(
f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}",
return_tensors="pt",
truncation=True,
max_length=1024
).to(device)
with torch.no_grad():
outputs = qa_model.generate(**inputs, max_length=300)
answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer
with gr.Blocks(title="محاكاة دماغ المؤلف") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("## 📚 ارفع كتبك هنا")
file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
upload_button = gr.Button("🚀 ابدأ التدريب")
training_status = gr.Textbox(label="حالة التدريب", interactive=False)
with gr.Column():
gr.Markdown("## ❓ اطرح سؤالك")
question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...")
ask_button = gr.Button("✉️ أرسل السؤال!")
answer_output = gr.Textbox(label="الإجابة", interactive=False)
upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status])
ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output])
demo.launch(share=True)