ramysaidagieb commited on
Commit
a1f3bda
·
verified ·
1 Parent(s): f396372

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -85
app.py CHANGED
@@ -1,98 +1,45 @@
 
1
  import gradio as gr
2
- import os
3
- import tempfile
4
- import shutil
5
- from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_community.embeddings import HuggingFaceEmbeddings
8
- from langchain_community.vectorstores import FAISS
9
- from langchain.chains import RetrievalQA
10
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
11
  import torch
12
 
13
- EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
14
- QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter"
15
-
16
- embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
- qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True)
19
- qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device)
20
-
21
- vectordb = None
22
-
23
- def load_document(file_path):
24
- ext = os.path.splitext(file_path)[1].lower()
25
- if ext == ".pdf":
26
- loader = PyMuPDFLoader(file_path)
27
- elif ext in [".doc", ".docx"]:
28
- loader = UnstructuredWordDocumentLoader(file_path)
29
- else:
30
- raise ValueError("صيغة الملف غير مدعومة.")
31
- return loader.load()
32
-
33
- def train_from_documents(documents):
34
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
35
- texts = splitter.split_documents(documents)
36
- vectorstore = FAISS.from_documents(texts, embedding_model)
37
- return vectorstore
38
-
39
- def upload_files(files):
40
- global vectordb
41
- temp_dir = tempfile.mkdtemp()
42
- all_docs = []
43
-
44
- for file in files:
45
- file_path = os.path.join(temp_dir, file.name)
46
- with open(file_path, "wb") as f:
47
- f.write(file.read())
48
- docs = load_document(file_path)
49
- all_docs.extend(docs)
50
-
51
- vectordb = train_from_documents(all_docs)
52
- shutil.rmtree(temp_dir)
53
- return "✅ النظام جاهز للإجابة على أسئلتك!"
54
 
55
- def answer_question(question):
56
- if vectordb is None:
57
- return "⚠️ الرجاء رفع الملفات أولاً."
58
-
59
- retriever = vectordb.as_retriever(search_kwargs={"k": 5})
60
- qa_chain = RetrievalQA.from_chain_type(
61
- llm=None,
62
- retriever=retriever,
63
- return_source_documents=True
64
- )
65
 
66
- relevant_docs = qa_chain.retriever.get_relevant_documents(question)
67
- context = "\n".join(doc.page_content for doc in relevant_docs)
 
 
 
 
 
 
 
 
 
68
 
69
- inputs = qa_tokenizer(
70
- f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}",
71
- return_tensors="pt",
72
- truncation=True,
73
- max_length=1024
74
- ).to(device)
75
 
76
- with torch.no_grad():
77
- outputs = qa_model.generate(**inputs, max_length=300)
78
- answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
79
- return answer
80
 
81
- with gr.Blocks(title="محاكاة دماغ المؤلف") as demo:
82
  with gr.Row():
83
- with gr.Column():
84
- gr.Markdown("## 📚 ارفع كتبك هنا")
85
- file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
86
- upload_button = gr.Button("🚀 ابدأ التدريب")
87
- training_status = gr.Textbox(label="حالة التدريب", interactive=False)
88
-
89
- with gr.Column():
90
- gr.Markdown("## ❓ اطرح سؤالك")
91
- question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...")
92
- ask_button = gr.Button("✉️ أرسل السؤال!")
93
- answer_output = gr.Textbox(label="الإجابة", interactive=False)
94
 
95
- upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status])
96
- ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output])
97
 
98
- demo.launch(share=True)
 
1
+ # app.py
2
  import gradio as gr
 
 
 
 
 
 
 
 
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  import torch
5
 
6
+ # تحميل النموذج والمحول
7
+ model_name = "csebuetnlp/mT5_small_arabic_qa"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ model = model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def generate_answer(question, context):
14
+ input_text = f"سؤال: {question} سياق: {context}"
15
+ inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
16
+ output = model.generate(**inputs, max_length=256)
17
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
18
+ return answer
 
 
 
 
19
 
20
+ def ask_question(question, context):
21
+ if not question.strip():
22
+ return "❌ الرجاء كتابة سؤال."
23
+ answer = generate_answer(question, context)
24
+ return f"✅ الإجابة: {answer}"
25
+
26
+ with gr.Blocks(title="سؤال وجواب من الكتب") as demo:
27
+ gr.Markdown("""
28
+ # 📚 اسأل كتبك!
29
+ اطرح أي سؤال وسنبحث لك عن الجواب من محتوى الكتب بدقة وفهم!
30
+ """)
31
 
32
+ with gr.Row():
33
+ question = gr.Textbox(label="✍️ اكتب سؤالك هنا:", placeholder="مثال: ما معنى الذكاء الاصطناعي؟")
 
 
 
 
34
 
35
+ with gr.Row():
36
+ context = gr.Textbox(label="📖 اكتب أو الصق نص من كتابك هنا:", placeholder="انسخ فقرة أو أكثر من الكتاب...")
 
 
37
 
 
38
  with gr.Row():
39
+ ask_btn = gr.Button("🔍 احصل على الإجابة")
40
+
41
+ output = gr.Textbox(label="💬 الإجابة:")
 
 
 
 
 
 
 
 
42
 
43
+ ask_btn.click(fn=ask_question, inputs=[question, context], outputs=output)
 
44
 
45
+ demo.launch()