ramysaidagieb commited on
Commit
b278fd0
·
verified ·
1 Parent(s): 0cb59dc

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. README.md +19 -0
  3. app.py +89 -0
  4. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ *.pyc
README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ title: Arabic Author Brain QA
4
+ emoji: "📚"
5
+ colorFrom: blue
6
+ colorTo: blue
7
+ sdk: gradio
8
+ sdk_version: "4.25.0"
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
+ # Arabic Author Brain QA
14
+
15
+ Upload Arabic books in PDF, DOCX, or DOC format, and build an interactive Q&A bot that understands the ideas inside the books.
16
+
17
+ ## بالعربية
18
+ ارفع كتبك بالعربية ودع النظام يتعلم منها ليجيب عن أسئلتك بناءً على الأفكار، وليس مجرد تكرار النصوص!
19
+
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+ import pdfminer.high_level
7
+ import docx
8
+ import faiss
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+ from sentence_transformers import SentenceTransformer
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.embeddings import HuggingFaceEmbeddings
14
+ from langchain.vectorstores import FAISS
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+ # تحميل النماذج
19
+ embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)
20
+ qa_model_name = "aubmindlab/aragpt2-base"
21
+ qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
22
+ qa_model = AutoModelForCausalLM.from_pretrained(qa_model_name).to(device)
23
+
24
+ # إعداد قاعدة البيانات
25
+ index = None
26
+ docs = []
27
+
28
+ def extract_text(file_path):
29
+ if file_path.endswith('.pdf'):
30
+ with open(file_path, 'rb') as f:
31
+ return pdfminer.high_level.extract_text(f)
32
+ elif file_path.endswith('.docx') or file_path.endswith('.doc'):
33
+ doc = docx.Document(file_path)
34
+ return "\n".join([para.text for para in doc.paragraphs])
35
+ else:
36
+ raise ValueError("صيغة ملف غير مدعومة")
37
+
38
+ def process_files(files):
39
+ global index, docs
40
+ all_text = ""
41
+ for file in files:
42
+ text = extract_text(file.name)
43
+ all_text += text + "\n"
44
+
45
+ # تقسيم النص إلى مقاطع
46
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
47
+ texts = text_splitter.split_text(all_text)
48
+
49
+ # إنشاء المتجهات
50
+ embeddings = embedding_model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
51
+ index = faiss.IndexFlatL2(embeddings.shape[1])
52
+ index.add(embeddings.cpu().numpy())
53
+ docs = texts
54
+ return "✅ تم تحميل الكتب واستيعاب الأفكار! النظام جاهز للإجابة."
55
+
56
+ def generate_answer(question):
57
+ global index, docs
58
+ if index is None:
59
+ return "❌ الرجاء رفع الكتب أولاً."
60
+
61
+ q_emb = embedding_model.encode([question])
62
+ D, I = index.search(q_emb, k=3)
63
+ context = "\n".join([docs[i] for i in I[0]])
64
+
65
+ # تجهيز الإدخال للنموذج
66
+ prompt = f"سؤال: {question}\n\nمحتوى ذو صلة:\n{context}\n\nالإجابة:"
67
+ inputs = qa_tokenizer(prompt, return_tensors='pt').to(device)
68
+ outputs = qa_model.generate(**inputs, max_new_tokens=300, pad_token_id=qa_tokenizer.eos_token_id)
69
+ answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True)
70
+ return answer.split("الإجابة:")[-1].strip()
71
+
72
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
73
+ gr.Markdown("""
74
+ # 🚀 أهلاً بك في محاكاة عقل المؤلف
75
+ ارفع كتبك واستعد للانطلاق في رحلة استكشاف الأفكار العميقة!
76
+ """)
77
+ with gr.Tab("📚 رفع الكتب للتدريب"):
78
+ upload = gr.File(file_types=['.pdf', '.docx', '.doc'], file_count='multiple')
79
+ train_button = gr.Button("🚀 ابدأ التدريب!")
80
+ train_output = gr.Textbox(label="🔵 حالة التدريب", interactive=False)
81
+ with gr.Tab("❓ اسأل الكتاب"):
82
+ question = gr.Textbox(label="اكتب سؤالك هنا...")
83
+ answer = gr.Textbox(label="الإجابة", interactive=False)
84
+ ask_button = gr.Button("✉️ أرسل السؤال!"")
85
+
86
+ train_button.click(process_files, inputs=[upload], outputs=[train_output])
87
+ ask_button.click(generate_answer, inputs=[question], outputs=[answer])
88
+
89
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ gradio
3
+ transformers
4
+ sentence-transformers
5
+ pdfminer.six
6
+ python-docx
7
+ faiss-cpu
8
+ langchain
9
+ torch