ramysaidagieb commited on
Commit
a8ba66a
·
verified ·
1 Parent(s): 4141758

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -114
app.py CHANGED
@@ -1,122 +1,117 @@
 
 
1
  import gradio as gr
2
  import os
3
- import tempfile
4
- import faiss
5
- import torch
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain.prompts import PromptTemplate
10
- from langchain.chains import RetrievalQA
11
- from langchain.llms import HuggingFacePipeline
12
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
13
- from pdfminer.high_level import extract_text as extract_pdf_text
14
  import docx
15
- import nltk
16
-
17
- nltk.download('punkt')
18
- from nltk.tokenize import sent_tokenize
19
-
20
- uploaded_texts = []
21
- vector_store = None
22
- qa_chain = None
23
-
24
- embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
25
- embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
26
-
27
- model_name = "csebuetnlp/mT5_small"
28
- tokenizer = AutoTokenizer.from_pretrained(model_name)
29
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
30
-
31
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
32
- llm = HuggingFacePipeline(pipeline=pipe)
33
-
34
- ARABIC_PROMPT_TEMPLATE = """
35
- أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
36
- لا تستخدم أي معلومات خارجية.
37
- السؤال: {question}
38
- الإجابة:
39
- """
40
-
41
- def format_arabic_prompt(question):
42
- return ARABIC_PROMPT_TEMPLATE.format(question=question)
43
-
44
- def extract_text_from_file(file_path):
45
- if file_path.endswith(".pdf"):
46
- return extract_pdf_text(file_path)
47
- elif file_path.endswith(".docx") or file_path.endswith(".doc"):
48
- doc = docx.Document(file_path)
49
- return "\n".join([para.text for para in doc.paragraphs])
50
- else:
51
- raise ValueError("Unsupported file format")
52
-
53
- def arabic_split_text(text):
54
- sentences = sent_tokenize(text, language='arabic')
55
  chunks = []
56
- chunk = ""
57
- for sentence in sentences:
58
- if len(chunk) + len(sentence) <= 500:
59
- chunk += " " + sentence
60
- else:
61
- chunks.append(chunk.strip())
62
- chunk = sentence
63
- if chunk:
64
- chunks.append(chunk.strip())
65
  return chunks
66
 
67
- def train_from_texts(texts):
68
- global vector_store, qa_chain
69
-
70
- splitter = RecursiveCharacterTextSplitter(
71
- chunk_size=500,
72
- chunk_overlap=100,
73
- length_function=len,
74
- )
75
-
76
- all_chunks = []
77
- for text in texts:
78
- chunks = arabic_split_text(text)
79
- all_chunks.extend(chunks)
80
-
81
- vectors = embeddings.embed_documents(all_chunks)
82
- dimension = len(vectors[0])
83
- index = faiss.IndexFlatL2(dimension)
84
- vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
85
-
86
- retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
87
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
88
-
89
- def upload_book(file, progress=gr.Progress()):
90
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
91
- tmp.write(file.read())
92
- tmp_path = tmp.name
93
-
94
- progress(0.2, desc="تحميل الملف...")
95
- extracted_text = extract_text_from_file(tmp_path)
96
- uploaded_texts.append(extracted_text)
97
- progress(0.5, desc="معالجة النص...")
98
-
99
- train_from_texts(uploaded_texts)
100
- progress(1.0, desc="اكتمل التدريب!")
101
- return "النظام جاهز للإجابة على أسئلتك"
102
-
103
- def answer_question(user_question):
104
- if qa_chain is None:
105
- return "الرجاء رفع كتاب أولاً."
106
- prompt = format_arabic_prompt(user_question)
107
- result = qa_chain.run(prompt)
108
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  with gr.Blocks() as demo:
111
- with gr.Tab("تحميل الكتب"):
112
- upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
113
- upload_output = gr.Textbox(label="حالة النظام")
114
- upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
115
-
116
- with gr.Tab("اسأل الكتاب"):
117
- question = gr.Textbox(label="اكتب سؤالك بالعربية")
118
- answer = gr.Textbox(label="الإجابة")
119
- ask_button = gr.Button("إرسال السؤال")
120
- ask_button.click(answer_question, inputs=question, outputs=answer)
121
-
122
- demo.launch(share=True)
 
 
 
1
+ # app.py
2
+
3
  import gradio as gr
4
  import os
5
+ import pdfminer.high_level
 
 
 
 
 
 
 
 
 
 
6
  import docx
7
+ from sentence_transformers import SentenceTransformer
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+ import torch
10
+ import faiss
11
+ import tempfile
12
+
13
+ # ====== Settings ======
14
+ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
15
+ GENERATION_MODEL_NAME = "aubmindlab/aragpt2-small"
16
+ CHUNK_SIZE = 500
17
+ CHUNK_OVERLAP = 50
18
+ TOP_K = 5
19
+
20
+ # ====== Load Models ======
21
+ embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
22
+ gen_tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
23
+ gen_model = AutoModelForCausalLM.from_pretrained(GENERATION_MODEL_NAME)
24
+
25
+ # ====== Globals ======
26
+ index = None
27
+ chunks = []
28
+
29
+ # ====== Helpers ======
30
+ def extract_text_from_pdf(file_path):
31
+ with open(file_path, 'rb') as f:
32
+ return pdfminer.high_level.extract_text(f)
33
+
34
+ def extract_text_from_docx(file_path):
35
+ doc = docx.Document(file_path)
36
+ return "\n".join([para.text for para in doc.paragraphs])
37
+
38
+ def chunk_text(text):
39
+ words = text.split()
 
 
 
 
 
 
 
40
  chunks = []
41
+ for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
42
+ chunk = " ".join(words[i:i+CHUNK_SIZE])
43
+ chunks.append(chunk)
 
 
 
 
 
 
44
  return chunks
45
 
46
+ def build_vector_store(chunks):
47
+ vectors = embedder.encode(chunks)
48
+ dim = vectors.shape[1]
49
+ idx = faiss.IndexFlatL2(dim)
50
+ idx.add(vectors)
51
+ return idx, vectors
52
+
53
+ def retrieve_relevant_chunks(question, idx, chunks, vectors):
54
+ q_vec = embedder.encode([question])
55
+ D, I = idx.search(q_vec, TOP_K)
56
+ return [chunks[i] for i in I[0] if i < len(chunks)]
57
+
58
+ def generate_answer(context_chunks, question):
59
+ context = " \n".join(context_chunks)
60
+ prompt = f"سؤال: {question}\nمحتوى ذو صلة: {context}\nجواب:"
61
+ inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
62
+ outputs = gen_model.generate(**inputs, max_new_tokens=100)
63
+ answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
64
+ answer = answer.split("جواب:")[-1].strip()
65
+ return answer
66
+
67
+ # ====== Gradio Functions ======
68
+ def upload_and_train(files):
69
+ global index, chunks
70
+
71
+ all_text = ""
72
+ for file in files:
73
+ suffix = os.path.splitext(file.name)[-1].lower()
74
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
75
+ tmp.write(file.read())
76
+ tmp_path = tmp.name
77
+ if suffix == ".pdf":
78
+ all_text += extract_text_from_pdf(tmp_path) + "\n"
79
+ elif suffix in [".docx", ".doc"]:
80
+ all_text += extract_text_from_docx(tmp_path) + "\n"
81
+ os.unlink(tmp_path)
82
+
83
+ chunks = chunk_text(all_text)
84
+ index, vectors = build_vector_store(chunks)
85
+
86
+ return "✅ النظام جاهز للإجابة على أسئلتك"
87
+
88
+ def ask_question(user_question):
89
+ if index is None:
90
+ return "الرجاء رفع الكتب أولاً وتدريب النظام."
91
+ rel_chunks = retrieve_relevant_chunks(user_question, index, chunks, None)
92
+ answer = generate_answer(rel_chunks, user_question)
93
+ return answer
94
+
95
+ # ====== Gradio Interface ======
96
+ upload = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
97
+ train_btn = gr.Button("ابدأ التدريب")
98
+ train_output = gr.Textbox()
99
+ question_input = gr.Textbox(placeholder="اكتب سؤالك هنا باللغة العربية")
100
+ answer_output = gr.Textbox()
101
+ ask_btn = gr.Button("أرسل السؤال")
102
 
103
  with gr.Blocks() as demo:
104
+ gr.Markdown("# 🧠 محاكاة دماغ المؤلف - نظام ذكي للإجابة على الأسئلة من كتبك بالعربية")
105
+ upload.render()
106
+ train_btn.render()
107
+ train_output.render()
108
+ question_input.render()
109
+ ask_btn.render()
110
+ answer_output.render()
111
+
112
+ train_btn.click(upload_and_train, inputs=[upload], outputs=[train_output])
113
+ ask_btn.click(ask_question, inputs=[question_input], outputs=[answer_output])
114
+
115
+ # Launch
116
+ if __name__ == "__main__":
117
+ demo.launch()