ramysaidagieb commited on
Commit
4141758
·
verified ·
1 Parent(s): 4cdd12b

Upload 5 files

Browse files
Files changed (5) hide show
  1. .gitignore +8 -0
  2. README.md +23 -0
  3. app.py +122 -0
  4. requirements.txt +9 -0
  5. space.yaml +10 -0
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .env
6
+ tmp/
7
+ *.tmp
8
+ *.log
README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arabic Book Brain AI - Hugging Face Space
2
+
3
+ ## 📚 Description (English)
4
+
5
+ Upload Arabic books (.pdf, .docx, .doc), train a system to understand the book's ideas, and answer Arabic questions based strictly on the uploaded content.
6
+
7
+ ## 📚 الوصف (بالعربية)
8
+
9
+ رفع كتب باللغة العربية (بصيغة PDF أو DOCX أو DOC)، تدريب نظام ذكي لفهم أفكار الكتب، ثم الإجابة عن الأسئلة باللغة العربية بناءً فقط على المحتوى.
10
+
11
+ ---
12
+
13
+ ## 🛠 Installation
14
+
15
+ ```bash
16
+ pip install -r requirements.txt
17
+ ```
18
+
19
+ ## 🧠 Start App
20
+
21
+ ```bash
22
+ python app.py
23
+ ```
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import faiss
5
+ import torch
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains import RetrievalQA
11
+ from langchain.llms import HuggingFacePipeline
12
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
13
+ from pdfminer.high_level import extract_text as extract_pdf_text
14
+ import docx
15
+ import nltk
16
+
17
+ nltk.download('punkt')
18
+ from nltk.tokenize import sent_tokenize
19
+
20
+ uploaded_texts = []
21
+ vector_store = None
22
+ qa_chain = None
23
+
24
+ embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
25
+ embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
26
+
27
+ model_name = "csebuetnlp/mT5_small"
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
30
+
31
+ pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
32
+ llm = HuggingFacePipeline(pipeline=pipe)
33
+
34
+ ARABIC_PROMPT_TEMPLATE = """
35
+ أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
36
+ لا تستخدم أي معلومات خارجية.
37
+ السؤال: {question}
38
+ الإجابة:
39
+ """
40
+
41
+ def format_arabic_prompt(question):
42
+ return ARABIC_PROMPT_TEMPLATE.format(question=question)
43
+
44
+ def extract_text_from_file(file_path):
45
+ if file_path.endswith(".pdf"):
46
+ return extract_pdf_text(file_path)
47
+ elif file_path.endswith(".docx") or file_path.endswith(".doc"):
48
+ doc = docx.Document(file_path)
49
+ return "\n".join([para.text for para in doc.paragraphs])
50
+ else:
51
+ raise ValueError("Unsupported file format")
52
+
53
+ def arabic_split_text(text):
54
+ sentences = sent_tokenize(text, language='arabic')
55
+ chunks = []
56
+ chunk = ""
57
+ for sentence in sentences:
58
+ if len(chunk) + len(sentence) <= 500:
59
+ chunk += " " + sentence
60
+ else:
61
+ chunks.append(chunk.strip())
62
+ chunk = sentence
63
+ if chunk:
64
+ chunks.append(chunk.strip())
65
+ return chunks
66
+
67
+ def train_from_texts(texts):
68
+ global vector_store, qa_chain
69
+
70
+ splitter = RecursiveCharacterTextSplitter(
71
+ chunk_size=500,
72
+ chunk_overlap=100,
73
+ length_function=len,
74
+ )
75
+
76
+ all_chunks = []
77
+ for text in texts:
78
+ chunks = arabic_split_text(text)
79
+ all_chunks.extend(chunks)
80
+
81
+ vectors = embeddings.embed_documents(all_chunks)
82
+ dimension = len(vectors[0])
83
+ index = faiss.IndexFlatL2(dimension)
84
+ vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
85
+
86
+ retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
87
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
88
+
89
+ def upload_book(file, progress=gr.Progress()):
90
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
91
+ tmp.write(file.read())
92
+ tmp_path = tmp.name
93
+
94
+ progress(0.2, desc="تحميل الملف...")
95
+ extracted_text = extract_text_from_file(tmp_path)
96
+ uploaded_texts.append(extracted_text)
97
+ progress(0.5, desc="معالجة النص...")
98
+
99
+ train_from_texts(uploaded_texts)
100
+ progress(1.0, desc="اكتمل التدريب!")
101
+ return "النظام جاهز للإجابة على أسئلتك"
102
+
103
+ def answer_question(user_question):
104
+ if qa_chain is None:
105
+ return "الرجاء رفع كتاب أولاً."
106
+ prompt = format_arabic_prompt(user_question)
107
+ result = qa_chain.run(prompt)
108
+ return result
109
+
110
+ with gr.Blocks() as demo:
111
+ with gr.Tab("تحميل الكتب"):
112
+ upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
113
+ upload_output = gr.Textbox(label="حالة النظام")
114
+ upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
115
+
116
+ with gr.Tab("اسأل الكتاب"):
117
+ question = gr.Textbox(label="اكتب سؤالك بالعربية")
118
+ answer = gr.Textbox(label="الإجابة")
119
+ ask_button = gr.Button("إرسال السؤال")
120
+ ask_button.click(answer_question, inputs=question, outputs=answer)
121
+
122
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ langchain
5
+ faiss-cpu
6
+ torch
7
+ pdfminer.six
8
+ python-docx
9
+ nltk
space.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Space metadata
2
+ title: Arabic Book Brain AI
3
+ emoji: 📚
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.14.0
8
+ app_file: app.py
9
+ python_version: 3.10
10
+ license: apache-2.0