ramysaidagieb commited on
Commit
4056d0b
·
verified ·
1 Parent(s): 23e3081

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -117
app.py DELETED
@@ -1,117 +0,0 @@
1
- # app.py
2
-
3
- import gradio as gr
4
- import os
5
- import pdfminer.high_level
6
- import docx
7
- from sentence_transformers import SentenceTransformer
8
- from transformers import AutoModelForCausalLM, AutoTokenizer
9
- import torch
10
- import faiss
11
- import tempfile
12
-
13
- # ====== Settings ======
14
- EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
15
- GENERATION_MODEL_NAME = "aubmindlab/aragpt2-small"
16
- CHUNK_SIZE = 500
17
- CHUNK_OVERLAP = 50
18
- TOP_K = 5
19
-
20
- # ====== Load Models ======
21
- embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
22
- gen_tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
23
- gen_model = AutoModelForCausalLM.from_pretrained(GENERATION_MODEL_NAME)
24
-
25
- # ====== Globals ======
26
- index = None
27
- chunks = []
28
-
29
- # ====== Helpers ======
30
- def extract_text_from_pdf(file_path):
31
- with open(file_path, 'rb') as f:
32
- return pdfminer.high_level.extract_text(f)
33
-
34
- def extract_text_from_docx(file_path):
35
- doc = docx.Document(file_path)
36
- return "\n".join([para.text for para in doc.paragraphs])
37
-
38
- def chunk_text(text):
39
- words = text.split()
40
- chunks = []
41
- for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
42
- chunk = " ".join(words[i:i+CHUNK_SIZE])
43
- chunks.append(chunk)
44
- return chunks
45
-
46
- def build_vector_store(chunks):
47
- vectors = embedder.encode(chunks)
48
- dim = vectors.shape[1]
49
- idx = faiss.IndexFlatL2(dim)
50
- idx.add(vectors)
51
- return idx, vectors
52
-
53
- def retrieve_relevant_chunks(question, idx, chunks, vectors):
54
- q_vec = embedder.encode([question])
55
- D, I = idx.search(q_vec, TOP_K)
56
- return [chunks[i] for i in I[0] if i < len(chunks)]
57
-
58
- def generate_answer(context_chunks, question):
59
- context = " \n".join(context_chunks)
60
- prompt = f"سؤال: {question}\nمحتوى ذو صلة: {context}\nجواب:"
61
- inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
62
- outputs = gen_model.generate(**inputs, max_new_tokens=100)
63
- answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
64
- answer = answer.split("جواب:")[-1].strip()
65
- return answer
66
-
67
- # ====== Gradio Functions ======
68
- def upload_and_train(files):
69
- global index, chunks
70
-
71
- all_text = ""
72
- for file in files:
73
- suffix = os.path.splitext(file.name)[-1].lower()
74
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
75
- tmp.write(file.read())
76
- tmp_path = tmp.name
77
- if suffix == ".pdf":
78
- all_text += extract_text_from_pdf(tmp_path) + "\n"
79
- elif suffix in [".docx", ".doc"]:
80
- all_text += extract_text_from_docx(tmp_path) + "\n"
81
- os.unlink(tmp_path)
82
-
83
- chunks = chunk_text(all_text)
84
- index, vectors = build_vector_store(chunks)
85
-
86
- return "✅ النظام جاهز للإجابة على أسئلتك"
87
-
88
- def ask_question(user_question):
89
- if index is None:
90
- return "الرجاء رفع الكتب أولاً وتدريب النظام."
91
- rel_chunks = retrieve_relevant_chunks(user_question, index, chunks, None)
92
- answer = generate_answer(rel_chunks, user_question)
93
- return answer
94
-
95
- # ====== Gradio Interface ======
96
- upload = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple")
97
- train_btn = gr.Button("ابدأ التدريب")
98
- train_output = gr.Textbox()
99
- question_input = gr.Textbox(placeholder="اكتب سؤالك هنا باللغة العربية")
100
- answer_output = gr.Textbox()
101
- ask_btn = gr.Button("أرسل السؤال")
102
-
103
- with gr.Blocks() as demo:
104
- gr.Markdown("# 🧠 محاكاة دماغ المؤلف - نظام ذكي للإجابة على الأسئلة من كتبك بالعربية")
105
- upload.render()
106
- train_btn.render()
107
- train_output.render()
108
- question_input.render()
109
- ask_btn.render()
110
- answer_output.render()
111
-
112
- train_btn.click(upload_and_train, inputs=[upload], outputs=[train_output])
113
- ask_btn.click(ask_question, inputs=[question_input], outputs=[answer_output])
114
-
115
- # Launch
116
- if __name__ == "__main__":
117
- demo.launch()