abdull4h commited on
Commit
5224f4e
·
verified ·
1 Parent(s): 30d051c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vision 2030 Virtual Assistant with Arabic (ALLaM-7B) and English (Mistral-7B-Instruct) + RAG + Improved Prompting
2
+
3
+ import gradio as gr
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
5
+ from langdetect import detect
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ import numpy as np
9
+
10
+ # ----------------------------
11
+ # Load Arabic Model (ALLaM-7B)
12
+ # ----------------------------
13
+ print("Loading ALLaM-7B-Instruct-preview for Arabic...")
14
+ arabic_model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"
15
+ arabic_tokenizer = AutoTokenizer.from_pretrained(arabic_model_id)
16
+ arabic_model = AutoModelForCausalLM.from_pretrained(arabic_model_id, device_map="auto")
17
+ arabic_pipe = pipeline("text-generation", model=arabic_model, tokenizer=arabic_tokenizer)
18
+
19
+ # ----------------------------
20
+ # Load English Model (Mistral-7B-Instruct)
21
+ # ----------------------------
22
+ print("Loading Mistral-7B-Instruct-v0.2 for English...")
23
+ english_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
24
+ english_tokenizer = AutoTokenizer.from_pretrained(english_model_id)
25
+ english_model = AutoModelForCausalLM.from_pretrained(english_model_id, device_map="auto")
26
+ english_pipe = pipeline("text-generation", model=english_model, tokenizer=english_tokenizer)
27
+
28
+ # ----------------------------
29
+ # Load Embedding Models for Retrieval
30
+ # ----------------------------
31
+ print("Loading Embedding Models for Retrieval...")
32
+ arabic_embedder = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-ca')
33
+ english_embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
34
+
35
+ # ----------------------------
36
+ # Prepare FAISS Index (dummy example)
37
+ # ----------------------------
38
+ # In real scenario, load Vision 2030 documents, preprocess & embed
39
+ # Here we'll create dummy data for demonstration
40
+
41
+ documents = [
42
+ {"text": "Vision 2030 aims to diversify the Saudi economy.", "lang": "en"},
43
+ {"text": "رؤية 2030 تهدف إلى تنويع الاقتصاد السعودي.", "lang": "ar"}
44
+ ]
45
+
46
+ # Embed documents and build index
47
+ english_vectors = []
48
+ arabic_vectors = []
49
+ english_texts = []
50
+ arabic_texts = []
51
+
52
+ for doc in documents:
53
+ if doc["lang"] == "en":
54
+ vec = english_embedder.encode(doc["text"])
55
+ english_vectors.append(vec)
56
+ english_texts.append(doc["text"])
57
+ else:
58
+ vec = arabic_embedder.encode(doc["text"])
59
+ arabic_vectors.append(vec)
60
+ arabic_texts.append(doc["text"])
61
+
62
+ # FAISS indexes
63
+ english_index = faiss.IndexFlatL2(len(english_vectors[0]))
64
+ english_index.add(np.array(english_vectors))
65
+
66
+ arabic_index = faiss.IndexFlatL2(len(arabic_vectors[0]))
67
+ arabic_index.add(np.array(arabic_vectors))
68
+
69
+ # ----------------------------
70
+ # Define the RAG response function with Improved Prompting
71
+ # ----------------------------
72
+ def retrieve_and_generate(user_input):
73
+ try:
74
+ lang = detect(user_input)
75
+ except:
76
+ lang = "en" # Default fallback
77
+
78
+ if lang == "ar":
79
+ print("Detected Arabic input")
80
+ query_vec = arabic_embedder.encode(user_input)
81
+ D, I = arabic_index.search(np.array([query_vec]), k=1)
82
+ context = arabic_texts[I[0][0]] if I[0][0] >= 0 else ""
83
+ # Improved Arabic Prompt
84
+ input_text = (
85
+ f"أنت خبير في رؤية السعودية 2030.\n"
86
+ f"إليك بعض المعلومات المهمة:\n{context}\n\n"
87
+ f"مثال:\n"
88
+ f"السؤال: ما هي ركائز رؤية 2030؟\n"
89
+ f"الإجابة: ركائز رؤية 2030 هي مجتمع حيوي، اقتصاد مزدهر، ووطن طموح.\n\n"
90
+ f"أجب عن سؤال المستخدم بشكل واضح ودقيق.\n"
91
+ f"السؤال: {user_input}\n"
92
+ f"الإجابة:"
93
+ )
94
+ response = arabic_pipe(input_text, max_new_tokens=256, do_sample=True, temperature=0.7)
95
+ reply = response[0]['generated_text']
96
+
97
+ else:
98
+ print("Detected English input")
99
+ query_vec = english_embedder.encode(user_input)
100
+ D, I = english_index.search(np.array([query_vec]), k=1)
101
+ context = english_texts[I[0][0]] if I[0][0] >= 0 else ""
102
+ # Improved English Prompt
103
+ input_text = (
104
+ f"You are an expert on Saudi Arabia's Vision 2030.\n"
105
+ f"Here is some relevant information:\n{context}\n\n"
106
+ f"Example:\n"
107
+ f"Question: What are the key pillars of Vision 2030?\n"
108
+ f"Answer: The key pillars are a vibrant society, a thriving economy, and an ambitious nation.\n\n"
109
+ f"Answer the user's question clearly and accurately.\n"
110
+ f"Question: {user_input}\n"
111
+ f"Answer:"
112
+ )
113
+ response = english_pipe(input_text, max_new_tokens=256, do_sample=True, temperature=0.7)
114
+ reply = response[0]['generated_text']
115
+
116
+ return reply
117
+
118
+ # ----------------------------
119
+ # Gradio UI
120
+ # ----------------------------
121
+ with gr.Blocks() as demo:
122
+ gr.Markdown("# Vision 2030 Virtual Assistant 🌍\n\nSupports Arabic & English queries about Vision 2030 (with RAG retrieval and improved prompting).")
123
+ chatbot = gr.Chatbot()
124
+ msg = gr.Textbox(label="Ask me anything about Vision 2030")
125
+ clear = gr.Button("Clear")
126
+
127
+ def chat(message, history):
128
+ reply = retrieve_and_generate(message)
129
+ history.append((message, reply))
130
+ return history, ""
131
+
132
+ msg.submit(chat, [msg, chatbot], [chatbot, msg])
133
+ clear.click(lambda: None, None, chatbot, queue=False)
134
+
135
+ # Launching the space
136
+ demo.launch()