vanaraj07 commited on
Commit
b865f35
·
verified ·
1 Parent(s): 1c65196
Files changed (1) hide show
  1. app.py +120 -121
app.py CHANGED
@@ -1,122 +1,121 @@
1
- import gradio as gr
2
- from google.colab import drive
3
- import os
4
- import pandas as pd
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
- from sklearn.metrics.pairwise import cosine_similarity
7
- import numpy as np
8
- import faiss
9
- from sentence_transformers import SentenceTransformer, CrossEncoder
10
- import openai
11
-
12
- csv_path = 'train_data.csv'
13
- if not os.path.isfile(csv_path):
14
- raise FileNotFoundError(f"Could not find CSV at {csv_path}")
15
-
16
- df = pd.read_csv(csv_path, on_bad_lines='skip').dropna()
17
- df.columns = ['Question', 'Answer']
18
-
19
- # STEP 3: Build TF-IDF structures (same)
20
- questions = df['Question'].tolist()
21
- answers = df['Answer'].tolist()
22
- qa_pairs = [f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)]
23
- tfidf = TfidfVectorizer(max_features=5000).fit(questions)
24
- tfidf_matrix = tfidf.transform(questions)
25
-
26
- # STEP 4: Enhanced Embedding of Q+A pairs
27
- embedder = SentenceTransformer("all-mpnet-base-v2")
28
- qa_embeddings = embedder.encode(qa_pairs, convert_to_numpy=True)
29
- dim = qa_embeddings.shape[1]
30
- index = faiss.IndexHNSWFlat(dim, 32)
31
- index.hnsw.efConstruction = 200
32
- index.add(qa_embeddings)
33
-
34
- # STEP 5: Together AI Setup (same)
35
- openai.api_key = "cfbafb6a338787841b0295fa7fbe0e4acca77b70ccc3d92bafea2004783b93a3"
36
- openai.api_base = "https://api.together.xyz/v1"
37
-
38
- # STEP 6: Smarter Hybrid Context Retriever
39
- cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
40
-
41
- def get_top_k_matches(query, lex_n=50, sem_k=20, ce_k=5):
42
- # Lexical filter
43
- q_tfidf = tfidf.transform([query])
44
- lex_scores = cosine_similarity(q_tfidf, tfidf_matrix).flatten()
45
- lex_idxs = np.argsort(lex_scores)[-lex_n:][::-1]
46
-
47
- # Embed query
48
- q_emb = embedder.encode([query], convert_to_numpy=True)
49
- sub_embs = qa_embeddings[lex_idxs]
50
- dists = np.linalg.norm(sub_embs - q_emb, axis=1)
51
- top_sem_idxs = np.argsort(dists)[:sem_k]
52
- cand_idxs = [lex_idxs[i] for i in top_sem_idxs]
53
-
54
- # Cross-encoder for precision rerank
55
- candidates = [qa_pairs[i] for i in cand_idxs]
56
- pairs = [[query, cand] for cand in candidates]
57
- ce_scores = cross_encoder.predict(pairs)
58
- scored = sorted(zip(ce_scores, candidates), reverse=True)
59
- top_contexts = [ctx for _, ctx in scored[:ce_k]]
60
- return top_contexts
61
-
62
- # STEP 7: Smart Prompt Generator (unchanged)
63
- def generate_prompt(user_query, context):
64
- return f"""
65
- You are a smart and friendly assistant helping students with academic-related queries.
66
-
67
- Below is a question from a student. You have been given multiple pieces of relevant academic context pulled from the official college documentation. Carefully analyze all the given Q&A context and generate the most accurate, clear, and helpful answer for the student.
68
-
69
- ### Student's Question:
70
- {user_query}
71
-
72
- ### Top Contexts:
73
- {context}
74
-
75
- ### Instructions:
76
- - Use all relevant context to form your answer.
77
- - Avoid repeating the same sentences. Summarize smartly.
78
- - Keep your answer polite and student-friendly.
79
- - If not found, reply: "I'm sorry, I couldn't find this information in the provided academic context."
80
-
81
- ### Your Final Answer:
82
- """
83
-
84
- # STEP 8: Ask a question and get response (unchanged)
85
- def ask_bot(question):
86
- context = get_top_k_matches(question)
87
- prompt = generate_prompt(question, context)
88
- response = openai.ChatCompletion.create(
89
- model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
90
- messages=[{"role":"user","content":prompt}],
91
- temperature=0.5, max_tokens=1024
92
- )
93
- return response.choices[0].message.content
94
-
95
-
96
- # Define query function
97
- def qa_pipeline(query, history=[]):
98
- try:
99
- response = ask_bot(query)
100
- history.append((query, response))
101
- return "", history
102
- except Exception as e:
103
- history.append((query, f"⚠️ Error: {str(e)}"))
104
- return "", history
105
-
106
- # Launch UI with blocks
107
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
108
- gr.Markdown("## 🤖 KCT Smart Chatbot")
109
- gr.Markdown("Ask academic or college-related questions. Powered by your custom dataset.")
110
-
111
- chatbot = gr.Chatbot(label="KCT Chatbot", height=400)
112
- msg = gr.Textbox(label="Enter your question here")
113
- clear = gr.Button("🧹 Clear Chat")
114
-
115
- # On send
116
- def user_submit(user_input, chat_history):
117
- return qa_pipeline(user_input, chat_history)
118
-
119
- msg.submit(user_submit, [msg, chatbot], [msg, chatbot])
120
- clear.click(lambda: None, None, chatbot, queue=False)
121
-
122
  demo.launch(share=True)
 
1
+ import gradio as gr
2
+ import os
3
+ import pandas as pd
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ import faiss
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+ import openai
10
+
11
+ csv_path = 'train_data.csv'
12
+ if not os.path.isfile(csv_path):
13
+ raise FileNotFoundError(f"Could not find CSV at {csv_path}")
14
+
15
+ df = pd.read_csv(csv_path, on_bad_lines='skip').dropna()
16
+ df.columns = ['Question', 'Answer']
17
+
18
+ # STEP 3: Build TF-IDF structures (same)
19
+ questions = df['Question'].tolist()
20
+ answers = df['Answer'].tolist()
21
+ qa_pairs = [f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)]
22
+ tfidf = TfidfVectorizer(max_features=5000).fit(questions)
23
+ tfidf_matrix = tfidf.transform(questions)
24
+
25
+ # STEP 4: Enhanced Embedding of Q+A pairs
26
+ embedder = SentenceTransformer("all-mpnet-base-v2")
27
+ qa_embeddings = embedder.encode(qa_pairs, convert_to_numpy=True)
28
+ dim = qa_embeddings.shape[1]
29
+ index = faiss.IndexHNSWFlat(dim, 32)
30
+ index.hnsw.efConstruction = 200
31
+ index.add(qa_embeddings)
32
+
33
+ # STEP 5: Together AI Setup (same)
34
+ openai.api_key = "cfbafb6a338787841b0295fa7fbe0e4acca77b70ccc3d92bafea2004783b93a3"
35
+ openai.api_base = "https://api.together.xyz/v1"
36
+
37
+ # STEP 6: Smarter Hybrid Context Retriever
38
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
39
+
40
+ def get_top_k_matches(query, lex_n=50, sem_k=20, ce_k=5):
41
+ # Lexical filter
42
+ q_tfidf = tfidf.transform([query])
43
+ lex_scores = cosine_similarity(q_tfidf, tfidf_matrix).flatten()
44
+ lex_idxs = np.argsort(lex_scores)[-lex_n:][::-1]
45
+
46
+ # Embed query
47
+ q_emb = embedder.encode([query], convert_to_numpy=True)
48
+ sub_embs = qa_embeddings[lex_idxs]
49
+ dists = np.linalg.norm(sub_embs - q_emb, axis=1)
50
+ top_sem_idxs = np.argsort(dists)[:sem_k]
51
+ cand_idxs = [lex_idxs[i] for i in top_sem_idxs]
52
+
53
+ # Cross-encoder for precision rerank
54
+ candidates = [qa_pairs[i] for i in cand_idxs]
55
+ pairs = [[query, cand] for cand in candidates]
56
+ ce_scores = cross_encoder.predict(pairs)
57
+ scored = sorted(zip(ce_scores, candidates), reverse=True)
58
+ top_contexts = [ctx for _, ctx in scored[:ce_k]]
59
+ return top_contexts
60
+
61
+ # STEP 7: Smart Prompt Generator (unchanged)
62
+ def generate_prompt(user_query, context):
63
+ return f"""
64
+ You are a smart and friendly assistant helping students with academic-related queries.
65
+
66
+ Below is a question from a student. You have been given multiple pieces of relevant academic context pulled from the official college documentation. Carefully analyze all the given Q&A context and generate the most accurate, clear, and helpful answer for the student.
67
+
68
+ ### Student's Question:
69
+ {user_query}
70
+
71
+ ### Top Contexts:
72
+ {context}
73
+
74
+ ### Instructions:
75
+ - Use all relevant context to form your answer.
76
+ - Avoid repeating the same sentences. Summarize smartly.
77
+ - Keep your answer polite and student-friendly.
78
+ - If not found, reply: "I'm sorry, I couldn't find this information in the provided academic context."
79
+
80
+ ### Your Final Answer:
81
+ """
82
+
83
+ # STEP 8: Ask a question and get response (unchanged)
84
+ def ask_bot(question):
85
+ context = get_top_k_matches(question)
86
+ prompt = generate_prompt(question, context)
87
+ response = openai.ChatCompletion.create(
88
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
89
+ messages=[{"role":"user","content":prompt}],
90
+ temperature=0.5, max_tokens=1024
91
+ )
92
+ return response.choices[0].message.content
93
+
94
+
95
+ # Define query function
96
+ def qa_pipeline(query, history=[]):
97
+ try:
98
+ response = ask_bot(query)
99
+ history.append((query, response))
100
+ return "", history
101
+ except Exception as e:
102
+ history.append((query, f"⚠️ Error: {str(e)}"))
103
+ return "", history
104
+
105
+ # Launch UI with blocks
106
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
+ gr.Markdown("## 🤖 KCT Smart Chatbot")
108
+ gr.Markdown("Ask academic or college-related questions. Powered by your custom dataset.")
109
+
110
+ chatbot = gr.Chatbot(label="KCT Chatbot", height=400)
111
+ msg = gr.Textbox(label="Enter your question here")
112
+ clear = gr.Button("🧹 Clear Chat")
113
+
114
+ # On send
115
+ def user_submit(user_input, chat_history):
116
+ return qa_pipeline(user_input, chat_history)
117
+
118
+ msg.submit(user_submit, [msg, chatbot], [msg, chatbot])
119
+ clear.click(lambda: None, None, chatbot, queue=False)
120
+
 
121
  demo.launch(share=True)