vanaraj07 commited on
Commit
1c65196
·
verified ·
1 Parent(s): 8ba62ab

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +122 -0
  2. requirements.txt +7 -0
  3. train_data.csv +0 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from google.colab import drive
3
+ import os
4
+ import pandas as pd
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import numpy as np
8
+ import faiss
9
+ from sentence_transformers import SentenceTransformer, CrossEncoder
10
+ import openai
11
+
12
+ csv_path = 'train_data.csv'
13
+ if not os.path.isfile(csv_path):
14
+ raise FileNotFoundError(f"Could not find CSV at {csv_path}")
15
+
16
+ df = pd.read_csv(csv_path, on_bad_lines='skip').dropna()
17
+ df.columns = ['Question', 'Answer']
18
+
19
+ # STEP 3: Build TF-IDF structures (same)
20
+ questions = df['Question'].tolist()
21
+ answers = df['Answer'].tolist()
22
+ qa_pairs = [f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)]
23
+ tfidf = TfidfVectorizer(max_features=5000).fit(questions)
24
+ tfidf_matrix = tfidf.transform(questions)
25
+
26
+ # STEP 4: Enhanced Embedding of Q+A pairs
27
+ embedder = SentenceTransformer("all-mpnet-base-v2")
28
+ qa_embeddings = embedder.encode(qa_pairs, convert_to_numpy=True)
29
+ dim = qa_embeddings.shape[1]
30
+ index = faiss.IndexHNSWFlat(dim, 32)
31
+ index.hnsw.efConstruction = 200
32
+ index.add(qa_embeddings)
33
+
34
+ # STEP 5: Together AI Setup (same)
35
+ openai.api_key = "cfbafb6a338787841b0295fa7fbe0e4acca77b70ccc3d92bafea2004783b93a3"
36
+ openai.api_base = "https://api.together.xyz/v1"
37
+
38
+ # STEP 6: Smarter Hybrid Context Retriever
39
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
40
+
41
+ def get_top_k_matches(query, lex_n=50, sem_k=20, ce_k=5):
42
+ # Lexical filter
43
+ q_tfidf = tfidf.transform([query])
44
+ lex_scores = cosine_similarity(q_tfidf, tfidf_matrix).flatten()
45
+ lex_idxs = np.argsort(lex_scores)[-lex_n:][::-1]
46
+
47
+ # Embed query
48
+ q_emb = embedder.encode([query], convert_to_numpy=True)
49
+ sub_embs = qa_embeddings[lex_idxs]
50
+ dists = np.linalg.norm(sub_embs - q_emb, axis=1)
51
+ top_sem_idxs = np.argsort(dists)[:sem_k]
52
+ cand_idxs = [lex_idxs[i] for i in top_sem_idxs]
53
+
54
+ # Cross-encoder for precision rerank
55
+ candidates = [qa_pairs[i] for i in cand_idxs]
56
+ pairs = [[query, cand] for cand in candidates]
57
+ ce_scores = cross_encoder.predict(pairs)
58
+ scored = sorted(zip(ce_scores, candidates), reverse=True)
59
+ top_contexts = [ctx for _, ctx in scored[:ce_k]]
60
+ return top_contexts
61
+
62
+ # STEP 7: Smart Prompt Generator (unchanged)
63
+ def generate_prompt(user_query, context):
64
+ return f"""
65
+ You are a smart and friendly assistant helping students with academic-related queries.
66
+
67
+ Below is a question from a student. You have been given multiple pieces of relevant academic context pulled from the official college documentation. Carefully analyze all the given Q&A context and generate the most accurate, clear, and helpful answer for the student.
68
+
69
+ ### Student's Question:
70
+ {user_query}
71
+
72
+ ### Top Contexts:
73
+ {context}
74
+
75
+ ### Instructions:
76
+ - Use all relevant context to form your answer.
77
+ - Avoid repeating the same sentences. Summarize smartly.
78
+ - Keep your answer polite and student-friendly.
79
+ - If not found, reply: "I'm sorry, I couldn't find this information in the provided academic context."
80
+
81
+ ### Your Final Answer:
82
+ """
83
+
84
+ # STEP 8: Ask a question and get response (unchanged)
85
+ def ask_bot(question):
86
+ context = get_top_k_matches(question)
87
+ prompt = generate_prompt(question, context)
88
+ response = openai.ChatCompletion.create(
89
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
90
+ messages=[{"role":"user","content":prompt}],
91
+ temperature=0.5, max_tokens=1024
92
+ )
93
+ return response.choices[0].message.content
94
+
95
+
96
+ # Define query function
97
+ def qa_pipeline(query, history=[]):
98
+ try:
99
+ response = ask_bot(query)
100
+ history.append((query, response))
101
+ return "", history
102
+ except Exception as e:
103
+ history.append((query, f"⚠️ Error: {str(e)}"))
104
+ return "", history
105
+
106
+ # Launch UI with blocks
107
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
108
+ gr.Markdown("## 🤖 KCT Smart Chatbot")
109
+ gr.Markdown("Ask academic or college-related questions. Powered by your custom dataset.")
110
+
111
+ chatbot = gr.Chatbot(label="KCT Chatbot", height=400)
112
+ msg = gr.Textbox(label="Enter your question here")
113
+ clear = gr.Button("🧹 Clear Chat")
114
+
115
+ # On send
116
+ def user_submit(user_input, chat_history):
117
+ return qa_pipeline(user_input, chat_history)
118
+
119
+ msg.submit(user_submit, [msg, chatbot], [msg, chatbot])
120
+ clear.click(lambda: None, None, chatbot, queue=False)
121
+
122
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ faiss-cpu
4
+ scikit-learn
5
+ sentence-transformers
6
+ openai
7
+ gradio
train_data.csv ADDED
The diff for this file is too large to render. See raw diff