Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import openai
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Set up your OpenAI API key (consider using environment variables for security)
|
9 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
10 |
+
|
11 |
+
# Load course data from CSV file
|
12 |
+
df = pd.read_csv('course_data.csv')
|
13 |
+
|
14 |
+
# Combine relevant columns into one text representation for each course
|
15 |
+
def combine_course_text(row):
|
16 |
+
return f"{row['Course Title']} {row['Description']} {row['All Chapter Titles']} {row['All Lessons']}"
|
17 |
+
|
18 |
+
df['combined_text'] = df.apply(combine_course_text, axis=1)
|
19 |
+
|
20 |
+
# Function to get embeddings for text using OpenAI's API
|
21 |
+
def get_embedding(text):
|
22 |
+
try:
|
23 |
+
response = openai.Embedding.create(
|
24 |
+
input=text,
|
25 |
+
model="text-embedding-ada-002"
|
26 |
+
)
|
27 |
+
embedding = response['data'][0]['embedding']
|
28 |
+
return embedding
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error while getting embedding: {e}")
|
31 |
+
return None
|
32 |
+
|
33 |
+
# Pre-compute embeddings for all courses in the dataset
|
34 |
+
course_embeddings = []
|
35 |
+
course_titles = df['Course Title'].tolist()
|
36 |
+
|
37 |
+
for text in df['combined_text']:
|
38 |
+
embedding = get_embedding(text)
|
39 |
+
|
40 |
+
if embedding is not None:
|
41 |
+
course_embeddings.append(embedding)
|
42 |
+
|
43 |
+
else:
|
44 |
+
print(f"Failed to generate embedding for: {text}")
|
45 |
+
|
46 |
+
# Convert embeddings to numpy array (for cosine_similarity to work properly)
|
47 |
+
course_embeddings = np.array(course_embeddings)
|
48 |
+
|
49 |
+
# Function to search courses based on a query
|
50 |
+
def search_courses(query):
|
51 |
+
# Get embedding for query
|
52 |
+
query_embedding = get_embedding(query)
|
53 |
+
|
54 |
+
if query_embedding is None:
|
55 |
+
return "Error in generating query embedding."
|
56 |
+
|
57 |
+
# Compute cosine similarity between query embedding and course embeddings
|
58 |
+
similarities = cosine_similarity([query_embedding], course_embeddings)
|
59 |
+
|
60 |
+
# Sort by similarity
|
61 |
+
sorted_indices = np.argsort(similarities[0])[::-1]
|
62 |
+
|
63 |
+
# Get top 3 courses based on similarity
|
64 |
+
top_courses = [course_titles[i] for i in sorted_indices[:3]]
|
65 |
+
|
66 |
+
return top_courses
|
67 |
+
|
68 |
+
# Gradio Interface
|
69 |
+
def gradio_search(query):
|
70 |
+
if query.strip():
|
71 |
+
results = search_courses(query)
|
72 |
+
if len(results) < 3:
|
73 |
+
results.extend(["No results"] * (3 - len(results)))
|
74 |
+
return "Top relevant courses:\n1. " + results[0] + "\n2. " + results[1] + "\n3. " + results[2]
|
75 |
+
|
76 |
+
# Create Gradio interface
|
77 |
+
interface = gr.Interface(
|
78 |
+
fn=gradio_search,
|
79 |
+
inputs="text",
|
80 |
+
outputs="text",
|
81 |
+
title="Smart Course Search",
|
82 |
+
description="Enter a query and get the most relevant courses from the dataset.")
|
83 |
+
|
84 |
+
# Launch the Gradio interface
|
85 |
+
interface.launch(share=True)
|