aswin-10 commited on
Commit
586e301
·
verified ·
1 Parent(s): ae4467e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -82
app.py CHANGED
@@ -1,85 +1,85 @@
1
- import gradio as gr
2
  import pandas as pd
3
- import numpy as np
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- import openai
 
6
  import os
7
 
8
- # Set up your OpenAI API key (consider using environment variables for security)
9
- openai.api_key = os.getenv("OPENAI_API_KEY")
10
-
11
- # Load course data from CSV file
12
- df = pd.read_csv('course_data.csv')
13
-
14
- # Combine relevant columns into one text representation for each course
15
- def combine_course_text(row):
16
- return f"{row['Course Title']} {row['Description']} {row['All Chapter Titles']} {row['All Lessons']}"
17
-
18
- df['combined_text'] = df.apply(combine_course_text, axis=1)
19
-
20
- # Function to get embeddings for text using OpenAI's API
21
- def get_embedding(text):
22
- try:
23
- response = openai.Embedding.create(
24
- input=text,
25
- model="text-embedding-ada-002"
26
- )
27
- embedding = response['data'][0]['embedding']
28
- return embedding
29
- except Exception as e:
30
- print(f"Error while getting embedding: {e}")
31
- return None
32
-
33
- # Pre-compute embeddings for all courses in the dataset
34
- course_embeddings = []
35
- course_titles = df['Course Title'].tolist()
36
-
37
- for text in df['combined_text']:
38
- embedding = get_embedding(text)
39
-
40
- if embedding is not None:
41
- course_embeddings.append(embedding)
42
-
43
- else:
44
- print(f"Failed to generate embedding for: {text}")
45
-
46
- # Convert embeddings to numpy array (for cosine_similarity to work properly)
47
- course_embeddings = np.array(course_embeddings)
48
-
49
- # Function to search courses based on a query
50
- def search_courses(query):
51
- # Get embedding for query
52
- query_embedding = get_embedding(query)
53
-
54
- if query_embedding is None:
55
- return "Error in generating query embedding."
56
-
57
- # Compute cosine similarity between query embedding and course embeddings
58
- similarities = cosine_similarity([query_embedding], course_embeddings)
59
-
60
- # Sort by similarity
61
- sorted_indices = np.argsort(similarities[0])[::-1]
62
-
63
- # Get top 3 courses based on similarity
64
- top_courses = [course_titles[i] for i in sorted_indices[:3]]
65
-
66
- return top_courses
67
-
68
- # Gradio Interface
69
- def gradio_search(query):
70
- if query.strip():
71
- results = search_courses(query)
72
- if len(results) < 3:
73
- results.extend(["No results"] * (3 - len(results)))
74
- return "Top relevant courses:\n1. " + results[0] + "\n2. " + results[1] + "\n3. " + results[2]
75
-
76
- # Create Gradio interface
77
- interface = gr.Interface(
78
- fn=gradio_search,
79
- inputs="text",
80
- outputs="text",
81
- title="Smart Course Search",
82
- description="Enter a query and get the most relevant courses from the dataset.")
83
-
84
- # Launch the Gradio interface
85
- interface.launch(share=True)
 
 
1
  import pandas as pd
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from transformers import pipeline
4
+ import torch
5
+ import gradio as gr
6
  import os
7
 
8
+ # Load the dataset
9
+ csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv')
10
+ df = pd.read_csv(csv_file_path, encoding='Windows-1252')
11
+ df.fillna('', inplace=True)
12
+
13
+ # Load the pre-trained model for embeddings
14
+ model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
15
+
16
+ # Combine title and description to create a full text for each course
17
+ df['full_text'] = df.iloc[:, 0] + " " + df.iloc[:, 1] + " " + df['Instructor Name'] + " " + df['Rating'].astype(str) + " " + df['Category']
18
+
19
+ # Convert full course texts into embeddings
20
+ course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
21
+
22
+ # Load a model for text generation (e.g., BART)
23
+ generator = pipeline('text2text-generation', model='facebook/bart-large-cnn')
24
+
25
+ def expand_query(query):
26
+ paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
27
+ expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
28
+ return [q['generated_text'] for q in expanded_queries]
29
+
30
+ def generate_description(query):
31
+ response = generator(query, max_length=100, num_return_sequences=1)
32
+ return response[0]['generated_text']
33
+
34
+ def search_courses(query, level_filter=None, category_filter=None, top_k=3):
35
+ expanded_queries = expand_query(query)
36
+ all_similarities = []
37
+
38
+ for expanded_query in expanded_queries:
39
+ query_embedding = model.encode(expanded_query, convert_to_tensor=True)
40
+ similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
41
+ all_similarities.append(similarities)
42
+
43
+ aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
44
+ filtered_df = df.copy()
45
+
46
+ if level_filter and level_filter != "Nil":
47
+ filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
48
+ if category_filter and category_filter != "NIL":
49
+ filtered_df = filtered_df[filtered_df['Category'] == category_filter]
50
+
51
+ if filtered_df.empty:
52
+ return "<p>No matching courses found.</p>"
53
+
54
+ filtered_similarities = aggregated_similarities[filtered_df.index]
55
+ top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
56
+
57
+ results = []
58
+ for idx in top_results.indices:
59
+ idx = int(idx)
60
+ course_title = filtered_df.iloc[idx]['Course Title']
61
+ course_description = filtered_df.iloc[idx, 1]
62
+ course_url = filtered_df.iloc[idx, -1]
63
+ generated_description = generate_description(course_title + " " + course_description)
64
+ course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
65
+ results.append(f"<strong>{course_link}</strong><br>{course_description}<br>{generated_description}<br><br>")
66
+
67
+ return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
68
+
69
+ def create_gradio_interface():
70
+ with gr.Blocks() as demo:
71
+ gr.Markdown("# Analytics Vidhya Free Courses")
72
+ gr.Markdown("Enter your query and use filters to narrow down the search.")
73
+ query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description")
74
+ with gr.Accordion(" Filters", open=False):
75
+ level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced", "Nil"], label=" Course Level", multiselect=False)
76
+ category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP", "NIL"], label=" Category", multiselect=False)
77
+ search_button = gr.Button("Search")
78
+ output = gr.HTML(label="Search Results")
79
+ search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
80
+
81
+ return demo
82
+
83
+ # Launch Gradio interface
84
+ demo = create_gradio_interface()
85
+ demo.launch()