aswin-10 commited on
Commit
47e6638
·
verified ·
1 Parent(s): 73233e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -24
app.py CHANGED
@@ -3,83 +3,107 @@ from sentence_transformers import SentenceTransformer, util
3
  from transformers import pipeline
4
  import torch
5
  import gradio as gr
6
- import os
7
 
8
- # Load the dataset
9
  csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv')
 
 
10
  df = pd.read_csv(csv_file_path, encoding='Windows-1252')
11
- df.fillna('', inplace=True)
12
 
13
- # Load the pre-trained model for embeddings
14
  model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
15
 
16
  # Combine title and description to create a full text for each course
17
- df['full_text'] = df.iloc[:, 0] + " " + df.iloc[:, 1] + " " + df['Instructor Name'] + " " + df['Rating'].astype(str) + " " + df['Category']
18
 
19
  # Convert full course texts into embeddings
20
  course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
21
 
22
- # Load a model for text generation (e.g., BART)
23
- generator = pipeline('text2text-generation', model='facebook/bart-large-cnn')
24
-
25
  def expand_query(query):
26
  paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
27
  expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
28
  return [q['generated_text'] for q in expanded_queries]
29
 
30
- def generate_description(query):
31
- response = generator(query, max_length=100, num_return_sequences=1)
32
- return response[0]['generated_text']
33
-
34
  def search_courses(query, level_filter=None, category_filter=None, top_k=3):
 
35
  expanded_queries = expand_query(query)
 
 
36
  all_similarities = []
37
 
38
  for expanded_query in expanded_queries:
 
39
  query_embedding = model.encode(expanded_query, convert_to_tensor=True)
 
 
40
  similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
 
 
41
  all_similarities.append(similarities)
42
 
 
43
  aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
44
- filtered_df = df.copy()
45
 
46
- if level_filter and level_filter != "Nil":
 
 
47
  filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
48
- if category_filter and category_filter != "NIL":
49
  filtered_df = filtered_df[filtered_df['Category'] == category_filter]
50
-
51
  if filtered_df.empty:
52
  return "<p>No matching courses found.</p>"
53
-
 
54
  filtered_similarities = aggregated_similarities[filtered_df.index]
 
 
55
  top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
56
 
 
57
  results = []
58
  for idx in top_results.indices:
59
  idx = int(idx)
60
  course_title = filtered_df.iloc[idx]['Course Title']
61
- course_description = filtered_df.iloc[idx, 1]
62
- course_url = filtered_df.iloc[idx, -1]
63
- generated_description = generate_description(course_title + " " + course_description)
 
 
64
  course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
65
- results.append(f"<strong>{course_link}</strong><br>{course_description}<br>{generated_description}<br><br>")
66
 
 
67
  return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
68
 
 
69
  def create_gradio_interface():
70
  with gr.Blocks() as demo:
71
  gr.Markdown("# Analytics Vidhya Free Courses")
72
  gr.Markdown("Enter your query and use filters to narrow down the search.")
 
 
73
  query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description")
 
 
74
  with gr.Accordion(" Filters", open=False):
75
- level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced", "Nil"], label=" Course Level", multiselect=False)
76
- category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP", "NIL"], label=" Category", multiselect=False)
 
 
77
  search_button = gr.Button("Search")
 
 
78
  output = gr.HTML(label="Search Results")
 
 
79
  search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
80
 
81
  return demo
82
 
83
  # Launch Gradio interface
84
  demo = create_gradio_interface()
85
- demo.launch()
 
3
  from transformers import pipeline
4
  import torch
5
  import gradio as gr
6
+ import os
7
 
8
+ # Use the relative path where the CSV is uploaded
9
  csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv')
10
+
11
+ # Load the dataset
12
  df = pd.read_csv(csv_file_path, encoding='Windows-1252')
 
13
 
14
+ # Load the pre-trained model for embeddings (using SentenceTransformers)
15
  model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
16
 
17
  # Combine title and description to create a full text for each course
18
+ df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category']
19
 
20
  # Convert full course texts into embeddings
21
  course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True)
22
 
23
+ # Function to expand the query using paraphrasing
 
 
24
  def expand_query(query):
25
  paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws')
26
  expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True)
27
  return [q['generated_text'] for q in expanded_queries]
28
 
29
+ # Function to search for the most relevant courses
 
 
 
30
  def search_courses(query, level_filter=None, category_filter=None, top_k=3):
31
+ # Step 1: Expand the query using paraphrasing
32
  expanded_queries = expand_query(query)
33
+
34
+ # Step 2: Initialize an array to store all similarities
35
  all_similarities = []
36
 
37
  for expanded_query in expanded_queries:
38
+ # Convert each expanded query into an embedding
39
  query_embedding = model.encode(expanded_query, convert_to_tensor=True)
40
+
41
+ # Compute cosine similarities between the query embedding and course embeddings
42
  similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0]
43
+
44
+ # Append to the list of all similarities
45
  all_similarities.append(similarities)
46
 
47
+ # Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course
48
  aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0]
 
49
 
50
+ # Step 4: Apply filters
51
+ filtered_df = df.copy()
52
+ if level_filter:
53
  filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter]
54
+ if category_filter:
55
  filtered_df = filtered_df[filtered_df['Category'] == category_filter]
56
+
57
  if filtered_df.empty:
58
  return "<p>No matching courses found.</p>"
59
+
60
+ # Recalculate similarities for the filtered data
61
  filtered_similarities = aggregated_similarities[filtered_df.index]
62
+
63
+ # Step 5: Get top_k most similar courses
64
  top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities)))
65
 
66
+ # Prepare the output as clickable links
67
  results = []
68
  for idx in top_results.indices:
69
  idx = int(idx)
70
  course_title = filtered_df.iloc[idx]['Course Title']
71
+ course_description = filtered_df.iloc[idx,1]
72
+ course_url = filtered_df.iloc[idx,-1]
73
+
74
+
75
+ # Format the result as a clickable hyperlink using raw HTML
76
  course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>'
77
+ results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>")
78
 
79
+ # Combine all results into an HTML formatted list
80
  return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>"
81
 
82
+ # Create Gradio UI
83
  def create_gradio_interface():
84
  with gr.Blocks() as demo:
85
  gr.Markdown("# Analytics Vidhya Free Courses")
86
  gr.Markdown("Enter your query and use filters to narrow down the search.")
87
+
88
+ # Input elements
89
  query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description")
90
+
91
+ # Filters (in a collapsible form)
92
  with gr.Accordion(" Filters", open=False):
93
+ level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label=" Course Level", multiselect=False)
94
+ category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label=" Category", multiselect=False)
95
+
96
+ # Search button
97
  search_button = gr.Button("Search")
98
+
99
+ # Output HTML for displaying results
100
  output = gr.HTML(label="Search Results")
101
+
102
+ # On button click, trigger the search function
103
  search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output)
104
 
105
  return demo
106
 
107
  # Launch Gradio interface
108
  demo = create_gradio_interface()
109
+ demo.launch(share=True, debug=True)