Spaces:

AzizTh
/

Hotel-Semantic-Search

Running

App Files Files Community

AzizTh commited on Aug 11, 2024

Commit

d4f5493

verified ·

1 Parent(s): b72263b

Rename GroupZero_Week2Assesment.py to app.py

Browse files

Files changed (1) hide show

GroupZero_Week2Assesment.py → app.py +26 -205

GroupZero_Week2Assesment.py → app.py RENAMED Viewed

@@ -1,183 +1,37 @@
-from IPython.display import HTML, display
-def set_css():
-  display(HTML('''
-  <style>
-    pre {
-        white-space: pre-wrap;
-    }
-  </style>
-  '''))
-get_ipython().events.register('pre_run_cell', set_css)
-!pip install huggingface datasets
-!pip install sentence_transformers
 import pandas as pd
-from datasets import load_dataset
-import numpy as np
 from sentence_transformers import SentenceTransformer
-import torch
-import scipy.spatial
-dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")
-df=pd.DataFrame(dataset['train'])
-df.head()
-df.rate.value_counts()
-df.groupby('hotel_name')['rate'].apply(lambda x: x.isnull().sum()).sort_values(ascending=False)[1:40]
-df['hotel_description'].isnull().sum()
-df['hotel_name'].value_counts()
-hotel_rates = df.groupby('hotel_name')['rate'].first().to_dict()
-len(hotel_rates)
-# prompt: check unique values of hotel_rates and how many times each value repeated
-unique_rates, rate_counts = np.unique(list(hotel_rates.values()), return_counts=True)
-for rate, count in zip(unique_rates, rate_counts):
-  print(f"Rate: {rate}, Count: {count}")
-# Define the function to fill missing rates
-def fill_rate(row):
-    if pd.isna(row['rate']):
-        return hotel_rates.get(row['hotel_name'], row['rate'])  # Return the matched rate or leave it as NaN if no match
-    else:
-        return row['rate']
-# Apply the function to each row in the DataFrame
-df['rate'] = df.apply(fill_rate, axis=1)
-df['rate'].isnull().sum()
-df['locality'].value_counts()
-# Assuming df is your DataFrame
-# Create a mapping for rating_value
-rating_value_map = {
-    5.0: 'Very Satisfied Customer',
-    4.5: 'Satisfied Customer',
-    4.0: 'Moderately Satisfied Customer',
-    3.5: 'Neutral Customer',
-    3.0: 'Dissatisfied Customer'
-}
-# Create a mapping for price_range
-price_range_map = {
-    '$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Economical',
-    '$$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Moderate',
-    '$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Expensive'
-}
-# Create a mapping for rate
-rate_map = {
-    5.0: '5 Stars',
-    4.0: '4 Stars',
-    3.0: '3 Stars',
-    2.0: '2 Stars',
-    1.0: '1 Star',
-    float('nan'): 'not known how many stars'
-}
-# Apply the mappings to the DataFrame
-df['rating_value'] = df['rating_value'].map(rating_value_map)
-df['price_range'] = df['price_range'].map(price_range_map)
-df['rate'] = df['rate'].map(rate_map)
-df.head()
-from sentence_transformers import SentenceTransformer
 model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
-def create_combined_embedding(row):
-    # Use empty strings for None values
-    description = (row['hotel_description'] or "").strip()
-    rate = (row['rate'] or "").strip()
-    price_range = (row['price_range'] or "").strip()
-    combined_text = f"hotel_description: {description}; hotel star rate: {rate}; price range: {price_range}"
-    embedding = model.encode(combined_text)
-    return embedding.tolist()  # Convert the embedding to a list
-# Assuming df is your DataFrame
-df["hotel_combined_embedding"] = df.apply(create_combined_embedding, axis=1)
-df.head()
-df['rating_value']
-customer_rating = df['rating_value'].tolist()
-customer_rating_embeddings = model.encode(customer_rating, show_progress_bar=True)#A
-print(f"customer_rating_embeddings shape: {customer_rating_embeddings.shape}")
-len(customer_rating_embeddings)
-# Convert embeddings to a list of lists (each embedding is a list)
-embedding_list = [embedding.tolist() for embedding in customer_rating_embeddings]
-# Add the embeddings as a new column to the original DataFrame
-df['rating_value_embedding'] = embedding_list
-df.head()
-df.to_csv('df.csv', index=True)
 df_new = pd.read_csv('last_df.csv')
-df_new.head()
-df_new['country'].unique()
 df_new['country'] = df_new['country'].replace('Türkiye', 'Turkey')
-df_new['country'].unique()
-df_new.head()
-!python -m spacy download en_core_web_trf
-import spacy
-import pandas as pd
-nlp = spacy.load("en_core_web_trf")
-# Function to extract city name from the query
-def get_city_name(query):
-    text_query = nlp(query)
-    for city in text_query.ents:
-        if city.label_ == "GPE":
-            return city.text.lower()
-    return None
-# Function to filter DataFrame by location
-def filter_by_loc(query):
-    city_name = get_city_name(query)
-    if city_name in df_new['locality'].str.lower().unique():
-        filtered_df = df_new[df_new['locality'].str.lower() == city_name.lower()]
-        return filtered_df
-    else:
-        return df_new
-query = "cheap hotel in Istanbul"
-query_embedding = model.encode(query)
-query_embedding.shape
 import torch.nn as nn
 import torch
@@ -206,17 +60,18 @@ def process_query(query):
     query_embedding = model.encode(query)
     # Filter DataFrame by location
-    filtered_data = filter_by_loc(query)
     # Convert query_embedding to a tensor if it is not already
     query_embedding_tensor = torch.tensor(query_embedding)
     # Apply the similarity function to the filtered DataFrame
-    filtered_data['similarity_score'] = filtered_data.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
-    top_similar = filtered_data.sort_values('similarity_score', ascending=False).head(1)
     hotel_name = top_similar['hotel_name'].values[0]
@@ -240,42 +95,8 @@ def process_query(query):
     return result
-# here is the returned df
-result_df = process_query(query)
-result_df
-# Extract the relevant information from the top similar hotel
-hotel_name = top_similar['hotel_name'].values[0]
-hotel_description = top_similar['hotel_description'].values[0]
-hotel_rate = top_similar['rate'].values[0]
-hotel_price_range = top_similar['price_range'].values[0]
-hotel_review = top_similar['review_title'].values[0]
-hotel_city = top_similar['locality'].values[0]
-hotel_country = top_similar['country'].values[0]
-# Print the information in an ordered fashion
-print("query: ",query)
-print("-" * 30)
-print("Here's the most similar hotel we found:")
-print("-" * 30)
-print(f"Hotel Name: {hotel_name}")
-print("City:", hotel_city)
-print("Country:", hotel_country)
-# print(f"Description: {hotel_description}")
-print(f"Star Rating: {hotel_rate}")
-print(f"Price Range: {hotel_price_range}")
-!pip install gradio
-import gradio as gr
 ui = gr.Interface(
     fn=process_query,

 import pandas as pd
 from sentence_transformers import SentenceTransformer
+import gradio as gr
+import spacy
 model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
 df_new = pd.read_csv('last_df.csv')
 df_new['country'] = df_new['country'].replace('Türkiye', 'Turkey')
+#
+#
+# nlp = spacy.load("en_core_web_trf")
+#
+# # Function to extract city name from the query
+# def get_city_name(query):
+#     text_query = nlp(query)
+#     for city in text_query.ents:
+#         if city.label_ == "GPE":
+#             return city.text.lower()
+#     return None
+#
+# # Function to filter DataFrame by location
+# def filter_by_loc(query):
+#     city_name = get_city_name(query)
+#     if city_name in df_new['locality'].str.lower().unique():
+#         filtered_df = df_new[df_new['locality'].str.lower() == city_name.lower()]
+#         return filtered_df
+#     else:
+#         return df_new
 import torch.nn as nn
 import torch
     query_embedding = model.encode(query)
     # Filter DataFrame by location
+    # filtered_data = filter_by_loc(query)
     # Convert query_embedding to a tensor if it is not already
     query_embedding_tensor = torch.tensor(query_embedding)
     # Apply the similarity function to the filtered DataFrame
+    # filtered_data['similarity_score'] = filtered_data.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
+    df_new['similarity_score'] = df_new.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
+    top_similar = df_new.sort_values('similarity_score', ascending=False).head(1)
     hotel_name = top_similar['hotel_name'].values[0]
     return result
 ui = gr.Interface(
     fn=process_query,