Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import xgboost as xgb | |
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
from gensim.models import FastText | |
import joblib | |
# Load the trained FastText model | |
try: | |
fasttext_model = FastText.load('fasttext_model.bin') | |
except FileNotFoundError: | |
st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.") | |
st.stop() | |
# Load the trained XGBoost model for the combined features | |
try: | |
model = joblib.load('model.pkl') | |
except FileNotFoundError: | |
st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.") | |
st.stop() | |
def tokenize(text): | |
if isinstance(text, str): | |
return text.split() | |
else: | |
return [] | |
def embed_text(text_series, fasttext_model): | |
embeddings = [] | |
for text in text_series: | |
tokens = tokenize(text) | |
vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv] | |
if vectors: | |
embeddings.append(np.mean(vectors, axis=0)) | |
else: | |
embeddings.append(np.zeros(fasttext_model.vector_size)) | |
return np.array(embeddings) | |
def preprocess_input(query, title, description, url, fasttext_model): | |
query = str(query) if pd.notna(query) else '' | |
title = str(title) if pd.notna(title) else '' | |
description = str(description) if pd.notna(description) else '' | |
url = str(url) if pd.notna(url) else '' | |
query_ft = embed_text(pd.Series([query]), fasttext_model) | |
title_ft = embed_text(pd.Series([title]), fasttext_model) | |
description_ft = embed_text(pd.Series([description]), fasttext_model) | |
url_ft = embed_text(pd.Series([url]), fasttext_model) | |
combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft]) | |
dmatrix = xgb.DMatrix(combined_features) | |
return dmatrix | |
def extract_title_description(url): | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36' | |
} | |
try: | |
response = requests.get(url, headers=headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
title = soup.title.string if soup.title else 'No title found' | |
description_tag = soup.find('meta', attrs={'name': 'description'}) | |
description = description_tag['content'] if description_tag else 'No description found' | |
return title, description | |
except Exception as e: | |
return 'Error extracting title', 'Error extracting description' | |
def predict(query, title, description, url, fasttext_model): | |
dmatrix = preprocess_input(query, title, description, url, fasttext_model) | |
probability = model.predict(dmatrix, validate_features=False)[0] | |
binary_prediction = int(probability >= 0.5) | |
return binary_prediction, probability | |
# Streamlit interface | |
st.title('CTR Prediction Inference') | |
tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"]) | |
with tab1: | |
st.header('Single Entry Inference') | |
query = st.text_input('Query') | |
url = st.text_input('URL') | |
if st.button('Predict'): | |
title, description = extract_title_description(url) | |
st.write(f'Extracted Title: {title}') | |
st.write(f'Extracted Description: {description}') | |
if query and url: | |
binary_result, confidence = predict(query, title, description, url, fasttext_model) | |
st.write(f'Predicted +/-: {binary_result}') | |
st.write(f'Conf.: {confidence:.2%}') | |
confidence_percentage = int(confidence * 100) | |
st.progress(confidence_percentage) | |
else: | |
st.write('Please enter both a query and a URL.') | |
with tab2: | |
st.header('Batch Entry Inference') | |
uploaded_file = st.file_uploader("Upload CSV", type="csv") | |
if uploaded_file is not None: | |
df = pd.read_csv(uploaded_file) | |
required_columns = ['Query', 'Title', 'Description', 'URL'] | |
if set(required_columns).issubset(df.columns): | |
predictions = [] | |
confidences = [] | |
for index, row in df.iterrows(): | |
binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model) | |
predictions.append(binary_result) | |
confidences.append(confidence) | |
df['+/-'] = predictions | |
df['Conf.'] = [f"{conf:.2%}" for conf in confidences] | |
cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']] | |
df = df[cols] | |
st.write(df) | |
st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv") | |
else: | |
st.write('CSV must contain Query, Title, Description, and URL columns.') | |
with tab3: | |
st.header('A/B Test Inference') | |
query = st.text_input('Query for A/B Test') | |
url = st.text_input('URL for A/B Test') | |
if 'step' not in st.session_state: | |
st.session_state.step = 0 | |
if st.button('Scrape A/B'): | |
title_A, description_A = extract_title_description(url) | |
st.session_state['title_A'] = title_A | |
st.session_state['description_A'] = description_A | |
st.session_state.step = 1 | |
if st.session_state.step == 1: | |
title_B = st.text_input('Title B', value=st.session_state.get('title_A', '')) | |
description_B = st.text_area('Description B', value=st.session_state.get('description_A', '')) | |
if st.button('Predict A/B'): | |
if query and url: | |
binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model) | |
binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model) | |
st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}') | |
st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}') | |
if binary_result_A == 1 and binary_result_B == 0: | |
st.write("B is worse than A") | |
elif binary_result_A == 0 and binary_result_B == 1: | |
st.write("B is better than A") | |
else: | |
st.write("B is the same as A") | |