Spaces:
Sleeping
Sleeping
File size: 4,430 Bytes
532392b e13f19b 532392b 99b887a e13f19b 532392b c4bc190 532392b c4bc190 532392b c4bc190 532392b c4bc190 532392b c4bc190 532392b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
import pandas as pd
import json
import os
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger
# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# Load preloaded dataset
@st.cache_data
def load_data():
file_path = "data/merged_dataset.csv.zip"
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.printdir()
zip_ref.extractall("data/extracted")
df = pd.read_csv("data/extracted/merged_dataset.csv")
return df
df = load_data()
# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
"""Compute sentence embeddings."""
return model.encode(problems, normalize_embeddings=True)
def find_similar_problems(df, similarity_threshold=0.9):
"""Find similar problems using cosine similarity."""
embeddings = compute_embeddings(df['problem'].tolist())
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
clusters = {}
for i in range(len(df)):
current_uuid = df["uuid"][i]
similar_items = [
(df["uuid"][j], float(similarity_matrix[i][j])) # Convert float32 to float
for j in range(i + 1, len(df))
if similarity_matrix[i][j] > similarity_threshold
]
if similar_items:
clusters[current_uuid] = similar_items
return clusters
def analyze_clusters(df, similarity_threshold=0.9):
"""Analyze duplicate problem clusters."""
clusters = find_similar_problems(df, similarity_threshold)
detailed_analysis = {}
for key, values in clusters.items():
base_row = df[df["uuid"] == key].iloc[0]
cluster_details = []
for val, score in values:
comparison_row = df[df["uuid"] == val].iloc[0]
column_differences = {}
for col in df.columns:
if col != "uuid":
base_val = base_row[col]
comp_val = comparison_row[col]
# Convert numpy types to native Python types
if hasattr(base_val, 'item'):
base_val = base_val.item()
if hasattr(comp_val, 'item'):
comp_val = comp_val.item()
column_differences[col] = {
'base': base_val,
'comparison': comp_val,
'match': bool(base_val == comp_val) # Convert numpy bool to Python bool
}
cluster_details.append({
'uuid': val,
'similarity_score': float(score), # Convert float32 to float
'column_differences': column_differences,
})
detailed_analysis[key] = cluster_details
return detailed_analysis
# ================== STREAMLIT UI ==================
st.title("π Problem Deduplication Explorer")
st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)
if st.sidebar.button("Run Deduplication Analysis"):
with st.spinner("Analyzing..."):
results = analyze_clusters(df, similarity_threshold)
st.success("Analysis Complete!")
st.subheader("π Duplicate Problem Clusters")
for base_uuid, cluster in results.items():
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
for entry in cluster:
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
# Export results
st.sidebar.download_button(
label="Download Results as JSON",
data=json.dumps(results, indent=2),
file_name="deduplication_results.json",
mime="application/json"
)
# ================== DATAFRAME DISPLAY ==================
st.subheader("π Explore the Dataset")
st.dataframe(df)
|