Spaces:

rahideer
/

cc

Running

File size: 5,993 Bytes

import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings
import pandas as pd
import zipfile
import os

# Set up page config
st.set_page_config(
    page_title="Java Code Clone Detector (IJaDataset 2.1)",
    page_icon="🔍",
    layout="wide"
)

# Suppress warnings
warnings.filterwarnings("ignore")

# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_PATH = "archive (1).zip"

# Initialize models with caching
@st.cache_resource
def load_models():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
        return tokenizer, model
    except Exception as e:
        st.error(f"Failed to load models: {str(e)}")
        return None, None

@st.cache_resource
def load_dataset():
    try:
        if not os.path.exists("Subject_CloneTypes_Directories"):
            with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                zip_ref.extractall(".")
        
        clone_pairs = []
        base_path = "Subject_CloneTypes_Directories"
        
        for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
            type_path = os.path.join(base_path, clone_type)
            if os.path.exists(type_path):
                for root, _, files in os.walk(type_path):
                    if files and len(files) >= 2:
                        with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
                            code1 = f1.read()
                        with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
                            code2 = f2.read()
                        clone_pairs.append({
                            "type": clone_type,
                            "code1": code1,
                            "code2": code2
                        })
                        break
        
        return clone_pairs[:10]
    except Exception as e:
        st.error(f"Error loading dataset: {str(e)}")
        return []

tokenizer, code_model = load_models()
dataset_pairs = load_dataset()

def normalize_code(code):
    try:
        code = re.sub(r'//.*', '', code)
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        code = re.sub(r'\s+', ' ', code).strip()
        return code
    except Exception:
        return code

def get_embedding(code):
    try:
        code = normalize_code(code)
        inputs = tokenizer(
            code,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LENGTH,
            padding='max_length'
        ).to(DEVICE)
        
        with torch.no_grad():
            outputs = code_model(**inputs)
        
        return outputs.last_hidden_state.mean(dim=1)
    except Exception as e:
        st.error(f"Error processing code: {str(e)}")
        return None

def compare_code(code1, code2):
    if not code1 or not code2:
        return None
    
    with st.spinner('Analyzing code...'):
        emb1 = get_embedding(code1)
        emb2 = get_embedding(code2)
        
        if emb1 is None or emb2 is None:
            return None
        
        with torch.no_grad():
            similarity = F.cosine_similarity(emb1, emb2).item()
        
        return similarity

# UI Elements
st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)")
st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.")

# Dataset selector
selected_pair = None
if dataset_pairs:
    pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
    selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
    selected_pair = pair_options[selected_option]

# Layout
col1, col2 = st.columns(2)

with col1:
    code1 = st.text_area(
        "First Java Code", 
        height=300,
        value=selected_pair["code1"] if selected_pair else "",
        help="Enter the first Java code snippet"
    )

with col2:
    code2 = st.text_area(
        "Second Java Code", 
        height=300,
        value=selected_pair["code2"] if selected_pair else "",
        help="Enter the second Java code snippet"
    )

threshold = st.slider(
    "Clone Detection Threshold",
    min_value=0.50,
    max_value=1.00,
    value=0.75,
    step=0.01,
    help="Similarity score needed to consider code as cloned (0.5-1.0)"
)

# Only perform comparison when button is clicked
if st.button("Compare Code"):
    similarity = compare_code(code1, code2)
    
    if similarity is not None:
        is_clone = similarity >= threshold
        
        st.subheader("Results")
        cols = st.columns(3)
        cols[0].metric("Similarity Score", f"{similarity:.3f}")
        cols[1].metric("Current Threshold", f"{threshold:.3f}")
        cols[2].metric(
            "Verdict", 
            "✅ CLONE" if is_clone else "❌ NOT CLONE",
            delta=f"{similarity-threshold:+.3f}",
            help=f"Score {'≥' if is_clone else '<'} threshold"
        )
        
        st.progress(similarity)
        
        with st.expander("Interpretation Guide"):
            st.markdown("""
            - **> 0.95**: Nearly identical (Type-1 clone)
            - **0.85-0.95**: Very similar (Type-2 clone) 
            - **0.70-0.85**: Similar structure (Type-3 clone)
            - **< 0.70**: Different code
            """)

        with st.expander("Show normalized code"):
            tab1, tab2 = st.tabs(["First Code", "Second Code"])
            with tab1:
                st.code(normalize_code(code1))
            with tab2:
                st.code(normalize_code(code2))

st.markdown("---")
st.markdown("""
**Dataset Information**:
- Using IJaDataset 2.1 from Kaggle
- Contains 100K Java files with clone annotations
- Clone types: Type-1, Type-2, and Type-3 clones
""")