import streamlit as st import javalang import torch import torch.nn.functional as F import re from transformers import AutoTokenizer, AutoModel import warnings import pandas as pd import zipfile import os # Set up page config st.set_page_config( page_title="Java Code Clone Detector (IJaDataset 2.1)", page_icon="🔍", layout="wide" ) # Suppress warnings warnings.filterwarnings("ignore") # Constants MODEL_NAME = "microsoft/codebert-base" MAX_LENGTH = 512 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') DATASET_PATH = "archive (1).zip" # Initialize models with caching @st.cache_resource def load_models(): try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) return tokenizer, model except Exception as e: st.error(f"Failed to load models: {str(e)}") return None, None @st.cache_resource def load_dataset(): try: if not os.path.exists("Subject_CloneTypes_Directories"): with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref: zip_ref.extractall(".") clone_pairs = [] base_path = "Subject_CloneTypes_Directories" for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]: type_path = os.path.join(base_path, clone_type) if os.path.exists(type_path): for root, _, files in os.walk(type_path): if files and len(files) >= 2: with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1: code1 = f1.read() with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2: code2 = f2.read() clone_pairs.append({ "type": clone_type, "code1": code1, "code2": code2 }) break return clone_pairs[:10] except Exception as e: st.error(f"Error loading dataset: {str(e)}") return [] tokenizer, code_model = load_models() dataset_pairs = load_dataset() def normalize_code(code): try: code = re.sub(r'//.*', '', code) code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) code = re.sub(r'\s+', ' ', code).strip() return code except Exception: return code def get_embedding(code): try: code = normalize_code(code) inputs = tokenizer( code, return_tensors="pt", truncation=True, max_length=MAX_LENGTH, padding='max_length' ).to(DEVICE) with torch.no_grad(): outputs = code_model(**inputs) return outputs.last_hidden_state.mean(dim=1) except Exception as e: st.error(f"Error processing code: {str(e)}") return None def compare_code(code1, code2): if not code1 or not code2: return None with st.spinner('Analyzing code...'): emb1 = get_embedding(code1) emb2 = get_embedding(code2) if emb1 is None or emb2 is None: return None with torch.no_grad(): similarity = F.cosine_similarity(emb1, emb2).item() return similarity # UI Elements st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)") st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.") # Dataset selector selected_pair = None if dataset_pairs: pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)} selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys())) selected_pair = pair_options[selected_option] # Layout col1, col2 = st.columns(2) with col1: code1 = st.text_area( "First Java Code", height=300, value=selected_pair["code1"] if selected_pair else "", help="Enter the first Java code snippet" ) with col2: code2 = st.text_area( "Second Java Code", height=300, value=selected_pair["code2"] if selected_pair else "", help="Enter the second Java code snippet" ) threshold = st.slider( "Clone Detection Threshold", min_value=0.50, max_value=1.00, value=0.75, step=0.01, help="Similarity score needed to consider code as cloned (0.5-1.0)" ) # Only perform comparison when button is clicked if st.button("Compare Code"): similarity = compare_code(code1, code2) if similarity is not None: is_clone = similarity >= threshold st.subheader("Results") cols = st.columns(3) cols[0].metric("Similarity Score", f"{similarity:.3f}") cols[1].metric("Current Threshold", f"{threshold:.3f}") cols[2].metric( "Verdict", "✅ CLONE" if is_clone else "❌ NOT CLONE", delta=f"{similarity-threshold:+.3f}", help=f"Score {'≥' if is_clone else '<'} threshold" ) st.progress(similarity) with st.expander("Interpretation Guide"): st.markdown(""" - **> 0.95**: Nearly identical (Type-1 clone) - **0.85-0.95**: Very similar (Type-2 clone) - **0.70-0.85**: Similar structure (Type-3 clone) - **< 0.70**: Different code """) with st.expander("Show normalized code"): tab1, tab2 = st.tabs(["First Code", "Second Code"]) with tab1: st.code(normalize_code(code1)) with tab2: st.code(normalize_code(code2)) st.markdown("---") st.markdown(""" **Dataset Information**: - Using IJaDataset 2.1 from Kaggle - Contains 100K Java files with clone annotations - Clone types: Type-1, Type-2, and Type-3 clones """)