File size: 5,993 Bytes
49a4932 12646dd 49a4932 03c8514 49a4932 03c8514 49a4932 12646dd 49a4932 12646dd 49a4932 12646dd 49a4932 12646dd 49a4932 12646dd 49a4932 12646dd 49a4932 a3f5843 12646dd 49a4932 a3f5843 49a4932 12646dd a3f5843 12646dd a3f5843 12646dd 49a4932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings
import pandas as pd
import zipfile
import os
# Set up page config
st.set_page_config(
page_title="Java Code Clone Detector (IJaDataset 2.1)",
page_icon="π",
layout="wide"
)
# Suppress warnings
warnings.filterwarnings("ignore")
# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_PATH = "archive (1).zip"
# Initialize models with caching
@st.cache_resource
def load_models():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
return tokenizer, model
except Exception as e:
st.error(f"Failed to load models: {str(e)}")
return None, None
@st.cache_resource
def load_dataset():
try:
if not os.path.exists("Subject_CloneTypes_Directories"):
with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
zip_ref.extractall(".")
clone_pairs = []
base_path = "Subject_CloneTypes_Directories"
for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
type_path = os.path.join(base_path, clone_type)
if os.path.exists(type_path):
for root, _, files in os.walk(type_path):
if files and len(files) >= 2:
with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
code1 = f1.read()
with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
code2 = f2.read()
clone_pairs.append({
"type": clone_type,
"code1": code1,
"code2": code2
})
break
return clone_pairs[:10]
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
return []
tokenizer, code_model = load_models()
dataset_pairs = load_dataset()
def normalize_code(code):
try:
code = re.sub(r'//.*', '', code)
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
code = re.sub(r'\s+', ' ', code).strip()
return code
except Exception:
return code
def get_embedding(code):
try:
code = normalize_code(code)
inputs = tokenizer(
code,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
padding='max_length'
).to(DEVICE)
with torch.no_grad():
outputs = code_model(**inputs)
return outputs.last_hidden_state.mean(dim=1)
except Exception as e:
st.error(f"Error processing code: {str(e)}")
return None
def compare_code(code1, code2):
if not code1 or not code2:
return None
with st.spinner('Analyzing code...'):
emb1 = get_embedding(code1)
emb2 = get_embedding(code2)
if emb1 is None or emb2 is None:
return None
with torch.no_grad():
similarity = F.cosine_similarity(emb1, emb2).item()
return similarity
# UI Elements
st.title("π Java Code Clone Detector (IJaDataset 2.1)")
st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.")
# Dataset selector
selected_pair = None
if dataset_pairs:
pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
selected_pair = pair_options[selected_option]
# Layout
col1, col2 = st.columns(2)
with col1:
code1 = st.text_area(
"First Java Code",
height=300,
value=selected_pair["code1"] if selected_pair else "",
help="Enter the first Java code snippet"
)
with col2:
code2 = st.text_area(
"Second Java Code",
height=300,
value=selected_pair["code2"] if selected_pair else "",
help="Enter the second Java code snippet"
)
threshold = st.slider(
"Clone Detection Threshold",
min_value=0.50,
max_value=1.00,
value=0.75,
step=0.01,
help="Similarity score needed to consider code as cloned (0.5-1.0)"
)
# Only perform comparison when button is clicked
if st.button("Compare Code"):
similarity = compare_code(code1, code2)
if similarity is not None:
is_clone = similarity >= threshold
st.subheader("Results")
cols = st.columns(3)
cols[0].metric("Similarity Score", f"{similarity:.3f}")
cols[1].metric("Current Threshold", f"{threshold:.3f}")
cols[2].metric(
"Verdict",
"β
CLONE" if is_clone else "β NOT CLONE",
delta=f"{similarity-threshold:+.3f}",
help=f"Score {'β₯' if is_clone else '<'} threshold"
)
st.progress(similarity)
with st.expander("Interpretation Guide"):
st.markdown("""
- **> 0.95**: Nearly identical (Type-1 clone)
- **0.85-0.95**: Very similar (Type-2 clone)
- **0.70-0.85**: Similar structure (Type-3 clone)
- **< 0.70**: Different code
""")
with st.expander("Show normalized code"):
tab1, tab2 = st.tabs(["First Code", "Second Code"])
with tab1:
st.code(normalize_code(code1))
with tab2:
st.code(normalize_code(code2))
st.markdown("---")
st.markdown("""
**Dataset Information**:
- Using IJaDataset 2.1 from Kaggle
- Contains 100K Java files with clone annotations
- Clone types: Type-1, Type-2, and Type-3 clones
""") |