|
import streamlit as st |
|
import javalang |
|
import torch |
|
import torch.nn.functional as F |
|
import re |
|
from transformers import AutoTokenizer, AutoModel |
|
import warnings |
|
|
|
|
|
st.set_page_config( |
|
page_title="Java Code Clone Detector", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
MODEL_NAME = "microsoft/codebert-base" |
|
MAX_LENGTH = 512 |
|
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) |
|
return tokenizer, model |
|
except Exception as e: |
|
st.error(f"Failed to load models: {str(e)}") |
|
return None, None |
|
|
|
tokenizer, code_model = load_models() |
|
|
|
|
|
st.title("π Java Code Clone Detector") |
|
st.markdown(""" |
|
Compare two Java code snippets to detect potential clones using CodeBERT embeddings. |
|
The similarity score ranges from 0 (completely different) to 1 (identical). |
|
""") |
|
|
|
|
|
EXAMPLE_1 = """public class Hello { |
|
public static void main(String[] args) { |
|
System.out.println("Hello, World!"); |
|
} |
|
}""" |
|
|
|
EXAMPLE_2 = """public class Greet { |
|
public static void main(String[] args) { |
|
System.out.println("Hello, World!"); |
|
} |
|
}""" |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
code1 = st.text_area( |
|
"First Java Code", |
|
height=300, |
|
value=EXAMPLE_1, |
|
help="Enter the first Java code snippet" |
|
) |
|
|
|
with col2: |
|
code2 = st.text_area( |
|
"Second Java Code", |
|
height=300, |
|
value=EXAMPLE_2, |
|
help="Enter the second Java code snippet" |
|
) |
|
|
|
|
|
threshold = st.slider( |
|
"Clone Detection Threshold", |
|
min_value=0.5, |
|
max_value=1.0, |
|
value=0.85, |
|
step=0.01, |
|
help="Adjust the similarity threshold for clone detection" |
|
) |
|
|
|
|
|
def normalize_code(code): |
|
try: |
|
code = re.sub(r'//.*', '', code) |
|
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) |
|
code = re.sub(r'\s+', ' ', code).strip() |
|
return code |
|
except Exception: |
|
return code |
|
|
|
|
|
def get_embedding(code): |
|
try: |
|
code = normalize_code(code) |
|
inputs = tokenizer( |
|
code, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=MAX_LENGTH, |
|
padding='max_length' |
|
).to(DEVICE) |
|
|
|
with torch.no_grad(): |
|
outputs = code_model(**inputs) |
|
|
|
return outputs.last_hidden_state.mean(dim=1) |
|
except Exception as e: |
|
st.error(f"Error processing code: {str(e)}") |
|
return None |
|
|
|
|
|
def compare_code(code1, code2): |
|
if not code1 or not code2: |
|
return None |
|
|
|
with st.spinner('Analyzing code...'): |
|
emb1 = get_embedding(code1) |
|
emb2 = get_embedding(code2) |
|
|
|
if emb1 is None or emb2 is None: |
|
return None |
|
|
|
with torch.no_grad(): |
|
similarity = F.cosine_similarity(emb1, emb2).item() |
|
|
|
return similarity |
|
|
|
|
|
if st.button("Compare Code", type="primary"): |
|
if tokenizer is None or code_model is None: |
|
st.error("Models failed to load. Please check the logs.") |
|
else: |
|
similarity = compare_code(code1, code2) |
|
|
|
if similarity is not None: |
|
|
|
st.subheader("Results") |
|
|
|
|
|
st.progress(similarity) |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
st.metric("Similarity Score", f"{similarity:.3f}") |
|
|
|
with col2: |
|
st.metric("Threshold", f"{threshold:.3f}") |
|
|
|
with col3: |
|
is_clone = similarity >= threshold |
|
st.metric( |
|
"Clone Detection", |
|
"β
Clone" if is_clone else "β Not a Clone", |
|
delta=f"{similarity-threshold:+.3f}" |
|
) |
|
|
|
|
|
if similarity > 0.95: |
|
st.success("The code snippets are nearly identical (potential Type-1 clone)") |
|
elif similarity > 0.85: |
|
st.success("The code snippets are very similar (potential Type-2 clone)") |
|
elif similarity > 0.7: |
|
st.warning("The code snippets show some similarity (potential Type-3 clone)") |
|
else: |
|
st.info("The code snippets are significantly different") |
|
|
|
|
|
with st.expander("Show normalized code"): |
|
tab1, tab2 = st.tabs(["First Code", "Second Code"]) |
|
|
|
with tab1: |
|
st.code(normalize_code(code1)) |
|
|
|
with tab2: |
|
st.code(normalize_code(code2)) |
|
|
|
|
|
st.markdown("---") |
|
st.markdown(""" |
|
**How it works**: |
|
1. Code is normalized (comments removed, whitespace standardized) |
|
2. CodeBERT generates embeddings for each snippet |
|
3. Cosine similarity is calculated between embeddings |
|
4. Results are compared against your threshold |
|
""") |