File size: 5,993 Bytes
49a4932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12646dd
49a4932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03c8514
49a4932
 
 
 
03c8514
49a4932
 
 
 
 
12646dd
 
 
 
 
 
 
 
 
 
 
49a4932
12646dd
49a4932
 
 
 
 
 
 
 
 
12646dd
 
 
49a4932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12646dd
49a4932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12646dd
49a4932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12646dd
49a4932
 
a3f5843
 
12646dd
49a4932
a3f5843
49a4932
 
12646dd
 
 
a3f5843
12646dd
 
 
 
 
 
 
 
a3f5843
 
 
 
 
12646dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49a4932
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings
import pandas as pd
import zipfile
import os

# Set up page config
st.set_page_config(
    page_title="Java Code Clone Detector (IJaDataset 2.1)",
    page_icon="πŸ”",
    layout="wide"
)

# Suppress warnings
warnings.filterwarnings("ignore")

# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_PATH = "archive (1).zip"

# Initialize models with caching
@st.cache_resource
def load_models():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
        return tokenizer, model
    except Exception as e:
        st.error(f"Failed to load models: {str(e)}")
        return None, None

@st.cache_resource
def load_dataset():
    try:
        if not os.path.exists("Subject_CloneTypes_Directories"):
            with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
                zip_ref.extractall(".")
        
        clone_pairs = []
        base_path = "Subject_CloneTypes_Directories"
        
        for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
            type_path = os.path.join(base_path, clone_type)
            if os.path.exists(type_path):
                for root, _, files in os.walk(type_path):
                    if files and len(files) >= 2:
                        with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
                            code1 = f1.read()
                        with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
                            code2 = f2.read()
                        clone_pairs.append({
                            "type": clone_type,
                            "code1": code1,
                            "code2": code2
                        })
                        break
        
        return clone_pairs[:10]
    except Exception as e:
        st.error(f"Error loading dataset: {str(e)}")
        return []

tokenizer, code_model = load_models()
dataset_pairs = load_dataset()

def normalize_code(code):
    try:
        code = re.sub(r'//.*', '', code)
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        code = re.sub(r'\s+', ' ', code).strip()
        return code
    except Exception:
        return code

def get_embedding(code):
    try:
        code = normalize_code(code)
        inputs = tokenizer(
            code,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LENGTH,
            padding='max_length'
        ).to(DEVICE)
        
        with torch.no_grad():
            outputs = code_model(**inputs)
        
        return outputs.last_hidden_state.mean(dim=1)
    except Exception as e:
        st.error(f"Error processing code: {str(e)}")
        return None

def compare_code(code1, code2):
    if not code1 or not code2:
        return None
    
    with st.spinner('Analyzing code...'):
        emb1 = get_embedding(code1)
        emb2 = get_embedding(code2)
        
        if emb1 is None or emb2 is None:
            return None
        
        with torch.no_grad():
            similarity = F.cosine_similarity(emb1, emb2).item()
        
        return similarity

# UI Elements
st.title("πŸ” Java Code Clone Detector (IJaDataset 2.1)")
st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.")

# Dataset selector
selected_pair = None
if dataset_pairs:
    pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
    selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
    selected_pair = pair_options[selected_option]

# Layout
col1, col2 = st.columns(2)

with col1:
    code1 = st.text_area(
        "First Java Code", 
        height=300,
        value=selected_pair["code1"] if selected_pair else "",
        help="Enter the first Java code snippet"
    )

with col2:
    code2 = st.text_area(
        "Second Java Code", 
        height=300,
        value=selected_pair["code2"] if selected_pair else "",
        help="Enter the second Java code snippet"
    )

threshold = st.slider(
    "Clone Detection Threshold",
    min_value=0.50,
    max_value=1.00,
    value=0.75,
    step=0.01,
    help="Similarity score needed to consider code as cloned (0.5-1.0)"
)

# Only perform comparison when button is clicked
if st.button("Compare Code"):
    similarity = compare_code(code1, code2)
    
    if similarity is not None:
        is_clone = similarity >= threshold
        
        st.subheader("Results")
        cols = st.columns(3)
        cols[0].metric("Similarity Score", f"{similarity:.3f}")
        cols[1].metric("Current Threshold", f"{threshold:.3f}")
        cols[2].metric(
            "Verdict", 
            "βœ… CLONE" if is_clone else "❌ NOT CLONE",
            delta=f"{similarity-threshold:+.3f}",
            help=f"Score {'β‰₯' if is_clone else '<'} threshold"
        )
        
        st.progress(similarity)
        
        with st.expander("Interpretation Guide"):
            st.markdown("""
            - **> 0.95**: Nearly identical (Type-1 clone)
            - **0.85-0.95**: Very similar (Type-2 clone) 
            - **0.70-0.85**: Similar structure (Type-3 clone)
            - **< 0.70**: Different code
            """)

        with st.expander("Show normalized code"):
            tab1, tab2 = st.tabs(["First Code", "Second Code"])
            with tab1:
                st.code(normalize_code(code1))
            with tab2:
                st.code(normalize_code(code2))

st.markdown("---")
st.markdown("""
**Dataset Information**:
- Using IJaDataset 2.1 from Kaggle
- Contains 100K Java files with clone annotations
- Clone types: Type-1, Type-2, and Type-3 clones
""")