rahideer commited on
Commit
49a4932
Β·
verified Β·
1 Parent(s): 38e7b4d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -0
app.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import javalang
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import re
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import warnings
8
+ import pandas as pd
9
+ import zipfile
10
+ import os
11
+
12
+ # Set up page config
13
+ st.set_page_config(
14
+ page_title="Java Code Clone Detector (IJaDataset 2.1)",
15
+ page_icon="πŸ”",
16
+ layout="wide"
17
+ )
18
+
19
+ # Suppress warnings
20
+ warnings.filterwarnings("ignore")
21
+
22
+ # Constants
23
+ MODEL_NAME = "microsoft/codebert-base"
24
+ MAX_LENGTH = 512
25
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
+ DATASET_PATH = "ijadataset2-1.zip" # Update this path if needed
27
+
28
+ # Initialize models with caching
29
+ @st.cache_resource
30
+ def load_models():
31
+ try:
32
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
33
+ model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
34
+ return tokenizer, model
35
+ except Exception as e:
36
+ st.error(f"Failed to load models: {str(e)}")
37
+ return None, None
38
+
39
+ @st.cache_resource
40
+ def load_dataset():
41
+ try:
42
+ # Extract dataset if needed
43
+ if not os.path.exists("Diverse_100K_Dataset"):
44
+ with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
45
+ zip_ref.extractall(".")
46
+
47
+ # Load sample pairs (modify this based on your dataset structure)
48
+ clone_pairs = []
49
+ base_path = "Diverse_100K_Dataset/Subject_CloneTypes_Directories"
50
+
51
+ # Example: Load one pair from each clone type
52
+ for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
53
+ type_path = os.path.join(base_path, clone_type)
54
+ if os.path.exists(type_path):
55
+ for root, _, files in os.walk(type_path):
56
+ if files:
57
+ # Take first two files as a pair
58
+ if len(files) >= 2:
59
+ with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
60
+ code1 = f1.read()
61
+ with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
62
+ code2 = f2.read()
63
+ clone_pairs.append({
64
+ "type": clone_type,
65
+ "code1": code1,
66
+ "code2": code2
67
+ })
68
+ break # Just take one pair per type for demo
69
+
70
+ return clone_pairs[:10] # Return first 10 pairs for demo
71
+
72
+ except Exception as e:
73
+ st.error(f"Error loading dataset: {str(e)}")
74
+ return []
75
+
76
+ tokenizer, code_model = load_models()
77
+ dataset_pairs = load_dataset()
78
+
79
+ # Normalization function
80
+ def normalize_code(code):
81
+ try:
82
+ code = re.sub(r'//.*', '', code) # Remove single-line comments
83
+ code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
84
+ code = re.sub(r'\s+', ' ', code).strip() # Normalize whitespace
85
+ return code
86
+ except Exception:
87
+ return code
88
+
89
+ # Embedding generation
90
+ def get_embedding(code):
91
+ try:
92
+ code = normalize_code(code)
93
+ inputs = tokenizer(
94
+ code,
95
+ return_tensors="pt",
96
+ truncation=True,
97
+ max_length=MAX_LENGTH,
98
+ padding='max_length'
99
+ ).to(DEVICE)
100
+
101
+ with torch.no_grad():
102
+ outputs = code_model(**inputs)
103
+
104
+ return outputs.last_hidden_state.mean(dim=1) # Pooled embedding
105
+ except Exception as e:
106
+ st.error(f"Error processing code: {str(e)}")
107
+ return None
108
+
109
+ # Comparison function
110
+ def compare_code(code1, code2):
111
+ if not code1 or not code2:
112
+ return None
113
+
114
+ with st.spinner('Analyzing code...'):
115
+ emb1 = get_embedding(code1)
116
+ emb2 = get_embedding(code2)
117
+
118
+ if emb1 is None or emb2 is None:
119
+ return None
120
+
121
+ with torch.no_grad():
122
+ similarity = F.cosine_similarity(emb1, emb2).item()
123
+
124
+ return similarity
125
+
126
+ # UI Elements
127
+ st.title("πŸ” Java Code Clone Detector (IJaDataset 2.1)")
128
+ st.markdown("""
129
+ Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.
130
+ """)
131
+
132
+ # Dataset selector
133
+ selected_pair = None
134
+ if dataset_pairs:
135
+ pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
136
+ selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys()))
137
+ selected_pair = pair_options[selected_option]
138
+
139
+ # Layout
140
+ col1, col2 = st.columns(2)
141
+
142
+ with col1:
143
+ code1 = st.text_area(
144
+ "First Java Code",
145
+ height=300,
146
+ value=selected_pair["code1"] if selected_pair else "",
147
+ help="Enter the first Java code snippet"
148
+ )
149
+
150
+ with col2:
151
+ code2 = st.text_area(
152
+ "Second Java Code",
153
+ height=300,
154
+ value=selected_pair["code2"] if selected_pair else "",
155
+ help="Enter the second Java code snippet"
156
+ )
157
+
158
+ # Threshold slider
159
+ threshold = st.slider(
160
+ "Clone Detection Threshold",
161
+ min_value=0.5,
162
+ max_value=1.0,
163
+ value=0.85,
164
+ step=0.01,
165
+ help="Adjust the similarity threshold for clone detection"
166
+ )
167
+
168
+ # Compare button
169
+ if st.button("Compare Code", type="primary"):
170
+ if tokenizer is None or code_model is None:
171
+ st.error("Models failed to load. Please check the logs.")
172
+ else:
173
+ similarity = compare_code(code1, code2)
174
+
175
+ if similarity is not None:
176
+ # Display results
177
+ st.subheader("Results")
178
+
179
+ # Progress bar for visualization
180
+ st.progress(similarity)
181
+
182
+ # Metrics columns
183
+ col1, col2, col3 = st.columns(3)
184
+
185
+ with col1:
186
+ st.metric("Similarity Score", f"{similarity:.3f}")
187
+
188
+ with col2:
189
+ st.metric("Threshold", f"{threshold:.3f}")
190
+
191
+ with col3:
192
+ is_clone = similarity >= threshold
193
+ st.metric(
194
+ "Clone Detection",
195
+ "βœ… Clone" if is_clone else "❌ Not a Clone",
196
+ delta=f"{similarity-threshold:+.3f}"
197
+ )
198
+
199
+ # Show normalized code for debugging
200
+ with st.expander("Show normalized code"):
201
+ tab1, tab2 = st.tabs(["First Code", "Second Code"])
202
+
203
+ with tab1:
204
+ st.code(normalize_code(code1))
205
+
206
+ with tab2:
207
+ st.code(normalize_code(code2))
208
+
209
+ # Footer
210
+ st.markdown("---")
211
+ st.markdown("""
212
+ **Dataset Information**:
213
+ - Using IJaDataset 2.1 from Kaggle
214
+ - Contains 100K Java files with clone annotations
215
+ - Clone types: Type-1, Type-2, and Type-3 clones
216
+ """)