rahideer commited on
Commit
e39d081
·
verified ·
1 Parent(s): afcc7a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -83
app.py CHANGED
@@ -1,102 +1,192 @@
 
1
  import javalang
2
  import torch
3
- import torch.nn as nn
4
  import torch.nn.functional as F
5
  import re
6
- import gradio as gr
7
  from transformers import AutoTokenizer, AutoModel
8
- from pathlib import Path
9
 
10
- # Configuration
11
- MAX_FILE_SIZE = 5000
12
- EMBEDDING_DIM = 128
13
- DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14
-
15
- # Initialize models once at startup
16
- tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
17
- code_model = AutoModel.from_pretrained("microsoft/codebert-base").to(DEVICE)
18
-
19
- # Simplified model architecture
20
- class CloneDetector(nn.Module):
21
- def __init__(self, hidden_dim):
22
- super().__init__()
23
- self.classifier = nn.Sequential(
24
- nn.Linear(hidden_dim * 2, hidden_dim),
25
- nn.ReLU(),
26
- nn.Linear(hidden_dim, 2))
27
-
28
- def forward(self, emb1, emb2):
29
- combined = torch.cat([emb1, emb2], dim=-1)
30
- return self.classifier(combined)
31
 
32
- model = CloneDetector(768).to(DEVICE) # 768 is CodeBERT's hidden size
 
33
 
34
- def get_code_embedding(code):
35
- """Get embedding for a single code snippet"""
36
- try:
37
- # Normalize code
38
- code = re.sub(r'//.*', '', code)
39
- code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
40
- code = ' '.join(code.split())
41
-
42
- # Tokenize and get embedding
43
- inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
44
- with torch.no_grad():
45
- outputs = code_model(**inputs)
46
- return outputs.last_hidden_state.mean(dim=1) # Pooled representation
47
- except Exception:
48
- return torch.zeros(1, 768).to(DEVICE)
49
 
50
- def predict_clone(code1, code2):
51
- """Compare two code snippets"""
 
52
  try:
53
- # Get embeddings
54
- emb1 = get_code_embedding(code1)
55
- emb2 = get_code_embedding(code2)
56
-
57
- # Calculate similarity
58
- with torch.no_grad():
59
- sim_score = F.cosine_similarity(emb1, emb2).item()
60
- logits = model(emb1, emb2)
61
- prob = F.softmax(logits, dim=-1)[0, 1].item()
62
-
63
- return {
64
- "Similarity Score": f"{sim_score:.3f}",
65
- "Clone Probability": f"{prob:.3f}",
66
- "Prediction": "Clone" if prob > 0.5 else "Not Clone"
67
- }
68
  except Exception as e:
69
- return {"Error": str(e)}
70
-
71
- # Gradio Interface
72
- demo = gr.Interface(
73
- fn=predict_clone,
74
- inputs=[
75
- gr.Textbox(label="First Java Code", lines=10),
76
- gr.Textbox(label="Second Java Code", lines=10)
77
- ],
78
- outputs=gr.JSON(label="Results"),
79
- examples=[
80
- ["""public class Hello {
 
 
81
  public static void main(String[] args) {
82
  System.out.println("Hello, World!");
83
  }
84
- }""",
85
- """public class Greet {
 
86
  public static void main(String[] args) {
87
  System.out.println("Hello, World!");
88
  }
89
- }"""],
90
- ["""public int add(int a, int b) {
91
- return a + b;
92
- }""",
93
- """public int sum(int x, int y) {
94
- return x + y;
95
- }"""]
96
- ],
97
- title="Java Code Clone Detector",
98
- description="Compare two Java code snippets to detect potential clones"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  )
100
 
101
- if __name__ == "__main__":
102
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
  import javalang
3
  import torch
 
4
  import torch.nn.functional as F
5
  import re
 
6
  from transformers import AutoTokenizer, AutoModel
7
+ import warnings
8
 
9
+ # Set up page config
10
+ st.set_page_config(
11
+ page_title="Java Code Clone Detector",
12
+ page_icon="🔍",
13
+ layout="wide"
14
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Suppress warnings
17
+ warnings.filterwarnings("ignore")
18
 
19
+ # Constants
20
+ MODEL_NAME = "microsoft/codebert-base"
21
+ MAX_LENGTH = 512
22
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Initialize models with caching
25
+ @st.cache_resource
26
+ def load_models():
27
  try:
28
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
+ model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
30
+ return tokenizer, model
 
 
 
 
 
 
 
 
 
 
 
 
31
  except Exception as e:
32
+ st.error(f"Failed to load models: {str(e)}")
33
+ return None, None
34
+
35
+ tokenizer, code_model = load_models()
36
+
37
+ # UI Elements
38
+ st.title("🔍 Java Code Clone Detector")
39
+ st.markdown("""
40
+ Compare two Java code snippets to detect potential clones using CodeBERT embeddings.
41
+ The similarity score ranges from 0 (completely different) to 1 (identical).
42
+ """)
43
+
44
+ # Example code
45
+ EXAMPLE_1 = """public class Hello {
46
  public static void main(String[] args) {
47
  System.out.println("Hello, World!");
48
  }
49
+ }"""
50
+
51
+ EXAMPLE_2 = """public class Greet {
52
  public static void main(String[] args) {
53
  System.out.println("Hello, World!");
54
  }
55
+ }"""
56
+
57
+ # Layout
58
+ col1, col2 = st.columns(2)
59
+
60
+ with col1:
61
+ code1 = st.text_area(
62
+ "First Java Code",
63
+ height=300,
64
+ value=EXAMPLE_1,
65
+ help="Enter the first Java code snippet"
66
+ )
67
+
68
+ with col2:
69
+ code2 = st.text_area(
70
+ "Second Java Code",
71
+ height=300,
72
+ value=EXAMPLE_2,
73
+ help="Enter the second Java code snippet"
74
+ )
75
+
76
+ # Threshold slider
77
+ threshold = st.slider(
78
+ "Clone Detection Threshold",
79
+ min_value=0.5,
80
+ max_value=1.0,
81
+ value=0.85,
82
+ step=0.01,
83
+ help="Adjust the similarity threshold for clone detection"
84
  )
85
 
86
+ # Normalization function
87
+ def normalize_code(code):
88
+ try:
89
+ code = re.sub(r'//.*', '', code) # Remove single-line comments
90
+ code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
91
+ code = re.sub(r'\s+', ' ', code).strip() # Normalize whitespace
92
+ return code
93
+ except Exception:
94
+ return code
95
+
96
+ # Embedding generation
97
+ def get_embedding(code):
98
+ try:
99
+ code = normalize_code(code)
100
+ inputs = tokenizer(
101
+ code,
102
+ return_tensors="pt",
103
+ truncation=True,
104
+ max_length=MAX_LENGTH,
105
+ padding='max_length'
106
+ ).to(DEVICE)
107
+
108
+ with torch.no_grad():
109
+ outputs = code_model(**inputs)
110
+
111
+ return outputs.last_hidden_state.mean(dim=1) # Pooled embedding
112
+ except Exception as e:
113
+ st.error(f"Error processing code: {str(e)}")
114
+ return None
115
+
116
+ # Comparison function
117
+ def compare_code(code1, code2):
118
+ if not code1 or not code2:
119
+ return None
120
+
121
+ with st.spinner('Analyzing code...'):
122
+ emb1 = get_embedding(code1)
123
+ emb2 = get_embedding(code2)
124
+
125
+ if emb1 is None or emb2 is None:
126
+ return None
127
+
128
+ with torch.no_grad():
129
+ similarity = F.cosine_similarity(emb1, emb2).item()
130
+
131
+ return similarity
132
+
133
+ # Compare button
134
+ if st.button("Compare Code", type="primary"):
135
+ if tokenizer is None or code_model is None:
136
+ st.error("Models failed to load. Please check the logs.")
137
+ else:
138
+ similarity = compare_code(code1, code2)
139
+
140
+ if similarity is not None:
141
+ # Display results
142
+ st.subheader("Results")
143
+
144
+ # Progress bar for visualization
145
+ st.progress(similarity)
146
+
147
+ # Metrics columns
148
+ col1, col2, col3 = st.columns(3)
149
+
150
+ with col1:
151
+ st.metric("Similarity Score", f"{similarity:.3f}")
152
+
153
+ with col2:
154
+ st.metric("Threshold", f"{threshold:.3f}")
155
+
156
+ with col3:
157
+ is_clone = similarity >= threshold
158
+ st.metric(
159
+ "Clone Detection",
160
+ "✅ Clone" if is_clone else "❌ Not a Clone",
161
+ delta=f"{similarity-threshold:+.3f}"
162
+ )
163
+
164
+ # Interpretation
165
+ if similarity > 0.95:
166
+ st.success("The code snippets are nearly identical (potential Type-1 clone)")
167
+ elif similarity > 0.85:
168
+ st.success("The code snippets are very similar (potential Type-2 clone)")
169
+ elif similarity > 0.7:
170
+ st.warning("The code snippets show some similarity (potential Type-3 clone)")
171
+ else:
172
+ st.info("The code snippets are significantly different")
173
+
174
+ # Show normalized code for debugging
175
+ with st.expander("Show normalized code"):
176
+ tab1, tab2 = st.tabs(["First Code", "Second Code"])
177
+
178
+ with tab1:
179
+ st.code(normalize_code(code1))
180
+
181
+ with tab2:
182
+ st.code(normalize_code(code2))
183
+
184
+ # Footer
185
+ st.markdown("---")
186
+ st.markdown("""
187
+ **How it works**:
188
+ 1. Code is normalized (comments removed, whitespace standardized)
189
+ 2. CodeBERT generates embeddings for each snippet
190
+ 3. Cosine similarity is calculated between embeddings
191
+ 4. Results are compared against your threshold
192
+ """)