Spaces:

rahideer
/

CCD

Sleeping

App Files Files Community

rahideer commited on 29 days ago

Commit

e39d081

verified ·

1 Parent(s): afcc7a1

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -83

app.py CHANGED Viewed

@@ -1,102 +1,192 @@
 import javalang
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 import re
-import gradio as gr
 from transformers import AutoTokenizer, AutoModel
-from pathlib import Path
-# Configuration
-MAX_FILE_SIZE = 5000
-EMBEDDING_DIM = 128
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# Initialize models once at startup
-tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
-code_model = AutoModel.from_pretrained("microsoft/codebert-base").to(DEVICE)
-# Simplified model architecture
-class CloneDetector(nn.Module):
-    def __init__(self, hidden_dim):
-        super().__init__()
-        self.classifier = nn.Sequential(
-            nn.Linear(hidden_dim * 2, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, 2))
-    def forward(self, emb1, emb2):
-        combined = torch.cat([emb1, emb2], dim=-1)
-        return self.classifier(combined)
-model = CloneDetector(768).to(DEVICE)  # 768 is CodeBERT's hidden size
-def get_code_embedding(code):
-    """Get embedding for a single code snippet"""
-    try:
-        # Normalize code
-        code = re.sub(r'//.*', '', code)
-        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
-        code = ' '.join(code.split())
-        # Tokenize and get embedding
-        inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
-        with torch.no_grad():
-            outputs = code_model(**inputs)
-        return outputs.last_hidden_state.mean(dim=1)  # Pooled representation
-    except Exception:
-        return torch.zeros(1, 768).to(DEVICE)
-def predict_clone(code1, code2):
-    """Compare two code snippets"""
     try:
-        # Get embeddings
-        emb1 = get_code_embedding(code1)
-        emb2 = get_code_embedding(code2)
-        # Calculate similarity
-        with torch.no_grad():
-            sim_score = F.cosine_similarity(emb1, emb2).item()
-            logits = model(emb1, emb2)
-            prob = F.softmax(logits, dim=-1)[0, 1].item()
-        return {
-            "Similarity Score": f"{sim_score:.3f}",
-            "Clone Probability": f"{prob:.3f}",
-            "Prediction": "Clone" if prob > 0.5 else "Not Clone"
-        }
     except Exception as e:
-        return {"Error": str(e)}
-# Gradio Interface
-demo = gr.Interface(
-    fn=predict_clone,
-    inputs=[
-        gr.Textbox(label="First Java Code", lines=10),
-        gr.Textbox(label="Second Java Code", lines=10)
-    ],
-    outputs=gr.JSON(label="Results"),
-    examples=[
-        ["""public class Hello {
     public static void main(String[] args) {
         System.out.println("Hello, World!");
     }
-}""",
-"""public class Greet {
     public static void main(String[] args) {
         System.out.println("Hello, World!");
     }
-}"""],
-        ["""public int add(int a, int b) {
-    return a + b;
-}""",
-"""public int sum(int x, int y) {
-    return x + y;
-}"""]
-    ],
-    title="Java Code Clone Detector",
-    description="Compare two Java code snippets to detect potential clones"
 )
-if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
 import javalang
 import torch
 import torch.nn.functional as F
 import re
 from transformers import AutoTokenizer, AutoModel
+import warnings
+# Set up page config
+st.set_page_config(
+    page_title="Java Code Clone Detector",
+    page_icon="🔍",
+    layout="wide"
+)
+# Suppress warnings
+warnings.filterwarnings("ignore")
+# Constants
+MODEL_NAME = "microsoft/codebert-base"
+MAX_LENGTH = 512
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Initialize models with caching
+@st.cache_resource
+def load_models():
     try:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
+        return tokenizer, model
     except Exception as e:
+        st.error(f"Failed to load models: {str(e)}")
+        return None, None
+tokenizer, code_model = load_models()
+# UI Elements
+st.title("🔍 Java Code Clone Detector")
+st.markdown("""
+Compare two Java code snippets to detect potential clones using CodeBERT embeddings.
+The similarity score ranges from 0 (completely different) to 1 (identical).
+""")
+# Example code
+EXAMPLE_1 = """public class Hello {
     public static void main(String[] args) {
         System.out.println("Hello, World!");
     }
+}"""
+EXAMPLE_2 = """public class Greet {
     public static void main(String[] args) {
         System.out.println("Hello, World!");
     }
+}"""
+# Layout
+col1, col2 = st.columns(2)
+with col1:
+    code1 = st.text_area(
+        "First Java Code",
+        height=300,
+        value=EXAMPLE_1,
+        help="Enter the first Java code snippet"
+    )
+with col2:
+    code2 = st.text_area(
+        "Second Java Code",
+        height=300,
+        value=EXAMPLE_2,
+        help="Enter the second Java code snippet"
+    )
+# Threshold slider
+threshold = st.slider(
+    "Clone Detection Threshold",
+    min_value=0.5,
+    max_value=1.0,
+    value=0.85,
+    step=0.01,
+    help="Adjust the similarity threshold for clone detection"
 )
+# Normalization function
+def normalize_code(code):
+    try:
+        code = re.sub(r'//.*', '', code)  # Remove single-line comments
+        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Multi-line comments
+        code = re.sub(r'\s+', ' ', code).strip()  # Normalize whitespace
+        return code
+    except Exception:
+        return code
+# Embedding generation
+def get_embedding(code):
+    try:
+        code = normalize_code(code)
+        inputs = tokenizer(
+            code,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_LENGTH,
+            padding='max_length'
+        ).to(DEVICE)
+        with torch.no_grad():
+            outputs = code_model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1)  # Pooled embedding
+    except Exception as e:
+        st.error(f"Error processing code: {str(e)}")
+        return None
+# Comparison function
+def compare_code(code1, code2):
+    if not code1 or not code2:
+        return None
+    with st.spinner('Analyzing code...'):
+        emb1 = get_embedding(code1)
+        emb2 = get_embedding(code2)
+        if emb1 is None or emb2 is None:
+            return None
+        with torch.no_grad():
+            similarity = F.cosine_similarity(emb1, emb2).item()
+        return similarity
+# Compare button
+if st.button("Compare Code", type="primary"):
+    if tokenizer is None or code_model is None:
+        st.error("Models failed to load. Please check the logs.")
+    else:
+        similarity = compare_code(code1, code2)
+        if similarity is not None:
+            # Display results
+            st.subheader("Results")
+            # Progress bar for visualization
+            st.progress(similarity)
+            # Metrics columns
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Similarity Score", f"{similarity:.3f}")
+            with col2:
+                st.metric("Threshold", f"{threshold:.3f}")
+            with col3:
+                is_clone = similarity >= threshold
+                st.metric(
+                    "Clone Detection",
+                    "✅ Clone" if is_clone else "❌ Not a Clone",
+                    delta=f"{similarity-threshold:+.3f}"
+                )
+            # Interpretation
+            if similarity > 0.95:
+                st.success("The code snippets are nearly identical (potential Type-1 clone)")
+            elif similarity > 0.85:
+                st.success("The code snippets are very similar (potential Type-2 clone)")
+            elif similarity > 0.7:
+                st.warning("The code snippets show some similarity (potential Type-3 clone)")
+            else:
+                st.info("The code snippets are significantly different")
+            # Show normalized code for debugging
+            with st.expander("Show normalized code"):
+                tab1, tab2 = st.tabs(["First Code", "Second Code"])
+                with tab1:
+                    st.code(normalize_code(code1))
+                with tab2:
+                    st.code(normalize_code(code2))
+# Footer
+st.markdown("---")
+st.markdown("""
+**How it works**:
+1. Code is normalized (comments removed, whitespace standardized)
+2. CodeBERT generates embeddings for each snippet
+3. Cosine similarity is calculated between embeddings
+4. Results are compared against your threshold
+""")