Spaces:

Aranwer
/

CodeCloneDetector

Runtime error

App Files Files Community

CodeCloneDetector / app.py

Aranwer

Update app.py

84d4d13 verified 13 days ago

raw

history blame

9.57 kB

	import os
	import streamlit as st
	import javalang
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import re
	import numpy as np
	import networkx as nx
	from transformers import AutoTokenizer, AutoModel, AutoConfig
	from torch_geometric.data import Data
	from torch_geometric.nn import GCNConv
	import warnings
	import pandas as pd
	import zipfile
	from collections import defaultdict

	# Configuration
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	warnings.filterwarnings("ignore")

	# Constants
	MODEL_NAME = "microsoft/codebert-base"
	MAX_LENGTH = 512
	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	DATASET_PATH = "ijadataset2-1.zip"
	CACHE_DIR = "./model_cache"

	# Set up page config
	st.set_page_config(
	page_title="Advanced Java Code Clone Detector",
	page_icon="🔍",
	layout="wide"
	)

	# Model Definitions
	class RNNModel(nn.Module):
	def __init__(self, input_size, hidden_size, num_layers):
	super().__init__()
	self.hidden_size = hidden_size
	self.num_layers = num_layers
	self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
	self.fc = nn.Linear(hidden_size, 1)

	def forward(self, x):
	h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)
	out, _ = self.rnn(x, h0)
	return self.fc(out[:, -1, :])

	class GNNModel(nn.Module):
	def __init__(self, node_features):
	super().__init__()
	self.conv1 = GCNConv(node_features, 128)
	self.conv2 = GCNConv(128, 64)
	self.fc = nn.Linear(64, 1)

	def forward(self, data):
	x, edge_index = data.x, data.edge_index
	x = F.relu(self.conv1(x, edge_index))
	x = F.dropout(x, training=self.training)
	x = self.conv2(x, edge_index)
	return torch.sigmoid(self.fc(x).mean())

	# Model Loading with Cache
	@st.cache_resource(show_spinner=False)
	def load_models():
	try:
	with st.spinner('Loading models (first run may take a few minutes)...'):
	config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)
	model = AutoModel.from_pretrained(MODEL_NAME, config=config, cache_dir=CACHE_DIR).to(DEVICE)

	rnn_model = RNNModel(input_size=768, hidden_size=256, num_layers=2).to(DEVICE)
	gnn_model = GNNModel(node_features=128).to(DEVICE)

	return tokenizer, model, rnn_model, gnn_model
	except Exception as e:
	st.error(f"Model loading failed: {str(e)}")
	return None, None, None, None

	# Dataset Loading
	@st.cache_resource
	def load_dataset():
	try:
	if not os.path.exists("Diverse_100K_Dataset"):
	with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
	zip_ref.extractall(".")

	clone_pairs = []
	base_path = "Diverse_100K_Dataset/Subject_CloneTypes_Directories"

	for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST", "Clone_Type4"]:
	type_path = os.path.join(base_path, clone_type)
	if os.path.exists(type_path):
	for root, _, files in os.walk(type_path):
	if files and len(files) >= 2:
	with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1, \
	open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
	clone_pairs.append({
	"type": clone_type,
	"code1": f1.read(),
	"code2": f2.read()
	})
	break

	return clone_pairs[:10]
	except Exception as e:
	st.error(f"Dataset error: {str(e)}")
	return []

	# AST Processing
	def parse_ast(code):
	try:
	return javalang.parse.parse(code)
	except:
	return None

	def build_ast_graph(ast_tree):
	if not ast_tree: return None

	G = nx.DiGraph()
	node_id = 0

	def traverse(node, parent=None):
	nonlocal node_id
	current = node_id
	G.add_node(current, type=type(node).__name__)
	if parent is not None:
	G.add_edge(parent, current)
	node_id += 1

	for child in getattr(node, 'children', []):
	if isinstance(child, javalang.ast.Node):
	traverse(child, current)
	elif isinstance(child, (list, tuple)):
	for item in child:
	if isinstance(item, javalang.ast.Node):
	traverse(item, current)

	traverse(ast_tree)
	return G

	def ast_to_pyg_data(ast_graph):
	if not ast_graph: return None

	node_types = list(nx.get_node_attributes(ast_graph, 'type').values())
	unique_types = list(set(node_types))
	type_to_idx = {t: i for i, t in enumerate(unique_types)}

	x = torch.zeros(len(node_types), len(unique_types))
	for i, t in enumerate(node_types):
	x[i, type_to_idx[t]] = 1

	edge_index = torch.tensor(list(ast_graph.edges())).t().contiguous()

	return Data(x=x.to(DEVICE), edge_index=edge_index.to(DEVICE))

	# Feature Extraction
	def normalize_code(code):
	code = re.sub(r'//.*?$', '', code, flags=re.MULTILINE)
	code = re.sub(r'/\.?\*/', '', code, flags=re.DOTALL)
	return re.sub(r'\s+', ' ', code).strip()

	def get_embedding(code, tokenizer, model):
	try:
	inputs = tokenizer(
	normalize_code(code),
	return_tensors="pt",
	truncation=True,
	max_length=MAX_LENGTH,
	padding='max_length'
	).to(DEVICE)

	with torch.no_grad():
	return model(**inputs).last_hidden_state.mean(dim=1)
	except:
	return None

	# Similarity Calculations
	def calculate_similarities(code1, code2, models):
	tokenizer, code_model, rnn_model, gnn_model = models

	# Get embeddings
	emb1 = get_embedding(code1, tokenizer, code_model)
	emb2 = get_embedding(code2, tokenizer, code_model)

	# Parse ASTs
	ast1 = build_ast_graph(parse_ast(code1))
	ast2 = build_ast_graph(parse_ast(code2))

	# Calculate similarities
	codebert_sim = F.cosine_similarity(emb1, emb2).item() if emb1 is not None and emb2 is not None else 0

	rnn_sim = 0
	if emb1 is not None and emb2 is not None:
	with torch.no_grad():
	rnn_input = torch.stack([emb1.squeeze(), emb2.squeeze()])
	rnn_sim = torch.sigmoid(rnn_model(rnn_input.unsqueeze(0))).item()

	gnn_sim = 0
	if ast1 and ast2:
	data1 = ast_to_pyg_data(ast1)
	data2 = ast_to_pyg_data(ast2)
	if data1 and data2:
	with torch.no_grad():
	gnn_sim = F.cosine_similarity(
	gnn_model(data1).unsqueeze(0),
	gnn_model(data2).unsqueeze(0)
	).item()

	return {
	'codebert': codebert_sim,
	'rnn': rnn_sim,
	'gnn': gnn_sim,
	'combined': 0.4codebert_sim + 0.3rnn_sim + 0.3*gnn_sim
	}

	# UI Components
	def main():
	st.title("🔍 Advanced Java Code Clone Detector")
	st.markdown("Detect all clone types (1-4) using hybrid analysis")

	# Load resources
	models = load_models()
	dataset_pairs = load_dataset()

	# Code input
	selected_pair = None
	if dataset_pairs:
	pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)}
	selected_option = st.selectbox("Select example pair:", list(pair_options.keys()))
	selected_pair = pair_options[selected_option]

	col1, col2 = st.columns(2)
	with col1:
	code1 = st.text_area("Code 1", height=300, value=selected_pair["code1"] if selected_pair else "")
	with col2:
	code2 = st.text_area("Code 2", height=300, value=selected_pair["code2"] if selected_pair else "")

	# Thresholds
	st.subheader("Detection Thresholds")
	cols = st.columns(3)
	with cols[0]:
	t1 = st.slider("Type 1/2", 0.85, 1.0, 0.95)
	with cols[1]:
	t3 = st.slider("Type 3", 0.7, 0.9, 0.8)
	with cols[2]:
	t4 = st.slider("Type 4", 0.5, 0.8, 0.65)

	# Analysis
	if st.button("Analyze", type="primary") and models[0]:
	with st.spinner("Analyzing..."):
	sims = calculate_similarities(code1, code2, models)

	# Determine clone type
	clone_type = "No Clone"
	if sims['combined'] >= t1:
	clone_type = "Type 1/2 Clone"
	elif sims['combined'] >= t3:
	clone_type = "Type 3 Clone"
	elif sims['combined'] >= t4:
	clone_type = "Type 4 Clone"

	# Display results
	st.subheader("Results")
	cols = st.columns(4)
	cols[0].metric("Combined", f"{sims['combined']:.2f}")
	cols[1].metric("CodeBERT", f"{sims['codebert']:.2f}")
	cols[2].metric("RNN", f"{sims['rnn']:.2f}")
	cols[3].metric("GNN", f"{sims['gnn']:.2f}")

	st.progress(sims['combined'])
	st.metric("Detection Result", clone_type)

	# Show details
	with st.expander("Details"):
	st.json(sims)
	st.code(f"Normalized Code 1:\n{normalize_code(code1)}")
	st.code(f"Normalized Code 2:\n{normalize_code(code2)}")

	if __name__ == "__main__":
	main()