Spaces:

Circhastic
/

Code-Readability-Classifier

Sleeping

App Files Files Community

Code-Readability-Classifier / app.py

Circhastic

updated with new model

a13a8c8 2 months ago

raw

history blame

3.29 kB

	import gradio as gr
	import numpy as np
	import tensorflow as tf
	import re
	from tree_sitter import Language, Parser
	import tree_sitter_languages # Pre-built parsers for multiple languages
	from tokenizers import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences

	tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
	model = tf.keras.models.load_model("crv3.keras") # CNN model

	parser = Parser()
	parser.set_language(tree_sitter_languages.get_language("java"))

	def syntax_aware_tokenize(code):
	"""Tokenizes Java code using Tree-Sitter (AST-based)."""
	tree = parser.parse(bytes(code, "utf8"))
	root_node = tree.root_node
	tokens = []

	def extract_tokens(node):
	"""Recursively extracts tokens from AST."""
	if node.child_count == 0: # Leaf node
	tokens.append(node.text.decode("utf-8"))
	for child in node.children:
	extract_tokens(child)

	extract_tokens(root_node)
	return tokens # Returns structured syntax tokens

	def replace_java_comments(code: str) -> str:
	"""Replaces Java comments with placeholders."""
	code = re.sub(r"//.*", " // ", code) # Replace single-line comments
	code = re.sub(r"/\[\s\S]?\/", " /*/ ", code) # Replace multi-line comments
	return code.strip() # Preserve indentation and code structure

	def tokenize_java_code(code: str, max_length=100):
	"""
	Tokenizes and pads Java code using AST tokenization + BPE.

	Args:
	code (str): Java code snippet.
	max_length (int): Maximum sequence length.

	Returns:
	np.array: Tokenized and padded sequence.
	"""
	cleaned_code = replace_java_comments(code) # Preprocess comments
	syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
	encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE

	# Pad the sequence
	padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
	return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model

	def classify_code(input_text, input_file):
	"""Classifies Java code readability based on user input."""
	# Load Java file if provided
	if input_file is not None:
	code = input_file.decode("utf-8") # Read Java file as text
	else:
	code = input_text # Use text input

	if not code.strip(): # Ensure input is not empty
	return "Please provide a Java code snippet."

	# Tokenize and predict
	tokenized_code = tokenize_java_code(code)
	prediction = model.predict(tokenized_code)[0][0]

	threshold = 0.52 # Adjust threshold for classification
	prediction = (prediction > threshold).astype(int) # Convert probability to binary

	return "Readable" if prediction == 1 else "Unreadable"

	gr.Interface(
	fn=classify_code,
	inputs=[
	gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"),
	gr.File(type="binary", label="Upload Java File (.java)")
	],
	outputs=gr.Text(label="Readability Classification"),
	title="Java Code Readability Classifier (AST + BPE)",
	description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
	allow_flagging="never"
	).launch()