Circhastic's picture
adjusted threshold
554bf8d
import gradio as gr
import numpy as np
import tensorflow as tf
import re
from tree_sitter import Language, Parser
import tree_sitter_languages # Pre-built parsers for multiple languages
from tokenizers import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
model = tf.keras.models.load_model("crv3.keras") # CNN model
parser = Parser()
parser.set_language(tree_sitter_languages.get_language("java"))
def syntax_aware_tokenize(code):
"""Tokenizes Java code using Tree-Sitter (AST-based)."""
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
tokens = []
def extract_tokens(node):
"""Recursively extracts tokens from AST."""
if node.child_count == 0: # Leaf node
tokens.append(node.text.decode("utf-8"))
for child in node.children:
extract_tokens(child)
extract_tokens(root_node)
return tokens # Returns structured syntax tokens
def replace_java_comments(code: str) -> str:
"""Replaces Java comments with placeholders."""
code = re.sub(r"//.*", " // ", code) # Replace single-line comments
code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) # Replace multi-line comments
return code.strip() # Preserve indentation and code structure
def tokenize_java_code(code: str, max_length=100):
"""
Tokenizes and pads Java code using AST tokenization + BPE.
Args:
code (str): Java code snippet.
max_length (int): Maximum sequence length.
Returns:
np.array: Tokenized and padded sequence.
"""
cleaned_code = replace_java_comments(code) # Preprocess comments
syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE
# Pad the sequence
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
def classify_code(input_text, input_file):
"""Classifies Java code readability based on user input."""
# Load Java file if provided
if input_file is not None:
code = input_file.decode("utf-8") # Read Java file as text
else:
code = input_text # Use text input
if not code.strip(): # Ensure input is not empty
return "Please provide a Java code snippet."
# Tokenize and predict
tokenized_code = tokenize_java_code(code)
prediction = model.predict(tokenized_code)[0][0]
threshold = 0.49 # Adjust threshold for classification
prediction = (prediction > threshold).astype(int) # Convert probability to binary
return "Readable" if prediction == 1 else "Unreadable"
gr.Interface(
fn=classify_code,
inputs=[
gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"),
gr.File(type="binary", label="Upload Java File (.java)")
],
outputs=gr.Text(label="Readability Classification"),
title="Java Code Readability Classifier (AST + BPE)",
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
allow_flagging="never"
).launch()