File size: 3,334 Bytes
f7601c8 9ac8247 a13a8c8 9fd42b6 9ac8247 f7601c8 a13a8c8 fe0e2ec a13a8c8 9fd42b6 a13a8c8 f7601c8 98ba9be a13a8c8 98ba9be 9ac8247 a13a8c8 9ac8247 dd4f105 9ac8247 a13a8c8 9ac8247 a13a8c8 ac03c7a a13a8c8 9ac8247 98ba9be a13a8c8 9ac8247 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import numpy as np
import tensorflow as tf
import re
from tree_sitter import Language, Parser
# import tree_sitter_languages # Pre-built parsers for multiple languages
from tokenizers import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
model = tf.keras.models.load_model("crv3.keras") # CNN model
parser = Parser()
parser.set_language(tree_sitter_languages.get_language("java"))
PY_LANGUAGE = Language(tspython.language())
def syntax_aware_tokenize(code):
"""Tokenizes Java code using Tree-Sitter (AST-based)."""
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
tokens = []
def extract_tokens(node):
"""Recursively extracts tokens from AST."""
if node.child_count == 0: # Leaf node
tokens.append(node.text.decode("utf-8"))
for child in node.children:
extract_tokens(child)
extract_tokens(root_node)
return tokens # Returns structured syntax tokens
def replace_java_comments(code: str) -> str:
"""Replaces Java comments with placeholders."""
code = re.sub(r"//.*", " // ", code) # Replace single-line comments
code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) # Replace multi-line comments
return code.strip() # Preserve indentation and code structure
def tokenize_java_code(code: str, max_length=100):
"""
Tokenizes and pads Java code using AST tokenization + BPE.
Args:
code (str): Java code snippet.
max_length (int): Maximum sequence length.
Returns:
np.array: Tokenized and padded sequence.
"""
cleaned_code = replace_java_comments(code) # Preprocess comments
syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE
# Pad the sequence
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
def classify_code(input_text, input_file):
"""Classifies Java code readability based on user input."""
# Load Java file if provided
if input_file is not None:
code = input_file.decode("utf-8") # Read Java file as text
else:
code = input_text # Use text input
if not code.strip(): # Ensure input is not empty
return "Please provide a Java code snippet."
# Tokenize and predict
tokenized_code = tokenize_java_code(code)
prediction = model.predict(tokenized_code)[0][0]
threshold = 0.52 # Adjust threshold for classification
prediction = (prediction > threshold).astype(int) # Convert probability to binary
return "Readable" if prediction == 1 else "Unreadable"
gr.Interface(
fn=classify_code,
inputs=[
gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"),
gr.File(type="binary", label="Upload Java File (.java)")
],
outputs=gr.Text(label="Readability Classification"),
title="Java Code Readability Classifier (AST + BPE)",
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
allow_flagging="never"
).launch() |