|
import gradio as gr |
|
import numpy as np |
|
import tensorflow as tf |
|
import re |
|
from tree_sitter import Language, Parser |
|
import tree_sitter_languages |
|
from tokenizers import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") |
|
model = tf.keras.models.load_model("crv3.keras") |
|
|
|
parser = Parser() |
|
parser.set_language(tree_sitter_languages.get_language("java")) |
|
|
|
def syntax_aware_tokenize(code): |
|
"""Tokenizes Java code using Tree-Sitter (AST-based).""" |
|
tree = parser.parse(bytes(code, "utf8")) |
|
root_node = tree.root_node |
|
tokens = [] |
|
|
|
def extract_tokens(node): |
|
"""Recursively extracts tokens from AST.""" |
|
if node.child_count == 0: |
|
tokens.append(node.text.decode("utf-8")) |
|
for child in node.children: |
|
extract_tokens(child) |
|
|
|
extract_tokens(root_node) |
|
return tokens |
|
|
|
def replace_java_comments(code: str) -> str: |
|
"""Replaces Java comments with placeholders.""" |
|
code = re.sub(r"//.*", " // ", code) |
|
code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) |
|
return code.strip() |
|
|
|
def tokenize_java_code(code: str, max_length=100): |
|
""" |
|
Tokenizes and pads Java code using AST tokenization + BPE. |
|
|
|
Args: |
|
code (str): Java code snippet. |
|
max_length (int): Maximum sequence length. |
|
|
|
Returns: |
|
np.array: Tokenized and padded sequence. |
|
""" |
|
cleaned_code = replace_java_comments(code) |
|
syntax_tokens = syntax_aware_tokenize(cleaned_code) |
|
encoded = tokenizer.encode(" ".join(syntax_tokens)).ids |
|
|
|
|
|
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0] |
|
return np.array(padded_sequence).reshape(1, -1) |
|
|
|
def classify_code(input_text, input_file): |
|
"""Classifies Java code readability based on user input.""" |
|
|
|
if input_file is not None: |
|
code = input_file.decode("utf-8") |
|
else: |
|
code = input_text |
|
|
|
if not code.strip(): |
|
return "Please provide a Java code snippet." |
|
|
|
|
|
tokenized_code = tokenize_java_code(code) |
|
prediction = model.predict(tokenized_code)[0][0] |
|
|
|
threshold = 0.52 |
|
prediction = (prediction > threshold).astype(int) |
|
|
|
return "Readable" if prediction == 1 else "Unreadable" |
|
|
|
gr.Interface( |
|
fn=classify_code, |
|
inputs=[ |
|
gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"), |
|
gr.File(type="binary", label="Upload Java File (.java)") |
|
], |
|
outputs=gr.Text(label="Readability Classification"), |
|
title="Java Code Readability Classifier (AST + BPE)", |
|
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.", |
|
allow_flagging="never" |
|
).launch() |