Commit
·
a13a8c8
1
Parent(s):
dd4f105
updated with new model
Browse files- app.py +47 -18
- cr_tokenizer.json +0 -0
- crv3.keras +2 -2
- requirements.txt +2 -0
app.py
CHANGED
@@ -1,23 +1,56 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import tensorflow as tf
|
|
|
|
|
|
|
4 |
from tokenizers import Tokenizer
|
5 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
6 |
-
import re
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def replace_java_comments(code: str) -> str:
|
13 |
"""Replaces Java comments with placeholders."""
|
14 |
-
code = re.sub(r"//.*", "
|
15 |
-
code = re.sub(r"/\*[\s\S]*?\*/", "
|
16 |
-
return code.strip() #
|
17 |
|
18 |
def tokenize_java_code(code: str, max_length=100):
|
19 |
-
"""
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
|
22 |
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
|
23 |
|
@@ -32,18 +65,14 @@ def classify_code(input_text, input_file):
|
|
32 |
if not code.strip(): # Ensure input is not empty
|
33 |
return "Please provide a Java code snippet."
|
34 |
|
35 |
-
# Replace comments before tokenization
|
36 |
-
cleaned_code = replace_java_comments(code)
|
37 |
-
|
38 |
# Tokenize and predict
|
39 |
-
tokenized_code = tokenize_java_code(
|
40 |
prediction = model.predict(tokenized_code)[0][0]
|
41 |
|
42 |
-
threshold = 0.52
|
43 |
-
prediction = (prediction > threshold).astype(int) # Convert
|
44 |
|
45 |
-
|
46 |
-
return "Readable" if prediction > 0.5 else "Unreadable"
|
47 |
|
48 |
gr.Interface(
|
49 |
fn=classify_code,
|
@@ -52,7 +81,7 @@ gr.Interface(
|
|
52 |
gr.File(type="binary", label="Upload Java File (.java)")
|
53 |
],
|
54 |
outputs=gr.Text(label="Readability Classification"),
|
55 |
-
title="Java Code Readability Classifier",
|
56 |
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
|
57 |
allow_flagging="never"
|
58 |
).launch()
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import tensorflow as tf
|
4 |
+
import re
|
5 |
+
from tree_sitter import Language, Parser
|
6 |
+
import tree_sitter_languages # Pre-built parsers for multiple languages
|
7 |
from tokenizers import Tokenizer
|
8 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
9 |
|
10 |
+
tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
|
11 |
+
model = tf.keras.models.load_model("crv3.keras") # CNN model
|
12 |
+
|
13 |
+
parser = Parser()
|
14 |
+
parser.set_language(tree_sitter_languages.get_language("java"))
|
15 |
+
|
16 |
+
def syntax_aware_tokenize(code):
|
17 |
+
"""Tokenizes Java code using Tree-Sitter (AST-based)."""
|
18 |
+
tree = parser.parse(bytes(code, "utf8"))
|
19 |
+
root_node = tree.root_node
|
20 |
+
tokens = []
|
21 |
+
|
22 |
+
def extract_tokens(node):
|
23 |
+
"""Recursively extracts tokens from AST."""
|
24 |
+
if node.child_count == 0: # Leaf node
|
25 |
+
tokens.append(node.text.decode("utf-8"))
|
26 |
+
for child in node.children:
|
27 |
+
extract_tokens(child)
|
28 |
+
|
29 |
+
extract_tokens(root_node)
|
30 |
+
return tokens # Returns structured syntax tokens
|
31 |
|
32 |
def replace_java_comments(code: str) -> str:
|
33 |
"""Replaces Java comments with placeholders."""
|
34 |
+
code = re.sub(r"//.*", " // ", code) # Replace single-line comments
|
35 |
+
code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) # Replace multi-line comments
|
36 |
+
return code.strip() # Preserve indentation and code structure
|
37 |
|
38 |
def tokenize_java_code(code: str, max_length=100):
|
39 |
+
"""
|
40 |
+
Tokenizes and pads Java code using AST tokenization + BPE.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
code (str): Java code snippet.
|
44 |
+
max_length (int): Maximum sequence length.
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
np.array: Tokenized and padded sequence.
|
48 |
+
"""
|
49 |
+
cleaned_code = replace_java_comments(code) # Preprocess comments
|
50 |
+
syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
|
51 |
+
encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE
|
52 |
+
|
53 |
+
# Pad the sequence
|
54 |
padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
|
55 |
return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
|
56 |
|
|
|
65 |
if not code.strip(): # Ensure input is not empty
|
66 |
return "Please provide a Java code snippet."
|
67 |
|
|
|
|
|
|
|
68 |
# Tokenize and predict
|
69 |
+
tokenized_code = tokenize_java_code(code)
|
70 |
prediction = model.predict(tokenized_code)[0][0]
|
71 |
|
72 |
+
threshold = 0.52 # Adjust threshold for classification
|
73 |
+
prediction = (prediction > threshold).astype(int) # Convert probability to binary
|
74 |
|
75 |
+
return "Readable" if prediction == 1 else "Unreadable"
|
|
|
76 |
|
77 |
gr.Interface(
|
78 |
fn=classify_code,
|
|
|
81 |
gr.File(type="binary", label="Upload Java File (.java)")
|
82 |
],
|
83 |
outputs=gr.Text(label="Readability Classification"),
|
84 |
+
title="Java Code Readability Classifier (AST + BPE)",
|
85 |
description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
|
86 |
allow_flagging="never"
|
87 |
).launch()
|
cr_tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
crv3.keras
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8ad1fb4d7eee878b3ce7282bdb8a5b428b2b940cdb615e662c649df4685f0e9
|
3 |
+
size 2357365
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
gradio
|
2 |
tensorflow
|
3 |
tokenizers
|
|
|
|
|
|
1 |
gradio
|
2 |
tensorflow
|
3 |
tokenizers
|
4 |
+
tree_sitter
|
5 |
+
tree_sitter_languages
|