Circhastic commited on
Commit
a13a8c8
·
1 Parent(s): dd4f105

updated with new model

Browse files
Files changed (4) hide show
  1. app.py +47 -18
  2. cr_tokenizer.json +0 -0
  3. crv3.keras +2 -2
  4. requirements.txt +2 -0
app.py CHANGED
@@ -1,23 +1,56 @@
1
  import gradio as gr
2
  import numpy as np
3
  import tensorflow as tf
 
 
 
4
  from tokenizers import Tokenizer
5
  from tensorflow.keras.preprocessing.sequence import pad_sequences
6
- import re
7
 
8
- # Load trained tokenizer and model
9
- tokenizer = Tokenizer.from_file("cr_tokenizer.json")
10
- model = tf.keras.models.load_model("crv3.keras")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def replace_java_comments(code: str) -> str:
13
  """Replaces Java comments with placeholders."""
14
- code = re.sub(r"//.*", " SINGLE_LINE_COMMENT ", code) # Replace single-line comments
15
- code = re.sub(r"/\*[\s\S]*?\*/", " MULTI_LINE_COMMENT ", code) # Replace multi-line comments
16
- return code.strip() # Keep indentation
17
 
18
  def tokenize_java_code(code: str, max_length=100):
19
- """Tokenizes and pads Java code for model input."""
20
- encoded = tokenizer.encode(code).ids
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
22
  return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
23
 
@@ -32,18 +65,14 @@ def classify_code(input_text, input_file):
32
  if not code.strip(): # Ensure input is not empty
33
  return "Please provide a Java code snippet."
34
 
35
- # Replace comments before tokenization
36
- cleaned_code = replace_java_comments(code)
37
-
38
  # Tokenize and predict
39
- tokenized_code = tokenize_java_code(cleaned_code)
40
  prediction = model.predict(tokenized_code)[0][0]
41
 
42
- threshold = 0.52 # Increase the threshold for "Readable"
43
- prediction = (prediction > threshold).astype(int) # Convert probabilities to binary
44
 
45
- # Convert to readable/unreadable
46
- return "Readable" if prediction > 0.5 else "Unreadable"
47
 
48
  gr.Interface(
49
  fn=classify_code,
@@ -52,7 +81,7 @@ gr.Interface(
52
  gr.File(type="binary", label="Upload Java File (.java)")
53
  ],
54
  outputs=gr.Text(label="Readability Classification"),
55
- title="Java Code Readability Classifier",
56
  description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
57
  allow_flagging="never"
58
  ).launch()
 
1
  import gradio as gr
2
  import numpy as np
3
  import tensorflow as tf
4
+ import re
5
+ from tree_sitter import Language, Parser
6
+ import tree_sitter_languages # Pre-built parsers for multiple languages
7
  from tokenizers import Tokenizer
8
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
9
 
10
+ tokenizer = Tokenizer.from_file("syntax_bpe_tokenizer.json") # New BPE tokenizer
11
+ model = tf.keras.models.load_model("crv3.keras") # CNN model
12
+
13
+ parser = Parser()
14
+ parser.set_language(tree_sitter_languages.get_language("java"))
15
+
16
+ def syntax_aware_tokenize(code):
17
+ """Tokenizes Java code using Tree-Sitter (AST-based)."""
18
+ tree = parser.parse(bytes(code, "utf8"))
19
+ root_node = tree.root_node
20
+ tokens = []
21
+
22
+ def extract_tokens(node):
23
+ """Recursively extracts tokens from AST."""
24
+ if node.child_count == 0: # Leaf node
25
+ tokens.append(node.text.decode("utf-8"))
26
+ for child in node.children:
27
+ extract_tokens(child)
28
+
29
+ extract_tokens(root_node)
30
+ return tokens # Returns structured syntax tokens
31
 
32
  def replace_java_comments(code: str) -> str:
33
  """Replaces Java comments with placeholders."""
34
+ code = re.sub(r"//.*", " // ", code) # Replace single-line comments
35
+ code = re.sub(r"/\*[\s\S]*?\*/", " /**/ ", code) # Replace multi-line comments
36
+ return code.strip() # Preserve indentation and code structure
37
 
38
  def tokenize_java_code(code: str, max_length=100):
39
+ """
40
+ Tokenizes and pads Java code using AST tokenization + BPE.
41
+
42
+ Args:
43
+ code (str): Java code snippet.
44
+ max_length (int): Maximum sequence length.
45
+
46
+ Returns:
47
+ np.array: Tokenized and padded sequence.
48
+ """
49
+ cleaned_code = replace_java_comments(code) # Preprocess comments
50
+ syntax_tokens = syntax_aware_tokenize(cleaned_code) # Extract AST tokens
51
+ encoded = tokenizer.encode(" ".join(syntax_tokens)).ids # Apply BPE
52
+
53
+ # Pad the sequence
54
  padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
55
  return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
56
 
 
65
  if not code.strip(): # Ensure input is not empty
66
  return "Please provide a Java code snippet."
67
 
 
 
 
68
  # Tokenize and predict
69
+ tokenized_code = tokenize_java_code(code)
70
  prediction = model.predict(tokenized_code)[0][0]
71
 
72
+ threshold = 0.52 # Adjust threshold for classification
73
+ prediction = (prediction > threshold).astype(int) # Convert probability to binary
74
 
75
+ return "Readable" if prediction == 1 else "Unreadable"
 
76
 
77
  gr.Interface(
78
  fn=classify_code,
 
81
  gr.File(type="binary", label="Upload Java File (.java)")
82
  ],
83
  outputs=gr.Text(label="Readability Classification"),
84
+ title="Java Code Readability Classifier (AST + BPE)",
85
  description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
86
  allow_flagging="never"
87
  ).launch()
cr_tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
crv3.keras CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da8d9f9c0924cdf329c5a6ef6ac51827944788a238e521fb44b06b87b7d48a8d
3
- size 11959228
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ad1fb4d7eee878b3ce7282bdb8a5b428b2b940cdb615e662c649df4685f0e9
3
+ size 2357365
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  gradio
2
  tensorflow
3
  tokenizers
 
 
 
1
  gradio
2
  tensorflow
3
  tokenizers
4
+ tree_sitter
5
+ tree_sitter_languages