Circhastic commited on
Commit
98ba9be
·
1 Parent(s): 9ac8247

updated app to replace comments

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -3,19 +3,24 @@ import numpy as np
3
  import tensorflow as tf
4
  from tokenizers import Tokenizer
5
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
6
 
7
  # Load trained tokenizer and model
8
  tokenizer = Tokenizer.from_file("cr_tokenizer.json")
9
  model = tf.keras.models.load_model("crv3.keras")
10
 
11
- # Tokenization function
 
 
 
 
 
12
  def tokenize_java_code(code: str, max_length=100):
13
  """Tokenizes and pads Java code for model input."""
14
  encoded = tokenizer.encode(code).ids
15
  padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
16
  return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
17
 
18
- # Prediction function
19
  def classify_code(input_text, input_file):
20
  """Classifies Java code readability based on user input."""
21
  # Load Java file if provided
@@ -27,21 +32,23 @@ def classify_code(input_text, input_file):
27
  if not code.strip(): # Ensure input is not empty
28
  return "Please provide a Java code snippet."
29
 
 
 
 
30
  # Tokenize and predict
31
- tokenized_code = tokenize_java_code(code)
32
  prediction = model.predict(tokenized_code)[0][0]
33
 
34
  # Convert to readable/unreadable
35
  return "Readable" if prediction > 0.5 else "Unreadable"
36
 
37
- # Create Gradio interface
38
  gr.Interface(
39
  fn=classify_code,
40
  inputs=[
41
  gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"),
42
  gr.File(type="binary", label="Upload Java File (.java)")
43
  ],
44
- outputs=gr.Text(label="Readability Prediction"),
45
  title="Java Code Readability Classifier",
46
  description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
47
  allow_flagging="never"
 
3
  import tensorflow as tf
4
  from tokenizers import Tokenizer
5
  from tensorflow.keras.preprocessing.sequence import pad_sequences
6
+ import re
7
 
8
  # Load trained tokenizer and model
9
  tokenizer = Tokenizer.from_file("cr_tokenizer.json")
10
  model = tf.keras.models.load_model("crv3.keras")
11
 
12
+ def replace_java_comments(code: str) -> str:
13
+ """Replaces Java comments with placeholders."""
14
+ code = re.sub(r"//.*", " SINGLE_LINE_COMMENT ", code) # Replace single-line comments
15
+ code = re.sub(r"/\*[\s\S]*?\*/", " MULTI_LINE_COMMENT ", code) # Replace multi-line comments
16
+ return code.strip() # Keep indentation
17
+
18
  def tokenize_java_code(code: str, max_length=100):
19
  """Tokenizes and pads Java code for model input."""
20
  encoded = tokenizer.encode(code).ids
21
  padded_sequence = pad_sequences([encoded], maxlen=max_length, padding="post")[0]
22
  return np.array(padded_sequence).reshape(1, -1) # Ensure correct shape for model
23
 
 
24
  def classify_code(input_text, input_file):
25
  """Classifies Java code readability based on user input."""
26
  # Load Java file if provided
 
32
  if not code.strip(): # Ensure input is not empty
33
  return "Please provide a Java code snippet."
34
 
35
+ # Replace comments before tokenization
36
+ cleaned_code = replace_java_comments(code)
37
+
38
  # Tokenize and predict
39
+ tokenized_code = tokenize_java_code(cleaned_code)
40
  prediction = model.predict(tokenized_code)[0][0]
41
 
42
  # Convert to readable/unreadable
43
  return "Readable" if prediction > 0.5 else "Unreadable"
44
 
 
45
  gr.Interface(
46
  fn=classify_code,
47
  inputs=[
48
  gr.Textbox(lines=10, placeholder="Paste Java code here...", label="Java Code Snippet"),
49
  gr.File(type="binary", label="Upload Java File (.java)")
50
  ],
51
+ outputs=gr.Text(label="Readability Classification"),
52
  title="Java Code Readability Classifier",
53
  description="Upload a Java file or paste a Java code snippet to check if it's readable or unreadable.",
54
  allow_flagging="never"