Shuu12121 commited on
Commit
6897705
·
verified ·
1 Parent(s): 7f67b88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -35
app.py CHANGED
@@ -1,52 +1,104 @@
 
1
  import torch
2
  from transformers import AutoTokenizer, EncoderDecoderModel
 
3
 
 
 
4
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
5
  model_name = "Shuu12121/CodeEncoderDecodeerModel-Ghost"
 
6
 
7
- # Tokenizerの読み込み
8
- encoder_tokenizer = AutoTokenizer.from_pretrained(f"{model_name}/encoder_tokenizer")
9
- decoder_tokenizer = AutoTokenizer.from_pretrained(f"{model_name}/decoder_tokenizer")
 
 
 
 
 
 
10
 
 
11
  if decoder_tokenizer.pad_token is None:
12
- decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
 
 
 
 
 
 
 
 
13
 
14
- model = EncoderDecoderModel.from_pretrained(model_name).to(device)
15
- model.eval()
 
 
 
 
 
 
16
 
 
17
  def generate_docstring(code: str) -> str:
18
- inputs = encoder_tokenizer(
19
- code,
20
- return_tensors="pt",
21
- padding=True,
22
- truncation=True,
23
- max_length=2048
24
- ).to(device)
25
-
26
- with torch.no_grad():
27
- output_ids = model.generate(
28
- input_ids=inputs.input_ids,
29
- attention_mask=inputs.attention_mask,
30
- max_length=256,
31
- num_beams=5,
32
- early_stopping=True,
33
- decoder_start_token_id=model.config.decoder_start_token_id,
34
- eos_token_id=model.config.eos_token_id,
35
- pad_token_id=model.config.pad_token_id,
36
- no_repeat_ngram_size=2
37
- )
38
-
39
- return decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
40
-
41
- # Gradio UI
42
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
 
44
  iface = gr.Interface(
45
  fn=generate_docstring,
46
- inputs=gr.Textbox(label="Code Snippet", lines=10, placeholder="Paste your function here..."),
47
  outputs=gr.Textbox(label="Generated Docstring"),
48
- title="Code-to-Docstring Generator",
49
- description="This demo uses a custom encoder-decoder model to generate docstrings from code."
50
  )
51
 
52
- iface.launch()
 
 
 
 
 
 
1
+ # app.py
2
  import torch
3
  from transformers import AutoTokenizer, EncoderDecoderModel
4
+ import gradio as gr
5
 
6
+ # デバイス設定 (Spacesのハードウェア設定に依存)
7
+ # SpacesでGPUを利用する場合、自動的にCUDAが利用可能になります
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ print(f"Using device: {device}") # デバイス確認用ログ
10
+
11
  model_name = "Shuu12121/CodeEncoderDecodeerModel-Ghost"
12
+ print(f"Loading model: {model_name}") # モデル読み込み開始ログ
13
 
14
+ # --- Tokenizerの読み込み ---
15
+ try:
16
+ encoder_tokenizer = AutoTokenizer.from_pretrained(f"{model_name}/encoder_tokenizer")
17
+ decoder_tokenizer = AutoTokenizer.from_pretrained(f"{model_name}/decoder_tokenizer")
18
+ print("Tokenizers loaded successfully.")
19
+ except Exception as e:
20
+ print(f"Error loading tokenizers: {e}")
21
+ # エラーが発生した場合、Gradioインターフェースでエラーを表示するなどの処理を追加できます
22
+ raise # ここではエラーを再発生させて、起動を停止させます
23
 
24
+ # decoder_tokenizerのpad_token設定
25
  if decoder_tokenizer.pad_token is None:
26
+ if decoder_tokenizer.eos_token is not None:
27
+ decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
28
+ print("Set decoder pad_token to eos_token.")
29
+ else:
30
+ # eos_tokenもない場合の代替処理(例: '<pad>'トークンを追加)
31
+ decoder_tokenizer.add_special_tokens({'pad_token': '<pad>'})
32
+ print("Added '<pad>' as pad_token.")
33
+ # モデルのリサイズが必要になる場合がある
34
+ # model.resize_token_embeddings(len(decoder_tokenizer)) # 必要に応じて
35
 
36
+ # --- モデルの読み込み ---
37
+ try:
38
+ model = EncoderDecoderModel.from_pretrained(model_name).to(device)
39
+ model.eval() # 評価モードに設定
40
+ print("Model loaded successfully and moved to device.")
41
+ except Exception as e:
42
+ print(f"Error loading model: {e}")
43
+ raise
44
 
45
+ # --- Docstring生成関数 ---
46
  def generate_docstring(code: str) -> str:
47
+ print("Received code snippet for docstring generation.") # 関数呼び出しログ
48
+ if not code:
49
+ return "Please provide a code snippet."
50
+
51
+ try:
52
+ # エンコーダー入力の準備
53
+ inputs = encoder_tokenizer(
54
+ code,
55
+ return_tensors="pt",
56
+ padding=True,
57
+ truncation=True,
58
+ max_length=2048 # モデルが許容する最大長に合わせる(必要なら調整)
59
+ ).to(device)
60
+
61
+ print(f"Input tokens length: {inputs.input_ids.shape[1]}")
62
+
63
+ # 生成実行
64
+ with torch.no_grad():
65
+ output_ids = model.generate(
66
+ input_ids=inputs.input_ids,
67
+ attention_mask=inputs.attention_mask,
68
+ max_length=256, # 生成するDocstringの最大長
69
+ num_beams=5, # ビームサーチのビーム数
70
+ early_stopping=True, # 早く停止させるか
71
+ # decoder_start_token_idは通常model.configから自動設定されるが、明示的に指定も可能
72
+ # decoder_start_token_id=model.config.decoder_start_token_id,
73
+ eos_token_id=decoder_tokenizer.eos_token_id, # EOSトークンID
74
+ pad_token_id=decoder_tokenizer.pad_token_id, # PADトークンID
75
+ no_repeat_ngram_size=2 # 繰り返さないN-gramサイズ
76
+ )
77
+
78
+ print(f"Generated output tokens length: {output_ids.shape[1]}")
79
+
80
+ # デコードしてテキストに変換
81
+ generated_docstring = decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
82
+ print("Docstring generated successfully.")
83
+ return generated_docstring
84
+
85
+ except Exception as e:
86
+ print(f"Error during generation: {e}")
87
+ # ユーザーにエラーを通知
88
+ return f"An error occurred during generation: {e}"
89
 
90
+ # --- Gradio UI ---
91
  iface = gr.Interface(
92
  fn=generate_docstring,
93
+ inputs=gr.Textbox(label="Code Snippet", lines=10, placeholder="Paste your Python function or code block here..."),
94
  outputs=gr.Textbox(label="Generated Docstring"),
95
+ title="Code-to-Docstring Generator (Shuu12121/CodeEncoderDecodeerModel-Ghost)",
96
+ description="This demo uses the Shuu12121/CodeEncoderDecodeerModel-Ghost model to automatically generate Python docstrings from code snippets. Paste your code below and click 'Submit'."
97
  )
98
 
99
+ # --- アプリケーションの起動 ---
100
+ # Hugging Face Spacesで実行する場合、share=Trueは不要
101
+ if __name__ == "__main__":
102
+ print("Launching Gradio interface...")
103
+ iface.launch()
104
+ print("Gradio interface launched.")