朱东升 commited on
Commit
74d43a2
·
1 Parent(s): 4f32597

requirements update7

Browse files
app.py CHANGED
@@ -5,10 +5,13 @@ import os
5
  import sys
6
  from pathlib import Path
7
 
8
- # 添加当前目录到模块搜索路径,确保可以导入src目录下的模块
9
  current_dir = os.path.dirname(os.path.abspath(__file__))
 
10
  if current_dir not in sys.path:
11
  sys.path.append(current_dir)
 
 
12
 
13
  def evaluate(input_data):
14
  """评估代码的主函数
@@ -39,7 +42,8 @@ def evaluate(input_data):
39
 
40
  # 评估所有完成的代码
41
  results = []
42
- for code in completions:
 
43
  result = evaluate_code(code, language)
44
  results.append(result)
45
 
@@ -68,12 +72,7 @@ def evaluate_code(code, language):
68
  try:
69
  # 动态导入对应语言的评估模块
70
  module_name = f"src.eval_{language.lower()}"
71
- try:
72
- module = importlib.import_module(module_name)
73
- except ImportError:
74
- # 尝试不带src前缀导入
75
- module_name = f"eval_{language.lower()}"
76
- module = importlib.import_module(module_name)
77
 
78
  # 创建临时文件存储代码
79
  temp_dir = Path("temp")
 
5
  import sys
6
  from pathlib import Path
7
 
8
+ # 添加当前目录和src目录到模块搜索路径
9
  current_dir = os.path.dirname(os.path.abspath(__file__))
10
+ src_dir = os.path.join(current_dir, "src")
11
  if current_dir not in sys.path:
12
  sys.path.append(current_dir)
13
+ if src_dir not in sys.path:
14
+ sys.path.append(src_dir)
15
 
16
  def evaluate(input_data):
17
  """评估代码的主函数
 
42
 
43
  # 评估所有完成的代码
44
  results = []
45
+ for comp in completions:
46
+ code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
47
  result = evaluate_code(code, language)
48
  results.append(result)
49
 
 
72
  try:
73
  # 动态导入对应语言的评估模块
74
  module_name = f"src.eval_{language.lower()}"
75
+ module = importlib.import_module(module_name)
 
 
 
 
 
76
 
77
  # 创建临时文件存储代码
78
  temp_dir = Path("temp")
src/generic_eval.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a helper script for evaluating benchmarks that have been translated to
2
+ # different languages.
3
+ #
4
+ # To use this script, call eval_lang.py.
5
+ # The --directory argument is required, and tells the script where the benchmarks are located.
6
+ # The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
7
+ #
8
+ # The script will print the results on each benchmark, and also write to results/lang.csv.
9
+ # When the script completes, it will print a summary.
10
+ #
11
+ # Examples
12
+ #
13
+ # To run the entire benchmark suite:
14
+ # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
15
+ #
16
+ # To run benchmarks 1, 2, and 3:
17
+ # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
18
+
19
+ import argparse
20
+ from sys import exit as sysexit
21
+ from pathlib import Path
22
+ import sys
23
+
24
+ def list_files(directory, ext):
25
+ files_unsorted = directory.glob(f"HumanEval_*{ext}")
26
+ # assumption: base filenames are in the format of HumanEval_X_*
27
+ # Where X is a valid number
28
+ def key(s):
29
+ return int(str(s.name).split("_")[1])
30
+ files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
31
+
32
+ # assumption: there may be missing files, but no extra files
33
+ # so we build files_array where the index corresponds to the file's number,
34
+ # and a missing file is represented by None
35
+ size = key(files_sorted[-1]) + 1
36
+ files_array = [None] * size
37
+ for f in files_sorted:
38
+ k = key(f)
39
+ files_array[k] = f
40
+
41
+ return files_array
42
+
43
+ def main(eval_script, language, extension):
44
+ args = argparse.ArgumentParser()
45
+
46
+ args.add_argument(
47
+ "--directory", type=str, required=True, help="Directory to read benchmarks from"
48
+ )
49
+ args.add_argument(
50
+ "--files",
51
+ type=int,
52
+ nargs="*",
53
+ default=[],
54
+ help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
55
+ )
56
+ args = args.parse_args()
57
+
58
+ directory = Path(args.directory).resolve()
59
+
60
+ files_sorted = list_files(directory, extension)
61
+
62
+ # the directory you specified does not contain the right language
63
+ if len(files_sorted) == 0:
64
+ print(f'The specified directory does not contain files of type {extension}')
65
+ sysexit(1)
66
+
67
+ files_index = []
68
+ if len(args.files) > 0:
69
+ files_index = args.files
70
+ else:
71
+ files_index = range(len(files_sorted))
72
+
73
+ total = 0
74
+ passed = 0
75
+ syntax_error = 0
76
+
77
+ results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
78
+
79
+ with open(results_file, "w") as f:
80
+ for i in files_index:
81
+ filepath = files_sorted[i]
82
+ if filepath is None:
83
+ print("File {} does not exist!".format(i))
84
+ continue
85
+ res = eval_script(filepath)
86
+ output = f"{language},{filepath.stem},{res['status']}\n"
87
+ f.write(output)
88
+ print(output, end="")
89
+ total += 1
90
+ if res['status'] == "OK":
91
+ passed += 1
92
+ elif res['status'] == "SyntaxError":
93
+ syntax_error += 1
94
+ print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
95
+
96
+
97
+
98
+ def main_check_stubs(check_script, language, extension):
99
+ args = argparse.ArgumentParser()
100
+
101
+ args.add_argument(
102
+ "--directory", type=str, required=True, help="Directory to read benchmarks from"
103
+ )
104
+ args.add_argument(
105
+ "--files",
106
+ type=int,
107
+ nargs="*",
108
+ default=[],
109
+ help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
110
+ )
111
+ args = args.parse_args()
112
+
113
+ directory = Path(args.directory).resolve()
114
+
115
+ files_sorted = list_files(directory, extension)
116
+
117
+ # the directory you specified does not contain the right language
118
+ if len(files_sorted) == 0:
119
+ print(f'The specified directory does not contain files of type {extension}')
120
+ sysexit(1)
121
+
122
+ files_index = []
123
+ if len(args.files) > 0:
124
+ files_index = args.files
125
+ else:
126
+ files_index = range(len(files_sorted))
127
+
128
+ total = 0
129
+ passed = 0
130
+
131
+ results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
132
+
133
+ with open(results_file, "w") as f:
134
+ for i in files_index:
135
+ filepath = files_sorted[i]
136
+ if filepath is None:
137
+ print("File {} does not exist!".format(i))
138
+ continue
139
+ res = check_script(filepath)
140
+ output = f"{language},{filepath.stem},{res['status']}\n"
141
+ f.write(output)
142
+ print(output, end="")
143
+ total += 1
144
+ if res['status'] == "OK":
145
+ passed += 1
146
+ print (f"Total {total}, Passed {passed}")
147
+
148
+ if total != passed:
149
+ sys.exit(1)
src/safe_subprocess/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /__pycache__
2
+ /.pytest_cache
src/safe_subprocess/__init__.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import signal
3
+ import fcntl
4
+ import time
5
+ import subprocess
6
+ from typing import List
7
+
8
+ MAX_BYTES_PER_READ = 1024
9
+ SLEEP_BETWEEN_READS = 0.1
10
+
11
+
12
+ class Result:
13
+ timeout: int
14
+ exit_code: int
15
+ stdout: str
16
+ stderr: str
17
+
18
+ def __init__(self, timeout, exit_code, stdout, stderr):
19
+ self.timeout = timeout
20
+ self.exit_code = exit_code
21
+ self.stdout = stdout
22
+ self.stderr = stderr
23
+
24
+
25
+ def set_nonblocking(reader):
26
+ fd = reader.fileno()
27
+ fl = fcntl.fcntl(fd, fcntl.F_GETFL)
28
+ fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
29
+
30
+
31
+ def run(
32
+ args: List[str],
33
+ timeout_seconds: int = 15,
34
+ max_output_size: int = 2048,
35
+ env = None,
36
+ cwd: str | None = None
37
+ ) -> Result:
38
+ """
39
+ Runs the given program with arguments. After the timeout elapses, kills the process
40
+ and all other processes in the process group. Captures at most max_output_size bytes
41
+ of stdout and stderr each, and discards any output beyond that.
42
+ """
43
+ p = subprocess.Popen(
44
+ args,
45
+ env=env,
46
+ stdin=subprocess.DEVNULL,
47
+ stdout=subprocess.PIPE,
48
+ stderr=subprocess.PIPE,
49
+ start_new_session=True,
50
+ bufsize=MAX_BYTES_PER_READ,
51
+ cwd=cwd
52
+ )
53
+ set_nonblocking(p.stdout)
54
+ set_nonblocking(p.stderr)
55
+
56
+ process_group_id = os.getpgid(p.pid)
57
+
58
+ # We sleep for 0.1 seconds in each iteration.
59
+ max_iterations = timeout_seconds * 10
60
+ stdout_saved_bytes = []
61
+ stderr_saved_bytes = []
62
+ stdout_bytes_read = 0
63
+ stderr_bytes_read = 0
64
+
65
+ for _ in range(max_iterations):
66
+ this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
67
+ this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
68
+ # this_stdout_read and this_stderr_read may be None if stdout or stderr
69
+ # are closed. Without these checks, test_close_output fails.
70
+ if this_stdout_read is not None and stdout_bytes_read < max_output_size:
71
+ stdout_saved_bytes.append(this_stdout_read)
72
+ stdout_bytes_read += len(this_stdout_read)
73
+ if this_stderr_read is not None and stderr_bytes_read < max_output_size:
74
+ stderr_saved_bytes.append(this_stderr_read)
75
+ stderr_bytes_read += len(this_stderr_read)
76
+ exit_code = p.poll()
77
+ if exit_code is not None:
78
+ break
79
+ time.sleep(SLEEP_BETWEEN_READS)
80
+
81
+ try:
82
+ # Kills the process group. Without this line, test_fork_once fails.
83
+ os.killpg(process_group_id, signal.SIGKILL)
84
+ except ProcessLookupError:
85
+ pass
86
+
87
+ timeout = exit_code is None
88
+ exit_code = exit_code if exit_code is not None else -1
89
+ stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
90
+ stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
91
+ return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
src/safe_subprocess/evil_programs/block_on_inputs.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ while True:
2
+ input()
src/safe_subprocess/evil_programs/close_outputs.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ print("This is the end")
4
+ sys.stdout.close()
5
+ sys.stderr.close()
6
+ while True:
7
+ pass
src/safe_subprocess/evil_programs/fork_bomb.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ while True:
4
+ os.fork()
src/safe_subprocess/evil_programs/fork_once.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ if os.fork() == 0:
5
+ while True:
6
+ time.sleep(60)
src/safe_subprocess/evil_programs/sleep_forever.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import time
2
+
3
+ while True:
4
+ time.sleep(60)
src/safe_subprocess/evil_programs/unbounded_output.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ b = True
2
+ while True:
3
+ print(b)
4
+ b = not b
src/safe_subprocess/module_test.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from safe_subprocess import run
2
+ import time
3
+ from pathlib import Path
4
+
5
+ ROOT = Path(__file__).resolve().parent / "evil_programs"
6
+
7
+
8
+ def assert_no_running_evil():
9
+ result = run(
10
+ ["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024
11
+ )
12
+ assert (
13
+ result.exit_code == 1
14
+ ), f"There are still evil processes running: {result.stdout}"
15
+ assert len(result.stderr) == 0
16
+ assert len(result.stdout) == 0
17
+
18
+
19
+ def test_fork_once():
20
+ # The program exits cleanly and immediately. But, it forks a child that runs
21
+ # forever.
22
+ result = run(
23
+ ["python3", ROOT / "fork_once.py"],
24
+ timeout_seconds=2,
25
+ max_output_size=1024,
26
+ )
27
+ assert result.exit_code == 0
28
+ assert result.timeout == False
29
+ assert len(result.stderr) == 0
30
+ assert len(result.stdout) == 0
31
+ assert_no_running_evil()
32
+
33
+
34
+ def test_close_outputs():
35
+ # The program prints to stdout, closes its output, and then runs forever.
36
+ result = run(
37
+ ["python3", ROOT / "close_outputs.py"],
38
+ timeout_seconds=2,
39
+ max_output_size=1024,
40
+ )
41
+ assert result.exit_code == -1
42
+ assert result.timeout == True
43
+ assert len(result.stderr) == 0
44
+ assert result.stdout == "This is the end\n"
45
+ assert_no_running_evil()
46
+
47
+
48
+ def test_unbounded_output():
49
+ result = run(
50
+ ["python3", ROOT / "unbounded_output.py"],
51
+ timeout_seconds=3,
52
+ max_output_size=1024,
53
+ )
54
+ assert result.exit_code == -1
55
+ assert result.timeout == True
56
+ assert len(result.stderr) == 0
57
+ assert len(result.stdout) == 1024
58
+ assert_no_running_evil()
59
+
60
+
61
+ def test_sleep_forever():
62
+ result = run(
63
+ ["python3", ROOT / "sleep_forever.py"],
64
+ timeout_seconds=2,
65
+ max_output_size=1024,
66
+ )
67
+ assert result.exit_code == -1
68
+ assert result.timeout == True
69
+ assert len(result.stderr) == 0
70
+ assert len(result.stdout) == 0
71
+ assert_no_running_evil()
72
+
73
+
74
+ def test_fork_bomb():
75
+ result = run(
76
+ ["python3", ROOT / "fork_bomb.py"],
77
+ timeout_seconds=2,
78
+ max_output_size=1024,
79
+ )
80
+ assert result.exit_code == -1
81
+ assert result.timeout == True
82
+ assert len(result.stderr) == 0
83
+ assert len(result.stdout) == 0
84
+ # Unfortunately, this sleep seems to be necessary. My theories:
85
+ # 1. os.killpg doesn't block until the whole process group is dead.
86
+ # 2. pgrep can produce stale output
87
+ time.sleep(2)
88
+ assert_no_running_evil()
89
+
90
+
91
+ def test_block_on_inputs():
92
+ # We run the subprocess with /dev/null as input. So, any program that tries
93
+ # to read input will error.
94
+ result = run(
95
+ ["python3", ROOT / "block_on_inputs.py"],
96
+ timeout_seconds=2,
97
+ max_output_size=1024,
98
+ )
99
+ assert result.exit_code == 1
100
+ assert result.timeout == False
101
+ assert len(result.stdout) == 0
102
+ assert "EOF when reading a line" in result.stderr
103
+ assert_no_running_evil()