Spaces:

dongsheng
/

docker_test

Sleeping

App Files Files Community

朱东升 commited on Mar 12

Commit

74d43a2

1 Parent(s): 4f32597

requirements update7

Browse files

Files changed (11) hide show

app.py +7 -8
src/generic_eval.py +149 -0
src/safe_subprocess/.gitignore +2 -0
src/safe_subprocess/__init__.py +91 -0
src/safe_subprocess/evil_programs/block_on_inputs.py +2 -0
src/safe_subprocess/evil_programs/close_outputs.py +7 -0
src/safe_subprocess/evil_programs/fork_bomb.py +4 -0
src/safe_subprocess/evil_programs/fork_once.py +6 -0
src/safe_subprocess/evil_programs/sleep_forever.py +4 -0
src/safe_subprocess/evil_programs/unbounded_output.py +4 -0
src/safe_subprocess/module_test.py +103 -0

app.py CHANGED Viewed

@@ -5,10 +5,13 @@ import os
 import sys
 from pathlib import Path
-# 添加当前目录到模块搜索路径，确保可以导入src目录下的模块
 current_dir = os.path.dirname(os.path.abspath(__file__))
 if current_dir not in sys.path:
     sys.path.append(current_dir)
 def evaluate(input_data):
     """评估代码的主函数
@@ -39,7 +42,8 @@ def evaluate(input_data):
             # 评估所有完成的代码
             results = []
-            for code in completions:
                 result = evaluate_code(code, language)
                 results.append(result)
@@ -68,12 +72,7 @@ def evaluate_code(code, language):
     try:
         # 动态导入对应语言的评估模块
         module_name = f"src.eval_{language.lower()}"
-        try:
-            module = importlib.import_module(module_name)
-        except ImportError:
-            # 尝试不带src前缀导入
-            module_name = f"eval_{language.lower()}"
-            module = importlib.import_module(module_name)
         # 创建临时文件存储代码
         temp_dir = Path("temp")

 import sys
 from pathlib import Path
+# 添加当前目录和src目录到模块搜索路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
+src_dir = os.path.join(current_dir, "src")
 if current_dir not in sys.path:
     sys.path.append(current_dir)
+if src_dir not in sys.path:
+    sys.path.append(src_dir)
 def evaluate(input_data):
     """评估代码的主函数
             # 评估所有完成的代码
             results = []
+            for comp in completions:
+                code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
                 result = evaluate_code(code, language)
                 results.append(result)
     try:
         # 动态导入对应语言的评估模块
         module_name = f"src.eval_{language.lower()}"
+        module = importlib.import_module(module_name)
         # 创建临时文件存储代码
         temp_dir = Path("temp")

src/generic_eval.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# This is a helper script for evaluating benchmarks that have been translated to
+# different languages.
+#
+# To use this script, call eval_lang.py.
+# The --directory argument is required, and tells the script where the benchmarks are located.
+# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
+#
+# The script will print the results on each benchmark, and also write to results/lang.csv.
+# When the script completes, it will print a summary.
+#
+# Examples
+#
+# To run the entire benchmark suite:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
+#
+# To run benchmarks 1, 2, and 3:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
+import argparse
+from sys import exit as sysexit
+from pathlib import Path
+import sys
+def list_files(directory, ext):
+    files_unsorted = directory.glob(f"HumanEval_*{ext}")
+    # assumption: base filenames are in the format of HumanEval_X_*
+    # Where X is a valid number
+    def key(s):
+        return int(str(s.name).split("_")[1])
+    files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
+    # assumption: there may be missing files, but no extra files
+    # so we build files_array where the index corresponds to the file's number,
+    # and a missing file is represented by None
+    size = key(files_sorted[-1]) + 1
+    files_array = [None] * size
+    for f in files_sorted:
+        k = key(f)
+        files_array[k] = f
+    return files_array
+def main(eval_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f'The specified directory does not contain files of type {extension}')
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    syntax_error = 0
+    results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = eval_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res['status'] == "OK":
+                passed += 1
+            elif res['status'] == "SyntaxError":
+                syntax_error += 1
+    print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
+def main_check_stubs(check_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f'The specified directory does not contain files of type {extension}')
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = check_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res['status'] == "OK":
+                passed += 1
+    print (f"Total {total}, Passed {passed}")
+    if total != passed:
+        sys.exit(1)

src/safe_subprocess/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /__pycache__
2	+ /.pytest_cache

src/safe_subprocess/__init__.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import signal
+import fcntl
+import time
+import subprocess
+from typing import List
+MAX_BYTES_PER_READ = 1024
+SLEEP_BETWEEN_READS = 0.1
+class Result:
+    timeout: int
+    exit_code: int
+    stdout: str
+    stderr: str
+    def __init__(self, timeout, exit_code, stdout, stderr):
+        self.timeout = timeout
+        self.exit_code = exit_code
+        self.stdout = stdout
+        self.stderr = stderr
+def set_nonblocking(reader):
+    fd = reader.fileno()
+    fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+    fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+def run(
+    args: List[str],
+    timeout_seconds: int = 15,
+    max_output_size: int = 2048,
+    env = None,
+    cwd: str | None = None
+) -> Result:
+    """
+    Runs the given program with arguments. After the timeout elapses, kills the process
+    and all other processes in the process group. Captures at most max_output_size bytes
+    of stdout and stderr each, and discards any output beyond that.
+    """
+    p = subprocess.Popen(
+        args,
+        env=env,
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        start_new_session=True,
+        bufsize=MAX_BYTES_PER_READ,
+        cwd=cwd
+    )
+    set_nonblocking(p.stdout)
+    set_nonblocking(p.stderr)
+    process_group_id = os.getpgid(p.pid)
+    # We sleep for 0.1 seconds in each iteration.
+    max_iterations = timeout_seconds * 10
+    stdout_saved_bytes = []
+    stderr_saved_bytes = []
+    stdout_bytes_read = 0
+    stderr_bytes_read = 0
+    for _ in range(max_iterations):
+        this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
+        this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
+        # this_stdout_read and this_stderr_read may be None if stdout or stderr
+        # are closed. Without these checks, test_close_output fails.
+        if this_stdout_read is not None and stdout_bytes_read < max_output_size:
+            stdout_saved_bytes.append(this_stdout_read)
+            stdout_bytes_read += len(this_stdout_read)
+        if this_stderr_read is not None and stderr_bytes_read < max_output_size:
+            stderr_saved_bytes.append(this_stderr_read)
+            stderr_bytes_read += len(this_stderr_read)
+        exit_code = p.poll()
+        if exit_code is not None:
+            break
+        time.sleep(SLEEP_BETWEEN_READS)
+    try:
+        # Kills the process group. Without this line, test_fork_once fails.
+        os.killpg(process_group_id, signal.SIGKILL)
+    except ProcessLookupError:
+        pass
+    timeout = exit_code is None
+    exit_code = exit_code if exit_code is not None else -1
+    stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
+    stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
+    return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)

src/safe_subprocess/evil_programs/block_on_inputs.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ while True:
2	+ input()

src/safe_subprocess/evil_programs/close_outputs.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import sys
+print("This is the end")
+sys.stdout.close()
+sys.stderr.close()
+while True:
+    pass

src/safe_subprocess/evil_programs/fork_bomb.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+while True:
+    os.fork()

src/safe_subprocess/evil_programs/fork_once.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import os
+import time
+if os.fork() == 0:
+    while True:
+        time.sleep(60)

src/safe_subprocess/evil_programs/sleep_forever.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import time
+while True:
+    time.sleep(60)

src/safe_subprocess/evil_programs/unbounded_output.py ADDED Viewed

	@@ -0,0 +1,4 @@

+b = True
+while True:
+    print(b)
+    b = not b

src/safe_subprocess/module_test.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from safe_subprocess import run
+import time
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent / "evil_programs"
+def assert_no_running_evil():
+    result = run(
+        ["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024
+    )
+    assert (
+        result.exit_code == 1
+    ), f"There are still evil processes running: {result.stdout}"
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+def test_fork_once():
+    # The program exits cleanly and immediately. But, it forks a child that runs
+    # forever.
+    result = run(
+        ["python3", ROOT / "fork_once.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 0
+    assert result.timeout == False
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_close_outputs():
+    # The program prints to stdout, closes its output, and then runs forever.
+    result = run(
+        ["python3", ROOT / "close_outputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert result.stdout == "This is the end\n"
+    assert_no_running_evil()
+def test_unbounded_output():
+    result = run(
+        ["python3", ROOT / "unbounded_output.py"],
+        timeout_seconds=3,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 1024
+    assert_no_running_evil()
+def test_sleep_forever():
+    result = run(
+        ["python3", ROOT / "sleep_forever.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_fork_bomb():
+    result = run(
+        ["python3", ROOT / "fork_bomb.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    # Unfortunately, this sleep seems to be necessary. My theories:
+    # 1. os.killpg doesn't block until the whole process group is dead.
+    # 2. pgrep can produce stale output
+    time.sleep(2)
+    assert_no_running_evil()
+def test_block_on_inputs():
+    # We run the subprocess with /dev/null as input. So, any program that tries
+    # to read input will error.
+    result = run(
+        ["python3", ROOT / "block_on_inputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 1
+    assert result.timeout == False
+    assert len(result.stdout) == 0
+    assert "EOF when reading a line" in result.stderr
+    assert_no_running_evil()