Spaces:

dongsheng
/

docker_test

Sleeping

App Files Files Community

朱东升 commited on Mar 17

Commit

0087c59

1 Parent(s): b01f5f4

requirements update27

Browse files

Files changed (3) hide show

app.py +87 -68
src/containerized_eval.py +0 -1
test.py +0 -49

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
-import gradio as gr
-import json
-import importlib
 import os
 import sys
-from pathlib import Path
 import concurrent.futures
 import multiprocessing
 from src.containerized_eval import eval_string_script
 # 添加当前目录和src目录到模块搜索路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_dir = os.path.join(current_dir, "src")
@@ -16,6 +15,53 @@ if current_dir not in sys.path:
 if src_dir not in sys.path:
     sys.path.append(src_dir)
 def evaluate(input_data):
     """评估代码的主函数
@@ -30,13 +76,8 @@ def evaluate(input_data):
             return {"status": "Exception", "error": "Input must be a list"}
         results = []
-        # 定义系统错误关键词，用于判断是否需要重试
-        system_error_keywords = [
-            "resource", "timeout", "busy", "congestion", "memory",
-            "connection", "system", "overload", "refused", "reset"
-        ]
         max_workers = multiprocessing.cpu_count()
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
             for future in concurrent.futures.as_completed(future_to_item):
@@ -44,69 +85,43 @@ def evaluate(input_data):
                 try:
                     result = future.result()
-                    # 检查是否是系统错误，如果是，立即重试一次
-                    if isinstance(result, dict) and result.get("status") == "Exception":
-                        error_msg = str(result.get("error", "")).lower()
-                        # 如果错误信息包含系统错误关键词，则重试
-                        if any(keyword in error_msg for keyword in system_error_keywords):
-                            print(f"检测到系统错误: {error_msg}，正在重试...")
-                            # 立即重试
-                            retry_result = evaluate_single_case(item)
-                            if isinstance(retry_result, dict) and retry_result.get("status") != "Exception":
-                                # 重试成功，使用重试结果
-                                result = retry_result
-                                print(f"重试成功")
-                            else:
-                                print(f"重试失败")
                     # 检查结果列表
-                    if isinstance(result, list):
-                        for i, res in enumerate(result):
-                            if isinstance(res, dict) and res.get("status") == "Exception":
-                                error_msg = str(res.get("error", "")).lower()
-                                # 如果错误信息包含系统错误关键词，则重试
-                                if any(keyword in error_msg for keyword in system_error_keywords):
-                                    print(f"检测到列表中的系统错误: {error_msg}，正在重试...")
-                                    # 仅重试这个失败的情况
-                                    code = item.get('prompt') + item.get('processed_completions', [])[i] + '\n' + item.get('tests')
-                                    retry_result = evaluate_code(code, item.get('language'))
-                                    if isinstance(retry_result, dict) and retry_result.get("status") != "Exception":
-                                        # 重��成功，更新结果
-                                        result[i] = retry_result
-                                        print(f"重试成功")
-                                    else:
-                                        print(f"重试失败")
-                    # 如果是超时错误，也尝试重试一次
-                    if isinstance(result, dict) and result.get("status") == "Timeout":
-                        print(f"检测到超时错误，正在重试...")
-                        # 立即重试
-                        retry_result = evaluate_single_case(item)
-                        if isinstance(retry_result, dict) and retry_result.get("status") != "Timeout":
-                            # 重试成功，使用重试结果
-                            result = retry_result
-                            print(f"重试成功")
-                        else:
-                            print(f"重试失败")
                     item.update(result)
                     results.append(item)
                 except Exception as e:
-                    error_msg = str(e).lower()
-                    # 检查是否是系统错误
-                    if any(keyword in error_msg for keyword in system_error_keywords):
-                        print(f"执行过程中检测到系统错误: {error_msg}，正在重试...")
-                        try:
-                            # 立即重试
-                            retry_result = evaluate_single_case(item)
                             item.update(retry_result)
                             results.append(item)
-                            print(f"重试成功")
                             continue
-                        except Exception as retry_e:
-                            print(f"重试失败: {str(retry_e)}")
                     # 如果重试失败或不是系统错误，记录原始错误
                     item.update({"status": "Exception", "error": str(e)})
@@ -116,6 +131,7 @@ def evaluate(input_data):
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 def evaluate_single_case(input_data):
     """评估单个代码用例
@@ -125,10 +141,10 @@ def evaluate_single_case(input_data):
     Returns:
         dict: 包含评估结果的字典
     """
     try:
-        if not isinstance(input_data, dict):
-            return {"status": "Exception", "error": "Input item must be a dictionary"}
         language = input_data.get('language')
         completions = input_data.get('processed_completions', [])
@@ -148,6 +164,7 @@ def evaluate_single_case(input_data):
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 def evaluate_code(code, language):
     """评估特定语言的代码
@@ -166,6 +183,7 @@ def evaluate_code(code, language):
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 # 创建Gradio接口
 demo = gr.Interface(
     fn=evaluate,
@@ -175,5 +193,6 @@ demo = gr.Interface(
     description="支持多种编程语言的代码评估服务"
 )
 if __name__ == "__main__":
     demo.launch()

 import os
 import sys
 import concurrent.futures
 import multiprocessing
+import gradio as gr
 from src.containerized_eval import eval_string_script
 # 添加当前目录和src目录到模块搜索路径
 current_dir = os.path.dirname(os.path.abspath(__file__))
 src_dir = os.path.join(current_dir, "src")
 if src_dir not in sys.path:
     sys.path.append(src_dir)
+# 定义系统错误关键词，用于判断是否需要重试
+SYSTEM_ERROR_KEYWORDS = [
+    "resource", "timeout", "busy", "congestion", "memory",
+    "connection", "system", "overload", "refused", "reset"
+]
+def is_system_error(error_msg):
+    """检查错误信息是否包含系统错误关键词
+    Args:
+        error_msg (str): 错误信息
+    Returns:
+        bool: 是否是系统错误
+    """
+    error_msg = str(error_msg).lower()
+    return any(keyword in error_msg for keyword in SYSTEM_ERROR_KEYWORDS)
+def retry_with_logging(func, args, test_name="未知测试用例", error_context=""):
+    """带日志记录的重试函数
+    Args:
+        func: 要重试的函数
+        args: 函数参数
+        test_name (str): 测试用例名称
+        error_context (str): 错误上下文描述
+    Returns:
+        tuple: (结果, 是否成功)
+    """
+    try:
+        print(f"{error_context}，正在重试测试用例 '{test_name}'...")
+        result = func(*args)
+        success = True
+        if isinstance(result, dict) and result.get("status") == "Exception":
+            success = False
+        else:
+            print(f"测试用例 '{test_name}' 重试成功")
+        return result, success
+    except Exception as e:
+        print(f"测试用例 '{test_name}' 重试失败: {str(e)}")
+        return {"status": "Exception", "error": str(e)}, False
 def evaluate(input_data):
     """评估代码的主函数
             return {"status": "Exception", "error": "Input must be a list"}
         results = []
         max_workers = multiprocessing.cpu_count()
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
             for future in concurrent.futures.as_completed(future_to_item):
                 try:
                     result = future.result()
                     # 检查结果列表
+                    if not isinstance(result, list):
+                        return {"status": "Exception", "error": "Input data must be a list"}
+                    # 处理结果中的系统错误
+                    for i, res in enumerate(result):
+                        if isinstance(res, dict) and res.get("status") == "Exception" and is_system_error(res.get("error", "")):
+                            test_name = item.get('name', '未知测试用例')
+                            error_context = f"检测到列表中的系统错误: {res.get('error', '')}"
+                            # 仅重试这个失败的情况
+                            code = item.get('prompt') + item.get('processed_completions', [])[i] + '\n' + item.get('tests')
+                            retry_result, success = retry_with_logging(
+                                evaluate_code,
+                                [code, item.get('language')],
+                                test_name,
+                                error_context
+                            )
+                            if success:
+                                result[i] = retry_result
                     item.update(result)
                     results.append(item)
                 except Exception as e:
+                    # 处理执行过程中的系统错误
+                    if is_system_error(e):
+                        test_name = item.get('name', '未知测试用例')
+                        error_context = f"执行过程中检测到系统错误: {str(e)}"
+                        retry_result, success = retry_with_logging(
+                            evaluate_single_case,
+                            [item],
+                            test_name,
+                            error_context
+                        )
+                        if success:
                             item.update(retry_result)
                             results.append(item)
                             continue
                     # 如果重试失败或不是系统错误，记录原始错误
                     item.update({"status": "Exception", "error": str(e)})
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 def evaluate_single_case(input_data):
     """评估单个代码用例
     Returns:
         dict: 包含评估结果的字典
     """
+    if not isinstance(input_data, dict):
+        return {"status": "Exception", "error": "Input item must be a dictionary"}
     try:
         language = input_data.get('language')
         completions = input_data.get('processed_completions', [])
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 def evaluate_code(code, language):
     """评估特定语言的代码
     except Exception as e:
         return {"status": "Exception", "error": str(e)}
 # 创建Gradio接口
 demo = gr.Interface(
     fn=evaluate,
     description="支持多种编程语言的代码评估服务"
 )
 if __name__ == "__main__":
     demo.launch()

src/containerized_eval.py CHANGED Viewed

@@ -75,7 +75,6 @@ def eval_string_script(language, program):
     with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
         f.write(program.encode("utf-8"))
         f.flush()
-        print(f'Path(f.name): {Path(f.name)}')
         result = eval_script(Path(f.name))
         # Only save the first 2K of output from the running program. Any futher
         # output is very likely an exceptionally long stack trace or a long

     with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
         f.write(program.encode("utf-8"))
         f.flush()
         result = eval_script(Path(f.name))
         # Only save the first 2K of output from the running program. Any futher
         # output is very likely an exceptionally long stack trace or a long

test.py DELETED Viewed

@@ -1,49 +0,0 @@
-import difflib
-def remove_prefix_by_last_line(a: str, b: str, threshold: float = 0.85) -> str:
-    """
-    基于A的最后一行定位B中的截断点，删除B中该行之前的所有内容
-    :param a: 前缀字符串A
-    :param b: 目标字符串B
-    :param threshold: 行相似度阈值
-    :return: 删除前缀后的B字符串
-    """
-    a_lines = a.splitlines()
-    b_lines = b.splitlines()
-    if not a_lines:
-        return b
-    last_a_line = a_lines[-1]
-    cut_index = -1
-    for i, b_line in enumerate(b_lines):
-        similarity = difflib.SequenceMatcher(
-            None, last_a_line, b_line
-        ).ratio()
-        if similarity >= threshold:
-            cut_index = i
-            break
-    if cut_index != -1:
-        return '\n'.join(b_lines[cut_index+1:])
-    else:
-        return b
-A = "#include<assert.h>\n#include<bits/stdc++.h>\n// Check if in given vector of numbers, are any two numbers closer to each other than\n// given threshold.\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.0f, (float)3.0f})), (0.5f))\n// (false)\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.8f, (float)3.0f, (float)4.0f, (float)5.0f, (float)2.0f})), (0.3f))\n// (true)\nbool has_close_elements(std::vector<float> numbers, float threshold) {\n"
-B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// Check if in given vector of numbers, are any two numbers closer to each other than\n// given threshold.\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.0f, (float)3.0f})), (0.5f))\n// (false)\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.8f, (float)3.0f, (float)4.0f, (float)5.0f, (float)2.0f})), (0.3f))\n// (true)\nbool has_close_elements(std::vector<float> numbers, float threshold) {\n    // Sort the vector in ascending order\n    std::sort(numbers.begin(), numbers.end());\n\n    // Iterate over the sorted vector\n    for (size_t i = 0; i < numbers.size() - 1; ++i) {\n        // Check if the difference between the current element and the next element is less than the threshold\n        if (numbers[i + 1] - numbers[i] < threshold) {\n            return true; // If a pair of elements is found that are closer than the threshold, return true\n        }\n    }\n\n    // If no pair of elements is found that are closer than the threshold, return false\n    return false;\n}\n\nint main() {\n    std::vector<float> numbers1 = {1.0f, 2.0f, 3.0f};\n    std::vector<float> numbers2 = {1.0f, 2.8f, 3.0f, 4.0f, 5.0f, 2.0f};\n\n    std::cout << std::boolalpha << has_close_elements(numbers1, 0.5f) << std::endl; // Output: false\n    std::cout << std::boolalpha << has_close_elements(numbers2, 0.3f) << std::endl; // Output: true\n\n    return 0;\n}\n"
-# print(remove_prefix_by_last_line(A, B))
-A = "#include<assert.h>\n#include<bits/stdc++.h>\n// Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n// separate those group into separate strings and return the vector of those.\n// Separate groups are balanced (each open brace is properly closed) and not nested within each other\n// Ignore any spaces in the input string.\n// >>> separate_paren_groups((\"( ) (( )) (( )( ))\"))\n// (std::vector<std::string>({(std::string)\"()\", (std::string)\"(())\", (std::string)\"(()())\"}))\nstd::vector<std::string> separate_paren_groups(std::string paren_string) {\n"
-B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n// separate those group into separate strings and return the vector of those.\n// Separate groups are balanced (each open brace is properly closed) and not nested within each other\n// Ignore any spaces in the input string.\n// >>> separate_paren_groups(\"(( )) (( )( ))\")\n// (std::vector<std::string>({(std::string)\"()\", (std::string)\"(())\", (std::string)\"(()())\"}))\n\nstd::vector<std::string> separate_paren_groups(std::string paren_string) {\n    // Remove spaces from the input string\n    paren_string.erase(std::remove(paren_string.begin(), paren_string.end(), ' '), paren_string.end());\n\n    // Initialize an empty vector to store the groups\n    std::vector<std::string> groups;\n\n    // Initialize an empty string to build the current group\n    std::string current_group;\n\n    // Initialize a counter to keep track of the balance of parentheses\n    int balance = 0;\n\n    // Iterate over the characters in the input string\n    for (char c : paren_string) {\n        // If the character is an open parenthesis, increment the balance\n        if (c == '(') {\n            balance++;\n            current_group += c;\n        }\n        // If the character is a close parenthesis, decrement the balance\n        else if (c == ')') {\n            balance--;\n            current_group += c;\n            // If the balance is zero, it means we've found a complete group\n            if (balance == 0) {\n                groups.push_back(current_group);\n                current_group.clear();\n            }\n        }\n        // If the character is not a parenthesis, ignore it\n    }\n\n    // Return the vector of groups\n    return groups;\n}\n\nint main() {\n    std::string paren_string = \"(( )) (( )( ))\";\n    std::vector<std::string> groups = separate_paren_groups(paren_string);\n\n    // Print the groups\n    for (const auto& group : groups) {\n        std::cout << group << std::endl;\n    }\n\n    return 0;\n}\n"
-# print(remove_prefix_by_last_line(A, B))
-A = "#include<assert.h>\n#include<bits/stdc++.h>\n// The Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\n// fib4(0) -> 0\n// fib4(1) -> 0\n// fib4(2) -> 2\n// fib4(3) -> 0\n// fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n// Please write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.\n// >>> fib4((5))\n// (4)\n// >>> fib4((6))\n// (8)\n// >>> fib4((7))\n// (14)\nlong fib4(long n) {\n"
-B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// The Fib4 number sequence is a sequence similar to the Fibonacci sequence that's defined as follows:\n// fib4(0) -> 0\n// fib4(1) -> 0\n// fib4(2) -> 2\n// fib4(3) -> 0\n// fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n// Please write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.\n\n// We can use dynamic programming to solve this problem efficiently. We will store the results of subproblems in an array to avoid redundant calculations.\n\nlong fib4(long n) {\n    // Base cases\n    if (n <= 1) {\n        return 0;\n    } else if (n == 2) {\n        return 2;\n    } else if (n == 3) {\n        return 0;\n    }\n\n    // Initialize an array to store the results of subproblems\n    long fib4[n + 1];\n\n    // Base cases\n    fib4[0] = 0;\n    fib4[1] = 0;\n    fib4[2] = 2;\n    fib4[3] = 0;\n\n    // Compute the n-th element of the fib4 number sequence\n    for (long i = 4; i <= n; i++) {\n        fib4[i] = fib4[i - 1] + fib4[i - 2] + fib4[i - 3] + fib4[i - 4];\n    }\n\n    // Return the n-th element of the fib4 number sequence\n    return fib4[n];\n}\n\nint main() {\n    // Test the function\n    std::cout << \"fib4(5) = \" << fib4(5) << std::endl;\n    std::cout << \"fib4(6) = \" << fib4(6) << std::endl;\n    std::cout << \"fib4(7) = \" << fib4(7) << std::endl;\n\n    return 0;\n}\n"
-# print(remove_prefix_by_last_line(A, B))
-A = "package check_if_last_char_is_a_letter_test\n\nimport (\n    \"testing\"\n    \"fmt\"\n)\n\n// Create a function that returns true if the last character\n// of a given string is an alphabetical character and is not\n// a part of a word, and false otherwise.\n// Note: \"word\" is a group of characters separated by space.\n// Examples:\n// >>> check_if_last_char_is_a_letter(\"apple pie\")\n// false\n// >>> check_if_last_char_is_a_letter(\"apple pi e\")\n// true\n// >>> check_if_last_char_is_a_letter(\"apple pi e \")\n// false\n// >>> check_if_last_char_is_a_letter(\"\")\n// false\nfunc check_if_last_char_is_a_letter(txt string) bool {\n"
-B = "package check_if_last_char_is_a_letter_test\n\nimport (\n    \"testing\"\n    \"fmt\"\n)\n\n// Create a function that returns true if the last character\n// of a given string is an alphabetical character and is not\n// a part of a word, and false otherwise.\n// Note: \"word\" is a group of characters separated by space.\n// Examples:\n// >>> check_if_last_char_is_a_letter(\"apple pie\")\n// false\n// >>> check_if_last_char_is_a_letter(\"apple pi e\")\n// true\n// >>> check_if_last_char_is_a_letter(\"apple pi e \")\n// false\n// >>> check_if_last_char_is_a_letter(\"\")\n// false\nfunc check_if_last_char_is_a_letter(txt string) bool {\n    // Remove trailing spaces from the string\n    txt = strings.TrimRight(txt, \" \")\n    \n    // Check if the string is empty\n    if len(txt) == 0 {\n        return false\n    }\n    \n    // Get the last character of the string\n    lastChar := txt[len(txt)-1]\n    \n    // Check if the last character is an alphabetical character\n    if !unicode.IsLetter(rune(lastChar)) {\n        return false\n    }\n    \n    // Check if the last character is not part of a word\n    if strings.Contains(txt[:len(txt)-1], \" \") {\n        return true\n    }\n    \n    return false\n}\n\nfunc TestCheckIfLastCharIsALetter(t *testing.T) {\n    tests := []struct {\n        input string\n        want  bool\n    }{\n        {\"apple pie\", false},\n        {\"apple pi e\", true},\n        {\"apple pi e \", false},\n        {\"\", false},\n        {\"hello world\", false},\n        {\"hello world \", false},\n        {\"hello world \", true},\n    }\n    \n    for _, tt := range tests {\n        got := check_if_last_char_is_a_letter(tt.input)\n        if got != tt.want {\n            t.Errorf(\"check_if_last_char_is_a_letter(%q) = %v, want %v\", tt.input, got, tt.want)\n        }\n    }\n}\n"
-print(remove_prefix_by_last_line(A, B))