朱东升 commited on
Commit
0087c59
·
1 Parent(s): b01f5f4

requirements update27

Browse files
Files changed (3) hide show
  1. app.py +87 -68
  2. src/containerized_eval.py +0 -1
  3. test.py +0 -49
app.py CHANGED
@@ -1,13 +1,12 @@
1
- import gradio as gr
2
- import json
3
- import importlib
4
  import os
5
  import sys
6
- from pathlib import Path
7
  import concurrent.futures
8
  import multiprocessing
 
 
9
  from src.containerized_eval import eval_string_script
10
 
 
11
  # 添加当前目录和src目录到模块搜索路径
12
  current_dir = os.path.dirname(os.path.abspath(__file__))
13
  src_dir = os.path.join(current_dir, "src")
@@ -16,6 +15,53 @@ if current_dir not in sys.path:
16
  if src_dir not in sys.path:
17
  sys.path.append(src_dir)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def evaluate(input_data):
20
  """评估代码的主函数
21
 
@@ -30,13 +76,8 @@ def evaluate(input_data):
30
  return {"status": "Exception", "error": "Input must be a list"}
31
 
32
  results = []
33
- # 定义系统错误关键词,用于判断是否需要重试
34
- system_error_keywords = [
35
- "resource", "timeout", "busy", "congestion", "memory",
36
- "connection", "system", "overload", "refused", "reset"
37
- ]
38
-
39
  max_workers = multiprocessing.cpu_count()
 
40
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
41
  future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
42
  for future in concurrent.futures.as_completed(future_to_item):
@@ -44,69 +85,43 @@ def evaluate(input_data):
44
  try:
45
  result = future.result()
46
 
47
- # 检查是否是系统错误,如果是,立即重试一次
48
- if isinstance(result, dict) and result.get("status") == "Exception":
49
- error_msg = str(result.get("error", "")).lower()
50
-
51
- # 如果错误信息包含系统错误关键词,则重试
52
- if any(keyword in error_msg for keyword in system_error_keywords):
53
- print(f"检测到系统错误: {error_msg},正在重试...")
54
- # 立即重试
55
- retry_result = evaluate_single_case(item)
56
- if isinstance(retry_result, dict) and retry_result.get("status") != "Exception":
57
- # 重试成功,使用重试结果
58
- result = retry_result
59
- print(f"重试成功")
60
- else:
61
- print(f"重试失败")
62
-
63
  # 检查结果列表
64
- if isinstance(result, list):
65
- for i, res in enumerate(result):
66
- if isinstance(res, dict) and res.get("status") == "Exception":
67
- error_msg = str(res.get("error", "")).lower()
68
-
69
- # 如果错误信息包含系统错误关键词,则重试
70
- if any(keyword in error_msg for keyword in system_error_keywords):
71
- print(f"检测到列表中的系统错误: {error_msg},正在重试...")
72
- # 仅重试这个失败的情况
73
- code = item.get('prompt') + item.get('processed_completions', [])[i] + '\n' + item.get('tests')
74
- retry_result = evaluate_code(code, item.get('language'))
75
- if isinstance(retry_result, dict) and retry_result.get("status") != "Exception":
76
- # 重��成功,更新结果
77
- result[i] = retry_result
78
- print(f"重试成功")
79
- else:
80
- print(f"重试失败")
81
-
82
- # 如果是超时错误,也尝试重试一次
83
- if isinstance(result, dict) and result.get("status") == "Timeout":
84
- print(f"检测到超时错误,正在重试...")
85
- # 立即重试
86
- retry_result = evaluate_single_case(item)
87
- if isinstance(retry_result, dict) and retry_result.get("status") != "Timeout":
88
- # 重试成功,使用重试结果
89
- result = retry_result
90
- print(f"重试成功")
91
- else:
92
- print(f"重试失败")
93
 
94
  item.update(result)
95
  results.append(item)
96
  except Exception as e:
97
- error_msg = str(e).lower()
98
- # 检查是否是系统错误
99
- if any(keyword in error_msg for keyword in system_error_keywords):
100
- print(f"执行过程中检测到系统错误: {error_msg},正在重试...")
101
- try:
102
- # 立即重试
103
- retry_result = evaluate_single_case(item)
 
 
 
 
104
  item.update(retry_result)
105
  results.append(item)
106
- print(f"重试成功")
107
  continue
108
- except Exception as retry_e:
109
- print(f"重试失败: {str(retry_e)}")
110
 
111
  # 如果重试失败或不是系统错误,记录原始错误
112
  item.update({"status": "Exception", "error": str(e)})
@@ -116,6 +131,7 @@ def evaluate(input_data):
116
  except Exception as e:
117
  return {"status": "Exception", "error": str(e)}
118
 
 
119
  def evaluate_single_case(input_data):
120
  """评估单个代码用例
121
 
@@ -125,10 +141,10 @@ def evaluate_single_case(input_data):
125
  Returns:
126
  dict: 包含评估结果的字典
127
  """
 
 
 
128
  try:
129
- if not isinstance(input_data, dict):
130
- return {"status": "Exception", "error": "Input item must be a dictionary"}
131
-
132
  language = input_data.get('language')
133
  completions = input_data.get('processed_completions', [])
134
 
@@ -148,6 +164,7 @@ def evaluate_single_case(input_data):
148
  except Exception as e:
149
  return {"status": "Exception", "error": str(e)}
150
 
 
151
  def evaluate_code(code, language):
152
  """评估特定语言的代码
153
 
@@ -166,6 +183,7 @@ def evaluate_code(code, language):
166
  except Exception as e:
167
  return {"status": "Exception", "error": str(e)}
168
 
 
169
  # 创建Gradio接口
170
  demo = gr.Interface(
171
  fn=evaluate,
@@ -175,5 +193,6 @@ demo = gr.Interface(
175
  description="支持多种编程语言的代码评估服务"
176
  )
177
 
 
178
  if __name__ == "__main__":
179
  demo.launch()
 
 
 
 
1
  import os
2
  import sys
 
3
  import concurrent.futures
4
  import multiprocessing
5
+
6
+ import gradio as gr
7
  from src.containerized_eval import eval_string_script
8
 
9
+
10
  # 添加当前目录和src目录到模块搜索路径
11
  current_dir = os.path.dirname(os.path.abspath(__file__))
12
  src_dir = os.path.join(current_dir, "src")
 
15
  if src_dir not in sys.path:
16
  sys.path.append(src_dir)
17
 
18
+
19
+ # 定义系统错误关键词,用于判断是否需要重试
20
+ SYSTEM_ERROR_KEYWORDS = [
21
+ "resource", "timeout", "busy", "congestion", "memory",
22
+ "connection", "system", "overload", "refused", "reset"
23
+ ]
24
+
25
+
26
+ def is_system_error(error_msg):
27
+ """检查错误信息是否包含系统错误关键词
28
+
29
+ Args:
30
+ error_msg (str): 错误信息
31
+
32
+ Returns:
33
+ bool: 是否是系统错误
34
+ """
35
+ error_msg = str(error_msg).lower()
36
+ return any(keyword in error_msg for keyword in SYSTEM_ERROR_KEYWORDS)
37
+
38
+
39
+ def retry_with_logging(func, args, test_name="未知测试用例", error_context=""):
40
+ """带日志记录的重试函数
41
+
42
+ Args:
43
+ func: 要重试的函数
44
+ args: 函数参数
45
+ test_name (str): 测试用例名称
46
+ error_context (str): 错误上下文描述
47
+
48
+ Returns:
49
+ tuple: (结果, 是否成功)
50
+ """
51
+ try:
52
+ print(f"{error_context},正在重试测试用例 '{test_name}'...")
53
+ result = func(*args)
54
+ success = True
55
+ if isinstance(result, dict) and result.get("status") == "Exception":
56
+ success = False
57
+ else:
58
+ print(f"测试用例 '{test_name}' 重试成功")
59
+ return result, success
60
+ except Exception as e:
61
+ print(f"测试用例 '{test_name}' 重试失败: {str(e)}")
62
+ return {"status": "Exception", "error": str(e)}, False
63
+
64
+
65
  def evaluate(input_data):
66
  """评估代码的主函数
67
 
 
76
  return {"status": "Exception", "error": "Input must be a list"}
77
 
78
  results = []
 
 
 
 
 
 
79
  max_workers = multiprocessing.cpu_count()
80
+
81
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
82
  future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
83
  for future in concurrent.futures.as_completed(future_to_item):
 
85
  try:
86
  result = future.result()
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # 检查结果列表
89
+ if not isinstance(result, list):
90
+ return {"status": "Exception", "error": "Input data must be a list"}
91
+
92
+ # 处理结果中的系统错误
93
+ for i, res in enumerate(result):
94
+ if isinstance(res, dict) and res.get("status") == "Exception" and is_system_error(res.get("error", "")):
95
+ test_name = item.get('name', '未知测试用例')
96
+ error_context = f"检测到列表中的系统错误: {res.get('error', '')}"
97
+ # 仅重试这个失败的情况
98
+ code = item.get('prompt') + item.get('processed_completions', [])[i] + '\n' + item.get('tests')
99
+ retry_result, success = retry_with_logging(
100
+ evaluate_code,
101
+ [code, item.get('language')],
102
+ test_name,
103
+ error_context
104
+ )
105
+ if success:
106
+ result[i] = retry_result
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  item.update(result)
109
  results.append(item)
110
  except Exception as e:
111
+ # 处理执行过程中的系统错误
112
+ if is_system_error(e):
113
+ test_name = item.get('name', '未知测试用例')
114
+ error_context = f"执行过程中检测到系统错误: {str(e)}"
115
+ retry_result, success = retry_with_logging(
116
+ evaluate_single_case,
117
+ [item],
118
+ test_name,
119
+ error_context
120
+ )
121
+ if success:
122
  item.update(retry_result)
123
  results.append(item)
 
124
  continue
 
 
125
 
126
  # 如果重试失败或不是系统错误,记录原始错误
127
  item.update({"status": "Exception", "error": str(e)})
 
131
  except Exception as e:
132
  return {"status": "Exception", "error": str(e)}
133
 
134
+
135
  def evaluate_single_case(input_data):
136
  """评估单个代码用例
137
 
 
141
  Returns:
142
  dict: 包含评估结果的字典
143
  """
144
+ if not isinstance(input_data, dict):
145
+ return {"status": "Exception", "error": "Input item must be a dictionary"}
146
+
147
  try:
 
 
 
148
  language = input_data.get('language')
149
  completions = input_data.get('processed_completions', [])
150
 
 
164
  except Exception as e:
165
  return {"status": "Exception", "error": str(e)}
166
 
167
+
168
  def evaluate_code(code, language):
169
  """评估特定语言的代码
170
 
 
183
  except Exception as e:
184
  return {"status": "Exception", "error": str(e)}
185
 
186
+
187
  # 创建Gradio接口
188
  demo = gr.Interface(
189
  fn=evaluate,
 
193
  description="支持多种编程语言的代码评估服务"
194
  )
195
 
196
+
197
  if __name__ == "__main__":
198
  demo.launch()
src/containerized_eval.py CHANGED
@@ -75,7 +75,6 @@ def eval_string_script(language, program):
75
  with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
76
  f.write(program.encode("utf-8"))
77
  f.flush()
78
- print(f'Path(f.name): {Path(f.name)}')
79
  result = eval_script(Path(f.name))
80
  # Only save the first 2K of output from the running program. Any futher
81
  # output is very likely an exceptionally long stack trace or a long
 
75
  with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
76
  f.write(program.encode("utf-8"))
77
  f.flush()
 
78
  result = eval_script(Path(f.name))
79
  # Only save the first 2K of output from the running program. Any futher
80
  # output is very likely an exceptionally long stack trace or a long
test.py DELETED
@@ -1,49 +0,0 @@
1
- import difflib
2
-
3
- def remove_prefix_by_last_line(a: str, b: str, threshold: float = 0.85) -> str:
4
- """
5
- 基于A的最后一行定位B中的截断点,删除B中该行之前的所有内容
6
- :param a: 前缀字符串A
7
- :param b: 目标字符串B
8
- :param threshold: 行相似度阈值
9
- :return: 删除前缀后的B字符串
10
- """
11
- a_lines = a.splitlines()
12
- b_lines = b.splitlines()
13
-
14
- if not a_lines:
15
- return b
16
-
17
- last_a_line = a_lines[-1]
18
- cut_index = -1
19
-
20
- for i, b_line in enumerate(b_lines):
21
- similarity = difflib.SequenceMatcher(
22
- None, last_a_line, b_line
23
- ).ratio()
24
- if similarity >= threshold:
25
- cut_index = i
26
- break
27
-
28
- if cut_index != -1:
29
- return '\n'.join(b_lines[cut_index+1:])
30
- else:
31
- return b
32
-
33
-
34
- A = "#include<assert.h>\n#include<bits/stdc++.h>\n// Check if in given vector of numbers, are any two numbers closer to each other than\n// given threshold.\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.0f, (float)3.0f})), (0.5f))\n// (false)\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.8f, (float)3.0f, (float)4.0f, (float)5.0f, (float)2.0f})), (0.3f))\n// (true)\nbool has_close_elements(std::vector<float> numbers, float threshold) {\n"
35
- B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// Check if in given vector of numbers, are any two numbers closer to each other than\n// given threshold.\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.0f, (float)3.0f})), (0.5f))\n// (false)\n// >>> has_close_elements((std::vector<float>({(float)1.0f, (float)2.8f, (float)3.0f, (float)4.0f, (float)5.0f, (float)2.0f})), (0.3f))\n// (true)\nbool has_close_elements(std::vector<float> numbers, float threshold) {\n // Sort the vector in ascending order\n std::sort(numbers.begin(), numbers.end());\n\n // Iterate over the sorted vector\n for (size_t i = 0; i < numbers.size() - 1; ++i) {\n // Check if the difference between the current element and the next element is less than the threshold\n if (numbers[i + 1] - numbers[i] < threshold) {\n return true; // If a pair of elements is found that are closer than the threshold, return true\n }\n }\n\n // If no pair of elements is found that are closer than the threshold, return false\n return false;\n}\n\nint main() {\n std::vector<float> numbers1 = {1.0f, 2.0f, 3.0f};\n std::vector<float> numbers2 = {1.0f, 2.8f, 3.0f, 4.0f, 5.0f, 2.0f};\n\n std::cout << std::boolalpha << has_close_elements(numbers1, 0.5f) << std::endl; // Output: false\n std::cout << std::boolalpha << has_close_elements(numbers2, 0.3f) << std::endl; // Output: true\n\n return 0;\n}\n"
36
- # print(remove_prefix_by_last_line(A, B))
37
-
38
- A = "#include<assert.h>\n#include<bits/stdc++.h>\n// Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n// separate those group into separate strings and return the vector of those.\n// Separate groups are balanced (each open brace is properly closed) and not nested within each other\n// Ignore any spaces in the input string.\n// >>> separate_paren_groups((\"( ) (( )) (( )( ))\"))\n// (std::vector<std::string>({(std::string)\"()\", (std::string)\"(())\", (std::string)\"(()())\"}))\nstd::vector<std::string> separate_paren_groups(std::string paren_string) {\n"
39
- B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n// separate those group into separate strings and return the vector of those.\n// Separate groups are balanced (each open brace is properly closed) and not nested within each other\n// Ignore any spaces in the input string.\n// >>> separate_paren_groups(\"(( )) (( )( ))\")\n// (std::vector<std::string>({(std::string)\"()\", (std::string)\"(())\", (std::string)\"(()())\"}))\n\nstd::vector<std::string> separate_paren_groups(std::string paren_string) {\n // Remove spaces from the input string\n paren_string.erase(std::remove(paren_string.begin(), paren_string.end(), ' '), paren_string.end());\n\n // Initialize an empty vector to store the groups\n std::vector<std::string> groups;\n\n // Initialize an empty string to build the current group\n std::string current_group;\n\n // Initialize a counter to keep track of the balance of parentheses\n int balance = 0;\n\n // Iterate over the characters in the input string\n for (char c : paren_string) {\n // If the character is an open parenthesis, increment the balance\n if (c == '(') {\n balance++;\n current_group += c;\n }\n // If the character is a close parenthesis, decrement the balance\n else if (c == ')') {\n balance--;\n current_group += c;\n // If the balance is zero, it means we've found a complete group\n if (balance == 0) {\n groups.push_back(current_group);\n current_group.clear();\n }\n }\n // If the character is not a parenthesis, ignore it\n }\n\n // Return the vector of groups\n return groups;\n}\n\nint main() {\n std::string paren_string = \"(( )) (( )( ))\";\n std::vector<std::string> groups = separate_paren_groups(paren_string);\n\n // Print the groups\n for (const auto& group : groups) {\n std::cout << group << std::endl;\n }\n\n return 0;\n}\n"
40
- # print(remove_prefix_by_last_line(A, B))
41
-
42
-
43
- A = "#include<assert.h>\n#include<bits/stdc++.h>\n// The Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\n// fib4(0) -> 0\n// fib4(1) -> 0\n// fib4(2) -> 2\n// fib4(3) -> 0\n// fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n// Please write a function to efficiently compute the n-th element of the fib4 number sequence. Do not use recursion.\n// >>> fib4((5))\n// (4)\n// >>> fib4((6))\n// (8)\n// >>> fib4((7))\n// (14)\nlong fib4(long n) {\n"
44
- B = "#include <assert.h>\n#include <bits/stdc++.h>\n\n// The Fib4 number sequence is a sequence similar to the Fibonacci sequence that's defined as follows:\n// fib4(0) -> 0\n// fib4(1) -> 0\n// fib4(2) -> 2\n// fib4(3) -> 0\n// fib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\n// Please write a function to efficiently compute the n-th element of the fib4 number sequence. Do not use recursion.\n\n// We can use dynamic programming to solve this problem efficiently. We will store the results of subproblems in an array to avoid redundant calculations.\n\nlong fib4(long n) {\n // Base cases\n if (n <= 1) {\n return 0;\n } else if (n == 2) {\n return 2;\n } else if (n == 3) {\n return 0;\n }\n\n // Initialize an array to store the results of subproblems\n long fib4[n + 1];\n\n // Base cases\n fib4[0] = 0;\n fib4[1] = 0;\n fib4[2] = 2;\n fib4[3] = 0;\n\n // Compute the n-th element of the fib4 number sequence\n for (long i = 4; i <= n; i++) {\n fib4[i] = fib4[i - 1] + fib4[i - 2] + fib4[i - 3] + fib4[i - 4];\n }\n\n // Return the n-th element of the fib4 number sequence\n return fib4[n];\n}\n\nint main() {\n // Test the function\n std::cout << \"fib4(5) = \" << fib4(5) << std::endl;\n std::cout << \"fib4(6) = \" << fib4(6) << std::endl;\n std::cout << \"fib4(7) = \" << fib4(7) << std::endl;\n\n return 0;\n}\n"
45
- # print(remove_prefix_by_last_line(A, B))
46
-
47
- A = "package check_if_last_char_is_a_letter_test\n\nimport (\n \"testing\"\n \"fmt\"\n)\n\n// Create a function that returns true if the last character\n// of a given string is an alphabetical character and is not\n// a part of a word, and false otherwise.\n// Note: \"word\" is a group of characters separated by space.\n// Examples:\n// >>> check_if_last_char_is_a_letter(\"apple pie\")\n// false\n// >>> check_if_last_char_is_a_letter(\"apple pi e\")\n// true\n// >>> check_if_last_char_is_a_letter(\"apple pi e \")\n// false\n// >>> check_if_last_char_is_a_letter(\"\")\n// false\nfunc check_if_last_char_is_a_letter(txt string) bool {\n"
48
- B = "package check_if_last_char_is_a_letter_test\n\nimport (\n \"testing\"\n \"fmt\"\n)\n\n// Create a function that returns true if the last character\n// of a given string is an alphabetical character and is not\n// a part of a word, and false otherwise.\n// Note: \"word\" is a group of characters separated by space.\n// Examples:\n// >>> check_if_last_char_is_a_letter(\"apple pie\")\n// false\n// >>> check_if_last_char_is_a_letter(\"apple pi e\")\n// true\n// >>> check_if_last_char_is_a_letter(\"apple pi e \")\n// false\n// >>> check_if_last_char_is_a_letter(\"\")\n// false\nfunc check_if_last_char_is_a_letter(txt string) bool {\n // Remove trailing spaces from the string\n txt = strings.TrimRight(txt, \" \")\n \n // Check if the string is empty\n if len(txt) == 0 {\n return false\n }\n \n // Get the last character of the string\n lastChar := txt[len(txt)-1]\n \n // Check if the last character is an alphabetical character\n if !unicode.IsLetter(rune(lastChar)) {\n return false\n }\n \n // Check if the last character is not part of a word\n if strings.Contains(txt[:len(txt)-1], \" \") {\n return true\n }\n \n return false\n}\n\nfunc TestCheckIfLastCharIsALetter(t *testing.T) {\n tests := []struct {\n input string\n want bool\n }{\n {\"apple pie\", false},\n {\"apple pi e\", true},\n {\"apple pi e \", false},\n {\"\", false},\n {\"hello world\", false},\n {\"hello world \", false},\n {\"hello world \", true},\n }\n \n for _, tt := range tests {\n got := check_if_last_char_is_a_letter(tt.input)\n if got != tt.want {\n t.Errorf(\"check_if_last_char_is_a_letter(%q) = %v, want %v\", tt.input, got, tt.want)\n }\n }\n}\n"
49
- print(remove_prefix_by_last_line(A, B))