diff --git "a/notebooks/04_Few-shot_Prompting_OpenAI.ipynb" "b/notebooks/04_Few-shot_Prompting_OpenAI.ipynb" new file mode 100644--- /dev/null +++ "b/notebooks/04_Few-shot_Prompting_OpenAI.ipynb" @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":476,"status":"ok","timestamp":1720679526275,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"uWKRSV6eZsCn"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"6d394937-6c99-4a7c-9d32-7600a280032f","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"G5pNu3zgZBrL","outputId":"160a554f-fb08-4aa0-bc00-0422fb7c1fac"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: d:\\code\\projects\\logical-reasoning\n"]}],"source":["import os\n","import sys\n","from pathlib import Path\n","\n","# check if workding_dir is in local variables\n","if \"workding_dir\" not in locals():\n"," workding_dir = str(Path.cwd().parent)\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":3,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"hPCC-6m7ZBrM","outputId":"c7aa2c96-5e99-440a-c148-201d79465ff9"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: d:\\code\\projects\\logical-reasoning\\.env\n"]},{"data":{"text/plain":["True"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{"byteLimit":2048000,"rowLimit":10000},"inputWidgets":{},"nuid":"f1597656-8042-4878-9d3b-9ebfb8dd86dc","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720679529345,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"1M3IraVtZBrM","outputId":"29ab35f6-2970-4ade-d85d-3174acf8cda0"},"outputs":[{"name":"stdout","output_type":"stream","text":["gpt-4o datasets/mgtv results/openai_results.csv 16\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","max_new_tokens = int(os.getenv(\"MAX_NEW_TOKENS\", 2048))\n","\n","print(model_name,data_path, results_path, max_new_tokens)"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1685,"status":"ok","timestamp":1720679531591,"user":{"displayName":"HUANG DONGHAO _","userId":"00977795705617022768"},"user_tz":-480},"id":"ZuS_FsLyZBrN","outputId":"2cba0105-c505-4395-afbd-2f2fee6581d0"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading d:\\code\\projects\\logical-reasoning\\llm_toolkit\\logical_reasoning_utils.py\n","CUDA is available, we have found 1 GPU(s)\n","NVIDIA GeForce RTX 4080 Laptop GPU\n","CUDA version: 12.1\n"]}],"source":["from llm_toolkit.llm_utils import *\n","from llm_toolkit.logical_reasoning_utils import *\n","\n","device = check_gpu()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n"]}],"source":["datasets = load_logical_reasoning_dataset(data_path)"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["--------------------------------------------------\n","text: 甄加索是自杀吗\n","--------------------------------------------------\n","label: 不是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 海岸之谜\n","--------------------------------------------------\n","puzzle: 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\n","--------------------------------------------------\n","truth: 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\n","--------------------------------------------------\n","text: 背包主人是来湖边放生的吗\n","--------------------------------------------------\n","label: 不是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 湖畔迷影\n","--------------------------------------------------\n","puzzle: 一个宁静的午后,湖边突然传来了惊叫声。人们跑过去发现,湖边石头上静静躺着一个打翻的背包和一张奇怪的纸条。纸条上写着:“它就在这里,但我无法带它回去。”人们搜寻了周围,但什么也没有发现,背包的主人也不知所踪。\n","--------------------------------------------------\n","truth: 原来,湖边是一个学校的生物研究小组经常进行野外考察的地方。背包的主人是一位对湖中生物充满好奇的学生。他在湖边发现了一只稀有的湖龟,但由于湖龟属于保护动物,他无法私自将其带回去。他在纸条上记录了自己的发现,并准备将这个消息告诉他的研究小组。然而,在他离开去寻找同组的伙伴时,不小心滑入湖中,因不会游泳而遭遇了不幸。而湖龟在他跌入水中时受到了惊吓,悄悄潜入了湖中深处。那张纸条和打翻的背包成为了这个谜团的唯一线索。 \n"]}],"source":["eval_dataset = datasets[\"test\"].select([0, 100]).to_pandas()\n","print_row_details(eval_dataset, range(len(eval_dataset)))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["You are the host of a situational guessing game. The rules of the game are as follows:\n","\n","1. Participants will receive a riddle that describes a simple yet difficult to understand event.\n","2. The host knows the answer, which is the solution to the riddle.\n","3. Participants can ask any closed-ended questions to uncover the truth of the event.\n","4. For each question, the host will respond with one of the following five options based on the actual situation: Yes, No, Unimportant, Correct answer, or Incorrect questioning. The criteria for each response are as follows:\n"," - If the riddle and answer can provide an answer to the question, respond with: Yes or No\n"," - If the riddle and answer cannot directly or indirectly infer an answer to the question, respond with: Unimportant\n"," - If the participant's question is not a closed-ended question or is difficult to understand, respond with: Incorrect questioning\n"," - If the participant's question essentially reveals the truth of the answer, respond with: Correct answer\n","5. The response must not include any additional information, nor should any word be omitted from the options. For example, \"No\" cannot be abbreviated to \"N\".\n","\n","Please strictly follow these rules when answering the participant's questions.\n","\n","**Riddle:** {}\n","\n","**Answer:** {}\n","\n","**Participant's question:** {}\n","\n"]}],"source":["prompt = get_prompt_template(using_p1=False, chinese_prompt=False)\n","print(prompt)"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["你是一个情景猜谜游戏的主持人。游戏规则如下:\n","\n","1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n","2. 主持人知道谜底,谜底是谜面的答案。\n","3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n","4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n"," - 若谜面和谜底能找到问题的答案,回答:是或者不是\n"," - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n"," - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n"," - 若参与者提问基本还原了谜底真相,回答:回答正确\n","5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n","\n","请��格按照这些规则回答参与者提出的问题。\n","\n","**谜面:** {}\n","\n","**谜底:** {}\n","\n","**参与者提出的问题:** {}\n","\n"]}],"source":["prompt = get_prompt_template(using_p1=False, chinese_prompt=True)\n","print(prompt)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:RunnableSequence] Entering Chain run with input:\n","\u001b[0m{}\n","\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:RunnableSequence > prompt:ChatPromptTemplate] Entering Prompt run with input:\n","\u001b[0m{}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:RunnableSequence > prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n","\u001b[0m[outputs]\n","\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[chain:RunnableSequence > llm:ChatOpenAI] Entering LLM run with input:\n","\u001b[0m{\n"," \"prompts\": [\n"," \"System: You are an expert in logical reasoning.\\nHuman: 你是一个情景猜谜游戏的主持人。游戏规则如下:\\n\\n1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\\n2. 主持人知道谜底,谜底是谜面的答案。\\n3. 参与者可以询问任何封闭式问题来找寻事件的真相。\\n4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\\n - 若谜面和谜底能找到问题的答案,回答:是或者不是\\n - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\\n - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\\n - 若参与者提问基本还原了谜底真相,回答:回答正确\\n5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\\n\\n请严格按照这些规则回答参与者提出的问题。\\n\\n**谜面:** 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\\n\\n**谜底:** 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\\n\\n**参与者提出的问题:** 甄加索是自杀吗\"\n"," ]\n","}\n","\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[chain:RunnableSequence > llm:ChatOpenAI] [662ms] Exiting LLM run with output:\n","\u001b[0m{\n"," \"generations\": [\n"," [\n"," {\n"," \"text\": \"不是\",\n"," \"generation_info\": {\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ChatGeneration\",\n"," \"message\": {\n"," \"lc\": 1,\n"," \"type\": \"constructor\",\n"," \"id\": [\n"," \"langchain\",\n"," \"schema\",\n"," \"messages\",\n"," \"AIMessage\"\n"," ],\n"," \"kwargs\": {\n"," \"content\": \"不是\",\n"," \"response_metadata\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 1,\n"," \"prompt_tokens\": 557,\n"," \"total_tokens\": 558\n"," },\n"," \"model_name\": \"gpt-4o-mini-2024-07-18\",\n"," \"system_fingerprint\": \"fp_483d39d857\",\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ai\",\n"," \"id\": \"run-f82f7c1c-3d91-4c9d-966c-12efa0432aa5-0\",\n"," \"usage_metadata\": {\n"," \"input_tokens\": 557,\n"," \"output_tokens\": 1,\n"," \"total_tokens\": 558\n"," },\n"," \"tool_calls\": [],\n"," \"invalid_tool_calls\": []\n"," }\n"," }\n"," }\n"," ]\n"," ],\n"," \"llm_output\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 1,\n"," \"prompt_tokens\": 557,\n"," \"total_tokens\": 558\n"," },\n"," \"model_name\": \"gpt-4o-mini-2024-07-18\",\n"," \"system_fingerprint\": \"fp_483d39d857\"\n"," },\n"," \"run\": null\n","}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:RunnableSequence] [667ms] Exiting Chain run with output:\n","\u001b[0m[outputs]\n"]},{"data":{"text/plain":["'不是'"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["from langchain_core.globals import set_debug\n","\n","set_debug(True)\n","\n","reasoning_with_openai(eval_dataset.iloc[0], prompt, max_tokens=max_new_tokens)"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:RunnableSequence] Entering Chain run with input:\n","\u001b[0m{}\n","\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:RunnableSequence > prompt:ChatPromptTemplate] Entering Prompt run with input:\n","\u001b[0m{}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:RunnableSequence > prompt:ChatPromptTemplate] [0ms] Exiting Prompt run with output:\n","\u001b[0m[outputs]\n","\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[chain:RunnableSequence > llm:ChatOpenAI] Entering LLM run with input:\n","\u001b[0m{\n"," \"prompts\": [\n"," \"System: You are an expert in logical reasoning.\\nHuman: 你是一个情景猜谜游戏的主持人。游戏规则如下:\\n\\n1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\\n2. 主持人知道谜底,谜底是谜面的答案。\\n3. 参与者可以询问任何封闭式问题来找寻事件的真相。\\n4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\\n - 若谜面和谜底能找到问题的答案,回答:是或者不是\\n - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\\n - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\\n - 若参与者提问基本还原了谜底真相,回答:回答正确\\n5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\\n\\n请严格按照这些规则回答参与者提出的问题。\\n\\n**谜面:** 一个宁静的午后,湖边突然传来了惊叫声。人们跑过去发现,湖边石头上静静躺着一个打翻的背包和一张奇怪的纸条。纸条上写着:“它就在这里,但我无法带它回去。”人们搜寻了周围,但什么也没有发现,背包的主人也不知所踪。\\n\\n**谜底:** 原来,湖边是一个学校的生物研究小组经常进行野外考察的地方。背包的主人是一位对湖中生物充满好奇的学生。他在湖边发现了一只稀有的湖龟,但由于湖龟属于保护动物,他无法私自将其带回去。他在纸条上记录了自己的发现,并准备将这个消息告诉他的研究小组。然而,在他离开去寻找同组的伙伴时,不小心滑入湖中,因不会游泳而遭遇了不幸。而湖龟在他跌入水中时受到了惊吓,悄悄潜入了湖中深处。那张纸条和打翻的背包成为了这个谜团的唯一线索。 \\n\\n**参与者提出的问题:** 背包主人是来湖边放生的吗\"\n"," ]\n","}\n","\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[chain:RunnableSequence > llm:ChatOpenAI] [593ms] Exiting LLM run with output:\n","\u001b[0m{\n"," \"generations\": [\n"," [\n"," {\n"," \"text\": \"不是。\",\n"," \"generation_info\": {\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ChatGeneration\",\n"," \"message\": {\n"," \"lc\": 1,\n"," \"type\": \"constructor\",\n"," \"id\": [\n"," \"langchain\",\n"," \"schema\",\n"," \"messages\",\n"," \"AIMessage\"\n"," ],\n"," \"kwargs\": {\n"," \"content\": \"不是。\",\n"," \"response_metadata\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 2,\n"," \"prompt_tokens\": 556,\n"," \"total_tokens\": 558\n"," },\n"," \"model_name\": \"gpt-4o-2024-05-13\",\n"," \"system_fingerprint\": \"fp_25624ae3a5\",\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ai\",\n"," \"id\": \"run-89d84f56-e731-49e8-9c8c-6d49f0f5a0fb-0\",\n"," \"usage_metadata\": {\n"," \"input_tokens\": 556,\n"," \"output_tokens\": 2,\n"," \"total_tokens\": 558\n"," },\n"," \"tool_calls\": [],\n"," \"invalid_tool_calls\": []\n"," }\n"," }\n"," }\n"," ]\n"," ],\n"," \"llm_output\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 2,\n"," \"prompt_tokens\": 556,\n"," \"total_tokens\": 558\n"," },\n"," \"model_name\": \"gpt-4o-2024-05-13\",\n"," \"system_fingerprint\": \"fp_25624ae3a5\"\n"," },\n"," \"run\": null\n","}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:RunnableSequence] [594ms] Exiting Chain run with output:\n","\u001b[0m[outputs]\n"]},{"data":{"text/plain":["'不是。'"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["reasoning_with_openai(eval_dataset.iloc[-1], prompt, model=\"gpt-4o\", max_tokens=max_new_tokens)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textlabelanswertitlepuzzletruth
0甄加索是自杀吗不是NaN海岸之谜在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任...甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在...
1背包主人是来湖边放生的吗不是NaN湖畔迷影一个宁静的午后,湖边突然传来了惊叫声。人们跑过去发现,湖边石头上静静躺着一个打翻的背包和一张...原来,湖边是一个学校的生物研究小组经常进行野外考察的地方。背包的主人是一位对湖中生物充满好奇...
\n","
"],"text/plain":[" text label answer title \\\n","0 甄加索是自杀吗 不是 NaN 海岸之谜 \n","1 背包主人是来湖边放生的吗 不是 NaN 湖畔迷影 \n","\n"," puzzle \\\n","0 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任... \n","1 一个宁静的午后,湖边突然传来了惊叫声。人们跑过去发现,湖边石头上静静躺着一个打翻的背包和一张... \n","\n"," truth \n","0 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在... \n","1 原来,湖边是一个学校的生物研究小组经常进行野外考察的地方。背包的主人是一位对湖中生物充满好奇... "]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["eval_dataset"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 2/2 [00:01<00:00, 1.09it/s]\n"]}],"source":["set_debug(False)\n","predictions = eval_openai(eval_dataset)"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/plain":["['不是', '不是']"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["predictions"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"data":{"text/plain":["{'accuracy': 1.0,\n"," 'precision': 1.0,\n"," 'recall': 1.0,\n"," 'f1': 1.0,\n"," 'ratio_valid_classifications': 1.0}"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["calc_metrics(eval_dataset[\"label\"], predictions)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["^C\n","CPU times: total: 15.6 ms\n","Wall time: 22.7 s\n"]},{"name":"stdout","output_type":"stream","text":["loading env vars from: d:\\code\\projects\\logical-reasoning\\.env\n","Adding d:\\code\\projects\\logical-reasoning to sys.path\n","loading d:\\code\\projects\\logical-reasoning\\llm_toolkit\\logical_reasoning_utils.py\n","gpt-4o-mini datasets/mgtv results/openai_results.csv 16\n","Evaluating model: gpt-4o-mini\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'answer', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","--------------------------------------------------\n","text: 甄加索是自杀吗\n","--------------------------------------------------\n","label: 不是\n","--------------------------------------------------\n","answer: nan\n","--------------------------------------------------\n","title: 海岸之谜\n","--------------------------------------------------\n","puzzle: 在远离城市喧嚣的海边小屋,一天清晨,邻居发现甄加索僵卧在沙滩上,已无生命迹象。现场没有发现任何打斗的迹象。请问甄加索的死因是什么?\n","--------------------------------------------------\n","truth: 甄加索是一位热爱自然的画家,他每年都会来到这个海边小屋寻找灵感。在他生命的最后几天,他一直在创作一幅描绘海洋生物的画作。在画即将完成的前一天晚上,他骑着自行车外出,打算在海边观赏夜景。然而,他在沙滩上意外发现了一只搁浅的海豚,为了救助这只海豚,他耗费了极大的体力,最终成功将其送回海中。筋疲力尽的甄加索在沙滩上睡着了,由于他患有严重的心脏病,却未告知旁人,在寒冷的海风中,他的心脏停止了跳动。因此,警方在现场只发现了车轮痕迹和未完成的画作,而没有发现任何他杀的迹象。\n","*** Evaluating with num_shots: 0\n","'DataFrame' object has no attribute 'to_pandas'\n"]},{"name":"stderr","output_type":"stream","text":["\n"," 0%| | 0/3000 [00:00