{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import requests\n",
"hash = \"QmR8etyW3TPFadNtNrW54vfnFqmh8vBrMARWV76EmxCZyk\"\n",
"ipfs_address = \"https://gateway.autonolas.tech/ipfs/\"\n",
"\n",
"accuracy_link= ipfs_address + hash\n",
"response = requests.get(accuracy_link)\n",
"print(response)\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"headers = ['tool', 'tool_accuracy', 'total_requests', 'min', 'max']\n"
]
}
],
"source": [
"from io import StringIO\n",
"accuracy_store = {}\n",
"data = StringIO(response.text)\n",
"csv_reader = csv.reader(data, delimiter=',')\n",
"for row in csv_reader:\n",
" if row[0] == \"tool\":\n",
" print(f\"headers = {row}\")\n",
" continue\n",
" accuracy_store[row[0]] = [\n",
" row[2],\n",
" row[1],\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'claude-prediction-offline': ['481', '57.380457380457386'], 'claude-prediction-online': ['1055', '61.137440758293835'], 'prediction-offline': ['4465', '67.41321388577828'], 'prediction-offline-sme': ['61', '70.49180327868852'], 'prediction-online': ['9490', '66.00632244467862'], 'prediction-online-sme': ['14642', '65.67408823931157'], 'prediction-request-rag': ['2691', '63.58231140839836'], 'prediction-request-rag-claude': ['7428', '65.64351103931072'], 'prediction-request-reasoning': ['17372', '67.11374625834677'], 'prediction-request-reasoning-claude': ['2470', '66.72064777327935'], 'prediction-url-cot-claude': ['1596', '61.904761904761905']}\n"
]
}
],
"source": [
"print(accuracy_store)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"fpmms = pd.read_parquet('../data/fpmms.parquet')\n",
"tools = pd.read_parquet('../data/tools.parquet')\n",
"trades = pd.read_parquet('../data/all_trades_profitability.parquet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"INC_TOOLS = [\n",
" \"prediction-online\",\n",
" \"prediction-offline\",\n",
" \"claude-prediction-online\",\n",
" \"claude-prediction-offline\",\n",
" \"prediction-offline-sme\",\n",
" \"prediction-online-sme\",\n",
" \"prediction-request-rag\",\n",
" \"prediction-request-reasoning\",\n",
" \"prediction-url-cot-claude\",\n",
" \"prediction-request-rag-claude\",\n",
" \"prediction-request-reasoning-claude\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" win | \n",
" tool | \n",
" tool_accuracy | \n",
" total_requests | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 66.308244 | \n",
" 279 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 58.914027 | \n",
" 1105 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 67.717915 | \n",
" 2283 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-offline-sme | \n",
" 55.555556 | \n",
" 18 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online | \n",
" 65.459066 | \n",
" 5631 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-online-sme | \n",
" 67.417656 | \n",
" 8167 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag | \n",
" 64.217072 | \n",
" 1769 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-rag-claude | \n",
" 69.554566 | \n",
" 4490 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning | \n",
" 68.813594 | \n",
" 9828 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-request-reasoning-claude | \n",
" 68.910256 | \n",
" 2184 | \n",
"
\n",
" \n",
" 10 | \n",
" prediction-url-cot-claude | \n",
" 64.584980 | \n",
" 1265 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"win tool tool_accuracy total_requests\n",
"0 claude-prediction-offline 66.308244 279\n",
"1 claude-prediction-online 58.914027 1105\n",
"2 prediction-offline 67.717915 2283\n",
"3 prediction-offline-sme 55.555556 18\n",
"4 prediction-online 65.459066 5631\n",
"5 prediction-online-sme 67.417656 8167\n",
"6 prediction-request-rag 64.217072 1769\n",
"7 prediction-request-rag-claude 69.554566 4490\n",
"8 prediction-request-reasoning 68.813594 9828\n",
"9 prediction-request-reasoning-claude 68.910256 2184\n",
"10 prediction-url-cot-claude 64.584980 1265"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tools_inc = tools[tools['tool'].isin(INC_TOOLS)]\n",
"# filtering errors\n",
"tools_non_error = tools_inc[tools_inc['error'] != 1]\n",
"tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})\n",
"tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]\n",
"tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]\n",
"tools_non_error['win'] = (tools_non_error['currentAnswer'] == tools_non_error['vote']).astype(int)\n",
"tools_non_error.columns = tools_non_error.columns.astype(str)\n",
"wins = tools_non_error.groupby(['tool', 'win']).size().unstack().fillna(0)\n",
"wins['tool_accuracy'] = (wins[1] / (wins[0] + wins[1])) * 100\n",
"wins.reset_index(inplace=True)\n",
"wins['total_requests'] = wins[0] + wins[1]\n",
"wins.columns = wins.columns.astype(str)\n",
"wins = wins[[\"tool\", \"tool_accuracy\", \"total_requests\"]]\n",
"wins"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" min | \n",
" max | \n",
"
\n",
" \n",
" tool | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" claude-prediction-offline | \n",
" 2024-04-23 13:09:30 | \n",
" 2024-06-10 00:31:30 | \n",
"
\n",
" \n",
" claude-prediction-online | \n",
" 2024-04-12 12:24:20 | \n",
" 2024-06-09 21:41:20 | \n",
"
\n",
" \n",
" prediction-offline | \n",
" 2024-04-12 12:20:10 | \n",
" 2024-06-08 23:45:00 | \n",
"
\n",
" \n",
" prediction-offline-sme | \n",
" 2024-04-16 07:58:45 | \n",
" 2024-04-29 20:45:15 | \n",
"
\n",
" \n",
" prediction-online | \n",
" 2024-04-16 05:52:40 | \n",
" 2024-06-09 21:47:20 | \n",
"
\n",
" \n",
" prediction-online-sme | \n",
" 2024-04-12 11:51:30 | \n",
" 2024-06-10 00:06:00 | \n",
"
\n",
" \n",
" prediction-request-rag | \n",
" 2024-04-12 11:39:40 | \n",
" 2024-06-09 21:17:45 | \n",
"
\n",
" \n",
" prediction-request-rag-claude | \n",
" 2024-04-12 11:14:30 | \n",
" 2024-06-07 11:42:30 | \n",
"
\n",
" \n",
" prediction-request-reasoning | \n",
" 2024-04-12 11:57:05 | \n",
" 2024-06-09 21:50:45 | \n",
"
\n",
" \n",
" prediction-request-reasoning-claude | \n",
" 2024-04-12 11:53:55 | \n",
" 2024-06-05 05:00:10 | \n",
"
\n",
" \n",
" prediction-url-cot-claude | \n",
" 2024-04-12 11:37:15 | \n",
" 2024-06-05 05:21:10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" min max\n",
"tool \n",
"claude-prediction-offline 2024-04-23 13:09:30 2024-06-10 00:31:30\n",
"claude-prediction-online 2024-04-12 12:24:20 2024-06-09 21:41:20\n",
"prediction-offline 2024-04-12 12:20:10 2024-06-08 23:45:00\n",
"prediction-offline-sme 2024-04-16 07:58:45 2024-04-29 20:45:15\n",
"prediction-online 2024-04-16 05:52:40 2024-06-09 21:47:20\n",
"prediction-online-sme 2024-04-12 11:51:30 2024-06-10 00:06:00\n",
"prediction-request-rag 2024-04-12 11:39:40 2024-06-09 21:17:45\n",
"prediction-request-rag-claude 2024-04-12 11:14:30 2024-06-07 11:42:30\n",
"prediction-request-reasoning 2024-04-12 11:57:05 2024-06-09 21:50:45\n",
"prediction-request-reasoning-claude 2024-04-12 11:53:55 2024-06-05 05:00:10\n",
"prediction-url-cot-claude 2024-04-12 11:37:15 2024-06-05 05:21:10"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tools_inc = tools[tools['tool'].isin(INC_TOOLS)]\n",
"# filtering errors\n",
"tools_non_error = tools_inc[tools_inc['error'] != 1]\n",
"tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})\n",
"tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]\n",
"tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]\n",
"tools_non_error['win'] = (tools_non_error['currentAnswer'] == tools_non_error['vote']).astype(int)\n",
"tools_non_error.columns = tools_non_error.columns.astype(str)\n",
"timeline = tools_non_error.groupby(['tool'])[\"request_time\"].agg([\"min\",\"max\"])\n",
"timeline"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" tool_accuracy | \n",
" total_requests | \n",
" min | \n",
" max | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 66.308244 | \n",
" 279 | \n",
" 2024-04-23 13:09:30 | \n",
" 2024-06-10 00:31:30 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 58.914027 | \n",
" 1105 | \n",
" 2024-04-12 12:24:20 | \n",
" 2024-06-09 21:41:20 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 67.717915 | \n",
" 2283 | \n",
" 2024-04-12 12:20:10 | \n",
" 2024-06-08 23:45:00 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-offline-sme | \n",
" 55.555556 | \n",
" 18 | \n",
" 2024-04-16 07:58:45 | \n",
" 2024-04-29 20:45:15 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online | \n",
" 65.459066 | \n",
" 5631 | \n",
" 2024-04-16 05:52:40 | \n",
" 2024-06-09 21:47:20 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-online-sme | \n",
" 67.417656 | \n",
" 8167 | \n",
" 2024-04-12 11:51:30 | \n",
" 2024-06-10 00:06:00 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag | \n",
" 64.217072 | \n",
" 1769 | \n",
" 2024-04-12 11:39:40 | \n",
" 2024-06-09 21:17:45 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-rag-claude | \n",
" 69.554566 | \n",
" 4490 | \n",
" 2024-04-12 11:14:30 | \n",
" 2024-06-07 11:42:30 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning | \n",
" 68.813594 | \n",
" 9828 | \n",
" 2024-04-12 11:57:05 | \n",
" 2024-06-09 21:50:45 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-request-reasoning-claude | \n",
" 68.910256 | \n",
" 2184 | \n",
" 2024-04-12 11:53:55 | \n",
" 2024-06-05 05:00:10 | \n",
"
\n",
" \n",
" 10 | \n",
" prediction-url-cot-claude | \n",
" 64.584980 | \n",
" 1265 | \n",
" 2024-04-12 11:37:15 | \n",
" 2024-06-05 05:21:10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool tool_accuracy total_requests \\\n",
"0 claude-prediction-offline 66.308244 279 \n",
"1 claude-prediction-online 58.914027 1105 \n",
"2 prediction-offline 67.717915 2283 \n",
"3 prediction-offline-sme 55.555556 18 \n",
"4 prediction-online 65.459066 5631 \n",
"5 prediction-online-sme 67.417656 8167 \n",
"6 prediction-request-rag 64.217072 1769 \n",
"7 prediction-request-rag-claude 69.554566 4490 \n",
"8 prediction-request-reasoning 68.813594 9828 \n",
"9 prediction-request-reasoning-claude 68.910256 2184 \n",
"10 prediction-url-cot-claude 64.584980 1265 \n",
"\n",
" min max \n",
"0 2024-04-23 13:09:30 2024-06-10 00:31:30 \n",
"1 2024-04-12 12:24:20 2024-06-09 21:41:20 \n",
"2 2024-04-12 12:20:10 2024-06-08 23:45:00 \n",
"3 2024-04-16 07:58:45 2024-04-29 20:45:15 \n",
"4 2024-04-16 05:52:40 2024-06-09 21:47:20 \n",
"5 2024-04-12 11:51:30 2024-06-10 00:06:00 \n",
"6 2024-04-12 11:39:40 2024-06-09 21:17:45 \n",
"7 2024-04-12 11:14:30 2024-06-07 11:42:30 \n",
"8 2024-04-12 11:57:05 2024-06-09 21:50:45 \n",
"9 2024-04-12 11:53:55 2024-06-05 05:00:10 \n",
"10 2024-04-12 11:37:15 2024-06-05 05:21:10 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total = wins.merge(timeline,how=\"left\", on=\"tool\")\n",
"total"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"total.to_csv(\"accuracy_info.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def extract_question(text):\n",
" pattern = r'\"([^\"]+\\?)\"'\n",
" match = re.search(pattern, text)\n",
" if match:\n",
" return match.group(1)\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_current_answer(q):\n",
" return trades[trades['title'] == q]['current_answer'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# only select trades in May 2024\n",
"trades['creation_timestamp'] = pd.to_datetime(trades['creation_timestamp'])\n",
"trades = trades[trades['creation_timestamp'].dt.month == 5]\n",
"trades = trades[trades['creation_timestamp'].dt.year == 2024]\n",
"\n",
"# make a column for winning_vote\n",
"tools['winning_vote'] = (tools['vote'] == tools['currentAnswer'])\n",
"tools = tools[tools['tool']!= 'resolve-market-reasoning-gpt-4'].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"tools['prompt_request'] = tools['prompt_request'].apply(extract_question)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"trades_grouped = trades.groupby(['title', 'winning_trade']).size().unstack().fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"winning_trades_percentage = trades_grouped[True] / trades_grouped.sum(axis=1)\n",
"winning_trades_percentage = winning_trades_percentage.reset_index()\n",
"winning_trades_percentage.columns = ['title', 'winning_trade_percentage']\n",
"winning_trades_percentage['num_trades'] = list(trades_grouped.sum(axis=1).values)\n",
"winning_trades_percentage_bottom_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[-50:].reset_index(drop=True)\n",
"winning_trades_percentage_top_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[:50].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False).reset_index(drop=True).to_csv('winning_trades_percentage.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Will Kylian Mbappe leave Paris St-Germain at the end of the season by 16 May 2024?',\n",
" 'Will BlizzCon be reinstated on or by 1 May 2024 after its cancellation in 2024?',\n",
" 'Will Joe Biden approve more weapons for Ukraine by 4 May 2024?',\n",
" \"Will FiiO's new custom in-ear monitors become the top-selling wireless earbuds by 9 May 2024?\",\n",
" 'Will Mohamed Salah leave Liverpool on 7 May 2024?',\n",
" \"Will Ryan Gosling accept a 'dark' role in a film by 14 May 2024?\",\n",
" 'Will the Philadelphia 76ers win the NBA play-offs on 7 May 2024?',\n",
" 'Will the Panamanian presidential election result in a clear victor by 12 May 2024?',\n",
" 'Will the Museum of Old and New Art in Tasmania be allowed to keep its exhibit women-only by 14 May 2024?',\n",
" \"Will Diego Maradona's 'Stolen' Golden Ball be auctioned off on 14 May 2024?\",\n",
" 'Will the Mercedes G-Wagen release an electric version on 1 May 2024?',\n",
" 'Will the Israeli government lift the broadcast ban on Al Jazeera on or before 13 May 2024?',\n",
" 'Will Intel release its Core Ultra 200 Arrow Lake CPUs by 16 May 2024?',\n",
" 'Will the Atlanta City Council pay $3.8 million to settle a lawsuit by the family of a church deacon who died in a struggle with a city police officer by 13 May 2024?',\n",
" 'Will Voyager-1 continue to send readable data until 1 May 2024?',\n",
" 'Will the Amber Alert issued in New Mexico result in the discovery of the missing 10-month-old baby by 13 May 2024?',\n",
" \"Will Florida's ban on lab-grown meat be overturned by 12 May 2024?\",\n",
" \"Will the US government successfully distribute the $138.7 million payout to Larry Nassar's victims by 1 May 2024?\",\n",
" 'Will a new sport be officially added to the Olympics programme on 16 May 2024?',\n",
" \"Will Kristi Noem be announced as Donald Trump's vice presidential running mate by 6 May 2024?\",\n",
" 'Will the United Auto Workers union strike against Daimler Truck on or by 7 May 2024?',\n",
" 'Will the World Snooker Championship 2024 conclude with Judd Trump or Tom Ford as the winner by May 5, 2024?',\n",
" \"Will Maria Georgas be announced as the next 'Bachelorette' lead on 9 May 2024?\",\n",
" 'Will Apple release new iPads at their event on May 7, 2024?',\n",
" 'Will Joe Biden still be the President of the United States on 11 May 2024?',\n",
" \"Will the world's biggest 3D printer be used to make parts of houses by 2 May 2024?\",\n",
" \"Will Anthony Edwards be named NBA's MVP on 11 May 2024?\",\n",
" 'Will a winner be declared in the Eurovision 2024 grand final by 19 May 2024?',\n",
" \"Will a new mission be launched to explore the moon's 'hidden side' by 12 May 2024?\",\n",
" 'Will Mike Tyson win his bout against Jake Paul on 7 May 2024?',\n",
" 'Will the bird flu outbreak be declared a global pandemic by 12 May 2024?',\n",
" 'Will the new Apple Pencil Pro be revealed by 15 May 2024?',\n",
" \"Will the amateur angler who landed UK's 'biggest fish' in Essex catch another record-breaking fish by 7 May 2024?\",\n",
" \"Will Saul 'Canelo' Alvarez successfully defend his WBA, WBC, WBO, and IBF titles again by 13 May 2024?\",\n",
" \"Will Taylor Swift's 'The Tortured Poets Department' album reach number 1 on Billboard 200 on 3 May 2024?\",\n",
" 'Will Joe Biden attend the White House Correspondents Dinner on 5 May 2024?',\n",
" 'Will King Charles perform public duties on 5 May 2024, after his progress in cancer treatment?',\n",
" \"Will LinkedIn's new puzzle games Pinpoint, Queens, and Crossclimb be successful on their platform by 9 May 2024?\",\n",
" 'Will South Dakota Governor Kristi Noem resign over the puppy killing controversy by 15 May 2024?',\n",
" 'Will Apple announce the release of a new M4 chip by 13 May 2024?',\n",
" 'Will Eric Adams still be the mayor of New York City on 10 May 2024?',\n",
" \"Will the livestream video 'portals' connecting New York City and Dublin still be operational on 19 May 2024?\",\n",
" 'Will there be more pro-Palestinian protests on US university campuses on 6 May 2024?',\n",
" 'Will Google Pixel 8a be released at Google I/O 2024 on 14 May?',\n",
" 'Will Apple announce more than just a spec bump at the May 2024 iPad event?',\n",
" \"Will Apple's new Magic Keyboard for the iPad Pro M4 be released by 15 May 2024?\",\n",
" 'Will the UEFA Champions League final be between PSG and Borussia Dortmund on 13 May 2024?',\n",
" 'Will the FBI report an increase in scams targeting Americans older than 60 in 2024?',\n",
" 'Will Erik ten Hag remain as Manchester United manager on 17 May 2024?',\n",
" 'Will Jofra Archer be a part of the England squad for T20 World Cup in June 2024?']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"winning_trades_percentage_top_50['title'].tolist()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[\"Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\",\n",
" 'Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?',\n",
" 'Will the final report on the Baltimore bridge collapse be released by 20 May 2024?',\n",
" 'Will the Autonomous Racing League successfully hold their second race by May 3, 2024?',\n",
" 'Will Trent Staggs win the Senatorial race to replace Sen. Mitt Romney (R-UT) on 5 May 2024?',\n",
" 'Will the Houston area experience flooding conditions on 11 May 2024?',\n",
" \"Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?\",\n",
" 'Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?',\n",
" 'Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?',\n",
" \"Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?\",\n",
" 'Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?',\n",
" 'Will there be any major cyber attack on an organization using AI before 2 May 2024?',\n",
" 'Will Sony complete the takeover of Paramount by 11 May 2024?',\n",
" \"Will 'Hell's Kitchen' win the Tony Awards for Best Musical on 7 May 2024?\",\n",
" 'Will Tesla announce reinstating any laid off supercharger workers by 11 May 2024?',\n",
" 'Will there be another tornado in Nebraska and Iowa on 6 May 2024?',\n",
" 'Will the DJI drones be officially banned in the United States by 4 May 2024?',\n",
" 'Will OpenAI debut a multimodal AI digital assistant by 19 May 2024?',\n",
" 'Will TikTok be purchased by a Wall Street or Tech billionaire by 2 May 2024?',\n",
" \"Will the 'Lost' Gustav Klimt painting be sold at the auction in Vienna on 3 May 2024?\",\n",
" \"Will the Federal Communications Commission levy fines against AT&T, Sprint, T-Mobile, and Verizon for illegally sharing customers' location data by 9 May 2024?\",\n",
" 'Will the Manchester City win the WSL title on 14 May 2024?',\n",
" 'Will Meta start making profit from generative AI by 3 May 2024?',\n",
" 'Will Apple launch an AI-powered iOS 18 on or by 1 May 2024?',\n",
" 'Will iOS 18 receive a major AI overhaul by 6 May 2024?',\n",
" 'Will Ippei Mizuhara be sentenced for bank fraud by 15 May 2024?',\n",
" 'Will Tesla lay off nearly 2,700 workers at its Austin, Texas factory by 1 May 2024?',\n",
" 'Will Manchester City win the Premier League title on 11 May 2024?',\n",
" 'Will there be another deadly pandemic by 8 May 2024?',\n",
" 'Will China successfully collect samples from the far side of the Moon on 10 May 2024?',\n",
" \"Will the American Airlines correct their system's error of mistaking 101-year-old passenger for a baby by 7 May 2024?\",\n",
" 'Will the Boeing Starliner capsule successfully complete its first astronaut-crewed flight to the International Space Station by 13 May 2024?',\n",
" \"Will the Technics' special-edition turntable in collaboration with Lamborghini be released by 17 May 2024?\",\n",
" 'Will the Florida Panthers win against the Boston Bruins in the Game 3 on 17 May 2024?',\n",
" 'Will Harvard Yard be free from Anti-Israel protests by 2 May 2024?',\n",
" \"Will Samsung's latest jibe have any impact on Apple's sales by 11 May 2024?\",\n",
" \"Will the Miss USA organization respond to the call for 'full transparency' from contestants by 16 May 2024?\",\n",
" 'Will Tom Daley win a medal at the Paris Olympics 2024 by 14 May 2024?',\n",
" \"Will Liverpool win any more trophies in Jurgen Klopp's final season?\",\n",
" 'Will Liverpool win any more trophies by 2 May 2024?',\n",
" 'Will Caitlin Clark score more than 20 points in her next NBA game by 10 May 2024?',\n",
" 'Will the statues of civil rights leader Daisy Bates and singer Johnny Cash replace the Arkansas statues at the U.S Capitol by 14 May 2024?',\n",
" \"Will the season 6 of Netflix's Cobra Kai be released in 3 parts by 12 May 2024?\",\n",
" \"Will the 'Don't Say Gay' education restrictions bill be implemented in Alabama on or before 1 May 2024?\",\n",
" \"Will the 'lost' Gustav Klimt painting be successfully auctioned by 3 May 2024?\",\n",
" 'Will the Kansas City Chiefs win their next game on or before May 15, 2024?',\n",
" 'Will Lando Norris win another F1 race by 15 May 2024?',\n",
" 'Will Pennsylvania be a red state by 6 May 2024?',\n",
" 'Will Tesla face significant financial troubles by 11 May 2024?',\n",
" 'Will the BattlerGC Pro be released for the GameCube on or by 3 May 2024?']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"winning_trades_percentage_bottom_50['title'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"def losing_percentage(q):\n",
" print(f\"Losing percentage for: {q}\")\n",
" q_losing = tools[tools['prompt_request'].str.contains(q)].groupby(['tool', 'winning_vote']).size().unstack().fillna(0)\n",
" q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])\n",
" q_losing_perc = q_losing_perc.reset_index()\n",
" q_losing_perc.columns = ['tool', 'losing_percentage']\n",
" q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)\n",
" q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)\n",
" return q_losing_perc"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 40.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online-sme | \n",
" 0.656716 | \n",
" 67.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning-claude | \n",
" 0.571429 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-reasoning | \n",
" 0.538462 | \n",
" 52.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-request-rag | \n",
" 0.250000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-online | \n",
" 0.185185 | \n",
" 27.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 prediction-offline 1.000000 40.0\n",
"4 prediction-request-rag-claude 1.000000 17.0\n",
"7 prediction-url-cot-claude 1.000000 2.0\n",
"2 prediction-online-sme 0.656716 67.0\n",
"6 prediction-request-reasoning-claude 0.571429 7.0\n",
"5 prediction-request-reasoning 0.538462 52.0\n",
"3 prediction-request-rag 0.250000 4.0\n",
"1 prediction-online 0.185185 27.0"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# have confirmed market resolution was correct\n",
"losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 40.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online-sme | \n",
" 0.656716 | \n",
" 67.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning-claude | \n",
" 0.571429 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-reasoning | \n",
" 0.538462 | \n",
" 52.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-request-rag | \n",
" 0.250000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-online | \n",
" 0.185185 | \n",
" 27.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 prediction-offline 1.000000 40.0\n",
"4 prediction-request-rag-claude 1.000000 17.0\n",
"7 prediction-url-cot-claude 1.000000 2.0\n",
"2 prediction-online-sme 0.656716 67.0\n",
"6 prediction-request-reasoning-claude 0.571429 7.0\n",
"5 prediction-request-reasoning 0.538462 52.0\n",
"3 prediction-request-rag 0.250000 4.0\n",
"1 prediction-online 0.185185 27.0"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# have confirmed currentAnswer\n",
"losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 7 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online-sme | \n",
" 0.977273 | \n",
" 44.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-online | \n",
" 0.975000 | \n",
" 40.0 | \n",
"
\n",
" \n",
" 0 | \n",
" prediction-offline | \n",
" 0.677419 | \n",
" 31.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-reasoning | \n",
" 0.534483 | \n",
" 58.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag-claude | \n",
" 0.223881 | \n",
" 67.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning-claude | \n",
" 0.200000 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-request-rag | \n",
" 0.000000 | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"7 prediction-url-cot-claude 1.000000 1.0\n",
"2 prediction-online-sme 0.977273 44.0\n",
"1 prediction-online 0.975000 40.0\n",
"0 prediction-offline 0.677419 31.0\n",
"5 prediction-request-reasoning 0.534483 58.0\n",
"4 prediction-request-rag-claude 0.223881 67.0\n",
"6 prediction-request-reasoning-claude 0.200000 5.0\n",
"3 prediction-request-rag 0.000000 8.0"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# have confirmed currentAnswer\n",
"losing_percentage(winning_trades_percentage_bottom_50.loc[1, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will the final report on the Baltimore bridge collapse be released by 20 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.000000 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 1.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 87.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 25.0 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online | \n",
" 0.951220 | \n",
" 41.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning-claude | \n",
" 0.833333 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag | \n",
" 0.714286 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning | \n",
" 0.437500 | \n",
" 48.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online-sme | \n",
" 0.394366 | \n",
" 71.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.000000 5.0\n",
"1 claude-prediction-online 1.000000 1.0\n",
"2 prediction-offline 1.000000 87.0\n",
"6 prediction-request-rag-claude 1.000000 25.0\n",
"9 prediction-url-cot-claude 1.000000 1.0\n",
"3 prediction-online 0.951220 41.0\n",
"8 prediction-request-reasoning-claude 0.833333 6.0\n",
"5 prediction-request-rag 0.714286 7.0\n",
"7 prediction-request-reasoning 0.437500 48.0\n",
"4 prediction-online-sme 0.394366 71.0"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# have confirmed currentAnswer\n",
"losing_percentage(winning_trades_percentage_bottom_50.loc[2, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will the Autonomous Racing League successfully hold their second race by May 3, 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-offline | \n",
" 1.0 | \n",
" 23.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online | \n",
" 1.0 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online-sme | \n",
" 1.0 | \n",
" 18.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag | \n",
" 1.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag-claude | \n",
" 1.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-url-cot-claude | \n",
" 1.0 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning | \n",
" 0.0 | \n",
" 18.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning-claude | \n",
" 0.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.0 2.0\n",
"1 prediction-offline 1.0 23.0\n",
"2 prediction-online 1.0 14.0\n",
"3 prediction-online-sme 1.0 18.0\n",
"4 prediction-request-rag 1.0 5.0\n",
"5 prediction-request-rag-claude 1.0 8.0\n",
"8 prediction-url-cot-claude 1.0 6.0\n",
"6 prediction-request-reasoning 0.0 18.0\n",
"7 prediction-request-reasoning-claude 0.0 3.0"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# have confirmed currentAnswer\n",
"losing_percentage(winning_trades_percentage_bottom_50.loc[3, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will the Houston area experience flooding conditions on 11 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 1.000000 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 58.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online-sme | \n",
" 1.000000 | \n",
" 39.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag | \n",
" 1.000000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning-claude | \n",
" 1.000000 | \n",
" 8.0 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag-claude | \n",
" 0.754717 | \n",
" 53.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning | \n",
" 0.369048 | \n",
" 84.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online | \n",
" 0.166667 | \n",
" 72.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.000000 2.0\n",
"1 claude-prediction-online 1.000000 6.0\n",
"2 prediction-offline 1.000000 58.0\n",
"4 prediction-online-sme 1.000000 39.0\n",
"5 prediction-request-rag 1.000000 4.0\n",
"8 prediction-request-reasoning-claude 1.000000 8.0\n",
"9 prediction-url-cot-claude 1.000000 5.0\n",
"6 prediction-request-rag-claude 0.754717 53.0\n",
"7 prediction-request-reasoning 0.369048 84.0\n",
"3 prediction-online 0.166667 72.0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[5, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" prediction-online-sme | \n",
" 0.750000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-reasoning-claude | \n",
" 0.750000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-request-rag | \n",
" 0.666667 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-request-rag-claude | \n",
" 0.500000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-reasoning | \n",
" 0.400000 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 0 | \n",
" claude-prediction-online | \n",
" 0.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"1 prediction-online-sme 0.750000 4.0\n",
"5 prediction-request-reasoning-claude 0.750000 4.0\n",
"2 prediction-request-rag 0.666667 6.0\n",
"3 prediction-request-rag-claude 0.500000 2.0\n",
"4 prediction-request-reasoning 0.400000 5.0\n",
"0 claude-prediction-online 0.000000 1.0"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[6, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 11.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-online | \n",
" 1.000000 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online-sme | \n",
" 1.000000 | \n",
" 30.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 45.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-reasoning | \n",
" 0.874016 | \n",
" 127.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-request-rag | \n",
" 0.250000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning-claude | \n",
" 0.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 prediction-offline 1.000000 11.0\n",
"1 prediction-online 1.000000 17.0\n",
"2 prediction-online-sme 1.000000 30.0\n",
"4 prediction-request-rag-claude 1.000000 45.0\n",
"5 prediction-request-reasoning 0.874016 127.0\n",
"3 prediction-request-rag 0.250000 4.0\n",
"6 prediction-request-reasoning-claude 0.000000 2.0"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[7, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.000000 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online-sme | \n",
" 1.000000 | \n",
" 19.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 15.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag | \n",
" 0.941176 | \n",
" 17.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online | \n",
" 0.800000 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning-claude | \n",
" 0.666667 | \n",
" 15.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning | \n",
" 0.652174 | \n",
" 23.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-url-cot-claude | \n",
" 0.333333 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.000000 7.0\n",
"1 prediction-offline 1.000000 1.0\n",
"3 prediction-online-sme 1.000000 19.0\n",
"5 prediction-request-rag-claude 1.000000 15.0\n",
"4 prediction-request-rag 0.941176 17.0\n",
"2 prediction-online 0.800000 5.0\n",
"7 prediction-request-reasoning-claude 0.666667 15.0\n",
"6 prediction-request-reasoning 0.652174 23.0\n",
"8 prediction-url-cot-claude 0.333333 3.0"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[8, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.000000 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 1 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-url-cot-claude | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-reasoning | \n",
" 0.916667 | \n",
" 12.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning-claude | \n",
" 0.900000 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-request-rag | \n",
" 0.714286 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online-sme | \n",
" 0.666667 | \n",
" 9.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-online | \n",
" 0.500000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag-claude | \n",
" 0.454545 | \n",
" 11.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.000000 4.0\n",
"1 prediction-offline 1.000000 2.0\n",
"8 prediction-url-cot-claude 1.000000 2.0\n",
"6 prediction-request-reasoning 0.916667 12.0\n",
"7 prediction-request-reasoning-claude 0.900000 10.0\n",
"4 prediction-request-rag 0.714286 14.0\n",
"3 prediction-online-sme 0.666667 9.0\n",
"2 prediction-online 0.500000 2.0\n",
"5 prediction-request-rag-claude 0.454545 11.0"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[9, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Losing percentage for: Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 1.000000 | \n",
" 6.0 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 1.000000 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 1.000000 | \n",
" 36.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag-claude | \n",
" 1.000000 | \n",
" 50.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online-sme | \n",
" 0.986486 | \n",
" 74.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-request-rag | \n",
" 0.947368 | \n",
" 19.0 | \n",
"
\n",
" \n",
" 3 | \n",
" prediction-online | \n",
" 0.910714 | \n",
" 56.0 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-url-cot-claude | \n",
" 0.777778 | \n",
" 9.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-reasoning | \n",
" 0.465753 | \n",
" 73.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning-claude | \n",
" 0.071429 | \n",
" 14.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"0 claude-prediction-offline 1.000000 6.0\n",
"1 claude-prediction-online 1.000000 3.0\n",
"2 prediction-offline 1.000000 36.0\n",
"6 prediction-request-rag-claude 1.000000 50.0\n",
"4 prediction-online-sme 0.986486 74.0\n",
"5 prediction-request-rag 0.947368 19.0\n",
"3 prediction-online 0.910714 56.0\n",
"9 prediction-url-cot-claude 0.777778 9.0\n",
"7 prediction-request-reasoning 0.465753 73.0\n",
"8 prediction-request-reasoning-claude 0.071429 14.0"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"losing_percentage(winning_trades_percentage_bottom_50.loc[10, 'title'])"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()\n",
"q_losing = tools[tools['prompt_request'].isin(all_q)]\n",
"q_losing = q_losing.groupby(['tool'])['winning_vote'].value_counts().unstack().fillna(0)\n",
"q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])\n",
"q_losing_perc = q_losing_perc.reset_index()\n",
"q_losing_perc.columns = ['tool', 'losing_percentage']\n",
"q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)\n",
"q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tool | \n",
" losing_percentage | \n",
" num_calls | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" prediction-offline-sme | \n",
" 1.000000 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 7 | \n",
" prediction-request-rag-claude | \n",
" 0.913007 | \n",
" 1184.0 | \n",
"
\n",
" \n",
" 2 | \n",
" prediction-offline | \n",
" 0.893281 | \n",
" 1012.0 | \n",
"
\n",
" \n",
" 6 | \n",
" prediction-request-rag | \n",
" 0.889881 | \n",
" 336.0 | \n",
"
\n",
" \n",
" 5 | \n",
" prediction-online-sme | \n",
" 0.857143 | \n",
" 1722.0 | \n",
"
\n",
" \n",
" 4 | \n",
" prediction-online | \n",
" 0.853553 | \n",
" 1154.0 | \n",
"
\n",
" \n",
" 8 | \n",
" prediction-request-reasoning | \n",
" 0.847451 | \n",
" 2727.0 | \n",
"
\n",
" \n",
" 10 | \n",
" prediction-url-cot-claude | \n",
" 0.846154 | \n",
" 130.0 | \n",
"
\n",
" \n",
" 1 | \n",
" claude-prediction-online | \n",
" 0.735849 | \n",
" 53.0 | \n",
"
\n",
" \n",
" 9 | \n",
" prediction-request-reasoning-claude | \n",
" 0.659664 | \n",
" 238.0 | \n",
"
\n",
" \n",
" 0 | \n",
" claude-prediction-offline | \n",
" 0.591549 | \n",
" 142.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tool losing_percentage num_calls\n",
"3 prediction-offline-sme 1.000000 2.0\n",
"7 prediction-request-rag-claude 0.913007 1184.0\n",
"2 prediction-offline 0.893281 1012.0\n",
"6 prediction-request-rag 0.889881 336.0\n",
"5 prediction-online-sme 0.857143 1722.0\n",
"4 prediction-online 0.853553 1154.0\n",
"8 prediction-request-reasoning 0.847451 2727.0\n",
"10 prediction-url-cot-claude 0.846154 130.0\n",
"1 claude-prediction-online 0.735849 53.0\n",
"9 prediction-request-reasoning-claude 0.659664 238.0\n",
"0 claude-prediction-offline 0.591549 142.0"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q_losing_perc"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" confidence | \n",
" 0.00 | \n",
" 0.10 | \n",
" 0.20 | \n",
" 0.30 | \n",
" 0.40 | \n",
" 0.50 | \n",
" 0.55 | \n",
" 0.60 | \n",
" 0.65 | \n",
" 0.70 | \n",
" 0.75 | \n",
" 0.80 | \n",
" 0.85 | \n",
" 0.90 | \n",
" 0.95 | \n",
" 0.99 | \n",
" 1.00 | \n",
"
\n",
" \n",
" tool | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" claude-prediction-offline | \n",
" 0.0 | \n",
" 0.0 | \n",
" 5.0 | \n",
" 46.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 87.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" claude-prediction-online | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 10.0 | \n",
" 7.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 30.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" prediction-offline | \n",
" 0.0 | \n",
" 267.0 | \n",
" 2.0 | \n",
" 13.0 | \n",
" 302.0 | \n",
" 189.0 | \n",
" 0.0 | \n",
" 231.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" prediction-offline-sme | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" prediction-online | \n",
" 0.0 | \n",
" 22.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 43.0 | \n",
" 23.0 | \n",
" 8.0 | \n",
" 670.0 | \n",
" 99.0 | \n",
" 2.0 | \n",
" 76.0 | \n",
" 28.0 | \n",
" 55.0 | \n",
" 25.0 | \n",
" 11.0 | \n",
" 0.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" prediction-online-sme | \n",
" 1.0 | \n",
" 27.0 | \n",
" 10.0 | \n",
" 0.0 | \n",
" 71.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 679.0 | \n",
" 234.0 | \n",
" 39.0 | \n",
" 149.0 | \n",
" 76.0 | \n",
" 109.0 | \n",
" 80.0 | \n",
" 6.0 | \n",
" 0.0 | \n",
" 39.0 | \n",
"
\n",
" \n",
" prediction-request-rag | \n",
" 0.0 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 25.0 | \n",
" 5.0 | \n",
" 48.0 | \n",
" 11.0 | \n",
" 36.0 | \n",
" 57.0 | \n",
" 16.0 | \n",
" 11.0 | \n",
" 1.0 | \n",
" 20.0 | \n",
"
\n",
" \n",
" prediction-request-rag-claude | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 32.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 175.0 | \n",
" 0.0 | \n",
" 513.0 | \n",
" 0.0 | \n",
" 209.0 | \n",
" 3.0 | \n",
" 40.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" prediction-request-reasoning | \n",
" 0.0 | \n",
" 3.0 | \n",
" 103.0 | \n",
" 1.0 | \n",
" 58.0 | \n",
" 97.0 | \n",
" 0.0 | \n",
" 315.0 | \n",
" 176.0 | \n",
" 441.0 | \n",
" 317.0 | \n",
" 339.0 | \n",
" 159.0 | \n",
" 44.0 | \n",
" 58.0 | \n",
" 0.0 | \n",
" 97.0 | \n",
"
\n",
" \n",
" prediction-request-reasoning-claude | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 27.0 | \n",
" 0.0 | \n",
" 38.0 | \n",
" 4.0 | \n",
" 76.0 | \n",
" 0.0 | \n",
" 8.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" prediction-url-cot-claude | \n",
" 0.0 | \n",
" 2.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 40.0 | \n",
" 0.0 | \n",
" 60.0 | \n",
" 0.0 | \n",
" 22.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"confidence 0.00 0.10 0.20 0.30 0.40 0.50 \\\n",
"tool \n",
"claude-prediction-offline 0.0 0.0 5.0 46.0 4.0 0.0 \n",
"claude-prediction-online 0.0 0.0 2.0 10.0 7.0 3.0 \n",
"prediction-offline 0.0 267.0 2.0 13.0 302.0 189.0 \n",
"prediction-offline-sme 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"prediction-online 0.0 22.0 4.0 5.0 43.0 23.0 \n",
"prediction-online-sme 1.0 27.0 10.0 0.0 71.0 2.0 \n",
"prediction-request-rag 0.0 3.0 2.0 0.0 4.0 4.0 \n",
"prediction-request-rag-claude 0.0 0.0 1.0 32.0 0.0 0.0 \n",
"prediction-request-reasoning 0.0 3.0 103.0 1.0 58.0 97.0 \n",
"prediction-request-reasoning-claude 0.0 0.0 0.0 3.0 4.0 0.0 \n",
"prediction-url-cot-claude 0.0 2.0 1.0 2.0 0.0 0.0 \n",
"\n",
"confidence 0.55 0.60 0.65 0.70 0.75 0.80 \\\n",
"tool \n",
"claude-prediction-offline 0.0 87.0 0.0 0.0 0.0 0.0 \n",
"claude-prediction-online 0.0 30.0 0.0 0.0 0.0 0.0 \n",
"prediction-offline 0.0 231.0 3.0 0.0 0.0 0.0 \n",
"prediction-offline-sme 0.0 0.0 0.0 0.0 2.0 0.0 \n",
"prediction-online 8.0 670.0 99.0 2.0 76.0 28.0 \n",
"prediction-online-sme 0.0 679.0 234.0 39.0 149.0 76.0 \n",
"prediction-request-rag 0.0 25.0 5.0 48.0 11.0 36.0 \n",
"prediction-request-rag-claude 0.0 175.0 0.0 513.0 0.0 209.0 \n",
"prediction-request-reasoning 0.0 315.0 176.0 441.0 317.0 339.0 \n",
"prediction-request-reasoning-claude 0.0 27.0 0.0 38.0 4.0 76.0 \n",
"prediction-url-cot-claude 0.0 40.0 0.0 60.0 0.0 22.0 \n",
"\n",
"confidence 0.85 0.90 0.95 0.99 1.00 \n",
"tool \n",
"claude-prediction-offline 0.0 0.0 0.0 0.0 0.0 \n",
"claude-prediction-online 0.0 1.0 0.0 0.0 0.0 \n",
"prediction-offline 1.0 2.0 0.0 0.0 1.0 \n",
"prediction-offline-sme 0.0 0.0 0.0 0.0 0.0 \n",
"prediction-online 55.0 25.0 11.0 0.0 20.0 \n",
"prediction-online-sme 109.0 80.0 6.0 0.0 39.0 \n",
"prediction-request-rag 57.0 16.0 11.0 1.0 20.0 \n",
"prediction-request-rag-claude 3.0 40.0 3.0 0.0 0.0 \n",
"prediction-request-reasoning 159.0 44.0 58.0 0.0 97.0 \n",
"prediction-request-reasoning-claude 0.0 8.0 1.0 0.0 2.0 \n",
"prediction-url-cot-claude 0.0 3.0 0.0 0.0 0.0 "
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()\n",
"q_losing = tools[tools['prompt_request'].isin(all_q)]\n",
"q_losing.groupby(['tool'])['confidence'].value_counts().unstack().fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "akash",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}