diff --git a/.gitattributes b/.gitattributes index fbf7a8bb894c19993bd738e3bf1a80c6d6675841..4771d364873fa1788527fc77c5b5c31c6b0f0323 100644 --- a/.gitattributes +++ b/.gitattributes @@ -115,3 +115,43 @@ results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text +results/claude-3-haiku-20240307.csv filter=lfs diff=lfs merge=lfs -text +results/claude-3-opus-20240229.csv filter=lfs diff=lfs merge=lfs -text +results-cot/gemma-7b-it.csv filter=lfs diff=lfs merge=lfs -text +results-cot/gpt-3.5-turbo-0125.csv filter=lfs diff=lfs merge=lfs -text +results/gpt-3.5-turbo-0125.csv filter=lfs diff=lfs merge=lfs -text +results-cot/Mixtral-8x7B-Instruct-v0.1.csv filter=lfs diff=lfs merge=lfs -text +results/gemma-7b-it.csv filter=lfs diff=lfs merge=lfs -text +results-cot/Qwen1.5-72B-Chat.csv filter=lfs diff=lfs merge=lfs -text +results/CodeLlama-70b-Instruct-hf.csv filter=lfs diff=lfs merge=lfs -text +results/Mixtral-8x7B-Instruct-v0.1.csv filter=lfs diff=lfs merge=lfs -text +results-cot/gemma-7b-it.pkl filter=lfs diff=lfs merge=lfs -text +results/claude-3-haiku-20240307.pkl filter=lfs diff=lfs merge=lfs -text +results/gemma-7b-it.pkl filter=lfs diff=lfs merge=lfs -text +results-cot/gpt-3.5-turbo-0125.pkl filter=lfs diff=lfs merge=lfs -text +results-cot/Mixtral-8x7B-Instruct-v0.1.pkl filter=lfs diff=lfs merge=lfs -text +results/Mixtral-8x7B-Instruct-v0.1.pkl filter=lfs diff=lfs merge=lfs -text +results/claude-3-opus-20240229.pkl filter=lfs diff=lfs merge=lfs -text +results-cot/Qwen1.5-72B-Chat.pkl filter=lfs diff=lfs merge=lfs -text +results/CodeLlama-70b-Instruct-hf.pkl filter=lfs diff=lfs merge=lfs -text +results/gpt-3.5-turbo-0125.pkl filter=lfs diff=lfs merge=lfs -text +results/claude-3-haiku-20240307.jpg filter=lfs diff=lfs merge=lfs -text +results/claude-3-opus-20240229.jpg filter=lfs diff=lfs merge=lfs -text +results/gpt-3.5-turbo-0125.jpg filter=lfs diff=lfs merge=lfs -text +results-cot/gpt-3.5-turbo-0125.jpg filter=lfs diff=lfs merge=lfs -text +results/Mixtral-8x7B-Instruct-v0.1.jpg filter=lfs diff=lfs merge=lfs -text +results-cot/Qwen1.5-72B-Chat.jpg filter=lfs diff=lfs merge=lfs -text +results/gemma-7b-it.jpg filter=lfs diff=lfs merge=lfs -text +results-cot/Mixtral-8x7B-Instruct-v0.1.jpg filter=lfs diff=lfs merge=lfs -text +results-cot/gemma-7b-it.jpg filter=lfs diff=lfs merge=lfs -text +results/CodeLlama-70b-Instruct-hf.jpg filter=lfs diff=lfs merge=lfs -text +results-cot/gemma-7b-it.png filter=lfs diff=lfs merge=lfs -text +results-cot/gpt-3.5-turbo-0125.png filter=lfs diff=lfs merge=lfs -text +results/gpt-3.5-turbo-0125.png filter=lfs diff=lfs merge=lfs -text +results/CodeLlama-70b-Instruct-hf.png filter=lfs diff=lfs merge=lfs -text +results/Mixtral-8x7B-Instruct-v0.1.png filter=lfs diff=lfs merge=lfs -text +results/claude-3-opus-20240229.png filter=lfs diff=lfs merge=lfs -text +results-cot/Mixtral-8x7B-Instruct-v0.1.png filter=lfs diff=lfs merge=lfs -text +results-cot/Qwen1.5-72B-Chat.png filter=lfs diff=lfs merge=lfs -text +results/claude-3-haiku-20240307.png filter=lfs diff=lfs merge=lfs -text +results/gemma-7b-it.png filter=lfs diff=lfs merge=lfs -text diff --git a/app.py b/app.py index 9adab9c009f25af9ae1031bd1890cf1ca42b8ee0..1f3b7da5e7157c36949cb83402f47f414d1f5609 100644 --- a/app.py +++ b/app.py @@ -10,7 +10,7 @@ vision_results = glob("results-vision/*.pkl") # Load CoT text benchmark results cot_text_results = glob("results-cot/*.pkl") # Load CoT vision benchmark results -cot_vision_results = glob("results-vision-CoT/*.pkl") +# cot_vision_results = glob("results-vision-CoT/*.pkl") # Function to load data, add model type and name def load_data(files, model_type): @@ -27,18 +27,22 @@ def load_data(files, model_type): data = load_data(csv_results, "Text Only") vision_data = load_data(vision_results, "Vision") cot_text_data = load_data(cot_text_results, "CoT Text Only") -cot_vision_data = load_data(cot_vision_results, "CoT Vision") +# cot_vision_data = load_data(cot_vision_results, "CoT Vision") # Combine all data into a single DataFrame all_data = pd.concat( - [data, vision_data, cot_text_data, cot_vision_data], ignore_index=True + [data, vision_data, cot_text_data], ignore_index=True ) all_model_names = all_data["Model Name"].unique() all_text_only_model_names = list( all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique() ) -print(all_text_only_model_names) +all_cot_text_only_models = list( + all_data[all_data["Model Type"] == "CoT Text Only"]["Model Name"].unique() +) + + ## Continue with the cold code -- # TODO: Update me to read from all_data for later @@ -50,7 +54,7 @@ vision_data = {file: pd.read_pickle(file) for file in vision_results} # Load the CoT text files into a dict cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results} # Load the CoT vision files into a dict -cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} +# cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} def calculate_accuracy(df): @@ -96,13 +100,13 @@ def process_data(data): text_data_for_df = process_data(data) vision_data_for_df = process_data(vision_data) cot_text_data_for_df = process_data(cot_text_data) -cot_vision_data_for_df = process_data(cot_vision_data) +# cot_vision_data_for_df = process_data(cot_vision_data) # Create DataFrames accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names) vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names) -cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) +# cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) # Function to finalize DataFrame def finalize_df(df): @@ -117,7 +121,7 @@ def finalize_df(df): accuracy_df = finalize_df(accuracy_df) vision_accuracy_df = finalize_df(vision_accuracy_df) cot_text_accuracy_df = finalize_df(cot_text_accuracy_df) -cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) +# cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) def load_heatmap(evt: gr.SelectData): @@ -176,6 +180,43 @@ def calculate_order_by_first_substring(selected_models): return text_only_filtered, number_of_queries, number_of_fsms + +def calculate_order_by_first_substring_cot(selected_models): + + first_columns = all_data[all_data["substring_index"] == 1] + query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"] + query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)] + + query_ids_df = query_ids_df.groupby("query_id").filter( + lambda x: x["parsed_judge_response"].eq(1).all() + ) + + fsm_ids = query_ids_df.fsm_id.unique() + + text_only = all_data[all_data["Model Type"] == "CoT Text Only"] + text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)] + + query_ids = text_only_filtered.query_id.unique() + text_only_filtered = ( + text_only_filtered.groupby(["Model Name"])["parsed_judge_response"] + .mean() + .reset_index() + ) + + text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100 + text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True) + + text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply( + lambda x: round(x, 2) + ) + text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True) + + number_of_queries = len(query_ids) + number_of_fsms = len(fsm_ids) + + return text_only_filtered, number_of_queries, number_of_fsms + + with gr.Blocks() as demo: gr.Markdown("# FSM Benchmark Leaderboard") with gr.Tab("Text-only Benchmark"): @@ -196,8 +237,8 @@ with gr.Blocks() as demo: fn=load_vision_heatmap, outputs=[heatmap_image_vision] ) - with gr.Tab("CoT Text-only Benchmark"): - gr.Markdown("# CoT Text-only Leaderboard") + with gr.Tab("Text-only Benchmark (CoT)"): + gr.Markdown("# Text-only Leaderboard (CoT)") cot_leader_board_text = gr.Dataframe( cot_text_accuracy_df, headers=headers_with_icons ) @@ -207,16 +248,16 @@ with gr.Blocks() as demo: fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] ) - with gr.Tab("CoT Vision Benchmark"): - gr.Markdown("# CoT Vision Benchmark Leaderboard") - cot_leader_board_vision = gr.Dataframe( - cot_vision_accuracy_df, headers=headers_with_icons - ) - gr.Markdown("## Heatmap") - cot_heatmap_image_vision = gr.Image(label="", show_label=False) - cot_leader_board_vision.select( - fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] - ) + # with gr.Tab("Vision Benchmark (CoT)"): + # gr.Markdown("# Vision Benchmark Leaderboard (CoT)") + # cot_leader_board_vision = gr.Dataframe( + # cot_vision_accuracy_df, headers=headers_with_icons + # ) + # gr.Markdown("## Heatmap") + # cot_heatmap_image_vision = gr.Image(label="", show_label=False) + # cot_leader_board_vision.select( + # fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] + # ) with gr.Tab("Constraint Text-only Results"): gr.Markdown("## Constraint Text-only Leaderboard by first substring") @@ -240,4 +281,26 @@ with gr.Blocks() as demo: queue=True, ) + + with gr.Tab("Constraint Text-only Results (CoT)"): + gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)") + included_models_cot = gr.CheckboxGroup( + label="Models to include", + choices=all_cot_text_only_models, + value=all_cot_text_only_models, + interactive=True, + ) + with gr.Row(): + number_of_queries_cot = gr.Textbox(label="Number of included queries") + number_of_fsms_cot = gr.Textbox(label="Number of included FSMs") + + constrained_leader_board_text_cot = gr.Dataframe() + + included_models_cot.select( + fn=calculate_order_by_first_substring_cot, + inputs=[included_models_cot], + outputs=[constrained_leader_board_text_cot, number_of_queries_cot, number_of_fsms_cot], + queue=True, + ) + demo.launch() diff --git a/results-cot/Mixtral-8x7B-Instruct-v0.1.csv b/results-cot/Mixtral-8x7B-Instruct-v0.1.csv new file mode 100644 index 0000000000000000000000000000000000000000..be977117c18c4111c69538376dc0cef355ae12e3 --- /dev/null +++ b/results-cot/Mixtral-8x7B-Instruct-v0.1.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:093e919d90609c3be8d6818cf56ca018214da3a42b78aeaf85f92581b72c5ad4 +size 19494123 diff --git a/results-cot/Mixtral-8x7B-Instruct-v0.1.jpg b/results-cot/Mixtral-8x7B-Instruct-v0.1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e7272ca29856b38bc954b055f34688b373fac25a --- /dev/null +++ b/results-cot/Mixtral-8x7B-Instruct-v0.1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c747a78a4b70330c97682209acda5e13d61d0ae3b9a372d4d01269163b7842f +size 1325857 diff --git a/results-cot/Mixtral-8x7B-Instruct-v0.1.pkl b/results-cot/Mixtral-8x7B-Instruct-v0.1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7b06a4c6eab4e1a7c5a577de535213618125121a --- /dev/null +++ b/results-cot/Mixtral-8x7B-Instruct-v0.1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686692584c6ba027c454d699bbf585b95e5c99bfc426810ea74b327a975b9cf3 +size 19489822 diff --git a/results-cot/Mixtral-8x7B-Instruct-v0.1.png b/results-cot/Mixtral-8x7B-Instruct-v0.1.png new file mode 100644 index 0000000000000000000000000000000000000000..98e12193ed03f9a2a0681599f7aeeb5f30acd8fb --- /dev/null +++ b/results-cot/Mixtral-8x7B-Instruct-v0.1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01fafa25ac093e91e57f234b61c449e12a2f6610208d80ca7b1405b8831d0784 +size 1015852 diff --git a/results-cot/Qwen1.5-72B-Chat.csv b/results-cot/Qwen1.5-72B-Chat.csv new file mode 100644 index 0000000000000000000000000000000000000000..622753313053bafa231036e9de8d5802aad52b34 --- /dev/null +++ b/results-cot/Qwen1.5-72B-Chat.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32681449776facf1084405001e69ed7926b79c69f9717fb159e3eb064b333636 +size 15795431 diff --git a/results-cot/Qwen1.5-72B-Chat.jpg b/results-cot/Qwen1.5-72B-Chat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6641c86256dda3df6db405e9e290363be15cb623 --- /dev/null +++ b/results-cot/Qwen1.5-72B-Chat.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882652b7fd7ca1c03fbcd5c031f024933405b92b978514042feabe775c6e8789 +size 1314105 diff --git a/results-cot/Qwen1.5-72B-Chat.pkl b/results-cot/Qwen1.5-72B-Chat.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8ae84826b39afcd6390b4ebcf795ab0c1673e406 --- /dev/null +++ b/results-cot/Qwen1.5-72B-Chat.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c20383298d4b6482ca7c30bf91822e24099dc67b71a3be10271005e25208c40 +size 15778970 diff --git a/results-cot/Qwen1.5-72B-Chat.png b/results-cot/Qwen1.5-72B-Chat.png new file mode 100644 index 0000000000000000000000000000000000000000..c7f585620ae35171d2df8f9417b4a7eed58deb3d --- /dev/null +++ b/results-cot/Qwen1.5-72B-Chat.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e6b7014c29a2184e88d63f3f2b9c2373531174b24e59b76442cc90d9d4b93a7 +size 1014011 diff --git a/results-cot/gemma-7b-it.csv b/results-cot/gemma-7b-it.csv new file mode 100644 index 0000000000000000000000000000000000000000..0d06a59ba90adf63b6d6c0c0383d30cff9f04455 --- /dev/null +++ b/results-cot/gemma-7b-it.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8535fa3f2ef5a94b1b552859930e0476ca0f3c77ec4c277893a9ab9ef45d6c3 +size 16793758 diff --git a/results-cot/gemma-7b-it.jpg b/results-cot/gemma-7b-it.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d6e3586342946c5ba9ec74c251a68756505020d9 --- /dev/null +++ b/results-cot/gemma-7b-it.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28be12e5ad08179e972700c578cc8089b946407e17effa2e25fb2d5129894918 +size 1339444 diff --git a/results-cot/gemma-7b-it.pkl b/results-cot/gemma-7b-it.pkl new file mode 100644 index 0000000000000000000000000000000000000000..804dbd36e2521da4f0f239bb16ca5c3cd8422f38 --- /dev/null +++ b/results-cot/gemma-7b-it.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c581027f8b78df5934117276cec3e53613f5ac953d045f71af4121b3ec2e1a4 +size 16822239 diff --git a/results-cot/gemma-7b-it.png b/results-cot/gemma-7b-it.png new file mode 100644 index 0000000000000000000000000000000000000000..0441c1b599e5ed7ab28a0b07028edbc0f5e8ac60 --- /dev/null +++ b/results-cot/gemma-7b-it.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d10e044726def8fdebc8bd89b6cda148c315fd8d808dd7f168d4c5dbf92c2f2 +size 1010299 diff --git a/results-cot/gpt-3.5-CoT.csv b/results-cot/gpt-3.5-turbo-0125.csv similarity index 100% rename from results-cot/gpt-3.5-CoT.csv rename to results-cot/gpt-3.5-turbo-0125.csv diff --git a/results-cot/gpt-3.5-CoT.jpg b/results-cot/gpt-3.5-turbo-0125.jpg similarity index 100% rename from results-cot/gpt-3.5-CoT.jpg rename to results-cot/gpt-3.5-turbo-0125.jpg diff --git a/results-cot/gpt-3.5-CoT.pkl b/results-cot/gpt-3.5-turbo-0125.pkl similarity index 100% rename from results-cot/gpt-3.5-CoT.pkl rename to results-cot/gpt-3.5-turbo-0125.pkl diff --git a/results-cot/gpt-3.5-CoT.png b/results-cot/gpt-3.5-turbo-0125.png similarity index 100% rename from results-cot/gpt-3.5-CoT.png rename to results-cot/gpt-3.5-turbo-0125.png diff --git a/results-cot/gpt-4v-CoT-Azure.csv b/results-cot/gpt-4v-CoT-Azure.csv deleted file mode 100644 index 9a8b0fea4bd4ae783ed5a681bdf91609e47355df..0000000000000000000000000000000000000000 --- a/results-cot/gpt-4v-CoT-Azure.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6 -size 6374181 diff --git a/results-cot/gpt-4v-CoT-Azure.jpg b/results-cot/gpt-4v-CoT-Azure.jpg deleted file mode 100644 index 2836530914cc2df9cb1f4a0979495bc212027255..0000000000000000000000000000000000000000 --- a/results-cot/gpt-4v-CoT-Azure.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d63da74c747dc220638351069b927925aaa34e580e2c00e70dd29e0d2cefebb -size 1329490 diff --git a/results-cot/gpt-4v-CoT-Azure.pkl b/results-cot/gpt-4v-CoT-Azure.pkl deleted file mode 100644 index 29fd930ee291f4c8b6ccee0d9dc3cc5a59534971..0000000000000000000000000000000000000000 --- a/results-cot/gpt-4v-CoT-Azure.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847 -size 6320889 diff --git a/results-cot/gpt-4v-CoT-Azure.png b/results-cot/gpt-4v-CoT-Azure.png deleted file mode 100644 index 73be9068480ff1b020384f2918d21cb385cf96aa..0000000000000000000000000000000000000000 --- a/results-cot/gpt-4v-CoT-Azure.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8a96d76a726ab67813368f0a630576aee5cda6b5264c2edc65af93932fe4a32 -size 1013851 diff --git a/results-vision-CoT/gemini-pro-vision-CoT.csv b/results-vision-CoT/gemini-pro-vision-CoT.csv deleted file mode 100644 index e0f723d6850a94e4fe12ca4ba10b4337b856c234..0000000000000000000000000000000000000000 --- a/results-vision-CoT/gemini-pro-vision-CoT.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312 -size 6184119 diff --git a/results-vision-CoT/gemini-pro-vision-CoT.jpg b/results-vision-CoT/gemini-pro-vision-CoT.jpg deleted file mode 100644 index d4c0ff8c3ca0a6a089d64f6ccd104b0c7595b81a..0000000000000000000000000000000000000000 --- a/results-vision-CoT/gemini-pro-vision-CoT.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fed7a1736c7550edca80305d90c975e36da47331bc67f824c23b6bb5525289b4 -size 1333651 diff --git a/results-vision-CoT/gemini-pro-vision-CoT.pkl b/results-vision-CoT/gemini-pro-vision-CoT.pkl deleted file mode 100644 index 11d2cf5e319c9d9abf69866e84282ad7573ceb13..0000000000000000000000000000000000000000 --- a/results-vision-CoT/gemini-pro-vision-CoT.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff -size 6144275 diff --git a/results-vision-CoT/gemini-pro-vision-CoT.png b/results-vision-CoT/gemini-pro-vision-CoT.png deleted file mode 100644 index ccf38728dea2ac26242413c6cbc138b9699627ed..0000000000000000000000000000000000000000 --- a/results-vision-CoT/gemini-pro-vision-CoT.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49ab8af8d2e3d2fb671b375a830808eb92a84e0faef35d2844f8eed62bd6acf5 -size 1013282 diff --git a/results-vision/gemini-pro-vision-CoT.csv b/results-vision/gemini-pro-vision-CoT.csv deleted file mode 100644 index e0f723d6850a94e4fe12ca4ba10b4337b856c234..0000000000000000000000000000000000000000 --- a/results-vision/gemini-pro-vision-CoT.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312 -size 6184119 diff --git a/results-vision/gemini-pro-vision-CoT.jpg b/results-vision/gemini-pro-vision-CoT.jpg deleted file mode 100644 index d4c0ff8c3ca0a6a089d64f6ccd104b0c7595b81a..0000000000000000000000000000000000000000 --- a/results-vision/gemini-pro-vision-CoT.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fed7a1736c7550edca80305d90c975e36da47331bc67f824c23b6bb5525289b4 -size 1333651 diff --git a/results-vision/gemini-pro-vision-CoT.pkl b/results-vision/gemini-pro-vision-CoT.pkl deleted file mode 100644 index 11d2cf5e319c9d9abf69866e84282ad7573ceb13..0000000000000000000000000000000000000000 --- a/results-vision/gemini-pro-vision-CoT.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff -size 6144275 diff --git a/results-vision/gemini-pro-vision-CoT.png b/results-vision/gemini-pro-vision-CoT.png deleted file mode 100644 index ccf38728dea2ac26242413c6cbc138b9699627ed..0000000000000000000000000000000000000000 --- a/results-vision/gemini-pro-vision-CoT.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49ab8af8d2e3d2fb671b375a830808eb92a84e0faef35d2844f8eed62bd6acf5 -size 1013282 diff --git a/results-vision/gpt-4v-CoT.csv b/results-vision/gpt-4v-CoT.csv deleted file mode 100644 index 9a8b0fea4bd4ae783ed5a681bdf91609e47355df..0000000000000000000000000000000000000000 --- a/results-vision/gpt-4v-CoT.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6 -size 6374181 diff --git a/results-vision/gpt-4v-CoT.jpg b/results-vision/gpt-4v-CoT.jpg deleted file mode 100644 index 2836530914cc2df9cb1f4a0979495bc212027255..0000000000000000000000000000000000000000 --- a/results-vision/gpt-4v-CoT.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d63da74c747dc220638351069b927925aaa34e580e2c00e70dd29e0d2cefebb -size 1329490 diff --git a/results-vision/gpt-4v-CoT.pkl b/results-vision/gpt-4v-CoT.pkl deleted file mode 100644 index 29fd930ee291f4c8b6ccee0d9dc3cc5a59534971..0000000000000000000000000000000000000000 --- a/results-vision/gpt-4v-CoT.pkl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847 -size 6320889 diff --git a/results-vision/gpt-4v-CoT.png b/results-vision/gpt-4v-CoT.png deleted file mode 100644 index 73be9068480ff1b020384f2918d21cb385cf96aa..0000000000000000000000000000000000000000 --- a/results-vision/gpt-4v-CoT.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8a96d76a726ab67813368f0a630576aee5cda6b5264c2edc65af93932fe4a32 -size 1013851 diff --git a/results/CodeLlama-70b-Instruct-hf.csv b/results/CodeLlama-70b-Instruct-hf.csv new file mode 100644 index 0000000000000000000000000000000000000000..210a8fb0395397f78578e149d36a8b1791bc62e5 --- /dev/null +++ b/results/CodeLlama-70b-Instruct-hf.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3726905a1656174f3c29edfced6f2eec63222f6be8965c0d970264901d8cfc75 +size 16476347 diff --git a/results/CodeLlama-70B.jpg b/results/CodeLlama-70b-Instruct-hf.jpg similarity index 100% rename from results/CodeLlama-70B.jpg rename to results/CodeLlama-70b-Instruct-hf.jpg diff --git a/results/CodeLlama-70B.pkl b/results/CodeLlama-70b-Instruct-hf.pkl similarity index 100% rename from results/CodeLlama-70B.pkl rename to results/CodeLlama-70b-Instruct-hf.pkl diff --git a/results/CodeLlama-70B.png b/results/CodeLlama-70b-Instruct-hf.png similarity index 100% rename from results/CodeLlama-70B.png rename to results/CodeLlama-70b-Instruct-hf.png diff --git a/results/Llama-2-70b-chat-hf.csv b/results/Llama-2-70b-chat-hf.csv new file mode 100644 index 0000000000000000000000000000000000000000..2b3ec21ec3beeba95bc754ac42b8c2d82f94b134 --- /dev/null +++ b/results/Llama-2-70b-chat-hf.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a31de917b05ed5405474a348d072426474a8fb2ce7ff462dbb121e25f4b6ad +size 20760268 diff --git a/results/Mistral-7B-Instruct-v0.2.csv b/results/Mistral-7B-Instruct-v0.2.csv new file mode 100644 index 0000000000000000000000000000000000000000..4d0b979c5831639b55d9cde7ba065517b4a9a453 --- /dev/null +++ b/results/Mistral-7B-Instruct-v0.2.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ad4985661fc41e659a631fc74ba433cd08a571048f11436ccf87ff74f0db09 +size 27242025 diff --git a/results/Mixtral-8x7B-Instruct-v0.1.csv b/results/Mixtral-8x7B-Instruct-v0.1.csv new file mode 100644 index 0000000000000000000000000000000000000000..93b75d40f382ee7eea9fcae943041cdae92a90dd --- /dev/null +++ b/results/Mixtral-8x7B-Instruct-v0.1.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93e2b963a5ac8129b5284f3fd7987964ef96fa0e64194de704a3549c611de1f +size 17978176 diff --git a/results/Mixtral-8x7B-Instruct-0.1.jpg b/results/Mixtral-8x7B-Instruct-v0.1.jpg similarity index 100% rename from results/Mixtral-8x7B-Instruct-0.1.jpg rename to results/Mixtral-8x7B-Instruct-v0.1.jpg diff --git a/results/Mixtral-8x7B-Instruct-0.1.pkl b/results/Mixtral-8x7B-Instruct-v0.1.pkl similarity index 100% rename from results/Mixtral-8x7B-Instruct-0.1.pkl rename to results/Mixtral-8x7B-Instruct-v0.1.pkl diff --git a/results/Mixtral-8x7B-Instruct-0.1.png b/results/Mixtral-8x7B-Instruct-v0.1.png similarity index 100% rename from results/Mixtral-8x7B-Instruct-0.1.png rename to results/Mixtral-8x7B-Instruct-v0.1.png diff --git a/results/Qwen1.5-72B-Chat.csv b/results/Qwen1.5-72B-Chat.csv new file mode 100644 index 0000000000000000000000000000000000000000..4080e2f019798be13744122344a14ed2e911ae27 --- /dev/null +++ b/results/Qwen1.5-72B-Chat.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba395c0b55330f689827527831e57e50ae9d824b6635b2bb569713afcf26d4b +size 14219193 diff --git a/results/StripedHyena-Nous-7B.csv b/results/StripedHyena-Nous-7B.csv new file mode 100644 index 0000000000000000000000000000000000000000..2932e2894a2b6318a7b8a7349211f6206649738c --- /dev/null +++ b/results/StripedHyena-Nous-7B.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f662367ea0d33a368aaa7a72cfeed41d2f3dc05be6289a6fe485a028c7cb98d5 +size 29219512 diff --git a/results/Yi-34B-Chat.csv b/results/Yi-34B-Chat.csv new file mode 100644 index 0000000000000000000000000000000000000000..55bda75d9cdd0e2383d293c564afedcaa9165551 --- /dev/null +++ b/results/Yi-34B-Chat.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f09fb5f46ca144490bcb42ec89dd27f169680493501c211bf2bcfcd908da1c +size 20485423 diff --git a/results/claude-3-haiku-20240307.csv b/results/claude-3-haiku-20240307.csv new file mode 100644 index 0000000000000000000000000000000000000000..85a0bc280a47703e8b02b036e0b2154e4c4137af --- /dev/null +++ b/results/claude-3-haiku-20240307.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45623535997485afdee5b0312f2b5fdcc26cf531fbb56b6c3af6e126dfbe7b0f +size 19570166 diff --git a/results/Claude-3-Haiku.jpg b/results/claude-3-haiku-20240307.jpg similarity index 100% rename from results/Claude-3-Haiku.jpg rename to results/claude-3-haiku-20240307.jpg diff --git a/results/Claude-3-Haiku.pkl b/results/claude-3-haiku-20240307.pkl similarity index 100% rename from results/Claude-3-Haiku.pkl rename to results/claude-3-haiku-20240307.pkl diff --git a/results/Claude-3-Haiku.png b/results/claude-3-haiku-20240307.png similarity index 100% rename from results/Claude-3-Haiku.png rename to results/claude-3-haiku-20240307.png diff --git a/results/claude-3-opus-20240229.csv b/results/claude-3-opus-20240229.csv new file mode 100644 index 0000000000000000000000000000000000000000..791590d1fd63e822adb63932f2ae702572713ad8 --- /dev/null +++ b/results/claude-3-opus-20240229.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d902999bcee4798b81644b2ff0ea78280dd46bc310909154c1ef089adf82789 +size 20131397 diff --git a/results/Claude-3-Opus.jpg b/results/claude-3-opus-20240229.jpg similarity index 100% rename from results/Claude-3-Opus.jpg rename to results/claude-3-opus-20240229.jpg diff --git a/results/Claude-3-Opus.pkl b/results/claude-3-opus-20240229.pkl similarity index 100% rename from results/Claude-3-Opus.pkl rename to results/claude-3-opus-20240229.pkl diff --git a/results/Claude-3-Opus.png b/results/claude-3-opus-20240229.png similarity index 100% rename from results/Claude-3-Opus.png rename to results/claude-3-opus-20240229.png diff --git a/results/gemma-7b-it.csv b/results/gemma-7b-it.csv new file mode 100644 index 0000000000000000000000000000000000000000..353ab2348cce4a6f1904e336cd62a50b2003b833 --- /dev/null +++ b/results/gemma-7b-it.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bdc088d6c7eb18257ac35c1d2b2ee9f9849a69950016f6e9a0bf04be48a5ae2 +size 12624700 diff --git a/results/Gemma-7B.jpg b/results/gemma-7b-it.jpg similarity index 100% rename from results/Gemma-7B.jpg rename to results/gemma-7b-it.jpg diff --git a/results/Gemma-7B.pkl b/results/gemma-7b-it.pkl similarity index 100% rename from results/Gemma-7B.pkl rename to results/gemma-7b-it.pkl diff --git a/results/Gemma-7B.png b/results/gemma-7b-it.png similarity index 100% rename from results/Gemma-7B.png rename to results/gemma-7b-it.png diff --git a/results/gpt-3.5-turbo-0125.csv b/results/gpt-3.5-turbo-0125.csv new file mode 100644 index 0000000000000000000000000000000000000000..5f50409538b0260b33c3f8c9c6f343975331a1e5 --- /dev/null +++ b/results/gpt-3.5-turbo-0125.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f700f8e2914178a69513e96176c69e67acd51bb14ac12ab174d5e2df60f56179 +size 9472558 diff --git a/results/GPT-3.5-Turbo.jpg b/results/gpt-3.5-turbo-0125.jpg similarity index 100% rename from results/GPT-3.5-Turbo.jpg rename to results/gpt-3.5-turbo-0125.jpg diff --git a/results/GPT-3.5-Turbo.pkl b/results/gpt-3.5-turbo-0125.pkl similarity index 100% rename from results/GPT-3.5-Turbo.pkl rename to results/gpt-3.5-turbo-0125.pkl diff --git a/results/GPT-3.5-Turbo.png b/results/gpt-3.5-turbo-0125.png similarity index 100% rename from results/GPT-3.5-Turbo.png rename to results/gpt-3.5-turbo-0125.png diff --git a/results/gpt-4-0125-preview.csv b/results/gpt-4-0125-preview.csv new file mode 100644 index 0000000000000000000000000000000000000000..f63d7a302f9225c558d1239538fce78d6a8abd8e --- /dev/null +++ b/results/gpt-4-0125-preview.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4cbbacdff8172888d8d5e8917680f524d7cd73dcbcc7aa8d0e54c0246a752c +size 18088521