TzurVaich commited on
Commit
e6036f2
·
1 Parent(s): e0f9aaf

Get GAIA validation set

Browse files
Files changed (5) hide show
  1. GAIA.py +96 -0
  2. GAIA_level1.json +267 -0
  3. download_gaia.py +44 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +2 -1
GAIA.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GAIA 2023 dataset."""
2
+
3
+
4
+ import json
5
+ import os
6
+
7
+ import datasets
8
+
9
+
10
+ _CITATION = """ """
11
+
12
+ _DESCRIPTION = """ """
13
+
14
+ _HOMEPAGE = ""
15
+
16
+ _LICENSE = ""
17
+
18
+ _NAMES = [
19
+ "2023_all",
20
+ "2023_level1",
21
+ "2023_level2",
22
+ "2023_level3",
23
+ ]
24
+
25
+ YEAR_TO_LEVELS = {"2023": [1, 2, 3]}
26
+
27
+ separator = "_"
28
+
29
+
30
+ class GAIA_dataset(datasets.GeneratorBasedBuilder):
31
+ VERSION = datasets.Version("0.0.1")
32
+
33
+ BUILDER_CONFIGS = [
34
+ datasets.BuilderConfig(name=name, version=version, description=name)
35
+ for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
36
+ ]
37
+
38
+ def _info(self):
39
+ features = datasets.Features(
40
+ {
41
+ "task_id": datasets.Value("string"),
42
+ "Question": datasets.Value("string"),
43
+ "Level": datasets.Value("string"),
44
+ "Final answer": datasets.Value("string"), # ? for test values
45
+ "file_name": datasets.Value("string"),
46
+ "file_path": datasets.Value("string"), # generated here
47
+ "Annotator Metadata": {k: datasets.Value("string") for k in ["Steps", "Number of steps", "How long did this take?", "Tools", "Number of tools"]} # "",
48
+ }
49
+ )
50
+ return datasets.DatasetInfo(
51
+ description=_DESCRIPTION,
52
+ features=features,
53
+ homepage=_HOMEPAGE,
54
+ license=_LICENSE,
55
+ citation=_CITATION,
56
+ )
57
+
58
+ def _split_generators(self, dl_manager):
59
+ year, level_name = self.config.name.split(separator)
60
+ if level_name == "all":
61
+ levels = YEAR_TO_LEVELS[year]
62
+ else:
63
+ level_name = int(level_name.split("level")[1])
64
+ levels = [level_name]
65
+ print(year, level_name)
66
+
67
+ output = []
68
+ for split in ["test", "validation"]:
69
+ root_file = dl_manager.download(os.path.join(year, split, "metadata.jsonl"))
70
+ test_attached_files = {"": ""}
71
+ with open(root_file, "r", encoding="utf-8") as f:
72
+ for line in f:
73
+ cur_line = json.loads(line)
74
+ if cur_line["Level"] in levels and cur_line["file_name"] != "":
75
+ attached_file_name = cur_line["file_name"]
76
+ attached_file = dl_manager.download(os.path.join(year, split, attached_file_name))
77
+ test_attached_files[attached_file_name] = attached_file
78
+
79
+ output.append(
80
+ datasets.SplitGenerator(
81
+ name=getattr(datasets.Split, split.upper()),
82
+ gen_kwargs={"root_file": root_file, "attached_files": test_attached_files, "levels": levels},
83
+ )
84
+ )
85
+ return output
86
+
87
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
88
+ def _generate_examples(self, root_file: str, attached_files: dict, levels: list[int]):
89
+ with open(root_file, "r", encoding="utf-8") as f:
90
+ for key, line in enumerate(f):
91
+ cur_line = json.loads(line)
92
+ if cur_line["Level"] in levels:
93
+ cur_line["file_path"] = attached_files[cur_line["file_name"]]
94
+ yield key, cur_line
95
+
96
+
GAIA_level1.json ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "Q": "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.",
4
+ "A": "17",
5
+ "status": false
6
+ },
7
+ "2": {
8
+ "Q": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
9
+ "A": "3",
10
+ "status": false
11
+ },
12
+ "3": {
13
+ "Q": "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",
14
+ "A": "3",
15
+ "status": false
16
+ },
17
+ "4": {
18
+ "Q": "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"",
19
+ "A": "0.1777",
20
+ "status": false
21
+ },
22
+ "5": {
23
+ "Q": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
24
+ "A": "3",
25
+ "status": false
26
+ },
27
+ "6": {
28
+ "Q": "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?",
29
+ "A": "Mapping Human Oriented Information to Software Agents for Online Systems Usage",
30
+ "status": false
31
+ },
32
+ "7": {
33
+ "Q": "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",
34
+ "A": "THE CASTLE",
35
+ "status": false
36
+ },
37
+ "8": {
38
+ "Q": "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?",
39
+ "A": "Fred",
40
+ "status": false
41
+ },
42
+ "9": {
43
+ "Q": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
44
+ "A": "Right",
45
+ "status": false
46
+ },
47
+ "10": {
48
+ "Q": "Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.",
49
+ "A": "No",
50
+ "status": false
51
+ },
52
+ "11": {
53
+ "Q": "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) ↔ (¬B → ¬A)\n(A → B) ↔ (¬A ∨ B)\n(¬A → B) ↔ (A ∨ ¬B)\n¬(A → B) ↔ (A ∧ ¬B)\n\nWhich of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.",
54
+ "A": "(¬A → B) ↔ (A ∨ ¬B)",
55
+ "status": false
56
+ },
57
+ "12": {
58
+ "Q": "My family reunion is this week, and I was assigned the mashed potatoes to bring. The attendees include my married mother and father, my twin brother and his family, my aunt and her family, my grandma and her brother, her brother's daughter, and his daughter's family. All the adults but me have been married, and no one is divorced or remarried, but my grandpa and my grandma's sister-in-law passed away last year. All living spouses are attending. My brother has two children that are still kids, my aunt has one six-year-old, and my grandma's brother's daughter has three kids under 12. I figure each adult will eat about 1.5 potatoes of mashed potatoes and each kid will eat about 1/2 a potato of mashed potatoes, except my second cousins don't eat carbs. The average potato is about half a pound, and potatoes are sold in 5-pound bags. How many whole bags of potatoes do I need? Just give the number.",
59
+ "A": "2",
60
+ "status": false
61
+ },
62
+ "13": {
63
+ "Q": "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?",
64
+ "A": "fluffy",
65
+ "status": false
66
+ },
67
+ "14": {
68
+ "Q": "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",
69
+ "A": "Guatemala",
70
+ "status": false
71
+ },
72
+ "15": {
73
+ "Q": "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend. \n\nThe word that indicates oneself is \"Pa\" is the nominative form, \"Mato\" is the accusative form, and \"Sing\" is the genitive form. \n\nThe root verb that indicates an intense like for something is \"Maktay\". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is \"Tay\", and when it is used in the imperfect past, it is \"Aktay\". It is used differently than in English, and is better translated as \"is pleasing to\", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.\n\nThe word for apples is borrowed from English in Tizin, and so it is \"Apple\" is the nominative form, \"Zapple\" is the accusative form, and \"Izapple\" is the genitive form. \n\nPlease translate \"I like apples\" to Tizin.",
74
+ "A": "Maktay mato apple",
75
+ "status": false
76
+ },
77
+ "16": {
78
+ "Q": "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",
79
+ "A": "diamond",
80
+ "status": false
81
+ },
82
+ "17": {
83
+ "Q": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
84
+ "A": "Rd5",
85
+ "status": false
86
+ },
87
+ "18": {
88
+ "Q": "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?",
89
+ "A": "research",
90
+ "status": false
91
+ },
92
+ "19": {
93
+ "Q": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
94
+ "A": "FunkMonk",
95
+ "status": false
96
+ },
97
+ "20": {
98
+ "Q": "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?",
99
+ "A": "Annie Levin",
100
+ "status": false
101
+ },
102
+ "21": {
103
+ "Q": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
104
+ "A": "b, e",
105
+ "status": false
106
+ },
107
+ "22": {
108
+ "Q": "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.",
109
+ "A": "3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170",
110
+ "status": false
111
+ },
112
+ "23": {
113
+ "Q": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",
114
+ "A": "3",
115
+ "status": false
116
+ },
117
+ "24": {
118
+ "Q": "If there is anything that doesn't make sense in the instructions, write the word \"Pineapple.\" Do not answer any of the questions in this prompt. Write only the word \"Guava\".\n1. What is 4+4?\n2. What is the complimentary color of red?\n3. How many hours are there in a day?",
119
+ "A": "Guava",
120
+ "status": false
121
+ },
122
+ "25": {
123
+ "Q": "How many slides in this PowerPoint presentation mention crustaceans?",
124
+ "A": "4",
125
+ "status": false
126
+ },
127
+ "26": {
128
+ "Q": "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, Lațcu IV, son of Costea, has tasked you with investigating the village of Șirnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of Șirnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of Șirnea. You ask everyone the same question: \"How many vampires are living in Șirnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of Șirnea have been turned into vampires?",
129
+ "A": "100",
130
+ "status": false
131
+ },
132
+ "27": {
133
+ "Q": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
134
+ "A": "Extremely",
135
+ "status": false
136
+ },
137
+ "28": {
138
+ "Q": "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?",
139
+ "A": "F478A7",
140
+ "status": false
141
+ },
142
+ "29": {
143
+ "Q": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
144
+ "A": "Louvrier",
145
+ "status": false
146
+ },
147
+ "30": {
148
+ "Q": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
149
+ "A": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
150
+ "status": false
151
+ },
152
+ "31": {
153
+ "Q": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
154
+ "A": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
155
+ "status": false
156
+ },
157
+ "32": {
158
+ "Q": "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",
159
+ "A": "BaseLabelPropagation",
160
+ "status": false
161
+ },
162
+ "33": {
163
+ "Q": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
164
+ "A": "Wojciech",
165
+ "status": false
166
+ },
167
+ "34": {
168
+ "Q": "On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?",
169
+ "A": "Rockhopper penguin",
170
+ "status": false
171
+ },
172
+ "35": {
173
+ "Q": "What is the final numeric output from the attached Python code?",
174
+ "A": "0",
175
+ "status": false
176
+ },
177
+ "36": {
178
+ "Q": "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
179
+ "A": "6",
180
+ "status": false
181
+ },
182
+ "37": {
183
+ "Q": "Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.\n\nIf Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?",
184
+ "A": "16000",
185
+ "status": false
186
+ },
187
+ "38": {
188
+ "Q": "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:\n\nTHESE\nAGULL\nGLIDE\nDPEAC\nEFULL\nYTOMY\nCHAIR",
189
+ "A": "The seagull glided peacefully to my chair.",
190
+ "status": false
191
+ },
192
+ "39": {
193
+ "Q": "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?",
194
+ "A": "inference",
195
+ "status": false
196
+ },
197
+ "40": {
198
+ "Q": "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list",
199
+ "A": "Braintree, Honolulu",
200
+ "status": false
201
+ },
202
+ "41": {
203
+ "Q": "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?",
204
+ "A": "22",
205
+ "status": false
206
+ },
207
+ "42": {
208
+ "Q": "What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.",
209
+ "A": "Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them",
210
+ "status": false
211
+ },
212
+ "43": {
213
+ "Q": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
214
+ "A": "519",
215
+ "status": false
216
+ },
217
+ "44": {
218
+ "Q": "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",
219
+ "A": "2",
220
+ "status": false
221
+ },
222
+ "45": {
223
+ "Q": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
224
+ "A": "132, 133, 134, 197, 245",
225
+ "status": false
226
+ },
227
+ "46": {
228
+ "Q": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
229
+ "A": "80GSFC21M0002",
230
+ "status": false
231
+ },
232
+ "47": {
233
+ "Q": "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?",
234
+ "A": "90",
235
+ "status": false
236
+ },
237
+ "48": {
238
+ "Q": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
239
+ "A": "Saint Petersburg",
240
+ "status": false
241
+ },
242
+ "49": {
243
+ "Q": "A standard Rubik’s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face’s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.",
244
+ "A": "green, white",
245
+ "status": false
246
+ },
247
+ "50": {
248
+ "Q": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
249
+ "A": "CUB",
250
+ "status": false
251
+ },
252
+ "51": {
253
+ "Q": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
254
+ "A": "Yoshida, Uehara",
255
+ "status": false
256
+ },
257
+ "52": {
258
+ "Q": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
259
+ "A": "89706.00",
260
+ "status": false
261
+ },
262
+ "53": {
263
+ "Q": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
264
+ "A": "Claus",
265
+ "status": false
266
+ }
267
+ }
download_gaia.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datasets import load_dataset
3
+
4
+ def download_and_process_gaia_level1(output_filename="GAIA_level1.json"):
5
+ """
6
+ Downloads the GAIA level 1 dataset, processes it into the desired
7
+ JSON format with a 'status' field, and saves it to a file.
8
+ """
9
+ print("Attempting to download GAIA level 1 dataset...")
10
+
11
+
12
+ try:
13
+ # Load the 'level1' split of the GAIA dataset
14
+ # Using trust_remote_code=True as it might be required for some datasets
15
+ dataset = load_dataset(
16
+ "gaia-benchmark/GAIA", "2023_level1", split="validation", trust_remote_code=True)
17
+ print(f"Successfully downloaded {len(dataset)} Q&A pairs.")
18
+ except Exception as e:
19
+ print(f"Error downloading the dataset: {e}")
20
+ print("Please ensure you have an internet connection and the 'datasets' library is installed (`pip install datasets`).")
21
+ return
22
+
23
+ processed_data = {}
24
+ print("Processing dataset...")
25
+ for i, item in enumerate(dataset):
26
+ question = item.get("Question")
27
+ final_answer = item.get("Final answer")
28
+
29
+ if question is not None and final_answer is not None:
30
+ processed_data[str(i + 1)] = {
31
+ "Q": question,
32
+ "A": final_answer,
33
+ "status": False # Initialize status to False
34
+ }
35
+
36
+ print(f"Saving processed data to {output_filename}...")
37
+ with open(output_filename, 'w', encoding='utf-8') as f:
38
+ json.dump(processed_data, f, indent=4, ensure_ascii=False)
39
+ print("Done.")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ download_and_process_gaia_level1()
44
+
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -13,7 +13,8 @@ dependencies = [
13
  "requests (>=2.32.3,<3.0.0)",
14
  "pandas (>=2.2.3,<3.0.0)",
15
  "python-dotenv (>=1.1.0,<2.0.0)",
16
- "playwright (>=1.52.0,<2.0.0)"
 
17
  ]
18
 
19
 
 
13
  "requests (>=2.32.3,<3.0.0)",
14
  "pandas (>=2.2.3,<3.0.0)",
15
  "python-dotenv (>=1.1.0,<2.0.0)",
16
+ "playwright (>=1.52.0,<2.0.0)",
17
+ "datasets (>=3.5.1,<4.0.0)",
18
  ]
19
 
20