Spaces:
Sleeping
Sleeping
code update with cashed status file
Browse files- GAIA_level1_status.json +426 -0
- app.py +75 -19
GAIA_level1_status.json
ADDED
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"1": {
|
3 |
+
"Q": "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.",
|
4 |
+
"A": "17",
|
5 |
+
"file_name": "",
|
6 |
+
"file_path": "",
|
7 |
+
"annotator_metadata": null,
|
8 |
+
"status": false
|
9 |
+
},
|
10 |
+
"2": {
|
11 |
+
"Q": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
12 |
+
"A": "3",
|
13 |
+
"file_name": "",
|
14 |
+
"file_path": "",
|
15 |
+
"annotator_metadata": null,
|
16 |
+
"status": false
|
17 |
+
},
|
18 |
+
"3": {
|
19 |
+
"Q": "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",
|
20 |
+
"A": "3",
|
21 |
+
"file_name": "",
|
22 |
+
"file_path": "",
|
23 |
+
"annotator_metadata": null,
|
24 |
+
"status": false
|
25 |
+
},
|
26 |
+
"4": {
|
27 |
+
"Q": "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"",
|
28 |
+
"A": "0.1777",
|
29 |
+
"file_name": "",
|
30 |
+
"file_path": "",
|
31 |
+
"annotator_metadata": null,
|
32 |
+
"status": false
|
33 |
+
},
|
34 |
+
"5": {
|
35 |
+
"Q": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
36 |
+
"A": "3",
|
37 |
+
"file_name": "",
|
38 |
+
"file_path": "",
|
39 |
+
"annotator_metadata": null,
|
40 |
+
"status": false
|
41 |
+
},
|
42 |
+
"6": {
|
43 |
+
"Q": "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?",
|
44 |
+
"A": "Mapping Human Oriented Information to Software Agents for Online Systems Usage",
|
45 |
+
"file_name": "",
|
46 |
+
"file_path": "",
|
47 |
+
"annotator_metadata": null,
|
48 |
+
"status": false
|
49 |
+
},
|
50 |
+
"7": {
|
51 |
+
"Q": "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",
|
52 |
+
"A": "THE CASTLE",
|
53 |
+
"file_name": "",
|
54 |
+
"file_path": "",
|
55 |
+
"annotator_metadata": null,
|
56 |
+
"status": false
|
57 |
+
},
|
58 |
+
"8": {
|
59 |
+
"Q": "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?",
|
60 |
+
"A": "Fred",
|
61 |
+
"file_name": "cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx",
|
62 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx",
|
63 |
+
"annotator_metadata": null,
|
64 |
+
"status": false
|
65 |
+
},
|
66 |
+
"9": {
|
67 |
+
"Q": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
68 |
+
"A": "Right",
|
69 |
+
"file_name": "",
|
70 |
+
"file_path": "",
|
71 |
+
"annotator_metadata": null,
|
72 |
+
"status": false
|
73 |
+
},
|
74 |
+
"10": {
|
75 |
+
"Q": "Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.",
|
76 |
+
"A": "No",
|
77 |
+
"file_name": "5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx",
|
78 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx",
|
79 |
+
"annotator_metadata": null,
|
80 |
+
"status": false
|
81 |
+
},
|
82 |
+
"11": {
|
83 |
+
"Q": "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) ↔ (¬B → ¬A)\n(A → B) ↔ (¬A ∨ B)\n(¬A → B) ↔ (A ∨ ¬B)\n¬(A → B) ↔ (A ∧ ¬B)\n\nWhich of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.",
|
84 |
+
"A": "(¬A → B) ↔ (A ∨ ¬B)",
|
85 |
+
"file_name": "",
|
86 |
+
"file_path": "",
|
87 |
+
"annotator_metadata": null,
|
88 |
+
"status": false
|
89 |
+
},
|
90 |
+
"12": {
|
91 |
+
"Q": "My family reunion is this week, and I was assigned the mashed potatoes to bring. The attendees include my married mother and father, my twin brother and his family, my aunt and her family, my grandma and her brother, her brother's daughter, and his daughter's family. All the adults but me have been married, and no one is divorced or remarried, but my grandpa and my grandma's sister-in-law passed away last year. All living spouses are attending. My brother has two children that are still kids, my aunt has one six-year-old, and my grandma's brother's daughter has three kids under 12. I figure each adult will eat about 1.5 potatoes of mashed potatoes and each kid will eat about 1/2 a potato of mashed potatoes, except my second cousins don't eat carbs. The average potato is about half a pound, and potatoes are sold in 5-pound bags. How many whole bags of potatoes do I need? Just give the number.",
|
92 |
+
"A": "2",
|
93 |
+
"file_name": "",
|
94 |
+
"file_path": "",
|
95 |
+
"annotator_metadata": null,
|
96 |
+
"status": true
|
97 |
+
},
|
98 |
+
"13": {
|
99 |
+
"Q": "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?",
|
100 |
+
"A": "fluffy",
|
101 |
+
"file_name": "",
|
102 |
+
"file_path": "",
|
103 |
+
"annotator_metadata": null,
|
104 |
+
"status": false
|
105 |
+
},
|
106 |
+
"14": {
|
107 |
+
"Q": "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",
|
108 |
+
"A": "Guatemala",
|
109 |
+
"file_name": "",
|
110 |
+
"file_path": "",
|
111 |
+
"annotator_metadata": null,
|
112 |
+
"status": false
|
113 |
+
},
|
114 |
+
"15": {
|
115 |
+
"Q": "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend. \n\nThe word that indicates oneself is \"Pa\" is the nominative form, \"Mato\" is the accusative form, and \"Sing\" is the genitive form. \n\nThe root verb that indicates an intense like for something is \"Maktay\". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is \"Tay\", and when it is used in the imperfect past, it is \"Aktay\". It is used differently than in English, and is better translated as \"is pleasing to\", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.\n\nThe word for apples is borrowed from English in Tizin, and so it is \"Apple\" is the nominative form, \"Zapple\" is the accusative form, and \"Izapple\" is the genitive form. \n\nPlease translate \"I like apples\" to Tizin.",
|
116 |
+
"A": "Maktay mato apple",
|
117 |
+
"file_name": "",
|
118 |
+
"file_path": "",
|
119 |
+
"annotator_metadata": null,
|
120 |
+
"status": false
|
121 |
+
},
|
122 |
+
"16": {
|
123 |
+
"Q": "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",
|
124 |
+
"A": "diamond",
|
125 |
+
"file_name": "",
|
126 |
+
"file_path": "",
|
127 |
+
"annotator_metadata": null,
|
128 |
+
"status": false
|
129 |
+
},
|
130 |
+
"17": {
|
131 |
+
"Q": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
132 |
+
"A": "Rd5",
|
133 |
+
"file_name": "cca530fc-4052-43b2-b130-b30968d8aa44.png",
|
134 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png",
|
135 |
+
"annotator_metadata": null,
|
136 |
+
"status": false
|
137 |
+
},
|
138 |
+
"18": {
|
139 |
+
"Q": "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?",
|
140 |
+
"A": "research",
|
141 |
+
"file_name": "",
|
142 |
+
"file_path": "",
|
143 |
+
"annotator_metadata": null,
|
144 |
+
"status": false
|
145 |
+
},
|
146 |
+
"19": {
|
147 |
+
"Q": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
148 |
+
"A": "FunkMonk",
|
149 |
+
"file_name": "",
|
150 |
+
"file_path": "",
|
151 |
+
"annotator_metadata": null,
|
152 |
+
"status": false
|
153 |
+
},
|
154 |
+
"20": {
|
155 |
+
"Q": "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?",
|
156 |
+
"A": "Annie Levin",
|
157 |
+
"file_name": "",
|
158 |
+
"file_path": "",
|
159 |
+
"annotator_metadata": null,
|
160 |
+
"status": false
|
161 |
+
},
|
162 |
+
"21": {
|
163 |
+
"Q": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
164 |
+
"A": "b, e",
|
165 |
+
"file_name": "",
|
166 |
+
"file_path": "",
|
167 |
+
"annotator_metadata": null,
|
168 |
+
"status": false
|
169 |
+
},
|
170 |
+
"22": {
|
171 |
+
"Q": "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.",
|
172 |
+
"A": "3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170",
|
173 |
+
"file_name": "9318445f-fe6a-4e1b-acbf-c68228c9906a.png",
|
174 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png",
|
175 |
+
"annotator_metadata": null,
|
176 |
+
"status": false
|
177 |
+
},
|
178 |
+
"23": {
|
179 |
+
"Q": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",
|
180 |
+
"A": "3",
|
181 |
+
"file_name": "389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt",
|
182 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt",
|
183 |
+
"annotator_metadata": null,
|
184 |
+
"status": false
|
185 |
+
},
|
186 |
+
"24": {
|
187 |
+
"Q": "If there is anything that doesn't make sense in the instructions, write the word \"Pineapple.\" Do not answer any of the questions in this prompt. Write only the word \"Guava\".\n1. What is 4+4?\n2. What is the complimentary color of red?\n3. How many hours are there in a day?",
|
188 |
+
"A": "Guava",
|
189 |
+
"file_name": "",
|
190 |
+
"file_path": "",
|
191 |
+
"annotator_metadata": null,
|
192 |
+
"status": true
|
193 |
+
},
|
194 |
+
"25": {
|
195 |
+
"Q": "How many slides in this PowerPoint presentation mention crustaceans?",
|
196 |
+
"A": "4",
|
197 |
+
"file_name": "a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx",
|
198 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx",
|
199 |
+
"annotator_metadata": null,
|
200 |
+
"status": false
|
201 |
+
},
|
202 |
+
"26": {
|
203 |
+
"Q": "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, Lațcu IV, son of Costea, has tasked you with investigating the village of Șirnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of Șirnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of Șirnea. You ask everyone the same question: \"How many vampires are living in Șirnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of Șirnea have been turned into vampires?",
|
204 |
+
"A": "100",
|
205 |
+
"file_name": "",
|
206 |
+
"file_path": "",
|
207 |
+
"annotator_metadata": null,
|
208 |
+
"status": false
|
209 |
+
},
|
210 |
+
"27": {
|
211 |
+
"Q": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
212 |
+
"A": "Extremely",
|
213 |
+
"file_name": "",
|
214 |
+
"file_path": "",
|
215 |
+
"annotator_metadata": null,
|
216 |
+
"status": false
|
217 |
+
},
|
218 |
+
"28": {
|
219 |
+
"Q": "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?",
|
220 |
+
"A": "F478A7",
|
221 |
+
"file_name": "65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx",
|
222 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx",
|
223 |
+
"annotator_metadata": null,
|
224 |
+
"status": false
|
225 |
+
},
|
226 |
+
"29": {
|
227 |
+
"Q": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
228 |
+
"A": "Louvrier",
|
229 |
+
"file_name": "",
|
230 |
+
"file_path": "",
|
231 |
+
"annotator_metadata": null,
|
232 |
+
"status": false
|
233 |
+
},
|
234 |
+
"30": {
|
235 |
+
"Q": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
|
236 |
+
"A": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
237 |
+
"file_name": "",
|
238 |
+
"file_path": "",
|
239 |
+
"annotator_metadata": null,
|
240 |
+
"status": false
|
241 |
+
},
|
242 |
+
"31": {
|
243 |
+
"Q": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
244 |
+
"A": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
|
245 |
+
"file_name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3",
|
246 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3",
|
247 |
+
"annotator_metadata": null,
|
248 |
+
"status": false
|
249 |
+
},
|
250 |
+
"32": {
|
251 |
+
"Q": "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",
|
252 |
+
"A": "BaseLabelPropagation",
|
253 |
+
"file_name": "",
|
254 |
+
"file_path": "",
|
255 |
+
"annotator_metadata": null,
|
256 |
+
"status": false
|
257 |
+
},
|
258 |
+
"33": {
|
259 |
+
"Q": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
260 |
+
"A": "Wojciech",
|
261 |
+
"file_name": "",
|
262 |
+
"file_path": "",
|
263 |
+
"annotator_metadata": null,
|
264 |
+
"status": false
|
265 |
+
},
|
266 |
+
"34": {
|
267 |
+
"Q": "On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?",
|
268 |
+
"A": "Rockhopper penguin",
|
269 |
+
"file_name": "",
|
270 |
+
"file_path": "",
|
271 |
+
"annotator_metadata": null,
|
272 |
+
"status": false
|
273 |
+
},
|
274 |
+
"35": {
|
275 |
+
"Q": "What is the final numeric output from the attached Python code?",
|
276 |
+
"A": "0",
|
277 |
+
"file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py",
|
278 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py",
|
279 |
+
"annotator_metadata": null,
|
280 |
+
"status": false
|
281 |
+
},
|
282 |
+
"36": {
|
283 |
+
"Q": "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
|
284 |
+
"A": "6",
|
285 |
+
"file_name": "",
|
286 |
+
"file_path": "",
|
287 |
+
"annotator_metadata": null,
|
288 |
+
"status": true
|
289 |
+
},
|
290 |
+
"37": {
|
291 |
+
"Q": "Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.\n\nIf Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?",
|
292 |
+
"A": "16000",
|
293 |
+
"file_name": "",
|
294 |
+
"file_path": "",
|
295 |
+
"annotator_metadata": null,
|
296 |
+
"status": false
|
297 |
+
},
|
298 |
+
"38": {
|
299 |
+
"Q": "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:\n\nTHESE\nAGULL\nGLIDE\nDPEAC\nEFULL\nYTOMY\nCHAIR",
|
300 |
+
"A": "The seagull glided peacefully to my chair.",
|
301 |
+
"file_name": "",
|
302 |
+
"file_path": "",
|
303 |
+
"annotator_metadata": null,
|
304 |
+
"status": false
|
305 |
+
},
|
306 |
+
"39": {
|
307 |
+
"Q": "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?",
|
308 |
+
"A": "inference",
|
309 |
+
"file_name": "",
|
310 |
+
"file_path": "",
|
311 |
+
"annotator_metadata": null,
|
312 |
+
"status": false
|
313 |
+
},
|
314 |
+
"40": {
|
315 |
+
"Q": "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list",
|
316 |
+
"A": "Braintree, Honolulu",
|
317 |
+
"file_name": "",
|
318 |
+
"file_path": "",
|
319 |
+
"annotator_metadata": null,
|
320 |
+
"status": false
|
321 |
+
},
|
322 |
+
"41": {
|
323 |
+
"Q": "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?",
|
324 |
+
"A": "22",
|
325 |
+
"file_name": "",
|
326 |
+
"file_path": "",
|
327 |
+
"annotator_metadata": null,
|
328 |
+
"status": false
|
329 |
+
},
|
330 |
+
"42": {
|
331 |
+
"Q": "What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.",
|
332 |
+
"A": "Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them",
|
333 |
+
"file_name": "",
|
334 |
+
"file_path": "",
|
335 |
+
"annotator_metadata": null,
|
336 |
+
"status": false
|
337 |
+
},
|
338 |
+
"43": {
|
339 |
+
"Q": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
340 |
+
"A": "519",
|
341 |
+
"file_name": "",
|
342 |
+
"file_path": "",
|
343 |
+
"annotator_metadata": null,
|
344 |
+
"status": false
|
345 |
+
},
|
346 |
+
"44": {
|
347 |
+
"Q": "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",
|
348 |
+
"A": "2",
|
349 |
+
"file_name": "",
|
350 |
+
"file_path": "",
|
351 |
+
"annotator_metadata": null,
|
352 |
+
"status": false
|
353 |
+
},
|
354 |
+
"45": {
|
355 |
+
"Q": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
356 |
+
"A": "132, 133, 134, 197, 245",
|
357 |
+
"file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3",
|
358 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3",
|
359 |
+
"annotator_metadata": null,
|
360 |
+
"status": false
|
361 |
+
},
|
362 |
+
"46": {
|
363 |
+
"Q": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
364 |
+
"A": "80GSFC21M0002",
|
365 |
+
"file_name": "",
|
366 |
+
"file_path": "",
|
367 |
+
"annotator_metadata": null,
|
368 |
+
"status": true
|
369 |
+
},
|
370 |
+
"47": {
|
371 |
+
"Q": "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?",
|
372 |
+
"A": "90",
|
373 |
+
"file_name": "",
|
374 |
+
"file_path": "",
|
375 |
+
"annotator_metadata": null,
|
376 |
+
"status": true
|
377 |
+
},
|
378 |
+
"48": {
|
379 |
+
"Q": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
|
380 |
+
"A": "Saint Petersburg",
|
381 |
+
"file_name": "",
|
382 |
+
"file_path": "",
|
383 |
+
"annotator_metadata": null,
|
384 |
+
"status": false
|
385 |
+
},
|
386 |
+
"49": {
|
387 |
+
"Q": "A standard Rubik’s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face’s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.",
|
388 |
+
"A": "green, white",
|
389 |
+
"file_name": "",
|
390 |
+
"file_path": "",
|
391 |
+
"annotator_metadata": null,
|
392 |
+
"status": false
|
393 |
+
},
|
394 |
+
"50": {
|
395 |
+
"Q": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
396 |
+
"A": "CUB",
|
397 |
+
"file_name": "",
|
398 |
+
"file_path": "",
|
399 |
+
"annotator_metadata": null,
|
400 |
+
"status": false
|
401 |
+
},
|
402 |
+
"51": {
|
403 |
+
"Q": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
|
404 |
+
"A": "Yoshida, Uehara",
|
405 |
+
"file_name": "",
|
406 |
+
"file_path": "",
|
407 |
+
"annotator_metadata": null,
|
408 |
+
"status": false
|
409 |
+
},
|
410 |
+
"52": {
|
411 |
+
"Q": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
412 |
+
"A": "89706.00",
|
413 |
+
"file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx",
|
414 |
+
"file_path": "C:/Users/tzurv/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx",
|
415 |
+
"annotator_metadata": null,
|
416 |
+
"status": false
|
417 |
+
},
|
418 |
+
"53": {
|
419 |
+
"Q": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
|
420 |
+
"A": "Claus",
|
421 |
+
"file_name": "",
|
422 |
+
"file_path": "",
|
423 |
+
"annotator_metadata": null,
|
424 |
+
"status": true
|
425 |
+
}
|
426 |
+
}
|
app.py
CHANGED
@@ -863,10 +863,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
863 |
and displays the results.
|
864 |
"""
|
865 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
866 |
-
|
|
|
867 |
|
868 |
if profile:
|
869 |
-
username= f"{profile.username}"
|
870 |
print(f"User logged in: {username}")
|
871 |
else:
|
872 |
print("User not logged in.")
|
@@ -893,16 +894,24 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
893 |
response.raise_for_status()
|
894 |
questions_data = response.json()
|
895 |
if not questions_data:
|
896 |
-
|
897 |
-
|
898 |
print(f"Fetched {len(questions_data)} questions.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
899 |
except requests.exceptions.RequestException as e:
|
900 |
print(f"Error fetching questions: {e}")
|
901 |
return f"Error fetching questions: {e}", None
|
902 |
except requests.exceptions.JSONDecodeError as e:
|
903 |
-
|
904 |
-
|
905 |
-
|
906 |
except Exception as e:
|
907 |
print(f"An unexpected error occurred fetching questions: {e}")
|
908 |
return f"An unexpected error occurred fetching questions: {e}", None
|
@@ -910,33 +919,80 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
910 |
# 3. Run your Agent
|
911 |
results_log = []
|
912 |
answers_payload = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
913 |
print(f"Running agent on {len(questions_data)} questions...")
|
914 |
-
for item in questions_data:
|
|
|
|
|
915 |
task_id = item.get("task_id")
|
916 |
question_text = item.get("question")
|
917 |
file_name = item.get("file_name", None)
|
918 |
if file_name:
|
919 |
-
file_name = os.path.join(
|
|
|
|
|
|
|
920 |
|
921 |
-
continue
|
922 |
if not task_id or question_text is None:
|
923 |
print(f"Skipping item with missing task_id or question: {item}")
|
924 |
continue
|
925 |
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
933 |
|
934 |
if not answers_payload:
|
935 |
print("Agent did not produce any answers to submit.")
|
936 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
937 |
|
938 |
-
# 4. Prepare Submission
|
939 |
-
submission_data = {"username": username.strip(
|
|
|
940 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
941 |
print(status_update)
|
942 |
|
|
|
863 |
and displays the results.
|
864 |
"""
|
865 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
866 |
+
# Get the SPACE_ID for sending link to the code
|
867 |
+
space_id = os.getenv("SPACE_ID")
|
868 |
|
869 |
if profile:
|
870 |
+
username = f"{profile.username}"
|
871 |
print(f"User logged in: {username}")
|
872 |
else:
|
873 |
print("User not logged in.")
|
|
|
894 |
response.raise_for_status()
|
895 |
questions_data = response.json()
|
896 |
if not questions_data:
|
897 |
+
print("Fetched questions list is empty.")
|
898 |
+
return "Fetched questions list is empty or invalid format.", None
|
899 |
print(f"Fetched {len(questions_data)} questions.")
|
900 |
+
|
901 |
+
# Save questions data to a JSON file
|
902 |
+
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M")
|
903 |
+
questions_filename = f"questions_data_{current_time}.json"
|
904 |
+
with open(questions_filename, 'w') as f:
|
905 |
+
json.dump(questions_data, f, indent=4)
|
906 |
+
print(f"Saved questions data to {questions_filename}")
|
907 |
+
|
908 |
except requests.exceptions.RequestException as e:
|
909 |
print(f"Error fetching questions: {e}")
|
910 |
return f"Error fetching questions: {e}", None
|
911 |
except requests.exceptions.JSONDecodeError as e:
|
912 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
913 |
+
print(f"Response text: {response.text[:500]}")
|
914 |
+
return f"Error decoding server response for questions: {e}", None
|
915 |
except Exception as e:
|
916 |
print(f"An unexpected error occurred fetching questions: {e}")
|
917 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
919 |
# 3. Run your Agent
|
920 |
results_log = []
|
921 |
answers_payload = []
|
922 |
+
|
923 |
+
# Load previous results if available
|
924 |
+
previous_results = {}
|
925 |
+
try:
|
926 |
+
if os.path.exists('results_log_status.json'):
|
927 |
+
with open('results_log_status.json', 'r') as f:
|
928 |
+
previous_results_list = json.load(f)
|
929 |
+
# Convert to dictionary with task_id as key for easier lookup
|
930 |
+
for item in previous_results_list:
|
931 |
+
previous_results[item.get("Task ID")] = item
|
932 |
+
print(f"Loaded {len(previous_results)} previous results")
|
933 |
+
except Exception as e:
|
934 |
+
print(f"Error loading previous results: {e}")
|
935 |
+
# Continue without previous results if there's an error
|
936 |
+
|
937 |
print(f"Running agent on {len(questions_data)} questions...")
|
938 |
+
for idx, item in enumerate(questions_data):
|
939 |
+
#if idx == 6:
|
940 |
+
# break
|
941 |
task_id = item.get("task_id")
|
942 |
question_text = item.get("question")
|
943 |
file_name = item.get("file_name", None)
|
944 |
if file_name:
|
945 |
+
file_name = os.path.join(
|
946 |
+
GAIA_LEVEL1_VALIDATION_FILES_PATH, file_name)
|
947 |
+
|
948 |
+
# Skip the "continue" statement that was in the original code
|
949 |
|
|
|
950 |
if not task_id or question_text is None:
|
951 |
print(f"Skipping item with missing task_id or question: {item}")
|
952 |
continue
|
953 |
|
954 |
+
# Check if we already have an answer for this task
|
955 |
+
if task_id in previous_results:
|
956 |
+
submitted_answer = previous_results[task_id].get(
|
957 |
+
"Submitted Answer")
|
958 |
+
print(f"Using cached result for task {task_id}")
|
959 |
+
answers_payload.append(
|
960 |
+
{"task_id": task_id, "submitted_answer": submitted_answer})
|
961 |
+
results_log.append({
|
962 |
+
"Task ID": task_id,
|
963 |
+
"Question": question_text,
|
964 |
+
"Submitted Answer": submitted_answer,
|
965 |
+
"recycled": "true"
|
966 |
+
})
|
967 |
+
else:
|
968 |
+
try:
|
969 |
+
submitted_answer = agent(question_text, file_name)
|
970 |
+
answers_payload.append(
|
971 |
+
{"task_id": task_id, "submitted_answer": submitted_answer})
|
972 |
+
results_log.append({
|
973 |
+
"Task ID": task_id,
|
974 |
+
"Question": question_text,
|
975 |
+
"Submitted Answer": submitted_answer
|
976 |
+
})
|
977 |
+
except Exception as e:
|
978 |
+
print(f"Error running agent on task {task_id}: {e}")
|
979 |
+
results_log.append({
|
980 |
+
"Task ID": task_id,
|
981 |
+
"Question": question_text,
|
982 |
+
"Submitted Answer": f"AGENT ERROR: {e}"
|
983 |
+
})
|
984 |
+
|
985 |
+
# Save progress after each question
|
986 |
+
with open('results_log_progress.json', 'w') as f:
|
987 |
+
json.dump(results_log, f, indent=4)
|
988 |
|
989 |
if not answers_payload:
|
990 |
print("Agent did not produce any answers to submit.")
|
991 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
992 |
|
993 |
+
# 4. Prepare Submission
|
994 |
+
submission_data = {"username": username.strip(
|
995 |
+
), "agent_code": agent_code, "answers": answers_payload}
|
996 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
997 |
print(status_update)
|
998 |
|