Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- src/.DS_Store +0 -0
- src/FisrtModule/sub-embedder-reranker-ensemble-v3 (1).ipynb +1 -0
- src/SecondModule/2ndModule.md +0 -0
- src/SecondModule/__pycache__/module2.cpython-311.pyc +0 -0
- src/SecondModule/__pycache__/module2.cpython-39.pyc +0 -0
- src/SecondModule/__pycache__/module2_ori.cpython-39.pyc +0 -0
- src/SecondModule/misconception_mapping.csv +0 -0
- src/SecondModule/module2.py +196 -0
- src/ThirdModule/module3.py +121 -0
- src/config.py +1 -0
src/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/FisrtModule/sub-embedder-reranker-ensemble-v3 (1).ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":82695,"databundleVersionId":9738540,"sourceType":"competition"},{"sourceId":202482692,"sourceType":"kernelVersion"},{"sourceId":210169931,"sourceType":"kernelVersion"},{"sourceId":212909050,"sourceType":"kernelVersion"},{"sourceId":181555,"sourceType":"modelInstanceVersion","modelInstanceId":154735,"modelId":177197},{"sourceId":185957,"sourceType":"modelInstanceVersion","modelInstanceId":158544,"modelId":180911},{"sourceId":189058,"sourceType":"modelInstanceVersion","modelInstanceId":161187,"modelId":183575},{"sourceId":192491,"sourceType":"modelInstanceVersion","modelInstanceId":164133,"modelId":186481},{"sourceId":193719,"sourceType":"modelInstanceVersion","modelInstanceId":165209,"modelId":187529},{"sourceId":174909,"sourceType":"modelInstanceVersion","isSourceIdPinned":true,"modelInstanceId":148911,"modelId":171421}],"dockerImageVersionId":30776,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%capture\n!pip install --no-index --find-links=/kaggle/input/reranker-scripts/packages -U transformers bitsandbytes accelerate peft","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:49.145040Z","iopub.execute_input":"2024-12-20T09:36:49.145387Z","iopub.status.idle":"2024-12-20T09:36:50.814837Z","shell.execute_reply.started":"2024-12-20T09:36:49.145354Z","shell.execute_reply":"2024-12-20T09:36:50.813733Z"}},"outputs":[],"execution_count":1},{"cell_type":"code","source":"import re\nimport pandas as pd\nimport numpy as np\n\ncomp_dir = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'\n\ntrain = pd.read_csv(f'{comp_dir}/train.csv')\ntest = pd.read_csv(f'{comp_dir}/test.csv')\nmisconceptions = pd.read_csv(f'{comp_dir}/misconception_mapping.csv')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:50.816741Z","iopub.execute_input":"2024-12-20T09:36:50.817036Z","iopub.status.idle":"2024-12-20T09:36:51.189886Z","shell.execute_reply.started":"2024-12-20T09:36:50.817006Z","shell.execute_reply":"2024-12-20T09:36:51.189168Z"}},"outputs":[],"execution_count":2},{"cell_type":"code","source":"zeros = [5,\n 7,\n 10,\n 12,\n 13,\n 17,\n 18,\n 24,\n 25,\n 30,\n 40,\n 41,\n 43,\n 46,\n 50,\n 53,\n 56,\n 59,\n 62,\n 63,\n 65,\n 67,\n 69,\n 72,\n 73,\n 75,\n 79,\n 80,\n 87,\n 90,\n 92,\n 93,\n 94,\n 96,\n 104,\n 112,\n 115,\n 116,\n 121,\n 122,\n 124,\n 125,\n 128,\n 129,\n 135,\n 139,\n 140,\n 144,\n 145,\n 147,\n 148,\n 149,\n 150,\n 155,\n 156,\n 157,\n 160,\n 163,\n 165,\n 168,\n 170,\n 173,\n 174,\n 175,\n 177,\n 178,\n 180,\n 184,\n 188,\n 192,\n 193,\n 198,\n 199,\n 200,\n 203,\n 204,\n 206,\n 208,\n 209,\n 214,\n 215,\n 216,\n 222,\n 225,\n 231,\n 235,\n 237,\n 238,\n 240,\n 241,\n 243,\n 246,\n 250,\n 254,\n 257,\n 258,\n 259,\n 266,\n 269,\n 274,\n 275,\n 276,\n 277,\n 286,\n 288,\n 291,\n 298,\n 299,\n 301,\n 304,\n 310,\n 311,\n 313,\n 314,\n 316,\n 319,\n 320,\n 321,\n 323,\n 324,\n 325,\n 327,\n 330,\n 333,\n 335,\n 336,\n 341,\n 342,\n 343,\n 348,\n 351,\n 354,\n 356,\n 358,\n 359,\n 360,\n 361,\n 365,\n 367,\n 368,\n 369,\n 371,\n 385,\n 386,\n 387,\n 390,\n 394,\n 395,\n 399,\n 403,\n 406,\n 407,\n 410,\n 411,\n 412,\n 413,\n 416,\n 419,\n 420,\n 425,\n 428,\n 429,\n 430,\n 431,\n 432,\n 434,\n 435,\n 437,\n 440,\n 442,\n 444,\n 448,\n 453,\n 456,\n 457,\n 459,\n 462,\n 465,\n 472,\n 473,\n 475,\n 476,\n 477,\n 479,\n 480,\n 482,\n 484,\n 485,\n 487,\n 489,\n 490,\n 494,\n 497,\n 500,\n 502,\n 504,\n 505,\n 506,\n 513,\n 514,\n 516,\n 517,\n 518,\n 522,\n 523,\n 529,\n 530,\n 534,\n 535,\n 536,\n 538,\n 541,\n 543,\n 546,\n 548,\n 552,\n 555,\n 559,\n 560,\n 561,\n 562,\n 569,\n 571,\n 574,\n 575,\n 579,\n 580,\n 582,\n 586,\n 592,\n 593,\n 595,\n 596,\n 597,\n 598,\n 605,\n 607,\n 610,\n 612,\n 613,\n 615,\n 622,\n 627,\n 632,\n 634,\n 636,\n 640,\n 645,\n 647,\n 660,\n 662,\n 665,\n 667,\n 675,\n 676,\n 677,\n 678,\n 679,\n 681,\n 682,\n 683,\n 689,\n 692,\n 693,\n 696,\n 697,\n 698,\n 700,\n 701,\n 703,\n 705,\n 707,\n 714,\n 716,\n 720,\n 721,\n 722,\n 726,\n 728,\n 731,\n 738,\n 740,\n 741,\n 744,\n 748,\n 749,\n 750,\n 752,\n 753,\n 755,\n 761,\n 763,\n 768,\n 769,\n 770,\n 771,\n 774,\n 775,\n 776,\n 777,\n 778,\n 781,\n 786,\n 787,\n 788,\n 798,\n 799,\n 802,\n 803,\n 805,\n 810,\n 817,\n 818,\n 819,\n 822,\n 824,\n 825,\n 826,\n 827,\n 830,\n 831,\n 837,\n 841,\n 844,\n 846,\n 849,\n 850,\n 853,\n 854,\n 855,\n 856,\n 857,\n 859,\n 861,\n 862,\n 865,\n 869,\n 873,\n 881,\n 883,\n 884,\n 885,\n 886,\n 887,\n 889,\n 892,\n 896,\n 897,\n 899,\n 903,\n 908,\n 913,\n 916,\n 917,\n 918,\n 919,\n 920,\n 921,\n 928,\n 929,\n 934,\n 938,\n 939,\n 943,\n 945,\n 949,\n 953,\n 956,\n 957,\n 958,\n 963,\n 966,\n 967,\n 970,\n 973,\n 979,\n 985,\n 992,\n 993,\n 995,\n 998,\n 999,\n 1000,\n 1001,\n 1002,\n 1003,\n 1005,\n 1006,\n 1010,\n 1011,\n 1014,\n 1015,\n 1019,\n 1020,\n 1021,\n 1023,\n 1027,\n 1034,\n 1037,\n 1038,\n 1039,\n 1042,\n 1045,\n 1052,\n 1053,\n 1056,\n 1059,\n 1060,\n 1061,\n 1062,\n 1064,\n 1065,\n 1067,\n 1076,\n 1077,\n 1080,\n 1081,\n 1084,\n 1085,\n 1086,\n 1087,\n 1088,\n 1090,\n 1092,\n 1094,\n 1095,\n 1096,\n 1097,\n 1101,\n 1105,\n 1106,\n 1109,\n 1111,\n 1112,\n 1117,\n 1121,\n 1122,\n 1125,\n 1127,\n 1129,\n 1131,\n 1136,\n 1137,\n 1141,\n 1143,\n 1147,\n 1149,\n 1154,\n 1159,\n 1160,\n 1161,\n 1162,\n 1170,\n 1171,\n 1174,\n 1178,\n 1182,\n 1185,\n 1189,\n 1191,\n 1194,\n 1195,\n 1197,\n 1199,\n 1200,\n 1201,\n 1211,\n 1217,\n 1219,\n 1220,\n 1221,\n 1227,\n 1228,\n 1232,\n 1236,\n 1238,\n 1239,\n 1241,\n 1242,\n 1243,\n 1245,\n 1246,\n 1247,\n 1249,\n 1253,\n 1254,\n 1267,\n 1273,\n 1275,\n 1276,\n 1279,\n 1281,\n 1284,\n 1285,\n 1286,\n 1289,\n 1296,\n 1297,\n 1298,\n 1299,\n 1300,\n 1301,\n 1305,\n 1309,\n 1314,\n 1315,\n 1317,\n 1323,\n 1325,\n 1328,\n 1330,\n 1331,\n 1335,\n 1337,\n 1339,\n 1341,\n 1342,\n 1343,\n 1345,\n 1346,\n 1347,\n 1351,\n 1352,\n 1353,\n 1355,\n 1359,\n 1366,\n 1368,\n 1369,\n 1372,\n 1375,\n 1377,\n 1378,\n 1381,\n 1382,\n 1385,\n 1390,\n 1391,\n 1395,\n 1397,\n 1401,\n 1404,\n 1405,\n 1407,\n 1409,\n 1412,\n 1413,\n 1414,\n 1423,\n 1434,\n 1438,\n 1439,\n 1441,\n 1446,\n 1448,\n 1451,\n 1454,\n 1462,\n 1463,\n 1465,\n 1466,\n 1472,\n 1474,\n 1475,\n 1476,\n 1477,\n 1478,\n 1482,\n 1485,\n 1486,\n 1489,\n 1493,\n 1496,\n 1497,\n 1498,\n 1501,\n 1502,\n 1503,\n 1504,\n 1506,\n 1512,\n 1515,\n 1518,\n 1521,\n 1531,\n 1538,\n 1544,\n 1545,\n 1546,\n 1548,\n 1551,\n 1555,\n 1562,\n 1563,\n 1564,\n 1565,\n 1567,\n 1569,\n 1570,\n 1573,\n 1574,\n 1575,\n 1576,\n 1578,\n 1579,\n 1580,\n 1581,\n 1583,\n 1584,\n 1587,\n 1589,\n 1592,\n 1594,\n 1603,\n 1607,\n 1612,\n 1615,\n 1617,\n 1618,\n 1620,\n 1625,\n 1627,\n 1629,\n 1634,\n 1638,\n 1643,\n 1647,\n 1649,\n 1650,\n 1653,\n 1654,\n 1662,\n 1665,\n 1679,\n 1681,\n 1683,\n 1684,\n 1685,\n 1688,\n 1689,\n 1692,\n 1698,\n 1699,\n 1700,\n 1709,\n 1711,\n 1712,\n 1713,\n 1719,\n 1722,\n 1723,\n 1727,\n 1728,\n 1729,\n 1732,\n 1736,\n 1739,\n 1742,\n 1745,\n 1747,\n 1750,\n 1753,\n 1754,\n 1758,\n 1762,\n 1765,\n 1768,\n 1770,\n 1772,\n 1774,\n 1777,\n 1778,\n 1780,\n 1785,\n 1789,\n 1797,\n 1798,\n 1799,\n 1801,\n 1802,\n 1804,\n 1808,\n 1812,\n 1816,\n 1817,\n 1821,\n 1822,\n 1826,\n 1827,\n 1829,\n 1830,\n 1833,\n 1838,\n 1843,\n 1844,\n 1848,\n 1853,\n 1855,\n 1856,\n 1857,\n 1859,\n 1865,\n 1867,\n 1873,\n 1874,\n 1878,\n 1879,\n 1888,\n 1890,\n 1891,\n 1892,\n 1899,\n 1901,\n 1903,\n 1904,\n 1905,\n 1910,\n 1913,\n 1919,\n 1922,\n 1926,\n 1928,\n 1930,\n 1931,\n 1933,\n 1934,\n 1935,\n 1936,\n 1938,\n 1941,\n 1943,\n 1947,\n 1949,\n 1951,\n 1952,\n 1955,\n 1961,\n 1962,\n 1964,\n 1967,\n 1969,\n 1974,\n 1975,\n 1977,\n 1979,\n 1983,\n 1991,\n 1998,\n 2000,\n 2001,\n 2004,\n 2005,\n 2008,\n 2010,\n 2013,\n 2014,\n 2015,\n 2018,\n 2019,\n 2025,\n 2028,\n 2029,\n 2033,\n 2036,\n 2040,\n 2041,\n 2052,\n 2057,\n 2058,\n 2059,\n 2060,\n 2061,\n 2062,\n 2063,\n 2064,\n 2065,\n 2067,\n 2070,\n 2074,\n 2075,\n 2076,\n 2079,\n 2086,\n 2088,\n 2089,\n 2096,\n 2097,\n 2099,\n 2106,\n 2108,\n 2115,\n 2116,\n 2118,\n 2120,\n 2129,\n 2136,\n 2141,\n 2144,\n 2146,\n 2151,\n 2153,\n 2158,\n 2164,\n 2165,\n 2167,\n 2168,\n 2169,\n 2170,\n 2171,\n 2172,\n 2174,\n 2176,\n 2177,\n 2182,\n 2183,\n 2184,\n 2186,\n 2188,\n 2194,\n 2195,\n 2196,\n 2200,\n 2201,\n 2202,\n 2205,\n 2207,\n 2211,\n 2213,\n 2216,\n 2219,\n 2222,\n 2223,\n 2224,\n 2227,\n 2229,\n 2232,\n 2233,\n 2235,\n 2236,\n 2242,\n 2243,\n 2244,\n 2246,\n 2247,\n 2249,\n 2253,\n 2254,\n 2255,\n 2257,\n 2258,\n 2259,\n 2267,\n 2268,\n 2272,\n 2274,\n 2276,\n 2277,\n 2278,\n 2281,\n 2283,\n 2287,\n 2290,\n 2293,\n 2294,\n 2295,\n 2298,\n 2299,\n 2300,\n 2304,\n 2310,\n 2313,\n 2315,\n 2322,\n 2323,\n 2324,\n 2325,\n 2328,\n 2335,\n 2337,\n 2338,\n 2339,\n 2342,\n 2347,\n 2348,\n 2358,\n 2360,\n 2366,\n 2367,\n 2369,\n 2370,\n 2372,\n 2379,\n 2381,\n 2382,\n 2383,\n 2385,\n 2387,\n 2390,\n 2391,\n 2393,\n 2394,\n 2396,\n 2400,\n 2403,\n 2404,\n 2405,\n 2406,\n 2407,\n 2409,\n 2410,\n 2411,\n 2418,\n 2419,\n 2420,\n 2421,\n 2422,\n 2429,\n 2430,\n 2431,\n 2434,\n 2439,\n 2440,\n 2441,\n 2444,\n 2448,\n 2449,\n 2451,\n 2452,\n 2455,\n 2459,\n 2460,\n 2462,\n 2463,\n 2467,\n 2468,\n 2471,\n 2477,\n 2478,\n 2482,\n 2483,\n 2486,\n 2490,\n 2495,\n 2496,\n 2497,\n 2498,\n 2502,\n 2503,\n 2506,\n 2509,\n 2513,\n 2518,\n 2521,\n 2522,\n 2523,\n 2526,\n 2527,\n 2528,\n 2529,\n 2533,\n 2534,\n 2536,\n 2538,\n 2540,\n 2541,\n 2545,\n 2548,\n 2552,\n 2553,\n 2556,\n 2557,\n 2559,\n 2560,\n 2562,\n 2564,\n 2567,\n 2568,\n 2570,\n 2571,\n 2574,\n 2575,\n 2578,\n 2580,\n 2582,\n 2584]","metadata":{"trusted":true,"scrolled":true,"jupyter":{"source_hidden":true},"_kg_hide-input":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.191003Z","iopub.execute_input":"2024-12-20T09:36:51.191268Z","iopub.status.idle":"2024-12-20T09:36:51.219924Z","shell.execute_reply.started":"2024-12-20T09:36:51.191243Z","shell.execute_reply":"2024-12-20T09:36:51.219040Z"}},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":"# preprocessing","metadata":{}},{"cell_type":"code","source":"answer_cols = [\"AnswerAText\", \"AnswerBText\", \"AnswerCText\", \"AnswerDText\"]\nkeep_cols = [\"QuestionId\", \"CorrectAnswer\", \"ConstructName\", \"SubjectName\", \"QuestionText\" ]\n\ndef wide_to_long(df: pd.DataFrame) -> pd.DataFrame:\n answers_df = pd.melt(\n id_vars=keep_cols,\n frame=df[keep_cols + answer_cols],\n var_name='Answer', value_name='AnswerText'\n ).sort_values([\"QuestionId\", \"Answer\"]).reset_index(drop=True) \n return answers_df\n\ndef preprocess_text(x):\n x = re.sub(r\"http\\w+\", '', x) # Delete URL\n x = re.sub(r\"\\.+\", \".\", x) # Replace consecutive periods with a single period\n x = re.sub(r\"\\,+\", \",\", x) # Replace consecutive commas with a single comma\n x = re.sub(r\"\\\\\\\\\", r\"\\\\\", x) # Normalize multiple backslashes to double backslashes\n x = re.sub(r\"[ ]{2,}\", \" \", x) # Replace multiple spaces with a single space\n x = x.strip() # Remove empty characters at the beginning and end\n return x\n\ntask_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'\ndef get_text(row):\n text = f'''Instruct: {task_description}\n\nQuery:\n###Construct###:{row['ConstructName']}\n###Subject###:{row['SubjectName']}\n###Question###:{row['QuestionText']}\n###Correct Answer###:{row['CorrectAnswerText']}\n###Incorrect Answer###:{row['AnswerText']}\n###Misconception###:\n'''\n return preprocess_text(text)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.221573Z","iopub.execute_input":"2024-12-20T09:36:51.221859Z","iopub.status.idle":"2024-12-20T09:36:51.234513Z","shell.execute_reply.started":"2024-12-20T09:36:51.221834Z","shell.execute_reply":"2024-12-20T09:36:51.233784Z"}},"outputs":[],"execution_count":4},{"cell_type":"code","source":"df = wide_to_long(test)\ndf['AnswerId'] = df.Answer.str.replace('Answer', '').str.replace('Text', '')\n\nca_map_df = df[df['CorrectAnswer']==df['AnswerId']][['QuestionId', 'AnswerText']].reset_index(drop=True)\nca_map_df.columns = ['QuestionId', 'CorrectAnswerText']\ndf = pd.merge(df, ca_map_df, on='QuestionId', how='left')\n\ndf = df[df['CorrectAnswer']!=df['AnswerId']].reset_index(drop=True)\ndf['text'] = df.apply(get_text, axis=1)\ndf['QuestionId_Answer'] = df['QuestionId'].astype(str) + '_' + df['AnswerId'].astype(str)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.235313Z","iopub.execute_input":"2024-12-20T09:36:51.235540Z","iopub.status.idle":"2024-12-20T09:36:51.259820Z","shell.execute_reply.started":"2024-12-20T09:36:51.235517Z","shell.execute_reply":"2024-12-20T09:36:51.258966Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"print(df['text'][0])","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.260830Z","iopub.execute_input":"2024-12-20T09:36:51.261065Z","iopub.status.idle":"2024-12-20T09:36:51.265899Z","shell.execute_reply.started":"2024-12-20T09:36:51.261042Z","shell.execute_reply":"2024-12-20T09:36:51.265019Z"}},"outputs":[{"name":"stdout","text":"Instruct: Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.\n\nQuery:\n###Construct###:Use the order of operations to carry out calculations involving powers\n###Subject###:BIDMAS\n###Question###:\\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n###Correct Answer###:\\( 3 \\times(2+4)-5 \\)\n###Incorrect Answer###:\\( 3 \\times 2+(4-5) \\)\n###Misconception###:\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"len(df)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.267142Z","iopub.execute_input":"2024-12-20T09:36:51.267464Z","iopub.status.idle":"2024-12-20T09:36:51.277187Z","shell.execute_reply.started":"2024-12-20T09:36:51.267428Z","shell.execute_reply":"2024-12-20T09:36:51.276171Z"}},"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"9"},"metadata":{}}],"execution_count":7},{"cell_type":"code","source":"df.head(3)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.278299Z","iopub.execute_input":"2024-12-20T09:36:51.278606Z","iopub.status.idle":"2024-12-20T09:36:51.295275Z","shell.execute_reply.started":"2024-12-20T09:36:51.278568Z","shell.execute_reply":"2024-12-20T09:36:51.294395Z"}},"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" QuestionId CorrectAnswer \\\n0 1869 A \n1 1869 A \n2 1869 A \n\n ConstructName SubjectName \\\n0 Use the order of operations to carry out calcu... BIDMAS \n1 Use the order of operations to carry out calcu... BIDMAS \n2 Use the order of operations to carry out calcu... BIDMAS \n\n QuestionText Answer \\\n0 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerBText \n1 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerCText \n2 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerDText \n\n AnswerText AnswerId CorrectAnswerText \\\n0 \\( 3 \\times 2+(4-5) \\) B \\( 3 \\times(2+4)-5 \\) \n1 \\( 3 \\times(2+4-5) \\) C \\( 3 \\times(2+4)-5 \\) \n2 Does not need brackets D \\( 3 \\times(2+4)-5 \\) \n\n text QuestionId_Answer \n0 Instruct: Given a math question and a misconce... 1869_B \n1 Instruct: Given a math question and a misconce... 1869_C \n2 Instruct: Given a math question and a misconce... 1869_D ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>QuestionId</th>\n <th>CorrectAnswer</th>\n <th>ConstructName</th>\n <th>SubjectName</th>\n <th>QuestionText</th>\n <th>Answer</th>\n <th>AnswerText</th>\n <th>AnswerId</th>\n <th>CorrectAnswerText</th>\n <th>text</th>\n <th>QuestionId_Answer</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1869</td>\n <td>A</td>\n <td>Use the order of operations to carry out calcu...</td>\n <td>BIDMAS</td>\n <td>\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...</td>\n <td>AnswerBText</td>\n <td>\\( 3 \\times 2+(4-5) \\)</td>\n <td>B</td>\n <td>\\( 3 \\times(2+4)-5 \\)</td>\n <td>Instruct: Given a math question and a misconce...</td>\n <td>1869_B</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1869</td>\n <td>A</td>\n <td>Use the order of operations to carry out calcu...</td>\n <td>BIDMAS</td>\n <td>\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...</td>\n <td>AnswerCText</td>\n <td>\\( 3 \\times(2+4-5) \\)</td>\n <td>C</td>\n <td>\\( 3 \\times(2+4)-5 \\)</td>\n <td>Instruct: Given a math question and a misconce...</td>\n <td>1869_C</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1869</td>\n <td>A</td>\n <td>Use the order of operations to carry out calcu...</td>\n <td>BIDMAS</td>\n <td>\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...</td>\n <td>AnswerDText</td>\n <td>Does not need brackets</td>\n <td>D</td>\n <td>\\( 3 \\times(2+4)-5 \\)</td>\n <td>Instruct: Given a math question and a misconce...</td>\n <td>1869_D</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}],"execution_count":8},{"cell_type":"code","source":"df.to_parquet(\"df_preprocessed.parquet\", index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.296206Z","iopub.execute_input":"2024-12-20T09:36:51.296412Z","iopub.status.idle":"2024-12-20T09:36:51.320210Z","shell.execute_reply.started":"2024-12-20T09:36:51.296391Z","shell.execute_reply":"2024-12-20T09:36:51.319663Z"}},"outputs":[],"execution_count":9},{"cell_type":"markdown","source":"# Embedder","metadata":{}},{"cell_type":"code","source":"%%writefile run_embedder.py\n\nimport argparse\nfrom tqdm import tqdm, trange\nimport numpy as np\nimport pandas as pd\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport torch\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom transformers import AutoTokenizer, Qwen2Model, BitsAndBytesConfig, set_seed\nfrom peft import LoraConfig, get_peft_model\n\n# ใใฉใกใผใฟ\nMODEL_PATH = \"/kaggle/input/qwen2.5-32b/transformers/default/1\"\nMAX_LENGTH = 512\nBATCH_SIZE = 8\nDEVICE = 'auto'\n\nset_seed(42)\n\ndef last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:\n \"\"\"\n Extract the embedding of the last valid token based on attention mask.\n \"\"\"\n if attention_mask[:, -1].all(): # Check if right padding is used\n return last_hidden_states[:, -1]\n sequence_lengths = attention_mask.sum(dim=1) - 1\n return last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]\n\[email protected]_grad()\[email protected]('cuda')\ndef inference(df, model, tokenizer, query_text='query_text'):\n \"\"\"\n Perform inference to extract embeddings for a given dataframe of sentences.\n \"\"\"\n sentences = df[query_text].tolist()\n all_embeddings = [None] * len(sentences)\n\n # Sort sentences by length (descending)\n length_sorted_idx = np.argsort([-len(sen) for sen in sentences])\n sentences_sorted = [sentences[idx] for idx in length_sorted_idx]\n\n for start_idx in trange(0, len(sentences), BATCH_SIZE, desc=\"Batches\"):\n sentences_batch = sentences_sorted[start_idx: start_idx + BATCH_SIZE]\n features = tokenizer(sentences_batch, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors=\"pt\")\n features = {key: value.to(model.device) for key, value in features.items() if isinstance(value, Tensor)}\n\n outputs = model.model(**features)\n embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])\n embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu().numpy()\n\n for i, emb in enumerate(embeddings):\n original_idx = length_sorted_idx[start_idx + i]\n all_embeddings[original_idx] = emb\n\n return np.vstack(all_embeddings)\n\nif __name__ == \"__main__\":\n # Parse arguments\n parser = argparse.ArgumentParser(description=\"Run embedding inference.\")\n parser.add_argument(\"--df_path\", type=str, required=True)\n parser.add_argument(\"--lora_path\", type=str, required=True)\n parser.add_argument(\"--output_path\", type=str, required=True)\n args = parser.parse_args()\n\n # Load data\n df_path = args.df_path\n lora_path = args.lora_path\n output_path = args.output_path\n\n df = pd.read_parquet(df_path)\n misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')\n\n # Load tokenizer and model\n tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)\n bnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n bnb_4bit_use_double_quant=True,\n bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16\n )\n model = Qwen2Model.from_pretrained(\n MODEL_PATH,\n quantization_config=bnb_config,\n device_map=DEVICE,\n )\n config = LoraConfig(\n r=64,\n lora_alpha=128,\n target_modules=[\n \"q_proj\",\n \"k_proj\",\n \"v_proj\",\n \"o_proj\",\n \"gate_proj\",\n \"up_proj\",\n \"down_proj\",\n ],\n bias=\"none\",\n lora_dropout=0.05,\n task_type=\"CAUSAL_LM\",\n )\n model = get_peft_model(model, config)\n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n model.load_state_dict(d, strict=False)\n\n # Perform inference\n query_embeddings = inference(df, model, tokenizer, query_text='text')\n passage_embeddings = inference(misconceptions, model, tokenizer, query_text='MisconceptionName')\n\n # Compute similarity and save results\n similarity = cosine_similarity(query_embeddings, passage_embeddings)\n np.save(output_path, similarity)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.322886Z","iopub.execute_input":"2024-12-20T09:36:51.323104Z","iopub.status.idle":"2024-12-20T09:36:51.329503Z","shell.execute_reply.started":"2024-12-20T09:36:51.323083Z","shell.execute_reply":"2024-12-20T09:36:51.328756Z"}},"outputs":[{"name":"stdout","text":"Overwriting run_embedder.py\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"!pip install peft\n!pip install bitsandbytes\n!pip install accelerate","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:39:20.237450Z","iopub.execute_input":"2024-12-20T09:39:20.237851Z","iopub.status.idle":"2024-12-20T09:39:44.566957Z","shell.execute_reply.started":"2024-12-20T09:39:20.237810Z","shell.execute_reply":"2024-12-20T09:39:44.566020Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: peft in /opt/conda/lib/python3.10/site-packages (0.14.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft) (21.3)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft) (5.9.3)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft) (6.0.2)\nRequirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft) (2.4.0)\nRequirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft) (4.44.2)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft) (4.66.4)\nRequirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.34.2)\nRequirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft) (0.4.5)\nRequirement already satisfied: huggingface-hub>=0.25.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.25.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (3.15.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (2024.6.1)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (2.32.3)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (4.12.2)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->peft) (3.1.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.1.4)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (2024.5.15)\nRequirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (0.19.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft) (2.1.5)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (2024.8.30)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\nRequirement already satisfied: bitsandbytes in /opt/conda/lib/python3.10/site-packages (0.45.0)\nRequirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (2.4.0)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (1.26.4)\nRequirement already satisfied: typing_extensions>=4.8.0 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (4.12.2)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.15.1)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.1.4)\nRequirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (2024.6.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch->bitsandbytes) (2.1.5)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch->bitsandbytes) (1.3.0)\nRequirement already satisfied: accelerate in /opt/conda/lib/python3.10/site-packages (0.34.2)\nRequirement already satisfied: numpy<3.0.0,>=1.17 in /opt/conda/lib/python3.10/site-packages (from accelerate) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (21.3)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate) (5.9.3)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from accelerate) (6.0.2)\nRequirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (2.4.0)\nRequirement already satisfied: huggingface-hub>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (0.25.0)\nRequirement already satisfied: safetensors>=0.4.3 in /opt/conda/lib/python3.10/site-packages (from accelerate) (0.4.5)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (3.15.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (2024.6.1)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\nRequirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.4)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->accelerate) (3.1.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.1.4)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n","output_type":"stream"}],"execution_count":21},{"cell_type":"code","source":"!python run_embedder.py \\\n --df_path df_preprocessed.parquet \\\n --lora_path /kaggle/input/2211-lora-14b/transformers/default/1 \\\n --output_path similarity1.npy","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T10:07:39.583084Z","iopub.execute_input":"2024-12-20T10:07:39.583805Z","iopub.status.idle":"2024-12-20T10:13:11.378130Z","shell.execute_reply.started":"2024-12-20T10:07:39.583767Z","shell.execute_reply":"2024-12-20T10:13:11.376855Z"}},"outputs":[{"name":"stdout","text":"Loading checkpoint shards: 100%|โโโโโโโโโโโโโโโโ| 17/17 [04:54<00:00, 17.35s/it]\nTraceback (most recent call last):\n File \"/kaggle/working/run_embedder.py\", line 104, in <module>\n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 1065, in load\n with _open_file_like(f, 'rb') as opened_file:\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 468, in _open_file_like\n return _open_file(name_or_buffer, mode)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 449, in __init__\n super().__init__(open(name, mode))\nIsADirectoryError: [Errno 21] Is a directory: '/kaggle/input/2211-lora-14b/transformers/default/1'\n","output_type":"stream"}],"execution_count":45},{"cell_type":"code","source":"!python run_embedder.py \\\n --df_path df_preprocessed.parquet \\\n --lora_path /kaggle/input/embedder-lora-v6/transformers/default/1/adapter.bin \\\n --output_path similarity2.npy","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:44:20.174501Z","iopub.execute_input":"2024-12-20T09:44:20.174819Z","iopub.status.idle":"2024-12-20T09:49:00.609829Z","shell.execute_reply.started":"2024-12-20T09:44:20.174789Z","shell.execute_reply":"2024-12-20T09:49:00.608586Z"}},"outputs":[{"name":"stdout","text":"Loading checkpoint shards: 100%|โโโโโโโโโโโโโโโโ| 17/17 [04:04<00:00, 14.41s/it]\nTraceback (most recent call last):\n File \"/kaggle/working/run_embedder.py\", line 104, in <module>\n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 1065, in load\n with _open_file_like(f, 'rb') as opened_file:\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 468, in _open_file_like\n return _open_file(name_or_buffer, mode)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 449, in __init__\n super().__init__(open(name, mode))\nFileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/embedder-lora-v6/transformers/default/1/adapter.bin'\n","output_type":"stream"}],"execution_count":23},{"cell_type":"code","source":"sim1 = np.load('/kaggle/input/sub-embedder-reranker-ensemble-v3/similarity1.npy')\nsim2 = np.load('/kaggle/input/sub-embedder-reranker-ensemble-v3/similarity2.npy')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:00.480906Z","iopub.execute_input":"2024-12-20T09:53:00.481486Z","iopub.status.idle":"2024-12-20T09:53:00.493772Z","shell.execute_reply.started":"2024-12-20T09:53:00.481446Z","shell.execute_reply":"2024-12-20T09:53:00.492868Z"}},"outputs":[],"execution_count":28},{"cell_type":"code","source":"for i in range(sim1.shape[0]):\n sim1[i, zeros] *= 1.5\n sim2[i, zeros] *= 1.5","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:01.884017Z","iopub.execute_input":"2024-12-20T09:53:01.884376Z","iopub.status.idle":"2024-12-20T09:53:01.891969Z","shell.execute_reply.started":"2024-12-20T09:53:01.884344Z","shell.execute_reply":"2024-12-20T09:53:01.891197Z"}},"outputs":[],"execution_count":29},{"cell_type":"code","source":"top_n_indices1 = np.argsort(sim1, axis=1)[:, ::-1][:, :25]\ntop_n_indices2 = np.argsort(sim2, axis=1)[:, ::-1][:, :25]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:02.201726Z","iopub.execute_input":"2024-12-20T09:53:02.202248Z","iopub.status.idle":"2024-12-20T09:53:02.210604Z","shell.execute_reply.started":"2024-12-20T09:53:02.202219Z","shell.execute_reply":"2024-12-20T09:53:02.209640Z"}},"outputs":[],"execution_count":30},{"cell_type":"code","source":"df = pd.read_parquet('df_preprocessed.parquet')\n\ndf['top_indices1'] = top_n_indices1.tolist()\ndf['top_indices2'] = top_n_indices2.tolist()\ndf.to_parquet('df_embedded.parquet', index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:02.572997Z","iopub.execute_input":"2024-12-20T09:53:02.573307Z","iopub.status.idle":"2024-12-20T09:53:02.639880Z","shell.execute_reply.started":"2024-12-20T09:53:02.573277Z","shell.execute_reply":"2024-12-20T09:53:02.639199Z"}},"outputs":[],"execution_count":31},{"cell_type":"markdown","source":"# Reranker 1","metadata":{}},{"cell_type":"code","source":"import re\n\ndef get_candidates(c_indices):\n candidates = []\n\n mis_names = misconceptions[\"MisconceptionName\"].values\n for ix in c_indices:\n c_names = []\n for i, name in enumerate(mis_names[ix]):\n # A, B, C...\n c_names.append(f\"{chr(65 + i)}. {name}\")\n\n candidates.append(\"\\n\".join(c_names))\n\n return candidates\n\ndef preprocess_text(x):\n x = re.sub(r\"http\\w+\", '', x) # Delete URL\n x = re.sub(r\"\\.+\", \".\", x) # Replace consecutive periods with a single period\n x = re.sub(r\"\\,+\", \",\", x) # Replace consecutive commas with a single comma\n x = re.sub(r\"\\\\\\\\\", r\"\\\\\", x) # Normalize multiple backslashes to double backslashes\n x = re.sub(r\"[ ]{2,}\", \" \", x) # Replace multiple spaces with a single space\n x = x.strip() # Remove empty characters at the beginning and end\n return x\n\nPROMPT = \"\"\"Here is a question about {ConstructName}({SubjectName}).\nQuestion: {Question}\nCorrect Answer: {CorrectAnswer}\nIncorrect Answer: {IncorrectAnswer}\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\n{Retrival}\n\"\"\"\n\ndef preprocess_row(row):\n conversations = [\n {\n \"role\": \"user\",\n \"content\": preprocess_text(\n PROMPT.format(\n ConstructName=row[\"ConstructName\"],\n SubjectName=row[\"SubjectName\"],\n Question=row[\"QuestionText\"],\n CorrectAnswer=row[\"CorrectAnswerText\"],\n IncorrectAnswer=row[\"AnswerText\"],\n Retrival=row[\"retrieval\"]\n )\n )\n }\n ]\n return conversations","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:05.068502Z","iopub.execute_input":"2024-12-20T09:53:05.068841Z","iopub.status.idle":"2024-12-20T09:53:05.077336Z","shell.execute_reply.started":"2024-12-20T09:53:05.068813Z","shell.execute_reply":"2024-12-20T09:53:05.076413Z"}},"outputs":[],"execution_count":32},{"cell_type":"code","source":"def process_dataframe(top_indices_column, output_file, top_k):\n df = pd.read_parquet('df_embedded.parquet')\n top_indices = np.array(df[top_indices_column].tolist())[:, :top_k]\n df['retrieval'] = get_candidates(top_indices)\n df['conversations'] = df.apply(preprocess_row, axis=1)\n df['TOP_K'] = top_k\n df['id'] = range(len(df))\n df.to_parquet(output_file, index=False)\n print(df['conversations'][0][0]['content'])\n print('-'*50)\n\n# ๅฎๆฐใฎ่จญๅฎ\nTOP_K = 25\n\n# ใใกใคใซๅฆ็ใฎๅผใณๅบใ\nprocess_dataframe('top_indices1', 'df_test1.parquet', TOP_K)\nprocess_dataframe('top_indices2', 'df_test2.parquet', TOP_K)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:31.032546Z","iopub.execute_input":"2024-12-20T09:53:31.032910Z","iopub.status.idle":"2024-12-20T09:53:31.061895Z","shell.execute_reply.started":"2024-12-20T09:53:31.032879Z","shell.execute_reply":"2024-12-20T09:53:31.061014Z"}},"outputs":[{"name":"stdout","text":"Here is a question about Use the order of operations to carry out calculations involving powers(BIDMAS).\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nCorrect Answer: \\( 3 \\times(2+4)-5 \\)\nIncorrect Answer: \\( 3 \\times 2+(4-5) \\)\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\nA. Inserts brackets but not changed order of operation\nB. May have made a calculation error using the order of operations\nC. Carries out operations from left to right regardless of priority order, unless brackets are used\nD. Confuses the order of operations, believes subtraction comes before multiplication \nE. Has removed brackets but not performed the operation\nF. Has not realised that the answer may be changed by the insertion of brackets\nG. Thinks the subtraction sign means multiply\nH. Does not perform calculations in the numerator of a fraction before division by the denominator\nI. Applies BIDMAS in strict order (does not realize addition and subtraction, and multiplication and division, are of equal priority)\nJ. Uses addition instead of the associative property of multiplication\nK. Carries out operations from left to right regardless of priority order\nL. Thinks a divide and a negative sign next to each other makes a plus\nM. Believes order of operations does not affect the answer to a calculation\nN. Done a different calculation to the one given\nO. Does not include brackets when required\nP. When there's a negative sign in the question, thinks the answer must be negative\nQ. Thinks a divide and a negative sign next to each other makes a minus\nR. Thinks multiplication and addition are the same\nS. Carries out operations from right to left regardless of priority order\nT. Answers order of operations questions with brackets as if the brackets are not there\nU. Doesn't recognise commutativity of addition with negative numbers\nV. Confuses the order of operations, believes addition comes before multiplication \nW. Believes a subtraction cannot be partitioned into separate subtractions\nX. Assumes the negative sign in a power has no meaning\nY. When factorising into double brackets, finds the correct values for the non variable terms but swops the plus and minus sign\n--------------------------------------------------\nHere is a question about Use the order of operations to carry out calculations involving powers(BIDMAS).\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nCorrect Answer: \\( 3 \\times(2+4)-5 \\)\nIncorrect Answer: \\( 3 \\times 2+(4-5) \\)\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\nA. Inserts brackets but not changed order of operation\nB. May have made a calculation error using the order of operations\nC. Carries out operations from left to right regardless of priority order, unless brackets are used\nD. Uses addition instead of the associative property of multiplication\nE. Confuses the order of operations, believes subtraction comes before multiplication \nF. Believes that adding a positive to a negative makes your answer more negative\nG. Has removed brackets but not performed the operation\nH. Doesn't recognise commutativity of addition with negative numbers\nI. When there's a negative sign in the question, thinks the answer must be negative\nJ. Has not realised that the answer may be changed by the insertion of brackets\nK. Done a different calculation to the one given\nL. When a subtraction of one positive number from another, results in a negative answer, they believe the answer is the sum of those 2 numbers with a negative sign put in front.\nM. Does not know the distributive property\nN. When two digits multiply to 10 or more during a multiplication problem, does not add one to the preceding digit\nO. Thinks multiplication and addition are the same\nP. Believes a subtraction cannot be partitioned into separate subtractions\nQ. When factorising into double brackets, finds the correct values for the non variable terms but swops the plus and minus sign\nR. Does not understand that adding on to a multiple can give you the next multiple\nS. Confuses the direction of vectors when adding or subtracting\nT. When adding negatives believes that they can just add their absolute values and add a negative sign to the answer\nU. Thinks the multiplication sign means to add\nV. Does not understand the question\nW. Believes that if one number in a product decreases, the answer must increase. \nX. Thinks the inverse of multiplication is addition\nY. Tries to add or subtract unlike terms\n--------------------------------------------------\n","output_type":"stream"}],"execution_count":35},{"cell_type":"code","source":"%%writefile run_reranker1.py\n\nimport argparse\nimport time\nfrom tqdm import tqdm\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nimport random\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed\nfrom transformers.data.data_collator import pad_without_fast_tokenizer_warning\nfrom peft import PeftModel\n\n# ๅบๅฎใใฉใกใผใฟ\nMODEL_DIR = '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'\nMAX_LENGTH = 1536\nBATCH_SIZE = 1\n\nset_seed(42)\n\ndef tokenize(tokenizer, conversations, max_length=MAX_LENGTH):\n texts = []\n for messages in conversations:\n text = tokenizer.apply_chat_template(\n messages,\n tokenize=False,\n add_generation_prompt=True\n )\n texts.append(text)\n\n tokenized = tokenizer(texts, add_special_tokens=False, max_length=max_length, truncation=True)\n input_ids = tokenized.input_ids\n attention_mask = tokenized.attention_mask\n return input_ids, attention_mask\n\[email protected]_grad()\[email protected]('cuda')\ndef inference(df, model, label_idx, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):\n scores = []\n\n # ใใใใใจใซๅฆ็ใ่กใใซใผใ\n for start_idx in tqdm(range(0, len(df), batch_size)):\n end_idx = min(start_idx + batch_size, len(df))\n tmp = df.iloc[start_idx:end_idx]\n input_ids = tmp[\"input_ids\"].to_list()\n attention_mask = tmp[\"attention_mask\"].to_list()\n \n # ๅ
ฅๅใฎใใใฃใณใฐๅฆ็\n inputs = pad_without_fast_tokenizer_warning(\n tokenizer,\n {\"input_ids\": input_ids, \"attention_mask\": attention_mask},\n padding=\"longest\",\n pad_to_multiple_of=None,\n return_tensors=\"pt\",\n )\n \n # ใขใใซๆจ่ซ\n outputs = model(**inputs.to(model.device))\n proba = outputs.logits[:, -1, label_idx].softmax(-1).cpu()\n scores.extend(proba.tolist())\n \n df['score'] = scores\n return df\n\nif __name__ == \"__main__\":\n # ใณใใณใใฉใคใณๅผๆฐใๅฎ็พฉ\n parser = argparse.ArgumentParser(description=\"Run reranker inference.\")\n parser.add_argument(\"--lora_dir1\", type=str, required=True)\n parser.add_argument(\"--input_path\", type=str, required=True)\n parser.add_argument(\"--output_path\", type=str, required=True)\n args = parser.parse_args()\n\n # ่จญๅฎ\n lora_dir1 = args.lora_dir1\n input_path = args.input_path\n output_path = args.output_path\n\n # ใใผใฏใใคใถใผใจใขใใซใฎ่ชญใฟ่พผใฟ\n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n model = AutoModelForCausalLM.from_pretrained(\n MODEL_DIR,\n device_map='auto',\n use_cache=True,\n )\n\n # LoRAใขใใใฟใผใฎใญใผใ\n model = PeftModel.from_pretrained(model, lora_dir1, adapter_name=\"lora1\")\n\n # ใใผใฟ่ชญใฟ่พผใฟ\n df_test = pd.read_parquet(input_path)\n TOP_K = df_test['TOP_K'][0]\n label_idx = [tokenizer(f'{chr(65 + i)}', add_special_tokens=False)['input_ids'][-1] for i in range(TOP_K)]\n print(tokenizer.decode(label_idx))\n\n # ใใผใฟใฎใใผใฏใณๅ\n data = pd.DataFrame()\n data[\"id\"] = df_test[\"id\"]\n data[\"input_ids\"], data[\"attention_mask\"] = tokenize(tokenizer, df_test[\"conversations\"])\n data[\"length\"] = data[\"input_ids\"].apply(len)\n data = data.sort_values(\"length\", ascending=False).reset_index(drop=True)\n\n # ๆจ่ซใจไฟๅญ\n score_df = inference(data, model, label_idx).sort_values('id').reset_index(drop=True)\n score_df.to_parquet(output_path, index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:32.583830Z","iopub.execute_input":"2024-12-20T09:53:32.584170Z","iopub.status.idle":"2024-12-20T09:53:32.591128Z","shell.execute_reply.started":"2024-12-20T09:53:32.584142Z","shell.execute_reply":"2024-12-20T09:53:32.590270Z"}},"outputs":[{"name":"stdout","text":"Overwriting run_reranker1.py\n","output_type":"stream"}],"execution_count":36},{"cell_type":"code","source":"!python run_reranker1.py \\\n --lora_dir1 /kaggle/input/reranker1-lora-v33/transformers/default/1 \\\n --input_path df_test1.parquet \\\n --output_path score_df1.parquet","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:33.050970Z","iopub.execute_input":"2024-12-20T09:53:33.051808Z","iopub.status.idle":"2024-12-20T09:53:42.227652Z","shell.execute_reply.started":"2024-12-20T09:53:33.051772Z","shell.execute_reply":"2024-12-20T09:53:42.226528Z"}},"outputs":[{"name":"stdout","text":"Traceback (most recent call last):\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 402, in cached_file\n resolved_file = hf_hub_download(\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py\", line 101, in inner_f\n return f(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 106, in _inner_fn\n validate_repo_id(arg_value)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 154, in validate_repo_id\n raise HFValidationError(\nhuggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Use `repo_type` argument if needed.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/kaggle/working/run_reranker1.py\", line 80, in <module>\n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 834, in from_pretrained\n tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 666, in get_tokenizer_config\n resolved_config_file = cached_file(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 466, in cached_file\n raise EnvironmentError(\nOSError: Incorrect path_or_model_id: '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Please provide either the path to a local folder or the repo_id of a model on the Hub.\n","output_type":"stream"}],"execution_count":37},{"cell_type":"code","source":"!python run_reranker1.py \\\n --lora_dir1 /kaggle/input/reranker1-lora-v65/transformers/default/1 \\\n --input_path df_test2.parquet \\\n --output_path score_df2.parquet","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:42.229720Z","iopub.execute_input":"2024-12-20T09:53:42.230036Z","iopub.status.idle":"2024-12-20T09:53:50.356546Z","shell.execute_reply.started":"2024-12-20T09:53:42.230006Z","shell.execute_reply":"2024-12-20T09:53:50.355692Z"}},"outputs":[{"name":"stdout","text":"Traceback (most recent call last):\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 402, in cached_file\n resolved_file = hf_hub_download(\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py\", line 101, in inner_f\n return f(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 106, in _inner_fn\n validate_repo_id(arg_value)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 154, in validate_repo_id\n raise HFValidationError(\nhuggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Use `repo_type` argument if needed.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/kaggle/working/run_reranker1.py\", line 80, in <module>\n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 834, in from_pretrained\n tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 666, in get_tokenizer_config\n resolved_config_file = cached_file(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 466, in cached_file\n raise EnvironmentError(\nOSError: Incorrect path_or_model_id: '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Please provide either the path to a local folder or the repo_id of a model on the Hub.\n","output_type":"stream"}],"execution_count":38},{"cell_type":"markdown","source":"# Submit","metadata":{}},{"cell_type":"code","source":"df_embedded = pd.read_parquet('df_embedded.parquet')\nscore_df1 = pd.read_parquet(\"/kaggle/input/sub-embedder-reranker-ensemble-v3/score_df1.parquet\")\nscore_df2 = pd.read_parquet(\"/kaggle/input/sub-embedder-reranker-ensemble-v3/score_df2.parquet\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:05.933245Z","iopub.execute_input":"2024-12-20T09:55:05.934133Z","iopub.status.idle":"2024-12-20T09:55:05.954696Z","shell.execute_reply.started":"2024-12-20T09:55:05.934084Z","shell.execute_reply":"2024-12-20T09:55:05.954066Z"}},"outputs":[],"execution_count":40},{"cell_type":"code","source":"# top_indices1 = np.array(df_embedded['top_indices1'].tolist())\n# top_indices2 = np.array(df_embedded['top_indices2'].tolist())\n\n# scores1 = np.array(score_df1['score'].tolist())\n# scores2 = np.array(score_df2['score'].tolist())\n\n# # ้
ๅใ็ตๅ\n# indices_combined = np.concatenate([top_indices1, top_indices2], axis=1) # 2ๆฌกๅ
ใงๅๆนๅใซ็ตๅ\n# scores_combined = np.concatenate([scores1, scores2], axis=1) # ๅใใๅๆนๅใซ็ตๅ\n\n# # ๅ่กใงในใณใข้้ ใซใฝใผใ\n# sorted_indices = np.argsort(-scores_combined, axis=1) # ในใณใขใฎ้้ ใฝใผใใคใณใใใฏใน\n# sorted_top_indices = np.take_along_axis(indices_combined, sorted_indices, axis=1)\n\n# # ้่คใๅ้คใใฆๆ็ต็ตๆใๅๅพ\n# final_indices = []\n# for row in sorted_top_indices:\n# unique_row = np.unique(row, return_index=True)[1] # ้่คใ้คๅค\n# final_indices.append(row[np.sort(unique_row)][:25]) # ๅ
ใฎ้ ๅบใ็ถญๆ\n\n# final_indices = np.array(final_indices).tolist()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:09.262137Z","iopub.execute_input":"2024-12-20T09:55:09.262992Z","iopub.status.idle":"2024-12-20T09:55:09.267440Z","shell.execute_reply.started":"2024-12-20T09:55:09.262954Z","shell.execute_reply":"2024-12-20T09:55:09.266515Z"}},"outputs":[],"execution_count":41},{"cell_type":"code","source":"top_indices1 = np.array(df_embedded['top_indices1'].tolist())\ntop_indices2 = np.array(df_embedded['top_indices2'].tolist())\n\nscores1 = np.array(score_df1['score'].tolist())\nscores2 = np.array(score_df2['score'].tolist())\n\n# ใใใใใฎ่กใใจใซๅฆ็\nfinal_indices = []\nfor i in range(len(top_indices1)):\n # ็พ่กใฎ่กใฎindexใจในใณใข\n row_indices1 = top_indices1[i]\n row_scores1 = scores1[i]\n row_indices2 = top_indices2[i]\n row_scores2 = scores2[i]\n\n # ใคใณใใใฏในใจในใณใขใใพใจใใใใใฎ่พๆธ (index -> score)\n combined_dict = {}\n\n # indices1, scores1ใ่ฟฝๅ \n for idx, s in zip(row_indices1, row_scores1):\n combined_dict[idx] = s\n\n # indices2, scores2ใ่ฟฝๅ ใๆขใซๅญๅจใใindexใชใๅนณๅๅ\n for idx, s in zip(row_indices2, row_scores2):\n if idx in combined_dict:\n combined_dict[idx] = (combined_dict[idx] + s) / 2.0\n else:\n combined_dict[idx] = s\n\n # `zeros` ใซๅซใพใใใคใณใใใฏในใฎในใณใขใ 1.3 ๅ\n for idx in zeros:\n if idx in combined_dict:\n combined_dict[idx] *= 1.3\n\n # ่พๆธใใ(ใคใณใใใฏใน, ในใณใข)ใฟใใซใฎใชในใใไฝๆใใในใณใข้้ ใซใฝใผใ\n combined_list = sorted(combined_dict.items(), key=lambda x: x[1], reverse=True)\n\n top_index = combined_list[0][0] # ๊ฐ์ฅ ๋์ ์ ์์ ์ธ๋ฑ์ค ํ๋๋ง ์ถ์ถ\n final_indices.append([top_index])\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:09.682176Z","iopub.execute_input":"2024-12-20T09:55:09.682912Z","iopub.status.idle":"2024-12-20T09:55:09.691523Z","shell.execute_reply.started":"2024-12-20T09:55:09.682883Z","shell.execute_reply":"2024-12-20T09:55:09.690819Z"}},"outputs":[],"execution_count":42},{"cell_type":"code","source":"for idx in final_indices:\n misconception_name = misconceptions.iloc[idx]['MisconceptionName']\n print(f\"Misconception Index: {idx}, Misconception: {misconception_name}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:10.219923Z","iopub.execute_input":"2024-12-20T09:55:10.220223Z","iopub.status.idle":"2024-12-20T09:55:10.231268Z","shell.execute_reply.started":"2024-12-20T09:55:10.220184Z","shell.execute_reply":"2024-12-20T09:55:10.230332Z"}},"outputs":[{"name":"stdout","text":"Misconception Index: [1005], Misconception: 1005 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1507], Misconception: 1507 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1507], Misconception: 1507 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1755], Misconception: 1755 Incorrectly factorises a quadratic\nName: MisconceptionName, dtype: object\nMisconception Index: [2398], Misconception: 2398 Thinks you can divide terms by different facto...\nName: MisconceptionName, dtype: object\nMisconception Index: [1755], Misconception: 1755 Incorrectly factorises a quadratic\nName: MisconceptionName, dtype: object\nMisconception Index: [1287], Misconception: 1287 Believes if you changed all values by the same...\nName: MisconceptionName, dtype: object\nMisconception Index: [1287], Misconception: 1287 Believes if you changed all values by the same...\nName: MisconceptionName, dtype: object\nMisconception Index: [1073], Misconception: 1073 Believes if you add the same value to all numb...\nName: MisconceptionName, dtype: object\n","output_type":"stream"}],"execution_count":43},{"cell_type":"code","source":"from transformers import AutoTokenizer, AutoModelForCausalLM\nfrom peft import PeftModel\n\nMODEL_DIR = '/kaggle/input/qwen2.5-32b/transformers/default/1'\nLORA_DIR = '/kaggle/input/2211-lora-14b/transformers/default/1'\n\n# ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๋ก๋\ntokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map='auto')\nmodel = PeftModel.from_pretrained(model, LORA_DIR, adapter_name=\"lora1\")\n\n# ๋ชจ๋ธ ์ ์ฅ\noutput_path = './exported_model'\nmodel.save_pretrained(output_path)\ntokenizer.save_pretrained(output_path)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T10:15:51.442501Z","iopub.execute_input":"2024-12-20T10:15:51.443391Z"}},"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1462: UserWarning: Current model requires 7113553152 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.\n warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Loading checkpoint shards: 0%| | 0/17 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a93e599f995247b9a6835657738b6434"}},"metadata":{}}],"execution_count":null},{"cell_type":"code","source":"import torch\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\n\n# ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๋ก๋\nmodel_path = './exported_model'\ntokenizer = AutoTokenizer.from_pretrained(model_path)\nmodel = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto')\n\ndef predict_misconception(question, correct_answer, incorrect_answer, construct_name, subject_name):\n prompt = f\"\"\"Here is a question about {construct_name} ({subject_name}).\nQuestion: {question}\nCorrect Answer: {correct_answer}\nIncorrect Answer: {incorrect_answer}\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\"\"\"\n inputs = tokenizer(prompt, return_tensors='pt').to(model.device)\n outputs = model.generate(**inputs, max_new_tokens=50)\n result = tokenizer.decode(outputs[0], skip_special_tokens=True)\n return result\n\n# ์์ ํ
์คํธ\nquestion = \"3 ร 2 + 4 โ 5\\nWhere do the brackets need to go to make the answer equal 13?\"\ncorrect_answer = \"3 ร (2 + 4) โ 5\"\nincorrect_answer = \"3 ร 2 + (4 โ 5)\"\nconstruct_name = \"Use the order of operations to carry out calculations involving powers\"\nsubject_name = \"BIDMAS\"\n\nprediction = predict_misconception(question, correct_answer, incorrect_answer, construct_name, subject_name)\nprint(prediction)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|
src/SecondModule/2ndModule.md
ADDED
File without changes
|
src/SecondModule/__pycache__/module2.cpython-311.pyc
ADDED
Binary file (9.34 kB). View file
|
|
src/SecondModule/__pycache__/module2.cpython-39.pyc
ADDED
Binary file (7.28 kB). View file
|
|
src/SecondModule/__pycache__/module2_ori.cpython-39.pyc
ADDED
Binary file (6.84 kB). View file
|
|
src/SecondModule/misconception_mapping.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/SecondModule/module2.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
from typing import Tuple, Optional
|
4 |
+
from dataclasses import dataclass
|
5 |
+
import logging
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Set up logging
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
# .env ํ์ผ ๋ก๋
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
# Hugging Face API ์ ๋ณด
|
17 |
+
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
18 |
+
API_KEY = os.getenv("HUGGINGFACE_API_KEY")
|
19 |
+
|
20 |
+
base_path = os.path.dirname(os.path.abspath(__file__))
|
21 |
+
misconception_csv_path = os.path.join(base_path, 'misconception_mapping.csv')
|
22 |
+
|
23 |
+
if not API_KEY:
|
24 |
+
raise ValueError("API_KEY๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. .env ํ์ผ์ ํ์ธํ์ธ์.")
|
25 |
+
|
26 |
+
#์ ์ฌ ๋ฌธ์ ์์ฑ๊ธฐ ํด๋์ค
|
27 |
+
|
28 |
+
@dataclass
|
29 |
+
class GeneratedQuestion:
|
30 |
+
question: str
|
31 |
+
choices: dict
|
32 |
+
correct_answer: str
|
33 |
+
explanation: str
|
34 |
+
|
35 |
+
class SimilarQuestionGenerator:
|
36 |
+
def __init__(self, misconception_csv_path: str = 'misconception_mapping.csv'):
|
37 |
+
"""
|
38 |
+
Initialize the generator by loading the misconception mapping and the language model.
|
39 |
+
"""
|
40 |
+
self._load_data(misconception_csv_path)
|
41 |
+
|
42 |
+
def _load_data(self, misconception_csv_path: str):
|
43 |
+
logger.info("Loading misconception mapping...")
|
44 |
+
self.misconception_df = pd.read_csv(misconception_csv_path)
|
45 |
+
|
46 |
+
def get_misconception_text(self, misconception_id: float) -> Optional[str]:
|
47 |
+
# MisconceptionId๋ฅผ ๋ฐ์ ํด๋น ID์ ๋งค์นญ๋๋ ์ค๊ฐ๋
์ค๋ช
ํ
์คํธ๋ฅผ ๋ฐํํฉ๋๋ค
|
48 |
+
"""Retrieve the misconception text based on the misconception ID."""
|
49 |
+
if pd.isna(misconception_id): # NaN ์ฒดํฌ
|
50 |
+
logger.warning("Received NaN for misconception_id.")
|
51 |
+
return "No misconception provided."
|
52 |
+
|
53 |
+
try:
|
54 |
+
row = self.misconception_df[self.misconception_df['MisconceptionId'] == int(misconception_id)]
|
55 |
+
if not row.empty:
|
56 |
+
return row.iloc[0]['MisconceptionName']
|
57 |
+
except ValueError as e:
|
58 |
+
logger.error(f"Error processing misconception_id: {e}")
|
59 |
+
|
60 |
+
logger.warning(f"No misconception found for ID: {misconception_id}")
|
61 |
+
return "Misconception not found."
|
62 |
+
|
63 |
+
def generate_prompt(self, construct_name: str, subject_name: str, question_text: str, correct_answer_text: str, wrong_answer_text: str, misconception_text: str) -> str:
|
64 |
+
"""Create a prompt for the language model."""
|
65 |
+
#๋ฌธ์ ์์ฑ์ ์ํ ํ๋กฌํํธ ํ
์คํธ๋ฅผ ์์ฑ
|
66 |
+
logger.info("Generating prompt...")
|
67 |
+
misconception_clause = (f"that targets the following misconception: \"{misconception_text}\"." if misconception_text != "There is no misconception" else "")
|
68 |
+
prompt = f"""
|
69 |
+
<|begin_of_text|>
|
70 |
+
<|start_header_id|>system<|end_header_id|>
|
71 |
+
You are an educational assistant designed to generate multiple-choice questions {misconception_clause}
|
72 |
+
<|eot_id|>
|
73 |
+
<|start_header_id|>user<|end_header_id|>
|
74 |
+
You need to create a similar multiple-choice question based on the following details:
|
75 |
+
|
76 |
+
Construct Name: {construct_name}
|
77 |
+
Subject Name: {subject_name}
|
78 |
+
Question Text: {question_text}
|
79 |
+
Correct Answer: {correct_answer_text}
|
80 |
+
Wrong Answer: {wrong_answer_text}
|
81 |
+
|
82 |
+
Please follow this output format:
|
83 |
+
---
|
84 |
+
Question: <Your Question Text>
|
85 |
+
A) <Choice A>
|
86 |
+
B) <Choice B>
|
87 |
+
C) <Choice C>
|
88 |
+
D) <Choice D>
|
89 |
+
Correct Answer: <Correct Choice (e.g., A)>
|
90 |
+
Explanation: <Brief explanation for the correct answer>
|
91 |
+
---
|
92 |
+
Ensure that the question is conceptually similar but not identical to the original. Ensure clarity and educational value.
|
93 |
+
<|eot_id|>
|
94 |
+
<|start_header_id|>assistant<|end_header_id|>
|
95 |
+
""".strip()
|
96 |
+
logger.debug(f"Generated prompt: {prompt}")
|
97 |
+
return prompt
|
98 |
+
|
99 |
+
def call_model_api(self, prompt: str) -> str:
|
100 |
+
"""Hugging Face API ํธ์ถ"""
|
101 |
+
logger.info("Calling Hugging Face API...")
|
102 |
+
headers = {"Authorization": f"Bearer {API_KEY}"}
|
103 |
+
|
104 |
+
try:
|
105 |
+
response = requests.post(API_URL, headers=headers, json={"inputs": prompt})
|
106 |
+
response.raise_for_status()
|
107 |
+
|
108 |
+
response_data = response.json()
|
109 |
+
logger.debug(f"Raw API response: {response_data}")
|
110 |
+
|
111 |
+
# API ์๋ต์ด ๋ฆฌ์คํธ์ธ ๊ฒฝ์ฐ ์ฒ๋ฆฌ
|
112 |
+
if isinstance(response_data, list):
|
113 |
+
if response_data and isinstance(response_data[0], dict):
|
114 |
+
generated_text = response_data[0].get('generated_text', '')
|
115 |
+
else:
|
116 |
+
generated_text = response_data[0] if response_data else ''
|
117 |
+
# API ์๋ต์ด ๋์
๋๋ฆฌ์ธ ๊ฒฝ์ฐ ์ฒ๋ฆฌ
|
118 |
+
elif isinstance(response_data, dict):
|
119 |
+
generated_text = response_data.get('generated_text', '')
|
120 |
+
else:
|
121 |
+
generated_text = str(response_data)
|
122 |
+
|
123 |
+
logger.info(f"Generated text: {generated_text}")
|
124 |
+
return generated_text
|
125 |
+
|
126 |
+
except requests.exceptions.RequestException as e:
|
127 |
+
logger.error(f"API request failed: {e}")
|
128 |
+
raise
|
129 |
+
except Exception as e:
|
130 |
+
logger.error(f"Unexpected error in call_model_api: {e}")
|
131 |
+
raise
|
132 |
+
def parse_model_output(self, output: str) -> GeneratedQuestion:
|
133 |
+
if not isinstance(output, str):
|
134 |
+
logger.error(f"Invalid output format: {type(output)}. Expected string.")
|
135 |
+
raise ValueError("Model output is not a string.")
|
136 |
+
|
137 |
+
logger.info(f"Parsing output: {output}")
|
138 |
+
output_lines = output.strip().splitlines()
|
139 |
+
logger.debug(f"Split output into lines: {output_lines}")
|
140 |
+
|
141 |
+
question, choices, correct_answer, explanation = "", {}, "", ""
|
142 |
+
|
143 |
+
for line in output_lines:
|
144 |
+
if line.lower().startswith("question:"):
|
145 |
+
question = line.split(":", 1)[1].strip()
|
146 |
+
elif line.startswith("A)"):
|
147 |
+
choices["A"] = line[2:].strip()
|
148 |
+
elif line.startswith("B)"):
|
149 |
+
choices["B"] = line[2:].strip()
|
150 |
+
elif line.startswith("C)"):
|
151 |
+
choices["C"] = line[2:].strip()
|
152 |
+
elif line.startswith("D)"):
|
153 |
+
choices["D"] = line[2:].strip()
|
154 |
+
elif line.lower().startswith("correct answer:"):
|
155 |
+
correct_answer = line.split(":", 1)[1].strip()
|
156 |
+
elif line.lower().startswith("explanation:"):
|
157 |
+
explanation = line.split(":", 1)[1].strip()
|
158 |
+
|
159 |
+
if not question or len(choices) < 4 or not correct_answer or not explanation:
|
160 |
+
logger.warning("Incomplete generated question.")
|
161 |
+
return GeneratedQuestion(question, choices, correct_answer, explanation)
|
162 |
+
|
163 |
+
def generate_similar_question_with_text(self, construct_name: str, subject_name: str, question_text: str, correct_answer_text: str, wrong_answer_text: str, misconception_id: float) -> Tuple[Optional[GeneratedQuestion], Optional[str]]:
|
164 |
+
logger.info("generate_similar_question_with_text initiated")
|
165 |
+
|
166 |
+
# ์์ธ ์ฒ๋ฆฌ ์ถ๊ฐ
|
167 |
+
try:
|
168 |
+
misconception_text = self.get_misconception_text(misconception_id)
|
169 |
+
logger.info(f"Misconception text retrieved: {misconception_text}")
|
170 |
+
except Exception as e:
|
171 |
+
logger.error(f"Error retrieving misconception text: {e}")
|
172 |
+
return None, None
|
173 |
+
|
174 |
+
if not misconception_text:
|
175 |
+
logger.info("Skipping question generation due to lack of misconception.")
|
176 |
+
return None, None
|
177 |
+
|
178 |
+
prompt = self.generate_prompt(construct_name, subject_name, question_text, correct_answer_text, wrong_answer_text, misconception_text)
|
179 |
+
logger.info(f"Generated prompt: {prompt}")
|
180 |
+
|
181 |
+
generated_text = None # ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ด๊ธฐํ
|
182 |
+
try:
|
183 |
+
logger.info("Calling call_model_api...")
|
184 |
+
generated_text = self.call_model_api(prompt)
|
185 |
+
logger.info(f"Generated text from API: {generated_text}")
|
186 |
+
|
187 |
+
# ํ์ฑ
|
188 |
+
generated_question = self.parse_model_output(generated_text)
|
189 |
+
logger.info(f"Generated question object: {generated_question}")
|
190 |
+
return generated_question, generated_text
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logger.error(f"Failed to generate question: {e}")
|
194 |
+
logger.debug(f"API output for debugging: {generated_text}")
|
195 |
+
return None, generated_text
|
196 |
+
|
src/ThirdModule/module3.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# module3.py
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
+
from typing import Tuple
|
5 |
+
import logging
|
6 |
+
from config import Llama3_8b_PATH
|
7 |
+
import re
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
|
12 |
+
class SelfConsistencyChecker:
|
13 |
+
def __init__(self, model_name: str = 'meta-llama/Meta-Llama-3-8B-Instruct'):
|
14 |
+
self._load_model(model_name)
|
15 |
+
|
16 |
+
def _load_model(self, model_name: str):
|
17 |
+
"""Load the language model for self-consistency checking."""
|
18 |
+
logger.info(f"Loading model '{model_name}' from '{Llama3_8b_PATH}' for self-consistency check...")
|
19 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=Llama3_8b_PATH, trust_remote_code=True)
|
20 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
21 |
+
model_name,
|
22 |
+
cache_dir=Llama3_8b_PATH,
|
23 |
+
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
24 |
+
trust_remote_code=True,
|
25 |
+
device_map="auto"
|
26 |
+
)
|
27 |
+
self.model.eval()
|
28 |
+
if torch.cuda.is_available():
|
29 |
+
self.model.to('cuda')
|
30 |
+
logger.info("Model loaded on GPU for self-consistency.")
|
31 |
+
else:
|
32 |
+
logger.info("Model loaded on CPU for self-consistency.")
|
33 |
+
|
34 |
+
def _create_prompt(self, question: str, choices: dict) -> str:
|
35 |
+
"""
|
36 |
+
Create a prompt following the Llama 3 prompt template.
|
37 |
+
"""
|
38 |
+
prompt = f"""
|
39 |
+
<|begin_of_text|>
|
40 |
+
<|start_header_id|>system<|end_header_id|>
|
41 |
+
You are an expert reasoning assistant. Your task is to determine the single most accurate answer (A, B, C, or D) for a multiple-choice question based on the given options.
|
42 |
+
|
43 |
+
Rules:
|
44 |
+
1. Carefully read the question and all options.
|
45 |
+
2. Use logical reasoning to select the best answer.
|
46 |
+
3. Output your answer strictly in the following format: "Answer: [A/B/C/D]"
|
47 |
+
4. Do not provide any explanation or extra information.
|
48 |
+
|
49 |
+
<|eot_id|>
|
50 |
+
<|start_header_id|>user<|end_header_id|>
|
51 |
+
Question: {question}
|
52 |
+
|
53 |
+
Choices:
|
54 |
+
A) {choices['A']}
|
55 |
+
B) {choices['B']}
|
56 |
+
C) {choices['C']}
|
57 |
+
D) {choices['D']}
|
58 |
+
|
59 |
+
Please select the correct answer.
|
60 |
+
<|eot_id|>
|
61 |
+
<|start_header_id|>assistant<|end_header_id|>
|
62 |
+
"""
|
63 |
+
return prompt.strip()
|
64 |
+
|
65 |
+
def _extract_answer(self, text: str) -> str:
|
66 |
+
"""
|
67 |
+
Extract the answer (A, B, C, or D) from the generated text.
|
68 |
+
"""
|
69 |
+
match = re.search(r"Answer:\s*([ABCD])", text, re.IGNORECASE)
|
70 |
+
if match:
|
71 |
+
answer = match.group(1).upper()
|
72 |
+
logger.info(f"Extracted answer: {answer} from text: {text}")
|
73 |
+
return answer
|
74 |
+
logger.warning(f"Failed to extract answer from text: {text}")
|
75 |
+
return ""
|
76 |
+
|
77 |
+
def check_answer(self, question: str, choices: dict, num_inferences: int = 10) -> Tuple[str, str]:
|
78 |
+
"""
|
79 |
+
Perform self-consistency check:
|
80 |
+
- Run inference num_inferences times.
|
81 |
+
- Extract answer each time.
|
82 |
+
- Majority vote the final answer.
|
83 |
+
"""
|
84 |
+
|
85 |
+
prompt = self._create_prompt(question, choices) # ์์ ๋ ํ๋กฌํํธ ์์ฑ
|
86 |
+
answer_counts = {"A": 0, "B": 0, "C": 0, "D": 0}
|
87 |
+
|
88 |
+
inputs = self.tokenizer(prompt, return_tensors='pt')
|
89 |
+
if torch.cuda.is_available():
|
90 |
+
inputs = {k: v.to('cuda') for k, v in inputs.items()}
|
91 |
+
|
92 |
+
for _ in range(num_inferences):
|
93 |
+
with torch.no_grad():
|
94 |
+
outputs = self.model.generate(
|
95 |
+
**inputs,
|
96 |
+
max_new_tokens=50,
|
97 |
+
num_return_sequences=1,
|
98 |
+
temperature=0.7,
|
99 |
+
top_p=0.9,
|
100 |
+
do_sample=True,
|
101 |
+
eos_token_id=self.tokenizer.eos_token_id
|
102 |
+
)
|
103 |
+
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
104 |
+
predicted_answer = self._extract_answer(generated_text)
|
105 |
+
|
106 |
+
logger.info(f"Generated text: {generated_text}") # ๋ชจ๋ธ์ด ์์ฑํ ํ
์คํธ ํ์ธ
|
107 |
+
logger.info(f"Predicted answer: {predicted_answer}") # ์ถ์ถ๋ ์ ๋ต ํ์ธ
|
108 |
+
|
109 |
+
if predicted_answer in answer_counts:
|
110 |
+
answer_counts[predicted_answer] += 1
|
111 |
+
else:
|
112 |
+
logger.warning(f"Invalid answer extracted: {predicted_answer}")
|
113 |
+
|
114 |
+
# Majority vote
|
115 |
+
final_answer = max(answer_counts, key=answer_counts.get)
|
116 |
+
explanation = f"Answer counts: {answer_counts}. Majority answer: {final_answer}"
|
117 |
+
|
118 |
+
logger.info(f"Answer counts: {answer_counts}")
|
119 |
+
logger.info(f"Final Answer: {final_answer}")
|
120 |
+
|
121 |
+
return final_answer, explanation
|
src/config.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Llama3_8b_PATH = "Your-Model-Path"
|