{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Yc7rdoHKfTGU", "outputId": "d5eb9bfe-7e64-4f44-88f7-113291790cec" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/67.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m611.1/611.1 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m284.2/284.2 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.9/50.9 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.6/101.6 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.0/16.0 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.9/55.9 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m98.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m80.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m46.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m87.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m459.8/459.8 kB\u001b[0m \u001b[31m34.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m94.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.8/454.8 kB\u001b[0m \u001b[31m32.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ], "source": [ "%pip install --upgrade --quiet langchain_community langchain langchain-core langchain_huggingface langchain-chroma" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "B99BYYo6bHKj" }, "outputs": [], "source": [ "%pip install --upgrade --quiet langchain_google_genai langgraph" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "0h2yNOuOcy-E" }, "outputs": [], "source": [ "# Load metadata.jsonl\n", "import json\n", "\n", "# Load the metadata.jsonl file\n", "with open('/content/metadata.jsonl', 'r') as jsonl_file:\n", " json_list = list(jsonl_file)\n", "\n", "json_QA = []\n", "for json_str in json_list:\n", " json_data = json.loads(json_str)\n", " json_QA.append(json_data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2gBdvVhsdpCN", "outputId": "16e3f0a5-a7b7-4e2a-d87e-23d83c117a3c" }, "outputs": [ { "data": { "text/plain": [ "{'task_id': 'c61d22de-5f6c-4958-a7f6-5e9707bd3466',\n", " 'Question': 'A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?',\n", " 'Level': 2,\n", " 'Final answer': 'egalitarian',\n", " 'file_name': '',\n", " 'Annotator Metadata': {'Steps': '1. Go to arxiv.org and navigate to the Advanced Search page.\\n2. Enter \"AI regulation\" in the search box and select \"All fields\" from the dropdown.\\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select \"Submission date (original)\", and submit the search.\\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled \"Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\".\\n5. Note the six words used as labels: deontological, egalitarian, localized, standardized, utilitarian, and consequential.\\n6. Go back to arxiv.org\\n7. Find \"Physics and Society\" and go to the page for the \"Physics and Society\" category.\\n8. Note that the tag for this category is \"physics.soc-ph\".\\n9. Go to the Advanced Search page.\\n10. Enter \"physics.soc-ph\" in the search box and select \"All fields\" from the dropdown.\\n11. Enter 2016-08-11 and 2016-08-12 into the date inputs, select \"Submission date (original)\", and submit the search.\\n12. Search for instances of the six words in the results to find the paper titled \"Phase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\", indicating that \"egalitarian\" is the correct answer.',\n", " 'Number of steps': '12',\n", " 'How long did this take?': '8 minutes',\n", " 'Tools': '1. Web browser\\n2. Image recognition tools (to identify and parse a figure with three axes)',\n", " 'Number of tools': '2'}}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "json_QA[0]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wmFYIC2ec_FU", "outputId": "bd7f0444-02f3-4521-dc14-32faaf8abaec" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", "Task ID: 872bfbb1-9ccf-49f6-8c5f-aa22818ccd66\n", "Question: Which of the fruits shown in the 2008 painting \"Embroidery from Uzbekistan\" were served as part of the October 1949 breakfast menu for the ocean liner that was later used as a floating prop for the film \"The Last Voyage\"? Give the items as a comma-separated list, ordering them in clockwise order based on their arrangement in the painting starting from the 12 o'clock position. Use the plural form of each fruit.\n", "Level: 3\n", "Final Answer: pears, bananas\n", "Annotator Metadata: \n", " ├── Steps: \n", " │ ├── 1. Use search engine to search for \"2008 painting Embroidery from Uzbekistan\".\n", " │ ├── 2. Open the top result, a link to the painting's page on the Dayton Art Institute website, and verify that the painting has the specified title and year.\n", " │ ├── 3. Identify the fruits in the painting as watermelon, pear, lemon, and banana, which can be verified by either watching the video on the page or reading its linked transcript.\n", " │ ├── 4. Use search engine to search for \"ocean liner floating prop The Last Voyage\".\n", " │ ├── 5. Note from the results that this ocean liner was the SS Île de France.\n", " │ ├── 6. Use search engine to search for \"October 1949 breakfast menu SS Île de France\".\n", " │ ├── 7. Go to the result that shows the vintage SS Île de France breakfast menu for October 1949.\n", " │ ├── 8. Search the menu for each of the four fruits from the painting, finding \"Pear\" and \"Bananas\" but no matches for \"lemon\" or \"watermelon\".\n", " │ ├── 9. Check the positions of the fruits in the painting to find that the pears come before the bananas in clockwise order starting from the 12 o'clock position.\n", " │ ├── 10. Format the final answer as specified using the correct ordering: pears, bananas\n", " ├── Number of steps: 10\n", " ├── How long did this take?: 6\n", " ├── Tools:\n", " │ ├── 1. Web browser\n", " │ ├── 2. Search engine\n", " │ ├── 3. Image recognition and processing tools\n", " └── Number of tools: 3\n", "==================================================\n", "Task ID: 1dcc160f-c187-48c2-b68e-319bd4354f3d\n", "Question: According to Openreview.net, at the NeurIPS 2022 Conference, how many papers by an author named Yuri were accepted with a \"certain\" recommendation?\n", "Level: 2\n", "Final Answer: 3\n", "Annotator Metadata: \n", " ├── Steps: \n", " │ ├── 1. Went to openreview.net.\n", " │ ├── 2. Scroll down and clicked the \"All venues\" link.\n", " │ ├── 3. Clicked \"NeurIPS\".\n", " │ ├── 4. Opened the \"2022\" toggle menu.\n", " │ ├── 5. Clicked \"NeurIPS 2022 Conference\".\n", " │ ├── 6. Opened the top paper.\n", " │ ├── 7. Clicked \"Go to NeurIPS 2022 Conference homepage\".\n", " │ ├── 8. Searched \"Yuri\" in the search box.\n", " │ ├── 9. Opened each of the four papers and checked the Recommendation field.\n", " │ ├── 10. Counted the \"Certain\" recommendations.\n", " ├── Number of steps: 8\n", " ├── How long did this take?: 10 minutes\n", " ├── Tools:\n", " │ ├── 1. Web browser\n", " │ ├── 2. Search engine\n", " └── Number of tools: 2\n", "==================================================\n", "Task ID: 73c1b9fe-ee1d-4cf4-96ca-35c08f97b054\n", "Question: According to the USGS, in what year was the American Alligator first found west of Texas (not including Texas)?\n", "Level: 2\n", "Final Answer: 1954\n", "Annotator Metadata: \n", " ├── Steps: \n", " │ ├── 1. Search the web for “American Alligator USGS”.\n", " │ ├── 2. Click result for the USGS Species Profile.\n", " │ ├── 3. Click “Animated Map”.\n", " │ ├── 4. Click the “Skip years with no recorded sightings” button.\n", " │ ├── 5. Zoom out on the map to better view the whole U.S.\n", " │ ├── 6. Move the slider back to the beginning, then advance it until I see a red dot pop up west of Texas.\n", " │ ├── 7. Note the year that the dot appears, 1954.\n", " ├── Number of steps: 7\n", " ├── How long did this take?: 5 minutes\n", " ├── Tools:\n", " │ ├── 1. Search engine\n", " │ ├── 2. Web browser\n", " │ ├── 3. Image recognition\n", " └── Number of tools: 3\n", "==================================================\n" ] } ], "source": [ "import random\n", "\n", "# random.seed(42)\n", "\n", "# Randomly selecting 3 tasks\n", "random_samples = random.sample(json_QA, 3)\n", "for sample in random_samples:\n", " print(\"=\" * 50)\n", " print(f\"Task ID: {sample['task_id']}\")\n", " print(f\"Question: {sample['Question']}\")\n", " print(f\"Level: {sample['Level']}\")\n", " print(f\"Final Answer: {sample['Final answer']}\")\n", " print(f\"Annotator Metadata: \")\n", " print(f\" ├── Steps: \")\n", " for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n", " print(f\" │ ├── {step}\")\n", " print(f\" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n", " print(f\" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n", " print(f\" ├── Tools:\")\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " print(f\" │ ├── {tool}\")\n", " print(f\" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n", "print(\"=\" * 50)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MVLsbZq2hDxM", "outputId": "5728459c-1121-44f7-8105-2dcb6f3bc537" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HUGGING_FACE_API:··········\n" ] } ], "source": [ "import getpass\n", "import os\n", "\n", "if \"HUGGING_FACE_API\" not in os.environ:\n", " os.environ[\"HUGGING_FACE_API\"] = getpass.getpass(\"HUGGING_FACE_API:\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 528, "referenced_widgets": [ "08b5069e3fc047fa93cc8b9d25639047", "0009811108634c6da85b269f43073d1e", "ec374aa7d68f44249b21f4cab2f9a782", "18282f3935414a339b71797135edae33", "7155aa1ff3744d58a87b5282c8b314d0", "133b62e3ae7140e9acf9a6cebe2416c2", "0b49c5da692a4b1aa75216b0c49e1467", "158b372bb30d4d74a40230981c802a44", "97562a5865f74bd58dacc3d1b9ece1cf", "27788ea8732042e595f95fefac2ea0b2", "135533907bc6445884d23aefbc116345", "128c912e9b854111aa9ea61d714102bc", "a16fd07d940d46ee8c82d03ca7bba4f2", "a38a71761f29432687a3044749fcd466", "a1cad0ba7eab46918e7b51a655691382", "8cc7ecdbdf99458da0469f41c4c6dbaa", "78d0f050829148aab8a6d77d965a19e6", "c00f291df5814bfeb7b1a4b92d864def", "deafcff263294058a400d83bd4b37632", "d4e6e5aac4df4f55b2847f125baca737", "848f90f41a7f406baf3f4428d976fc6a", "d18b1a7c561248e8b7c444b6b49e2113", "74deb69ad55a4e988b3539299af407ba", "c9161ea6e77c40a2aaf8621351a01824", "6eab7ca64ed8484ea9ac20e2b7f1224a", "d718ad7aba124599a02bae760dda053d", "2446d9a6f12f4c5e8de161b9c51c37fc", "dbe14cc9095c46198336df799a540acd", "2eda49778017494f83c351612558e67f", "fa2dba76995d40929b75afc7b63b0023", "a71e84ad5b9c4946ab4e54c57f2006da", "6428f789c2bb40b383521384bba34ffb", "22be5141ab714326b21e669305aa6bfc", "b6683bace3a3499e895409fc3ede24a6", "8bf60d1594cc4ae1a06333ec76e5372f", "9c701abcaf2840aa87786c4afefabbfc", "7db6400d6dae45a891d353bb3daa9aac", "253c2e29806843b9a92f467e559ec2db", "6b3883c132b44bb78f05e4f11990336c", "687b7c15ec40457c975746bfaa337468", "552515bf225643169ef8df12a6a0bf7d", "5224a38a0d4f483790fce8f9b8d9ce78", "645276ddc82b4e5ebbc8f6e313132d95", "b282f7e4f6634ba28250745e12012897", "d4e6803e240b42caacb2bdcec471f2f0", "7f4c5c2e820643b887362b0d7367a27b", "e79cf0d925a84f30a6a018284ac01726", "cfad90d4a7e24936be2fc2dab7b027bb", "dcb2a5e3923a44309c4b681a69af3e9a", "7ab735f5355b4958b0b31d6f30694c61", "939de2a5a48745478248fc5dcd25b487", "26f1d41e52fb49eba7a9985e40353ae9", "879b9377a05c492bb57d0102a21f6f4b", "63a96b7ca197423da875f70525e44321", "26286aa987164fafb768f4db07c75763", "80ebb2d58de04f1ea4ba2a6e22a99cdb", "5a3bfc463d954b8daf48c77121269c16", "bff19759dd6c4ea3ac530202ceb36923", "29a56aa3f16c4a66bde11b680cf6f1d2", "847aa17730f448a4a3ba36d748c1b1f5", "f25a06c3d7a6437281a6923d6f77f2ec", "0e886fc7596a41f1a4b7f6143998120d", "70de4e10ab9b48858b9379153cbee5bc", "37735be6505e4258b7601df3e92a228c", "4042a2cf7d43490a8aedcea4ab63354a", "77d7d3c8754d4cfda1c0800e279954d7", "56cf15aa0bc3429c896bfbef84a78d3c", "0982232b9d464620904cb07dfabc7b55", "a1398a5eb1814de19bb636c7406b6693", "85bbc3d9100d4726ae827601bfede5b1", "0b9a3e0aeaf544c6a5b30de605b71ba1", "0f07409356b2448b9edef82a98f8fe63", "73fe3bb96ca643029cbc6b786a12e4e1", "c6cb240a81504f7781708ab1308bd8f1", "0f7b93c842754c25949b350f42e99cb3", "175090d3b82f4b75bc6a3e295478b309", "0271460f35f143d18fb027480ef133e3", "1009f9eb518543a380960e580c3dc43d", "e51885126ecd4a4da1b8b58f28bc8036", "a825d9bf5cab4ed09a8c032715a6ffa4", "7b6a94978fbc4f819c735b32c004897f", "48a827a977e24a46bd09baab6c1df7c0", "d07c461503034995a22fcca370e8e804", "3f9c87cce52f4395a11a66dd0200f93e", "d63760fc807c4f888ab37ff552988875", "2e3b24fe5feb4f32a15e54ca1de418b4", "88906c27e02f47e295adf10f07e297d4", "51142ebe7755473abe3624e2d12d8a80", "aa65837796a84e099b126a98ca502bc7", "6681e409b6dd4bfe9253fc4e862c066a", "7f19dd3ed3a34974abf8a796f8cccfe9", "0cbc57fdd4df47fb8a798f36bc87b352", "4a33f7aa485246ffb842d173e08c0193", "0f2f2297bc964e79b4e0a5d17879c27a", "1f521026bb9f499bb294aa18fcb7c3c7", "f0d6e800afcb4fd98c74658c85927846", "eae2e94962094e649370b8f66864244e", "26f681457c5747beaf38b511a3828fb4", "489ff77a5575466a9bd96046ed2700d1", "fb51f0ae287a4d658acffb3fc5dfa239", "33a62956376c4986a1019d6e6e679c47", "e0ae80610a1040fc9e584ac30ea8895e", "c0630796b1204f0ba3e42d0640732220", "e957d8da6e73428492fe18987127f2a3", "02148c6f8e26409380257baf83bc5891", "6c3ce7b336b34740b643f7ee6bd33b17", "bcbed9a4ed0846788c75c6962044f548", "48781881a88d4b1c8335802e65a799bd", "fb52a8504fe1465db029750d4f792cb1", "0b91f512322949b28f8687b02c484d1f", "8eb48bf59c5d47e5a0b94f7536bb7606", "271317d4a76b4bc5a192305769b6f17a", "2e9d9803bfd0422399a2f74e0e67af2e", "3fe6d717b64b408d95f54b19dfc34c6f", "2fb6a511cb3f433db486e29b0fd2c0a3", "9e194b0bf39247db8d4dde6a93e59572", "2332f1ff604c4e729c7bf9d9bae1a0d4", "0673a4e393d04339800efc7bc1ed4e0d", "b1b5c297d32f445d8601d1d5e65aa290", "c572df40b17b450a8e6007146de7e509", "ac97ef384a084dbaa6a540d077a71e65" ] }, "id": "hseoRWZxdBRO", "outputId": "7e6835f6-15bd-4f5e-9f62-e91691ce3669" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "08b5069e3fc047fa93cc8b9d25639047", "version_major": 2, "version_minor": 0 }, "text/plain": [ "modules.json: 0%| | 0.00/349 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "128c912e9b854111aa9ea61d714102bc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config_sentence_transformers.json: 0%| | 0.00/116 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "74deb69ad55a4e988b3539299af407ba", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/10.4k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b6683bace3a3499e895409fc3ede24a6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "sentence_bert_config.json: 0%| | 0.00/53.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d4e6803e240b42caacb2bdcec471f2f0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/571 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n", "WARNING:huggingface_hub.file_download:Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "80ebb2d58de04f1ea4ba2a6e22a99cdb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/438M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "56cf15aa0bc3429c896bfbef84a78d3c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/363 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1009f9eb518543a380960e580c3dc43d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.txt: 0%| | 0.00/232k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "aa65837796a84e099b126a98ca502bc7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/466k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fb51f0ae287a4d658acffb3fc5dfa239", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/239 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8eb48bf59c5d47e5a0b94f7536bb7606", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/190 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_chroma import Chroma\n", "\n", "\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n", "\n", "vector_store = Chroma(\n", " collection_name=\"example_collection\",\n", " embedding_function=embeddings,\n", " persist_directory=\"./chroma_langchain_db\", # Where to save data locally, remove if not necessary\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "1qHqQA-7f8sj" }, "outputs": [], "source": [ "from langchain.schema import Document\n", "\n", "docs = []\n", "for sample in json_QA:\n", " content = f\"Question: {sample['Question']}\\n\\nFinal answer: {sample['Final answer']}\"\n", " doc = Document(\n", " page_content=content,\n", " metadata={\"source\": sample[\"task_id\"]}\n", " )\n", " docs.append(doc)\n", "\n", "# Upload the documents to the vector store\n", "try:\n", " vector_store.add_documents(docs)\n", "except Exception as exception:\n", " print(\"Error inserting data into Chroma vector store:\", exception)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "v4ALfgdOjfLx" }, "outputs": [], "source": [ "# ALTERNATIVE : Save the documents (a list of dict) into a csv file, and manually upload it to chroma\n", "import pandas as pd\n", "df = pd.DataFrame(docs)\n", "df.to_csv('chroma_docs.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 293 }, "id": "-MV_Ux69kdk-", "outputId": "8845afc9-ed8c-489f-ac6f-60ac212ebbbf" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "summary": "{\n \"name\": \"df\",\n \"rows\": 165,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n [\n \"id\",\n null\n ]\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 165,\n \"samples\": [\n [\n \"page_content\",\n \"Question: As of August 2023, who is the only winner of the US version of Survivor to be born in the month of May?\\n\\nFinal answer: Michele Fitzgerald\"\n ]\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n [\n \"type\",\n \"Document\"\n ]\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", "type": "dataframe", "variable_name": "df" }, "text/html": [ "\n", "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "
---|---|---|---|---|
0 | \n", "(id, None) | \n", "(metadata, {'source': 'c61d22de-5f6c-4958-a7f6... | \n", "(page_content, Question: A paper about AI regu... | \n", "(type, Document) | \n", "
1 | \n", "(id, None) | \n", "(metadata, {'source': '17b5a6a3-bc87-42e8-b0fb... | \n", "(page_content, Question: I’m researching speci... | \n", "(type, Document) | \n", "
2 | \n", "(id, None) | \n", "(metadata, {'source': '04a04a9b-226c-43fd-b319... | \n", "(page_content, Question: If we assume all arti... | \n", "(type, Document) | \n", "
3 | \n", "(id, None) | \n", "(metadata, {'source': '14569e28-c88c-43e4-8c32... | \n", "(page_content, Question: In Unlambda, what exa... | \n", "(type, Document) | \n", "
4 | \n", "(id, None) | \n", "(metadata, {'source': 'e1fc63a2-da7a-432f-be78... | \n", "(page_content, Question: If Eliud Kipchoge cou... | \n", "(type, Document) | \n", "