Spaces:
Sleeping
Sleeping
File size: 5,016 Bytes
5fdb69e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
{
"cells": [
{
"cell_type": "markdown",
"id": "4e60bd8a-a4da-4db9-86a8-ac8c03f3e367",
"metadata": {},
"source": [
"# The Price is Right\n",
"\n",
"Today we build a more complex solution for estimating prices of goods.\n",
"\n",
"1. Day 2.0 notebook: create a RAG database with our 400,000 training data\n",
"2. Day 2.1 notebook: visualize in 2D\n",
"3. Day 2.2 notebook: visualize in 3D\n",
"4. Day 2.3 notebook: build and test a RAG pipeline with GPT-4o-mini\n",
"5. Day 2.4 notebook: (a) bring back our Random Forest pricer (b) Create a Ensemble pricer that allows contributions from all the pricers\n",
"\n",
"Phew! That's a lot to get through in one day!\n",
"\n",
"## PLEASE NOTE:\n",
"\n",
"We already have a very powerful product estimator with our proprietary, fine-tuned LLM. Most people would be very satisfied with that! The main reason we're adding these extra steps is to deepen your expertise with RAG and with Agentic workflows."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "993a2a24-1a58-42be-8034-6d116fb8d786",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import re\n",
"import math\n",
"import json\n",
"from tqdm import tqdm\n",
"import random\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"import numpy as np\n",
"import pickle\n",
"from sentence_transformers import SentenceTransformer\n",
"from datasets import load_dataset\n",
"import chromadb\n",
"from items import Item\n",
"from sklearn.manifold import TSNE\n",
"import plotly.graph_objects as go"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cc1fe53-612f-4228-aa02-8758f4c2098f",
"metadata": {},
"outputs": [],
"source": [
"# Turn this up at your own risk! 10_000 is safe..\n",
"\n",
"MAXIMUM_DATAPOINTS = 20_000"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4aab95e-d719-4476-b6e7-e248120df25a",
"metadata": {},
"outputs": [],
"source": [
"DB = \"products_vectorstore\"\n",
"client = chromadb.PersistentClient(path=DB)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f95dafd-ab80-464e-ba8a-dec7a2424780",
"metadata": {},
"outputs": [],
"source": [
"collection = client.get_or_create_collection('products')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "525fc313-8a16-4ac0-8c42-6a6d1ba1c9b8",
"metadata": {},
"outputs": [],
"source": [
"CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']\n",
"COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4cf1c9a-1ced-48d4-974c-3c850905034e",
"metadata": {},
"outputs": [],
"source": [
"# Prework\n",
"result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)\n",
"vectors = np.array(result['embeddings'])\n",
"documents = result['documents']\n",
"categories = [metadata['category'] for metadata in result['metadatas']]\n",
"colors = [COLORS[CATEGORIES.index(c)] for c in categories]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c54df150-c8d8-4bc3-8877-6759691eeb42",
"metadata": {},
"outputs": [],
"source": [
"# Let's try 3D!\n",
"\n",
"tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)\n",
"reduced_vectors = tsne.fit_transform(vectors)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8fb2a63-24c5-4dce-9e63-aa208272f82d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Create the 3D scatter plot\n",
"fig = go.Figure(data=[go.Scatter3d(\n",
" x=reduced_vectors[:, 0],\n",
" y=reduced_vectors[:, 1],\n",
" z=reduced_vectors[:, 2],\n",
" mode='markers',\n",
" marker=dict(size=3, color=colors, opacity=0.7),\n",
")])\n",
"\n",
"fig.update_layout(\n",
" title='3D Chroma Vector Store Visualization',\n",
" scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),\n",
" width=1200,\n",
" height=800,\n",
" margin=dict(r=20, b=10, l=10, t=40)\n",
")\n",
"\n",
"fig.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|