Segizu commited on
Commit
14e3122
Β·
1 Parent(s): c177ec8

metadata v12

Browse files
Files changed (1) hide show
  1. app.py +15 -27
app.py CHANGED
@@ -22,12 +22,6 @@ EMBEDDINGS_FILE = EMBEDDINGS_DIR / "embeddings.pkl"
22
 
23
  # βœ… Cargar dataset desde metadata.csv (con URLs absolutas)
24
  dataset = load_dataset("csv", data_files="metadata.csv")
25
- dataset = dataset["train"].cast_column("image", HfImage(decode=True))
26
-
27
- # Debug dataset loading
28
- print("Dataset columns:", dataset.column_names)
29
- print("Dataset features:", dataset.features)
30
- print("First item:", dataset[0])
31
 
32
  # πŸ”„ Preprocesar imagen para Facenet
33
  def preprocess_image(img: Image.Image) -> np.ndarray:
@@ -48,36 +42,30 @@ def build_database():
48
 
49
  # Debug: Print dataset structure
50
  print("Dataset structure:", dataset.features)
51
- print("First item structure:", dataset[0])
52
  print("Dataset type:", type(dataset))
53
- print("Dataset item type:", type(dataset[0]))
54
 
55
- for i in range(0, len(dataset), batch_size):
56
- batch = dataset[i:i + batch_size]
57
- print(f"πŸ“¦ Procesando lote {i // batch_size + 1}/{(len(dataset) + batch_size - 1) // batch_size}")
58
 
59
  for j, item in enumerate(batch):
60
  try:
61
  print(f"Debug - Processing item {i+j}")
62
  print(f"Debug - Item type: {type(item)}")
63
- print(f"Debug - Item keys: {item.keys() if isinstance(item, dict) else 'Not a dict'}")
64
-
65
- # Skip if item is not in the expected format
66
- if not isinstance(item, dict):
67
- print(f"⚠️ Skipping item {i+j} - Not a dictionary")
68
- continue
69
 
70
- if "image" not in item:
71
- print(f"⚠️ Skipping item {i+j} - No image key")
 
 
72
  continue
73
 
74
- img = item["image"]
75
- print(f"Debug - Image type: {type(img)}")
76
-
77
- # Skip if image is not a PIL Image
78
- if not isinstance(img, Image.Image):
79
- print(f"⚠️ Skipping item {i+j} - Not a PIL Image")
80
- continue
81
 
82
  # Ensure image is in RGB mode
83
  img = img.convert("RGB")
@@ -89,7 +77,7 @@ def build_database():
89
  )[0]["embedding"]
90
 
91
  database.append((f"image_{i+j}", img, embedding))
92
- print(f"βœ… Procesada imagen {i+j+1}/{len(dataset)}")
93
 
94
  del img_processed
95
  gc.collect()
 
22
 
23
  # βœ… Cargar dataset desde metadata.csv (con URLs absolutas)
24
  dataset = load_dataset("csv", data_files="metadata.csv")
 
 
 
 
 
 
25
 
26
  # πŸ”„ Preprocesar imagen para Facenet
27
  def preprocess_image(img: Image.Image) -> np.ndarray:
 
42
 
43
  # Debug: Print dataset structure
44
  print("Dataset structure:", dataset.features)
45
+ print("First item structure:", dataset["train"][0])
46
  print("Dataset type:", type(dataset))
47
+ print("Dataset item type:", type(dataset["train"][0]))
48
 
49
+ for i in range(0, len(dataset["train"]), batch_size):
50
+ batch = dataset["train"][i:i + batch_size]
51
+ print(f"πŸ“¦ Procesando lote {i // batch_size + 1}/{(len(dataset['train']) + batch_size - 1) // batch_size}")
52
 
53
  for j, item in enumerate(batch):
54
  try:
55
  print(f"Debug - Processing item {i+j}")
56
  print(f"Debug - Item type: {type(item)}")
57
+ print(f"Debug - Item content: {item}")
 
 
 
 
 
58
 
59
+ # Get the image URL
60
+ image_url = item["image"]
61
+ if not isinstance(image_url, str) or not image_url.startswith("http"):
62
+ print(f"⚠️ Skipping item {i+j} - Invalid URL format")
63
  continue
64
 
65
+ # Download and process the image
66
+ response = requests.get(image_url, timeout=10)
67
+ response.raise_for_status()
68
+ img = Image.open(BytesIO(response.content))
 
 
 
69
 
70
  # Ensure image is in RGB mode
71
  img = img.convert("RGB")
 
77
  )[0]["embedding"]
78
 
79
  database.append((f"image_{i+j}", img, embedding))
80
+ print(f"βœ… Procesada imagen {i+j+1}/{len(dataset['train'])}")
81
 
82
  del img_processed
83
  gc.collect()