Files changed (3) hide show
  1. app.py +47 -176
  2. metadata.csv +0 -0
  3. metadata.py +0 -23
app.py CHANGED
@@ -2,148 +2,41 @@ import numpy as np
2
  from PIL import Image
3
  import gradio as gr
4
  from deepface import DeepFace
5
- from datasets import load_dataset
6
  import os
7
- import pickle
8
- from io import BytesIO
9
- from huggingface_hub import upload_file, hf_hub_download, list_repo_files
10
- from pathlib import Path
11
- import gc
12
- import requests
13
- import time
14
- import shutil
15
 
16
- # 📁 Parámetros
17
- DATASET_ID = "Segizu/facial-recognition"
18
- EMBEDDINGS_SUBFOLDER = "embeddings"
19
- LOCAL_EMB_DIR = Path("embeddings")
20
- LOCAL_EMB_DIR.mkdir(exist_ok=True)
21
- HF_TOKEN = os.getenv("HF_TOKEN")
22
- headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
23
 
24
- # 💾 Configuración de control de almacenamiento
25
- MAX_TEMP_STORAGE_GB = 40
26
- UPLOAD_EVERY = 50
27
- embeddings_to_upload = []
28
-
29
- def get_folder_size(path):
30
- total = 0
31
- for dirpath, _, filenames in os.walk(path):
32
- for f in filenames:
33
- fp = os.path.join(dirpath, f)
34
- total += os.path.getsize(fp)
35
- return total / (1024 ** 3) # En GB
36
-
37
- def flush_embeddings():
38
- global embeddings_to_upload
39
- print("🚀 Subiendo lote de embeddings a Hugging Face...")
40
-
41
- for emb_file in embeddings_to_upload:
42
- try:
43
- filename = emb_file.name
44
- upload_file(
45
- path_or_fileobj=str(emb_file),
46
- path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
47
- repo_id=DATASET_ID,
48
- repo_type="dataset",
49
- token=HF_TOKEN
50
- )
51
- os.remove(emb_file)
52
- print(f"✅ Subido y eliminado: {filename}")
53
- time.sleep(1.2) # Evita 429
54
- except Exception as e:
55
- print(f"❌ Error subiendo {filename}: {e}")
56
- continue
57
-
58
- embeddings_to_upload = []
59
-
60
- # ✅ Cargar CSV desde el dataset
61
- dataset = load_dataset(
62
- "csv",
63
- data_files="metadata.csv",
64
- split="train",
65
- column_names=["image"],
66
- header=0
67
- )
68
-
69
- print("✅ Validación post-carga")
70
- print(dataset[0])
71
- print("Columnas:", dataset.column_names)
72
-
73
- # 🔄 Preprocesamiento
74
- def preprocess_image(img: Image.Image) -> np.ndarray:
75
  img_rgb = img.convert("RGB")
76
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
77
  return np.array(img_resized)
78
 
79
- # 📦 Generar y subir embeddings
80
  def build_database():
81
- print("🔄 Generando embeddings...")
82
- batch_size = 10
83
-
84
- for i in range(0, len(dataset), batch_size):
85
- batch = dataset[i:i + batch_size]
86
- print(f"📦 Lote {i // batch_size + 1}/{(len(dataset) + batch_size - 1) // batch_size}")
87
-
88
- for j in range(len(batch["image"])):
89
- item = {"image": batch["image"][j]}
90
- image_url = item["image"]
91
-
92
- if not isinstance(image_url, str) or not image_url.startswith("http") or image_url.strip().lower() == "image":
93
- print(f"⚠️ Saltando {i + j} - URL inválida: {image_url}")
94
- continue
95
-
96
- name = f"image_{i + j}"
97
- filename = LOCAL_EMB_DIR / f"{name}.pkl"
98
-
99
- # Verificar si ya existe en HF
100
- try:
101
- hf_hub_download(
102
- repo_id=DATASET_ID,
103
- repo_type="dataset",
104
- filename=f"{EMBEDDINGS_SUBFOLDER}/{name}.pkl",
105
- token=HF_TOKEN
106
- )
107
- print(f"⏩ Ya existe remoto: {name}.pkl")
108
- continue
109
- except:
110
- pass
111
-
112
- try:
113
- response = requests.get(image_url, headers=headers, timeout=10)
114
- response.raise_for_status()
115
- img = Image.open(BytesIO(response.content)).convert("RGB")
116
-
117
- img_processed = preprocess_image(img)
118
- embedding = DeepFace.represent(
119
- img_path=img_processed,
120
- model_name="Facenet",
121
- enforce_detection=False
122
- )[0]["embedding"]
123
-
124
- # Guardar temporal
125
- with open(filename, "wb") as f:
126
- pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
127
-
128
- embeddings_to_upload.append(filename)
129
-
130
- # Si excede límites, subir batch
131
- if get_folder_size(LOCAL_EMB_DIR) >= MAX_TEMP_STORAGE_GB or len(embeddings_to_upload) >= UPLOAD_EVERY:
132
- flush_embeddings()
133
-
134
- del img_processed
135
- gc.collect()
136
-
137
- except Exception as e:
138
- print(f"❌ Error en {name}: {e}")
139
- continue
140
-
141
- # Subir lo que quede
142
- if embeddings_to_upload:
143
- flush_embeddings()
144
 
145
- # 🔍 Buscar similitudes desde archivos remotos
146
- def find_similar_faces(uploaded_image: Image.Image):
147
  try:
148
  img_processed = preprocess_image(uploaded_image)
149
  query_embedding = DeepFace.represent(
@@ -151,62 +44,40 @@ def find_similar_faces(uploaded_image: Image.Image):
151
  model_name="Facenet",
152
  enforce_detection=False
153
  )[0]["embedding"]
154
- del img_processed
155
- gc.collect()
156
- except Exception as e:
157
- return [], f"⚠ Error procesando imagen: {str(e)}"
158
 
159
  similarities = []
 
 
 
 
160
 
161
- try:
162
- embedding_files = [
163
- f for f in list_repo_files(DATASET_ID, repo_type="dataset", token=HF_TOKEN)
164
- if f.startswith(f"{EMBEDDINGS_SUBFOLDER}/") and f.endswith(".pkl")
165
- ]
166
- except Exception as e:
167
- return [], f"⚠ Error obteniendo archivos: {str(e)}"
168
-
169
- for file_path in embedding_files:
170
- try:
171
- file_bytes = requests.get(
172
- f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/{file_path}",
173
- headers=headers,
174
- timeout=10
175
- ).content
176
- record = pickle.loads(file_bytes)
177
-
178
- name = record["name"]
179
- img = record["img"]
180
- emb = record["embedding"]
181
-
182
- dist = np.linalg.norm(np.array(query_embedding) - np.array(emb))
183
- sim_score = 1 / (1 + dist)
184
- similarities.append((sim_score, name, np.array(img)))
185
 
186
- except Exception as e:
187
- print(f"⚠ Error con {file_path}: {e}")
188
- continue
 
 
 
189
 
190
- similarities.sort(reverse=True)
191
- top = similarities[:5]
192
- gallery = [(img, f"{name} - Similitud: {sim:.2f}") for sim, name, img in top]
193
- summary = "\n".join([f"{name} - Similitud: {sim:.2f}" for sim, name, _ in top])
194
- return gallery, summary
195
 
196
- # 🚀 Inicializar
197
- print("🚀 Iniciando app...")
198
- build_database()
199
 
200
  # 🎛️ Interfaz Gradio
201
  demo = gr.Interface(
202
  fn=find_similar_faces,
203
  inputs=gr.Image(label="📤 Sube una imagen", type="pil"),
204
  outputs=[
205
- gr.Gallery(label="📸 Rostros similares"),
206
- gr.Textbox(label="🧠 Detalle", lines=6)
207
  ],
208
- title="🔍 Reconocimiento facial con DeepFace",
209
- description="Sube una imagen y encuentra coincidencias en el dataset privado de Hugging Face usando embeddings Facenet."
210
  )
211
 
212
  demo.launch()
 
2
  from PIL import Image
3
  import gradio as gr
4
  from deepface import DeepFace
5
+ from datasets import load_dataset, DownloadConfig
6
  import os
7
+ os.system("rm -rf ~/.cache/huggingface/hub/datasets--Segizu--dataset_faces")
 
 
 
 
 
 
 
8
 
9
+ # Cargar el dataset de Hugging Face forzando la descarga limpia
10
+ download_config = DownloadConfig(force_download=True)
11
+ dataset = load_dataset("Segizu/dataset_faces", download_config=download_config)
12
+ if "train" in dataset:
13
+ dataset = dataset["train"]
 
 
14
 
15
+ # 🔄 Preprocesar imagen para Facenet
16
+ def preprocess_image(img):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  img_rgb = img.convert("RGB")
18
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
19
  return np.array(img_resized)
20
 
21
+ # 📦 Construir base de datos de embeddings
22
  def build_database():
23
+ database = []
24
+ for i, item in enumerate(dataset):
25
+ try:
26
+ img = item["image"]
27
+ img_processed = preprocess_image(img)
28
+ embedding = DeepFace.represent(
29
+ img_path=img_processed,
30
+ model_name="Facenet",
31
+ enforce_detection=False
32
+ )[0]["embedding"]
33
+ database.append((f"image_{i}", img, embedding))
34
+ except Exception as e:
35
+ print(f" No se pudo procesar imagen {i}: {e}")
36
+ return database
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # 🔍 Buscar rostros similares
39
+ def find_similar_faces(uploaded_image):
40
  try:
41
  img_processed = preprocess_image(uploaded_image)
42
  query_embedding = DeepFace.represent(
 
44
  model_name="Facenet",
45
  enforce_detection=False
46
  )[0]["embedding"]
47
+ except:
48
+ return [], "⚠ No se detectó un rostro válido en la imagen."
 
 
49
 
50
  similarities = []
51
+ for name, db_img, embedding in database:
52
+ dist = np.linalg.norm(np.array(query_embedding) - np.array(embedding))
53
+ sim_score = 1 / (1 + dist)
54
+ similarities.append((sim_score, name, db_img))
55
 
56
+ similarities.sort(reverse=True)
57
+ top_matches = similarities[:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ gallery_items = []
60
+ text_summary = ""
61
+ for sim, name, img in top_matches:
62
+ caption = f"{name} - Similitud: {sim:.2f}"
63
+ gallery_items.append((img, caption))
64
+ text_summary += caption + "\n"
65
 
66
+ return gallery_items, text_summary
 
 
 
 
67
 
68
+ # ⚙️ Inicializar base
69
+ database = build_database()
 
70
 
71
  # 🎛️ Interfaz Gradio
72
  demo = gr.Interface(
73
  fn=find_similar_faces,
74
  inputs=gr.Image(label="📤 Sube una imagen", type="pil"),
75
  outputs=[
76
+ gr.Gallery(label="📸 Rostros más similares"),
77
+ gr.Textbox(label="🧠 Similitud", lines=6)
78
  ],
79
+ title="🔍 Buscador de Rostros con DeepFace",
80
+ description="Sube una imagen y se comparará contra los rostros del dataset alojado en Hugging Face (`Segizu/dataset_faces`)."
81
  )
82
 
83
  demo.launch()
metadata.csv DELETED
The diff for this file is too large to render. See raw diff
 
metadata.py DELETED
@@ -1,23 +0,0 @@
1
- from huggingface_hub import HfApi
2
- import csv
3
- import os
4
-
5
- HF_TOKEN = os.getenv("HF_TOKEN") or ""
6
- repo_id = "Segizu/facial-recognition"
7
-
8
- api = HfApi()
9
- files = api.list_repo_files(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
10
-
11
- # Generar URLs completas
12
- base_url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/"
13
- image_urls = [base_url + f for f in files if f.lower().endswith(".jpg")]
14
-
15
- # Escribir nuevo metadata.csv
16
- with open("metadata.csv", "w", newline="") as f:
17
- writer = csv.writer(f)
18
- writer.writerow(["image"])
19
- for url in image_urls:
20
- writer.writerow([url])
21
-
22
- print(f"✅ metadata.csv regenerado con URLs absolutas ({len(image_urls)} imágenes)")
23
-