Segizu commited on
Commit
51fa2e3
·
1 Parent(s): b6a67be

temp paralelo storage

Browse files
Files changed (1) hide show
  1. app.py +56 -18
app.py CHANGED
@@ -6,17 +6,57 @@ from datasets import load_dataset
6
  import os
7
  import pickle
8
  from io import BytesIO
9
- from huggingface_hub import upload_file, hf_hub_download
10
  from pathlib import Path
11
  import gc
12
  import requests
 
 
13
 
14
  # 📁 Parámetros
15
  DATASET_ID = "Segizu/facial-recognition"
16
  EMBEDDINGS_SUBFOLDER = "embeddings"
 
 
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # ✅ Cargar CSV desde el dataset
21
  dataset = load_dataset(
22
  "csv",
@@ -54,17 +94,17 @@ def build_database():
54
  continue
55
 
56
  name = f"image_{i + j}"
57
- filename = f"{name}.pkl"
58
 
59
- # Verificar si ya está subido
60
  try:
61
  hf_hub_download(
62
  repo_id=DATASET_ID,
63
  repo_type="dataset",
64
- filename=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
65
  token=HF_TOKEN
66
  )
67
- print(f"⏩ Ya existe remoto: {filename}")
68
  continue
69
  except:
70
  pass
@@ -81,19 +121,15 @@ def build_database():
81
  enforce_detection=False
82
  )[0]["embedding"]
83
 
84
- # Guardar temporal y subir
85
  with open(filename, "wb") as f:
86
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
87
 
88
- upload_file(
89
- path_or_fileobj=filename,
90
- path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
91
- repo_id=DATASET_ID,
92
- repo_type="dataset",
93
- token=HF_TOKEN
94
- )
95
- os.remove(filename)
96
- print(f"✅ Subido: {filename}")
97
 
98
  del img_processed
99
  gc.collect()
@@ -102,6 +138,10 @@ def build_database():
102
  print(f"❌ Error en {name}: {e}")
103
  continue
104
 
 
 
 
 
105
  # 🔍 Buscar similitudes desde archivos remotos
106
  def find_similar_faces(uploaded_image: Image.Image):
107
  try:
@@ -119,14 +159,12 @@ def find_similar_faces(uploaded_image: Image.Image):
119
  similarities = []
120
 
121
  try:
122
- # Obtener lista de archivos remotos
123
- from huggingface_hub import list_repo_files
124
  embedding_files = [
125
  f for f in list_repo_files(DATASET_ID, repo_type="dataset", token=HF_TOKEN)
126
  if f.startswith(f"{EMBEDDINGS_SUBFOLDER}/") and f.endswith(".pkl")
127
  ]
128
  except Exception as e:
129
- return [], f"⚠ Error obteniendo archivos del dataset: {str(e)}"
130
 
131
  for file_path in embedding_files:
132
  try:
 
6
  import os
7
  import pickle
8
  from io import BytesIO
9
+ from huggingface_hub import upload_file, hf_hub_download, list_repo_files
10
  from pathlib import Path
11
  import gc
12
  import requests
13
+ import time
14
+ import shutil
15
 
16
  # 📁 Parámetros
17
  DATASET_ID = "Segizu/facial-recognition"
18
  EMBEDDINGS_SUBFOLDER = "embeddings"
19
+ LOCAL_EMB_DIR = Path("embeddings")
20
+ LOCAL_EMB_DIR.mkdir(exist_ok=True)
21
  HF_TOKEN = os.getenv("HF_TOKEN")
22
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
23
 
24
+ # 💾 Configuración de control de almacenamiento
25
+ MAX_TEMP_STORAGE_GB = 40
26
+ UPLOAD_EVERY = 50
27
+ embeddings_to_upload = []
28
+
29
+ def get_folder_size(path):
30
+ total = 0
31
+ for dirpath, _, filenames in os.walk(path):
32
+ for f in filenames:
33
+ fp = os.path.join(dirpath, f)
34
+ total += os.path.getsize(fp)
35
+ return total / (1024 ** 3) # En GB
36
+
37
+ def flush_embeddings():
38
+ global embeddings_to_upload
39
+ print("🚀 Subiendo lote de embeddings a Hugging Face...")
40
+
41
+ for emb_file in embeddings_to_upload:
42
+ try:
43
+ filename = emb_file.name
44
+ upload_file(
45
+ path_or_fileobj=str(emb_file),
46
+ path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
47
+ repo_id=DATASET_ID,
48
+ repo_type="dataset",
49
+ token=HF_TOKEN
50
+ )
51
+ os.remove(emb_file)
52
+ print(f"✅ Subido y eliminado: {filename}")
53
+ time.sleep(1.2) # Evita 429
54
+ except Exception as e:
55
+ print(f"❌ Error subiendo {filename}: {e}")
56
+ continue
57
+
58
+ embeddings_to_upload = []
59
+
60
  # ✅ Cargar CSV desde el dataset
61
  dataset = load_dataset(
62
  "csv",
 
94
  continue
95
 
96
  name = f"image_{i + j}"
97
+ filename = LOCAL_EMB_DIR / f"{name}.pkl"
98
 
99
+ # Verificar si ya existe en HF
100
  try:
101
  hf_hub_download(
102
  repo_id=DATASET_ID,
103
  repo_type="dataset",
104
+ filename=f"{EMBEDDINGS_SUBFOLDER}/{name}.pkl",
105
  token=HF_TOKEN
106
  )
107
+ print(f"⏩ Ya existe remoto: {name}.pkl")
108
  continue
109
  except:
110
  pass
 
121
  enforce_detection=False
122
  )[0]["embedding"]
123
 
124
+ # Guardar temporal
125
  with open(filename, "wb") as f:
126
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
127
 
128
+ embeddings_to_upload.append(filename)
129
+
130
+ # Si excede límites, subir batch
131
+ if get_folder_size(LOCAL_EMB_DIR) >= MAX_TEMP_STORAGE_GB or len(embeddings_to_upload) >= UPLOAD_EVERY:
132
+ flush_embeddings()
 
 
 
 
133
 
134
  del img_processed
135
  gc.collect()
 
138
  print(f"❌ Error en {name}: {e}")
139
  continue
140
 
141
+ # Subir lo que quede
142
+ if embeddings_to_upload:
143
+ flush_embeddings()
144
+
145
  # 🔍 Buscar similitudes desde archivos remotos
146
  def find_similar_faces(uploaded_image: Image.Image):
147
  try:
 
159
  similarities = []
160
 
161
  try:
 
 
162
  embedding_files = [
163
  f for f in list_repo_files(DATASET_ID, repo_type="dataset", token=HF_TOKEN)
164
  if f.startswith(f"{EMBEDDINGS_SUBFOLDER}/") and f.endswith(".pkl")
165
  ]
166
  except Exception as e:
167
+ return [], f"⚠ Error obteniendo archivos: {str(e)}"
168
 
169
  for file_path in embedding_files:
170
  try: