Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import fitz # PyMuPDF | |
import faiss | |
import pickle | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
def cargar_pdfs(ruta="."): | |
textos = [] | |
for archivo in os.listdir(ruta): | |
if archivo.endswith(".pdf"): | |
ruta_pdf = os.path.join(ruta, archivo) | |
print(f"Procesando: {archivo}") | |
doc = fitz.open(ruta_pdf) | |
texto = "" | |
for pagina in doc: | |
texto += pagina.get_text() | |
texto = texto.replace("\n", " ").replace(" ", " ").strip() | |
doc.close() | |
if texto: | |
textos.append(texto) | |
return textos | |
def chunk_texto(texto, longitud=800): | |
return [texto[i:i+longitud] for i in range(0, len(texto), longitud)] | |
def generar_embedding(textos, tokenizer, model, batch_size=32): | |
all_embeddings = [] | |
for i in range(0, len(textos), batch_size): | |
batch = textos[i:i + batch_size] | |
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
last_hidden = outputs.last_hidden_state | |
mask = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden.size()).float() | |
summed = torch.sum(last_hidden * mask, 1) | |
counted = torch.clamp(mask.sum(1), min=1e-9) | |
mean_pooled = summed / counted | |
all_embeddings.append(mean_pooled.numpy()) | |
return np.vstack(all_embeddings) | |
def crear_index_y_guardar(ruta=".", modelo_id="jinaai/jina-embeddings-v2-base-es", archivo_salida="index.pkl"): | |
print("Cargando PDFs...") | |
textos = cargar_pdfs(ruta) | |
print("Dividiendo en chunks...") | |
chunks = [] | |
for texto in textos: | |
chunks.extend(chunk_texto(texto)) | |
if not chunks: | |
raise ValueError("No se generaron chunks. Revisa tus PDFs.") | |
print(f"Total de chunks generados: {len(chunks)}") | |
print("Generando embeddings...") | |
tokenizer = AutoTokenizer.from_pretrained(modelo_id) | |
model = AutoModel.from_pretrained(modelo_id) | |
embeddings = generar_embedding(chunks, tokenizer, model, batch_size=32) | |
print(f"Dimensión de embeddings: {embeddings.shape[1]}") | |
print("Creando índice FAISS...") | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(embeddings) | |
print(f"Guardando índice en: {archivo_salida}") | |
with open(archivo_salida, "wb") as f: | |
pickle.dump((index, chunks), f) | |
print("Indexación completada.") | |
return index, chunks | |
if __name__ == "__main__": | |
crear_index_y_guardar() | |