File size: 3,190 Bytes
75f4cb1 a635093 5d4742e 75f4cb1 5d4742e 108508b 75f4cb1 108508b 5d4742e a635093 5d4742e 75f4cb1 a635093 5d4742e ce3afb2 5d4742e ce3afb2 5d4742e ce3afb2 5d4742e 75f4cb1 a635093 75f4cb1 a635093 75f4cb1 108508b 75f4cb1 108508b 5d4742e 75f4cb1 108508b 75f4cb1 108508b 75f4cb1 108508b 75f4cb1 5d4742e 75f4cb1 ce3afb2 75f4cb1 ce3afb2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
import requests
from rerankers import Reranker
from langchain_huggingface import HuggingFacePipeline
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
# Configurar USER_AGENT
os.environ["USER_AGENT"] = "MyHFSpace/1.0 (HuggingFace Space for Nutrition Q&A)"
# Paso 1: Descargar el documento PDF
URL = "https://gruposdetrabajo.sefh.es/gefp/images/stories/documentos/4-ATENCION-FARMACEUTICA/Nutricion/Manual_basico_N_clinica_y_Dietetica_Valencia_2012.pdf"
response = requests.get(URL)
with open("Manual_de_nutrici贸n_clinica.pdf", "wb") as f:
f.write(response.content)
# Paso 2: Inicializar el modelo usando Hugging Face Pipeline
# Usamos gpt2 en lugar de distilbert-base-uncased
llm = HuggingFacePipeline.from_model_id(
model_id="gpt2", # Cambiado a gpt2, que es compatible con AutoModelForCausalLM
task="text-generation",
pipeline_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "max_length": 512}
)
chain = llm | StrOutputParser()
# Cargar y procesar el PDF
loader = PyPDFLoader("Manual_de_nutrici贸n_clinica.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)
# Crear embeddings y la base de datos vectorial
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
# Inicializar el reranker
ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')
# Paso 3: Definir la funci贸n RAG con reranking
def format_docs(docs):
return "\n\n".join(doc[0].page_content for doc in docs)
def test_rag_reranking(query, ranker):
docs = vectordb.similarity_search_with_score(query)
prompt = hub.pull("rlm/rag-prompt")
rag_chain = prompt | llm | StrOutputParser()
context = []
for doc, score in docs:
if score < 7:
doc_details = doc.to_json()['kwargs']
context.append(doc_details['page_content'])
if len(context) > 0:
ranking = ranker.rank(query=query, docs=context)
useful_context = ranking[0].text
generation = rag_chain.invoke({"context": useful_context, "question": query})
return generation
else:
return "No tengo informaci贸n para responder a esta pregunta"
# Paso 4: Crear una interfaz con Gradio
def answer_query(query):
return test_rag_reranking(query, ranker)
interface = gr.Interface(
fn=answer_query,
inputs=gr.Textbox(label="Ingresa tu pregunta sobre nutrici贸n:"),
outputs=gr.Textbox(label="Respuesta:"),
title="Respuesta a Preguntas sobre Nutrici贸n",
description="Haz preguntas sobre nutrici贸n basadas en el Manual B谩sico de Nutrici贸n Cl铆nica y Diet茅tica."
)
# Lanzar la interfaz
interface.launch() |