File size: 3,190 Bytes
75f4cb1
 
 
 
a635093
5d4742e
75f4cb1
 
5d4742e
108508b
75f4cb1
108508b
5d4742e
 
a635093
5d4742e
75f4cb1
 
 
 
a635093
5d4742e
ce3afb2
5d4742e
ce3afb2
5d4742e
ce3afb2
5d4742e
75f4cb1
a635093
75f4cb1
 
 
 
 
a635093
75f4cb1
 
 
 
108508b
75f4cb1
 
108508b
5d4742e
75f4cb1
 
108508b
75f4cb1
 
 
 
108508b
 
 
75f4cb1
 
 
 
 
 
 
108508b
75f4cb1
 
5d4742e
75f4cb1
 
 
 
 
 
 
 
ce3afb2
75f4cb1
 
 
ce3afb2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
import requests
from rerankers import Reranker
from langchain_huggingface import HuggingFacePipeline
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# Configurar USER_AGENT
os.environ["USER_AGENT"] = "MyHFSpace/1.0 (HuggingFace Space for Nutrition Q&A)"

# Paso 1: Descargar el documento PDF
URL = "https://gruposdetrabajo.sefh.es/gefp/images/stories/documentos/4-ATENCION-FARMACEUTICA/Nutricion/Manual_basico_N_clinica_y_Dietetica_Valencia_2012.pdf"
response = requests.get(URL)
with open("Manual_de_nutrici贸n_clinica.pdf", "wb") as f:
    f.write(response.content)

# Paso 2: Inicializar el modelo usando Hugging Face Pipeline
# Usamos gpt2 en lugar de distilbert-base-uncased
llm = HuggingFacePipeline.from_model_id(
    model_id="gpt2",  # Cambiado a gpt2, que es compatible con AutoModelForCausalLM
    task="text-generation",
    pipeline_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "max_length": 512}
)
chain = llm | StrOutputParser()

# Cargar y procesar el PDF
loader = PyPDFLoader("Manual_de_nutrici贸n_clinica.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

# Crear embeddings y la base de datos vectorial
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

# Inicializar el reranker
ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')

# Paso 3: Definir la funci贸n RAG con reranking
def format_docs(docs):
    return "\n\n".join(doc[0].page_content for doc in docs)

def test_rag_reranking(query, ranker):
    docs = vectordb.similarity_search_with_score(query)
    prompt = hub.pull("rlm/rag-prompt")
    rag_chain = prompt | llm | StrOutputParser()
    context = []
    for doc, score in docs:
        if score < 7:
            doc_details = doc.to_json()['kwargs']
            context.append(doc_details['page_content'])
    if len(context) > 0:
        ranking = ranker.rank(query=query, docs=context)
        useful_context = ranking[0].text
        generation = rag_chain.invoke({"context": useful_context, "question": query})
        return generation
    else:
        return "No tengo informaci贸n para responder a esta pregunta"

# Paso 4: Crear una interfaz con Gradio
def answer_query(query):
    return test_rag_reranking(query, ranker)

interface = gr.Interface(
    fn=answer_query,
    inputs=gr.Textbox(label="Ingresa tu pregunta sobre nutrici贸n:"),
    outputs=gr.Textbox(label="Respuesta:"),
    title="Respuesta a Preguntas sobre Nutrici贸n",
    description="Haz preguntas sobre nutrici贸n basadas en el Manual B谩sico de Nutrici贸n Cl铆nica y Diet茅tica."
)

# Lanzar la interfaz
interface.launch()