import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

# Establecer una carpeta accesible para caché
os.environ['HF_HOME'] = '/tmp/hf_cache'

# Descargar el modelo GGUF
@st.cache_resource
def cargar_modelo():
    model_path = hf_hub_download(
        repo_id="unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF",
        filename="Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf"
    )
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=4
    )
    return llm

llm = cargar_modelo()

# Título del Streamlit App
st.title("🦙 Llama-4 Scout con llama.cpp (CPU)")

# Input del usuario
prompt = st.text_area("Introduce tu prompt aquí:")

# Botón para generar respuesta
if st.button("Generar Respuesta"):
    with st.spinner("Generando respuesta..."):
        output = llm.create_chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200
        )
        respuesta = output["choices"][0]["message"]["content"]
        st.markdown(f"### Respuesta:\n{respuesta}")