Spaces:

brandonmai
/

duhoc-rewoo-agent

Sleeping

File size: 11,538 Bytes
import streamlit as st
import os
import importlib
import sys
from langchain_openai import OpenAI
import re
import json

from algos.PWS import *
from utils.util import *
from nodes.Worker import *
from prompts import fewshots

# Load API keys
# with open(os.path.join('./keys/', 'openai.key'), 'r') as f:
#     os.environ["OPENAI_API_KEY"] = f.read().strip()
# with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f:
#     os.environ["SERPAPI_API_KEY"] = f.read().strip()


def reload_modules():
    """Reload all relevant modules"""
    importlib.reload(sys.modules['nodes.Worker'])
    importlib.reload(sys.modules['algos.PWS'])
    importlib.reload(sys.modules['utils.util'])
    importlib.reload(sys.modules['prompts.fewshots'])
    importlib.reload(sys.modules['prompts.solver'])
    return "✅ Modules reloaded successfully!"


def process(tools, model, input_text):
    # Use study abroad fewshot for study-related questions
    if any(word in input_text.lower() for word in
           ["study", "student", "university", "college", "school", "abroad", "học", "trường", "du học", "học bổng",
            "gpa", "ielts", "tcf", "delf", "scholarship"]):
        # Ensure both Google and LLM are included for study abroad queries
        print(tools)
        assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools)
        method = PWS_Base(planner_model=model, solver_model=model,
                          fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools)
    else:
        method = PWS_Base(planner_model=model, solver_model=model,
                          fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools)
    response = method.run(input_text)

    # Extract planner log
    plan = response["planner_log"].split(input_text)[1].strip('\n')

    # Extract full solver log without truncating at "Now begin to solve the task"
    solve = response["solver_log"].split(input_text)[1].strip('\n')

    # Get the complete output
    output = response["output"]

    return plan, solve, output


def evaluate(response, plan, solve):
    """
    Evaluate whether the response is based on evidence or contains hallucinations.

    Args:
        response: The assistant's full response
        plan: The planning process
        solve: The solving process with evidence

    Returns:
        Dictionary with reasoning, summary and evaluation status
    """
    # Initialize OpenAI client
    llm = OpenAI(temperature=0)

    # Extract only evidence paragraphs from solve
    evidence_blocks = []
    for block in solve.split("\n\n"):
        if "Evidence:" in block:
            evidence_part = block.split("Evidence:", 1)[1].strip()
            if evidence_part:
                evidence_blocks.append(evidence_part)

    # Combine evidence sources
    evidence = "\n\n".join(evidence_blocks)
    if not evidence:
        evidence = solve  # Fallback to using entire solve text if no evidence found

    # Create prompt for evaluation
    prompt = f"""
    Evaluate whether the following response is factually supported by the provided evidence.

    Response to evaluate:
    {response}

    Evidence:
    {evidence}

    Provide your evaluation in this format:
    REASONING: Detailed analysis comparing the response against the evidence
    SUMMARY: Brief summary of the evaluation
    VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict
    """

    try:
        result_text = llm.invoke(prompt).strip()

        # Parse the structured output
        reasoning = ""
        summary = ""
        verdict = "UNSUPPORTED"

        if "REASONING:" in result_text:
            parts = result_text.split("REASONING:", 1)
            remainder = parts[1]
            if "SUMMARY:" in remainder:
                reasoning, remainder = remainder.split("SUMMARY:", 1)
                if "VERDICT:" in remainder:
                    summary, verdict = remainder.split("VERDICT:", 1)

        reasoning = reasoning.strip()
        summary = summary.strip()
        verdict = verdict.strip()

        # Determine verdict category
        verdict_category = "unsupported"
        if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict:
            verdict_category = "supported"
        elif "PARTIALLY" in verdict:
            verdict_category = "partially_supported"

        return {
            "reasoning": reasoning,
            "summary": summary,
            "verdict": verdict,
            "verdict_category": verdict_category
        }

    except Exception as e:
        return {
            "reasoning": f"Error during evaluation: {str(e)}",
            "summary": "Could not complete evaluation",
            "verdict": "EVALUATION FAILED",
            "verdict_category": "error"
        }


# Main app
st.set_page_config(page_title="ReWOO Demo", layout="wide")
st.title("ReWOO Demo 🤗")
st.markdown("""
Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models.
Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time.
""")

# Initialize session state
if 'messages' not in st.session_state:
    st.session_state.messages = []

# Sidebar
with st.sidebar:
    st.header("Configuration")

    # Tools selection
    tools = st.multiselect(
        "Select Tools",
        options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'],
        default=['Duckduckgo', 'LLM']
    )

    # Model selection
    model = st.selectbox(
        "Select Model",
        options=["text-davinci-003", "gpt-3.5-turbo"],
        index=1
    )

    # Refresh modules button
    if st.button("🔄 Refresh Modules"):
        status = reload_modules()
        st.success(status)

    # Examples section
    st.header("Examples")

    if st.button("Example 1: American Callan Pinckney's system"):
        example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?"
        st.session_state.messages.append({"role": "user", "content": example_text})
        with st.spinner('Processing...'):
            plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text)
            st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

    if st.button("Example 2: ReWOO paper"):
        example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?"
        st.session_state.messages.append({"role": "user", "content": example_text})
        with st.spinner('Processing...'):
            plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text)
            st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

    if st.button("Example 3: Car acceleration"):
        example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s."
        st.session_state.messages.append({"role": "user", "content": example_text})
        with st.spinner('Processing...'):
            plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text)
            st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

# Display chat history
for i, message in enumerate(st.session_state.messages):
    if message["role"] == "user":
        st.chat_message("user").write(message["content"])
    else:
        with st.chat_message("assistant"):
            st.write(message["content"])

            with st.expander("Show reasoning process"):
                st.subheader("Planner")
                st.text(message["plan"])
                st.subheader("Solver")
                st.text(message["solve"])

            # Add evaluate button in the expander
            if "evaluation_results" not in message:
                if st.button("🔍 Evaluate", key=f"eval_btn_{i}", type="secondary"):
                    with st.spinner("Evaluating response..."):
                        results = evaluate(message["content"], message["plan"], message["solve"])
                        st.session_state.messages[i]["evaluation_results"] = results
                        st.rerun()
            else:
                # Show evaluation in an expander
                with st.expander("Evaluation Results"):
                    results = message["evaluation_results"]

                    # Display verdict with color
                    verdict_color = "red"
                    if results["verdict_category"] == "supported":
                        verdict_color = "green"
                    elif results["verdict_category"] == "partially_supported":
                        verdict_color = "orange"

                    st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
                                unsafe_allow_html=True)
                    st.subheader("Summary")
                    st.write(results["summary"])
                    st.subheader("Detailed Analysis")
                    st.write(results["reasoning"])

# User input
if prompt := st.chat_input("Ask something..."):
    st.session_state.messages.append({"role": "user", "content": prompt})
    st.chat_message("user").write(prompt)

    with st.chat_message("assistant"):
        with st.spinner('Researching...'):
            plan, solve, output = process(tools, model, prompt)

        st.write(output)

        with st.expander("Show research process"):
            st.subheader("Planner")
            st.text(plan)
            st.subheader("Solver")
            st.text(solve)

        # Add evaluate button in expander for current response
        if st.button("🔍 Evaluate", key="eval_current", type="secondary"):
            with st.spinner("Evaluating response..."):
                results = evaluate(output, plan, solve)

                # Show evaluation in an expander
                with st.expander("Evaluation Results"):
                    # Display verdict with color
                    verdict_color = "red"
                    if results["verdict_category"] == "supported":
                        verdict_color = "green"
                    elif results["verdict_category"] == "partially_supported":
                        verdict_color = "orange"

                    st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
                                unsafe_allow_html=True)
                    st.subheader("Summary")
                    st.write(results["summary"])
                    st.subheader("Detailed Analysis")
                    st.write(results["reasoning"])

                    # Store evaluation results
                    for i in range(len(st.session_state.messages)):
                        if st.session_state.messages[i]["role"] == "assistant" and \
                                st.session_state.messages[i]["content"] == output:
                            st.session_state.messages[i]["evaluation_results"] = results
                            break

    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

# Clear chat button
if st.sidebar.button("Clear Chat"):
    st.session_state.messages = []
    st.rerun()