Spaces:
Sleeping
Sleeping
File size: 11,538 Bytes
2224132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
import streamlit as st
import os
import importlib
import sys
from langchain_openai import OpenAI
import re
import json
from algos.PWS import *
from utils.util import *
from nodes.Worker import *
from prompts import fewshots
# Load API keys
# with open(os.path.join('./keys/', 'openai.key'), 'r') as f:
# os.environ["OPENAI_API_KEY"] = f.read().strip()
# with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f:
# os.environ["SERPAPI_API_KEY"] = f.read().strip()
def reload_modules():
"""Reload all relevant modules"""
importlib.reload(sys.modules['nodes.Worker'])
importlib.reload(sys.modules['algos.PWS'])
importlib.reload(sys.modules['utils.util'])
importlib.reload(sys.modules['prompts.fewshots'])
importlib.reload(sys.modules['prompts.solver'])
return "✅ Modules reloaded successfully!"
def process(tools, model, input_text):
# Use study abroad fewshot for study-related questions
if any(word in input_text.lower() for word in
["study", "student", "university", "college", "school", "abroad", "học", "trường", "du học", "học bổng",
"gpa", "ielts", "tcf", "delf", "scholarship"]):
# Ensure both Google and LLM are included for study abroad queries
print(tools)
assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools)
method = PWS_Base(planner_model=model, solver_model=model,
fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools)
else:
method = PWS_Base(planner_model=model, solver_model=model,
fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools)
response = method.run(input_text)
# Extract planner log
plan = response["planner_log"].split(input_text)[1].strip('\n')
# Extract full solver log without truncating at "Now begin to solve the task"
solve = response["solver_log"].split(input_text)[1].strip('\n')
# Get the complete output
output = response["output"]
return plan, solve, output
def evaluate(response, plan, solve):
"""
Evaluate whether the response is based on evidence or contains hallucinations.
Args:
response: The assistant's full response
plan: The planning process
solve: The solving process with evidence
Returns:
Dictionary with reasoning, summary and evaluation status
"""
# Initialize OpenAI client
llm = OpenAI(temperature=0)
# Extract only evidence paragraphs from solve
evidence_blocks = []
for block in solve.split("\n\n"):
if "Evidence:" in block:
evidence_part = block.split("Evidence:", 1)[1].strip()
if evidence_part:
evidence_blocks.append(evidence_part)
# Combine evidence sources
evidence = "\n\n".join(evidence_blocks)
if not evidence:
evidence = solve # Fallback to using entire solve text if no evidence found
# Create prompt for evaluation
prompt = f"""
Evaluate whether the following response is factually supported by the provided evidence.
Response to evaluate:
{response}
Evidence:
{evidence}
Provide your evaluation in this format:
REASONING: Detailed analysis comparing the response against the evidence
SUMMARY: Brief summary of the evaluation
VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict
"""
try:
result_text = llm.invoke(prompt).strip()
# Parse the structured output
reasoning = ""
summary = ""
verdict = "UNSUPPORTED"
if "REASONING:" in result_text:
parts = result_text.split("REASONING:", 1)
remainder = parts[1]
if "SUMMARY:" in remainder:
reasoning, remainder = remainder.split("SUMMARY:", 1)
if "VERDICT:" in remainder:
summary, verdict = remainder.split("VERDICT:", 1)
reasoning = reasoning.strip()
summary = summary.strip()
verdict = verdict.strip()
# Determine verdict category
verdict_category = "unsupported"
if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict:
verdict_category = "supported"
elif "PARTIALLY" in verdict:
verdict_category = "partially_supported"
return {
"reasoning": reasoning,
"summary": summary,
"verdict": verdict,
"verdict_category": verdict_category
}
except Exception as e:
return {
"reasoning": f"Error during evaluation: {str(e)}",
"summary": "Could not complete evaluation",
"verdict": "EVALUATION FAILED",
"verdict_category": "error"
}
# Main app
st.set_page_config(page_title="ReWOO Demo", layout="wide")
st.title("ReWOO Demo 🤗")
st.markdown("""
Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models.
Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time.
""")
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
# Sidebar
with st.sidebar:
st.header("Configuration")
# Tools selection
tools = st.multiselect(
"Select Tools",
options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'],
default=['Duckduckgo', 'LLM']
)
# Model selection
model = st.selectbox(
"Select Model",
options=["text-davinci-003", "gpt-3.5-turbo"],
index=1
)
# Refresh modules button
if st.button("🔄 Refresh Modules"):
status = reload_modules()
st.success(status)
# Examples section
st.header("Examples")
if st.button("Example 1: American Callan Pinckney's system"):
example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?"
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
if st.button("Example 2: ReWOO paper"):
example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?"
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
if st.button("Example 3: Car acceleration"):
example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s."
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
# Display chat history
for i, message in enumerate(st.session_state.messages):
if message["role"] == "user":
st.chat_message("user").write(message["content"])
else:
with st.chat_message("assistant"):
st.write(message["content"])
with st.expander("Show reasoning process"):
st.subheader("Planner")
st.text(message["plan"])
st.subheader("Solver")
st.text(message["solve"])
# Add evaluate button in the expander
if "evaluation_results" not in message:
if st.button("🔍 Evaluate", key=f"eval_btn_{i}", type="secondary"):
with st.spinner("Evaluating response..."):
results = evaluate(message["content"], message["plan"], message["solve"])
st.session_state.messages[i]["evaluation_results"] = results
st.rerun()
else:
# Show evaluation in an expander
with st.expander("Evaluation Results"):
results = message["evaluation_results"]
# Display verdict with color
verdict_color = "red"
if results["verdict_category"] == "supported":
verdict_color = "green"
elif results["verdict_category"] == "partially_supported":
verdict_color = "orange"
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
unsafe_allow_html=True)
st.subheader("Summary")
st.write(results["summary"])
st.subheader("Detailed Analysis")
st.write(results["reasoning"])
# User input
if prompt := st.chat_input("Ask something..."):
st.session_state.messages.append({"role": "user", "content": prompt})
st.chat_message("user").write(prompt)
with st.chat_message("assistant"):
with st.spinner('Researching...'):
plan, solve, output = process(tools, model, prompt)
st.write(output)
with st.expander("Show research process"):
st.subheader("Planner")
st.text(plan)
st.subheader("Solver")
st.text(solve)
# Add evaluate button in expander for current response
if st.button("🔍 Evaluate", key="eval_current", type="secondary"):
with st.spinner("Evaluating response..."):
results = evaluate(output, plan, solve)
# Show evaluation in an expander
with st.expander("Evaluation Results"):
# Display verdict with color
verdict_color = "red"
if results["verdict_category"] == "supported":
verdict_color = "green"
elif results["verdict_category"] == "partially_supported":
verdict_color = "orange"
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
unsafe_allow_html=True)
st.subheader("Summary")
st.write(results["summary"])
st.subheader("Detailed Analysis")
st.write(results["reasoning"])
# Store evaluation results
for i in range(len(st.session_state.messages)):
if st.session_state.messages[i]["role"] == "assistant" and \
st.session_state.messages[i]["content"] == output:
st.session_state.messages[i]["evaluation_results"] = results
break
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
# Clear chat button
if st.sidebar.button("Clear Chat"):
st.session_state.messages = []
st.rerun() |