Spaces:
Running
Running
File size: 7,689 Bytes
699a68e d91c001 699a68e 3076d04 699a68e 3076d04 699a68e 3076d04 699a68e d91c001 699a68e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import streamlit as st
from rag_graph import rag_graph, vectorstore
from langchain_core.messages import HumanMessage
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
import chardet
from PyPDF2 import PdfReader
import io
import re
import logging
from langchain_openai import OpenAIEmbeddings
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Create a string buffer to capture logs
log_stream = io.StringIO()
handler = logging.StreamHandler(log_stream)
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
load_dotenv()
# Set page config
st.set_page_config(
page_title="RAG Application with RAGAS Metrics",
page_icon="🤖",
layout="wide"
)
# Initialize session state
if "messages" not in st.session_state:
st.session_state.messages = []
# Initialize embeddings
embeddings = OpenAIEmbeddings()
# Title
st.title("🤖 RAG Application with RAGAS Metrics")
def clean_text(text):
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove non-printable characters
text = ''.join(char for char in text if char.isprintable())
return text.strip()
def split_into_sentences(text):
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if s.strip()]
def process_text(text):
# Clean the text
text = clean_text(text)
# Split into sentences
sentences = split_into_sentences(text)
# Initialize text splitter with semantic chunking
text_splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type="percentile",
breakpoint_threshold_amount=25 # Lower threshold for more semantic grouping
)
# First chunk sentences semantically
sentence_chunks = text_splitter.split_text("\n".join(sentences))
# Then combine into paragraphs greedily
paragraphs = []
current_paragraph = []
current_size = 0
max_chunk_size = 1000 # Maximum characters per chunk
for chunk in sentence_chunks:
chunk_size = len(chunk)
if current_size + chunk_size <= max_chunk_size:
current_paragraph.append(chunk)
current_size += chunk_size
else:
if current_paragraph:
paragraphs.append("\n".join(current_paragraph))
current_paragraph = [chunk]
current_size = chunk_size
if current_paragraph:
paragraphs.append("\n".join(current_paragraph))
return paragraphs
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text: # Only add if text was extracted
text += page_text + "\n"
if not text.strip():
raise ValueError("No text could be extracted from the PDF")
return clean_text(text)
except Exception as e:
raise ValueError(f"Error extracting text from PDF: {str(e)}")
# Sidebar for document upload
with st.sidebar:
st.header("Document Management")
uploaded_file = st.file_uploader("Upload a document (max 10MB)", type=["txt", "pdf"])
if uploaded_file:
try:
# Check file size (10MB = 10 * 1024 * 1024 bytes)
if uploaded_file.size > 10 * 1024 * 1024:
st.error("File size exceeds 10MB limit. Please upload a smaller file.")
else:
logger.info(f"Processing uploaded file: {uploaded_file.name}")
# Process the document based on file type
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
else:
# For text files, detect encoding
raw_data = uploaded_file.getvalue()
result = chardet.detect(raw_data)
encoding = result['encoding']
text = raw_data.decode(encoding)
if not text.strip():
raise ValueError("No text content found in the document")
# Process text into semantic chunks
chunks = process_text(text)
if not chunks:
raise ValueError("No valid text chunks could be created from the document")
# Add to vectorstore
logger.info(f"Adding {len(chunks)} chunks to vectorstore")
vectorstore.add_texts(chunks)
st.success("Document processed and added to the knowledge base!")
st.info(f"Processed {len(chunks)} chunks of text")
except Exception as e:
logger.error(f"Error processing document: {str(e)}")
st.error(f"Error processing document: {str(e)}")
# Main chat interface
st.header("Chat with the RAG System")
# Display chat messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Chat input
default_question = "What is quantum computing?"
question = st.text_input("Ask a question", value=default_question)
if st.button("Submit") or question != default_question:
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": question})
# Display user message
with st.chat_message("user"):
st.markdown(question)
# Prepare the state for the RAG graph
state = {
"messages": [HumanMessage(content=question)],
"context": "", # Initialize empty context
"response": "", # Initialize empty response
"next": "retrieve" # Start with retrieval
}
# Run the RAG graph
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
try:
logger.info("Starting RAG process")
result = rag_graph.invoke(state)
logger.info("RAG process completed")
# Display the response and metrics
st.markdown(result["response"])
# Display the raw metrics dictionary
if "metrics" in result and result["metrics"]:
st.markdown("---") # Add a separator
st.subheader("RAGAS Metrics")
st.write("Raw metrics dictionary:")
st.json(result["metrics"])
# Display the metrics calculation log
metrics_log = log_stream.getvalue()
if "RAGAS metrics calculated" in metrics_log:
st.markdown("---")
st.subheader("Metrics Calculation Log")
st.code(metrics_log.split("RAGAS metrics calculated:")[-1].strip())
else:
st.warning("No metrics available for this response")
st.write("Debug - Full result dictionary:")
st.json(result)
# Add assistant response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": result["response"]
})
except Exception as e:
logger.error(f"Error in RAG process: {str(e)}")
st.error(f"Error generating response: {str(e)}") |