File size: 7,689 Bytes
699a68e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d91c001
 
 
 
 
 
699a68e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3076d04
699a68e
 
3076d04
 
 
699a68e
3076d04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699a68e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d91c001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699a68e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import streamlit as st
from rag_graph import rag_graph, vectorstore
from langchain_core.messages import HumanMessage
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
import chardet
from PyPDF2 import PdfReader
import io
import re
import logging
from langchain_openai import OpenAIEmbeddings

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create a string buffer to capture logs
log_stream = io.StringIO()
handler = logging.StreamHandler(log_stream)
handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)

load_dotenv()

# Set page config
st.set_page_config(
    page_title="RAG Application with RAGAS Metrics",
    page_icon="🤖",
    layout="wide"
)

# Initialize session state
if "messages" not in st.session_state:
    st.session_state.messages = []

# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Title
st.title("🤖 RAG Application with RAGAS Metrics")

def clean_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove non-printable characters
    text = ''.join(char for char in text if char.isprintable())
    return text.strip()

def split_into_sentences(text):
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def process_text(text):
    # Clean the text
    text = clean_text(text)
    
    # Split into sentences
    sentences = split_into_sentences(text)
    
    # Initialize text splitter with semantic chunking
    text_splitter = SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=25  # Lower threshold for more semantic grouping
    )
    
    # First chunk sentences semantically
    sentence_chunks = text_splitter.split_text("\n".join(sentences))
    
    # Then combine into paragraphs greedily
    paragraphs = []
    current_paragraph = []
    current_size = 0
    max_chunk_size = 1000  # Maximum characters per chunk
    
    for chunk in sentence_chunks:
        chunk_size = len(chunk)
        if current_size + chunk_size <= max_chunk_size:
            current_paragraph.append(chunk)
            current_size += chunk_size
        else:
            if current_paragraph:
                paragraphs.append("\n".join(current_paragraph))
            current_paragraph = [chunk]
            current_size = chunk_size
    
    if current_paragraph:
        paragraphs.append("\n".join(current_paragraph))
    
    return paragraphs

def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:  # Only add if text was extracted
                text += page_text + "\n"
        
        if not text.strip():
            raise ValueError("No text could be extracted from the PDF")
            
        return clean_text(text)
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {str(e)}")

# Sidebar for document upload
with st.sidebar:
    st.header("Document Management")
    uploaded_file = st.file_uploader("Upload a document (max 10MB)", type=["txt", "pdf"])
    if uploaded_file:
        try:
            # Check file size (10MB = 10 * 1024 * 1024 bytes)
            if uploaded_file.size > 10 * 1024 * 1024:
                st.error("File size exceeds 10MB limit. Please upload a smaller file.")
            else:
                logger.info(f"Processing uploaded file: {uploaded_file.name}")
                # Process the document based on file type
                if uploaded_file.type == "application/pdf":
                    text = extract_text_from_pdf(uploaded_file)
                else:
                    # For text files, detect encoding
                    raw_data = uploaded_file.getvalue()
                    result = chardet.detect(raw_data)
                    encoding = result['encoding']
                    text = raw_data.decode(encoding)
                
                if not text.strip():
                    raise ValueError("No text content found in the document")
                
                # Process text into semantic chunks
                chunks = process_text(text)
                
                if not chunks:
                    raise ValueError("No valid text chunks could be created from the document")
                
                # Add to vectorstore
                logger.info(f"Adding {len(chunks)} chunks to vectorstore")
                vectorstore.add_texts(chunks)
                
                st.success("Document processed and added to the knowledge base!")
                st.info(f"Processed {len(chunks)} chunks of text")
        except Exception as e:
            logger.error(f"Error processing document: {str(e)}")
            st.error(f"Error processing document: {str(e)}")

# Main chat interface
st.header("Chat with the RAG System")

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input
default_question = "What is quantum computing?"
question = st.text_input("Ask a question", value=default_question)
if st.button("Submit") or question != default_question:
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": question})
    
    # Display user message
    with st.chat_message("user"):
        st.markdown(question)
    
    # Prepare the state for the RAG graph
    state = {
        "messages": [HumanMessage(content=question)],
        "context": "",  # Initialize empty context
        "response": "",  # Initialize empty response
        "next": "retrieve"  # Start with retrieval
    }
    
    # Run the RAG graph
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            try:
                logger.info("Starting RAG process")
                result = rag_graph.invoke(state)
                logger.info("RAG process completed")
                
                # Display the response and metrics
                st.markdown(result["response"])
                
                # Display the raw metrics dictionary
                if "metrics" in result and result["metrics"]:
                    st.markdown("---")  # Add a separator
                    st.subheader("RAGAS Metrics")
                    st.write("Raw metrics dictionary:")
                    st.json(result["metrics"])
                    
                    # Display the metrics calculation log
                    metrics_log = log_stream.getvalue()
                    if "RAGAS metrics calculated" in metrics_log:
                        st.markdown("---")
                        st.subheader("Metrics Calculation Log")
                        st.code(metrics_log.split("RAGAS metrics calculated:")[-1].strip())
                else:
                    st.warning("No metrics available for this response")
                    st.write("Debug - Full result dictionary:")
                    st.json(result)
                
                # Add assistant response to chat history
                st.session_state.messages.append({
                    "role": "assistant",
                    "content": result["response"]
                })
            except Exception as e:
                logger.error(f"Error in RAG process: {str(e)}")
                st.error(f"Error generating response: {str(e)}")