Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

ganesh3 commited on Oct 28, 2024

Commit

185fa42

1 Parent(s): 66a5452

sixth commit

Browse files

Files changed (41) hide show

.env_template +6 -1
.streamlit/config.toml +17 -0
Dockerfile +25 -3
README.md +10 -2
Screenshots.md +11 -13
app/__init__.py +0 -0
app/data/sqlite.db +0 -0
app/database.py +219 -67
app/home.py +68 -0
app/main.py +0 -430
app/pages/__init__.py +0 -0
app/pages/chat_interface.py +361 -0
app/pages/data_ingestion.py +145 -0
app/pages/evaluation.py +134 -0
app/pages/ground_truth.py +100 -0
app/rag.py +23 -8
app/utils.py +62 -0
data/ground-truth-retrieval.csv +10 -0
data/sqlite.db +0 -0
docker-compose.yaml +37 -4
grafana/dashboards/rag_evaluation.json +6 -6
grafana/provisioning/datasources/sqlite.yaml +2 -1
image-1.png +0 -0
image-10.png +0 -0
image-11.png +0 -0
image-2.png +0 -0
image-3.png +0 -0
image-4.png +0 -0
image-5.png +0 -0
image-6.png +0 -0
image-7.png +0 -0
image-8.png +0 -0
image-9.png +0 -0
image.png +0 -0
images/image-1.png +0 -0
images/image-2.png +0 -0
images/image-3.png +0 -0
images/image-4.png +0 -0
images/image-5.png +0 -0
images/image-6.png +0 -0
images/image.png +0 -0

.env_template CHANGED Viewed

	@@ -1 +1,6 @@
1	- YOUTUBE_API_KEY='YOUR YOUTUBE_API_KEY'

+YOUTUBE_API_KEY='YOUR YOUTUBE_API_KEY'
+HF_TOKEN='YOUR Hugging Face API KEY'
+OLLAMA_MODEL='Your model'
+OLLAMA_HOST='Your Host Name'
+OLLAMA_TIMEOUT=240
+OLLAMA_MAX_RETRIES=3

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[browser]
+gatherUsageStats = false
+[theme]
+primaryColor = "#FF4B4B"
+backgroundColor = "#FFFFFF"
+secondaryBackgroundColor = "#F0F2F6"
+textColor = "#262730"
+[server]
+runOnSave = true
+port = 8501
+address = "0.0.0.0"
+[ui]
+hideTopBar = false
+hideSidebarNav = false

Dockerfile CHANGED Viewed

@@ -17,15 +17,37 @@ COPY requirements.txt .
 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the application code into the container
 COPY app/ ./app/
 COPY config/ ./config/
 COPY data/ ./data/
 COPY grafana/ ./grafana/
 COPY .env ./
 # Make port 8501 available to the world outside this container
 EXPOSE 8501
-# Run the Streamlit app when the container launches
-CMD ["streamlit", "run", "app/main.py", "--server.port=8501", "--server.address=0.0.0.0"]

 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
+# Create necessary directories
+RUN mkdir -p app/pages config data grafana logs /root/.streamlit
+# Set Python path and Streamlit configs
+ENV PYTHONPATH=/app \
+    STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
+    STREAMLIT_THEME_PRIMARY_COLOR="#FF4B4B" \
+    STREAMLIT_SERVER_PORT=8501 \
+    STREAMLIT_SERVER_ADDRESS=0.0.0.0
+# Create empty __init__.py files
+RUN touch app/__init__.py app/pages/__init__.py
+# Copy the application code and other files into the container
 COPY app/ ./app/
 COPY config/ ./config/
 COPY data/ ./data/
 COPY grafana/ ./grafana/
 COPY .env ./
+COPY .streamlit/config.toml /root/.streamlit/config.toml
 # Make port 8501 available to the world outside this container
 EXPOSE 8501
+# Create a healthcheck script
+RUN echo '#!/bin/bash\ncurl -f http://localhost:8501/_stcore/health' > /healthcheck.sh && \
+    chmod +x /healthcheck.sh
+# Add healthcheck
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD ["/healthcheck.sh"]
+# Run Streamlit
+CMD ["streamlit", "run", "app/home.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -61,8 +61,12 @@ The YouTube Assistant project is organized as follows:
 ```
 youtube-rag-app/
 ├── app/
-│   ├── main.py
-│   ├── ui.py
 │   ├── transcript_extractor.py
 │   ├── data_processor.py
 │   ├── elasticsearch_handler.py
@@ -70,6 +74,7 @@ youtube-rag-app/
 │   ├── rag.py
 │   ├── query_rewriter.py
 │   └── evaluation.py
 ├── data/
 │   └── sqlite.db
 ├── config/
@@ -129,3 +134,6 @@ I used the LLM as a Judge metric to evaluate the quality of our RAG Flow on my l
 * PARTLY_RELEVANT - 0 (0%)
 * NON RELEVANT - 0 (0%)

 ```
 youtube-rag-app/
 ├── app/
+│   ├── home.py
+│   ├── pages/
+│   ├────── chat_interface.py
+│   ├────── data_ingestion.py
+│   ├────── evauation.py
+│   ├────── ground_truth.py
 │   ├── transcript_extractor.py
 │   ├── data_processor.py
 │   ├── elasticsearch_handler.py
 │   ├── rag.py
 │   ├── query_rewriter.py
 │   └── evaluation.py
+│   └── utils.py
 ├── data/
 │   └── sqlite.db
 ├── config/
 * PARTLY_RELEVANT - 0 (0%)
 * NON RELEVANT - 0 (0%)
+### Monitoring
+I used Grafana to monitor the metrics, user feedback, evaluation results, and search performance.

Screenshots.md CHANGED Viewed

@@ -1,27 +1,25 @@
 ### Docker deployment
-![alt text](image-2.png)
-### Ingestion
-![alt text](image-3.png)
-![alt text](image-7.png)
 ### RAG
-![alt text](image-4.png)
-![alt text](image-8.png)
-![alt text](image-9.png)
-![alt text](image-10.png)
-![alt text](image-11.png)
 ### Ground Truth Generation
-![alt text](image-6.png)
-![alt text](image-5.png)
 ### RAG Evaluation
-![alt text](image.png)
-![alt text](image-1.png)

 ### Docker deployment
+![alt text](/images/image.png)
+### Home
+![alt text](/images/image-2.png)
+### Ingestion
+![alt text](/images/image-1.png)
 ### RAG
+![alt text](/images/image-3.png)
 ### Ground Truth Generation
+![alt text](images/image-4.png)
 ### RAG Evaluation
+![alt text](/images/image-5.png)
+### Monitoring
+![alt text](/images/image-6.png)

app/__init__.py ADDED Viewed

File without changes

app/data/sqlite.db ADDED Viewed

Binary file (127 kB). View file

app/database.py CHANGED Viewed

@@ -1,17 +1,43 @@
 import sqlite3
 import os
 class DatabaseHandler:
     def __init__(self, db_path='data/sqlite.db'):
         self.db_path = db_path
         self.conn = None
         self.create_tables()
         self.update_schema()
     def create_tables(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
-            # Existing tables
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS videos (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -27,16 +53,35 @@ class DatabaseHandler:
                     transcript_content TEXT
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS user_feedback (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    video_id INTEGER,
                     query TEXT,
-                    feedback INTEGER,
                     timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                    FOREIGN KEY (video_id) REFERENCES videos (id)
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS embedding_models (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -44,6 +89,8 @@ class DatabaseHandler:
                     description TEXT
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS elasticsearch_indices (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -55,27 +102,31 @@ class DatabaseHandler:
                 )
             ''')
-            # New tables for ground truth and evaluation
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS ground_truth (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     video_id TEXT,
                     question TEXT,
                     generation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                    UNIQUE(video_id, question)
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS search_performance (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     video_id TEXT,
                     hit_rate REAL,
                     mrr REAL,
-                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS search_parameters (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -83,10 +134,12 @@ class DatabaseHandler:
                     parameter_name TEXT,
                     parameter_value REAL,
                     score REAL,
-                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )
             ''')
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS rag_evaluations (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -95,14 +148,18 @@ class DatabaseHandler:
                     answer TEXT,
                     relevance TEXT,
                     explanation TEXT,
-                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )
             ''')
             conn.commit()
     def update_schema(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute("PRAGMA table_info(videos)")
             columns = [column[1] for column in cursor.fetchall()]
@@ -121,36 +178,122 @@ class DatabaseHandler:
             conn.commit()
     def add_video(self, video_data):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
-                INSERT OR REPLACE INTO videos
-                (youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration, transcript_content)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
-            ''', (
-                video_data['video_id'],
-                video_data['title'],
-                video_data['author'],
-                video_data['upload_date'],
-                video_data['view_count'],
-                video_data['like_count'],
-                video_data['comment_count'],
-                video_data['video_duration'],
-                video_data['transcript_content']
-            ))
-            conn.commit()
-            return cursor.lastrowid
-    def add_user_feedback(self, video_id, query, feedback):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
-                INSERT INTO user_feedback (video_id, query, feedback)
                 VALUES (?, ?, ?)
-            ''', (video_id, query, feedback))
             conn.commit()
     def add_embedding_model(self, model_name, description):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
@@ -170,12 +313,6 @@ class DatabaseHandler:
             ''', (video_id, index_name, embedding_model_id))
             conn.commit()
-    def get_video_by_youtube_id(self, youtube_id):
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('SELECT * FROM videos WHERE youtube_id = ?', (youtube_id,))
-            return cursor.fetchone()
     def get_elasticsearch_index(self, video_id, embedding_model):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
@@ -188,16 +325,6 @@ class DatabaseHandler:
             ''', (video_id, embedding_model))
             result = cursor.fetchone()
             return result[0] if result else None
-    def get_all_videos(self):
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-                SELECT youtube_id, title, channel_name, upload_date
-                FROM videos
-                ORDER BY upload_date DESC
-            ''')
-            return cursor.fetchall()
     def get_elasticsearch_index_by_youtube_id(self, youtube_id):
         with sqlite3.connect(self.db_path) as conn:
@@ -210,29 +337,8 @@ class DatabaseHandler:
             ''', (youtube_id,))
             result = cursor.fetchone()
             return result[0] if result else None
-    def get_transcript_content(self, youtube_id):
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-                SELECT transcript_content
-                FROM videos
-                WHERE youtube_id = ?
-            ''', (youtube_id,))
-            result = cursor.fetchone()
-            return result[0] if result else None
-    # This method is no longer needed as transcript is added in add_video
-    # def add_transcript_content(self, youtube_id, transcript_content):
-    #     with sqlite3.connect(self.db_path) as conn:
-    #         cursor = conn.cursor()
-    #         cursor.execute('''
-    #             UPDATE videos
-    #             SET transcript_content = ?
-    #             WHERE youtube_id = ?
-    #         ''', (transcript_content, youtube_id))
-    #         conn.commit()
     def add_ground_truth_questions(self, video_id, questions):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
@@ -281,6 +387,7 @@ class DatabaseHandler:
             ''')
             return cursor.fetchall()
     def save_search_performance(self, video_id, hit_rate, mrr):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
@@ -347,4 +454,49 @@ class DatabaseHandler:
                     SELECT * FROM search_performance
                     ORDER BY evaluation_date DESC
                 ''')
-            return cursor.fetchall()

 import sqlite3
 import os
+import logging
+from datetime import datetime
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class DatabaseHandler:
     def __init__(self, db_path='data/sqlite.db'):
         self.db_path = db_path
         self.conn = None
+        os.makedirs(os.path.dirname(db_path), exist_ok=True)
         self.create_tables()
         self.update_schema()
+        self.migrate_database()
     def create_tables(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
+            # First, drop the existing user_feedback table if it exists
+            cursor.execute('DROP TABLE IF EXISTS user_feedback')
+            # Recreate the user_feedback table with the correct schema
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS user_feedback (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    video_id TEXT,
+                    query TEXT,
+                    response TEXT,
+                    feedback INTEGER CHECK (feedback IN (-1, 1)),
+                    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    chat_id INTEGER,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id),
+                    FOREIGN KEY (chat_id) REFERENCES chat_history (id)
+                )
+            ''')
+            # Videos table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS videos (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     transcript_content TEXT
                 )
             ''')
+            # Chat History table
+            cursor.execute('''
+                CREATE TABLE IF NOT EXISTS chat_history (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    video_id TEXT,
+                    user_message TEXT,
+                    assistant_message TEXT,
+                    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id)
+                )
+            ''')
+            # User Feedback table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS user_feedback (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    video_id TEXT,
+                    chat_id INTEGER,
                     query TEXT,
+                    response TEXT,
+                    feedback INTEGER CHECK (feedback IN (-1, 1)),
                     timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id),
+                    FOREIGN KEY (chat_id) REFERENCES chat_history (id)
                 )
             ''')
+            # Embedding Models table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS embedding_models (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     description TEXT
                 )
             ''')
+            # Elasticsearch Indices table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS elasticsearch_indices (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                 )
             ''')
+            # Ground Truth table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS ground_truth (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     video_id TEXT,
                     question TEXT,
                     generation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    UNIQUE(video_id, question),
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id)
                 )
             ''')
+            # Search Performance table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS search_performance (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     video_id TEXT,
                     hit_rate REAL,
                     mrr REAL,
+                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id)
                 )
             ''')
+            # Search Parameters table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS search_parameters (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     parameter_name TEXT,
                     parameter_value REAL,
                     score REAL,
+                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id)
                 )
             ''')
+            # RAG Evaluations table
             cursor.execute('''
                 CREATE TABLE IF NOT EXISTS rag_evaluations (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     answer TEXT,
                     relevance TEXT,
                     explanation TEXT,
+                    evaluation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    FOREIGN KEY (video_id) REFERENCES videos (youtube_id)
                 )
             ''')
             conn.commit()
     def update_schema(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
+            # Check and update videos table
             cursor.execute("PRAGMA table_info(videos)")
             columns = [column[1] for column in cursor.fetchall()]
             conn.commit()
+    # Video Management Methods
     def add_video(self, video_data):
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                cursor.execute('''
+                    INSERT OR REPLACE INTO videos
+                    (youtube_id, title, channel_name, upload_date, view_count, like_count,
+                     comment_count, video_duration, transcript_content)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (
+                    video_data['video_id'],
+                    video_data['title'],
+                    video_data['author'],
+                    video_data['upload_date'],
+                    video_data['view_count'],
+                    video_data['like_count'],
+                    video_data['comment_count'],
+                    video_data['video_duration'],
+                    video_data['transcript_content']
+                ))
+                conn.commit()
+                return cursor.lastrowid
+        except Exception as e:
+            logger.error(f"Error adding video: {str(e)}")
+            raise
+    def get_video_by_youtube_id(self, youtube_id):
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+            cursor.execute('SELECT * FROM videos WHERE youtube_id = ?', (youtube_id,))
+            return cursor.fetchone()
+    def get_all_videos(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
+                SELECT youtube_id, title, channel_name, upload_date
+                FROM videos
+                ORDER BY upload_date DESC
+            ''')
+            return cursor.fetchall()
+    # Chat and Feedback Methods
+    def add_chat_message(self, video_id, user_message, assistant_message):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
+                INSERT INTO chat_history (video_id, user_message, assistant_message)
                 VALUES (?, ?, ?)
+            ''', (video_id, user_message, assistant_message))
             conn.commit()
+            return cursor.lastrowid
+    def get_chat_history(self, video_id):
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT id, user_message, assistant_message, timestamp
+                FROM chat_history
+                WHERE video_id = ?
+                ORDER BY timestamp ASC
+            ''', (video_id,))
+            return cursor.fetchall()
+    def add_user_feedback(self, video_id, chat_id, query, response, feedback):
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                # First verify the video exists
+                cursor.execute('SELECT id FROM videos WHERE youtube_id = ?', (video_id,))
+                if not cursor.fetchone():
+                    logger.error(f"Video {video_id} not found in database")
+                    raise ValueError(f"Video {video_id} not found")
+                # Then verify the chat message exists if chat_id is provided
+                if chat_id:
+                    cursor.execute('SELECT id FROM chat_history WHERE id = ?', (chat_id,))
+                    if not cursor.fetchone():
+                        logger.error(f"Chat message {chat_id} not found in database")
+                        raise ValueError(f"Chat message {chat_id} not found")
+                # Insert the feedback
+                cursor.execute('''
+                    INSERT INTO user_feedback
+                    (video_id, chat_id, query, response, feedback)
+                    VALUES (?, ?, ?, ?, ?)
+                ''', (video_id, chat_id, query, response, feedback))
+                conn.commit()
+                logger.info(f"Added feedback for video {video_id}, chat {chat_id}")
+                return cursor.lastrowid
+        except sqlite3.Error as e:
+            logger.error(f"Database error: {str(e)}")
+            raise
+        except Exception as e:
+            logger.error(f"Error adding feedback: {str(e)}")
+            raise
+    def get_user_feedback_stats(self, video_id):
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                cursor.execute('''
+                    SELECT
+                        COUNT(CASE WHEN feedback = 1 THEN 1 END) as positive_feedback,
+                        COUNT(CASE WHEN feedback = -1 THEN 1 END) as negative_feedback
+                    FROM user_feedback
+                    WHERE video_id = ?
+                ''', (video_id,))
+                return cursor.fetchone() or (0, 0)  # Return (0, 0) if no feedback exists
+        except sqlite3.Error as e:
+            logger.error(f"Database error getting feedback stats: {str(e)}")
+            return (0, 0)
+    # Embedding and Index Methods
     def add_embedding_model(self, model_name, description):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             ''', (video_id, index_name, embedding_model_id))
             conn.commit()
     def get_elasticsearch_index(self, video_id, embedding_model):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             ''', (video_id, embedding_model))
             result = cursor.fetchone()
             return result[0] if result else None
     def get_elasticsearch_index_by_youtube_id(self, youtube_id):
         with sqlite3.connect(self.db_path) as conn:
             ''', (youtube_id,))
             result = cursor.fetchone()
             return result[0] if result else None
+    # Ground Truth Methods
     def add_ground_truth_questions(self, video_id, questions):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             ''')
             return cursor.fetchall()
+    # Evaluation Methods
     def save_search_performance(self, video_id, hit_rate, mrr):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
                     SELECT * FROM search_performance
                     ORDER BY evaluation_date DESC
                 ''')
+            return cursor.fetchall()
+    def migrate_database(self):
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                cursor = conn.cursor()
+                # Check if chat_id column exists in user_feedback
+                cursor.execute("PRAGMA table_info(user_feedback)")
+                columns = [column[1] for column in cursor.fetchall()]
+                if 'chat_id' not in columns:
+                    logger.info("Migrating user_feedback table")
+                    # Create temporary table with new schema
+                    cursor.execute('''
+                        CREATE TABLE user_feedback_new (
+                            id INTEGER PRIMARY KEY AUTOINCREMENT,
+                            video_id TEXT,
+                            query TEXT,
+                            response TEXT,
+                            feedback INTEGER CHECK (feedback IN (-1, 1)),
+                            timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                            chat_id INTEGER,
+                            FOREIGN KEY (video_id) REFERENCES videos (youtube_id),
+                            FOREIGN KEY (chat_id) REFERENCES chat_history (id)
+                        )
+                    ''')
+                    # Copy existing data
+                    cursor.execute('''
+                        INSERT INTO user_feedback_new (video_id, query, response, feedback, timestamp)
+                        SELECT video_id, query, response, feedback, timestamp
+                        FROM user_feedback
+                    ''')
+                    # Drop old table and rename new one
+                    cursor.execute('DROP TABLE user_feedback')
+                    cursor.execute('ALTER TABLE user_feedback_new RENAME TO user_feedback')
+                    logger.info("Migration completed successfully")
+                conn.commit()
+        except Exception as e:
+            logger.error(f"Error during migration: {str(e)}")
+            raise

app/home.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Home",
+    page_icon="🏠",
+    layout="wide"
+)
+from transcript_extractor import test_api_key, initialize_youtube_api
+import logging
+import os
+import sys
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('app.log'),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+def main():
+    st.title("YouTube Transcript RAG System 🎥")
+    st.write("Welcome to the YouTube Transcript RAG System!")
+    # Check API key
+    if not test_api_key():
+        st.error("YouTube API key is invalid or not set. Please check your configuration.")
+        new_api_key = st.text_input("Enter your YouTube API key:")
+        if new_api_key:
+            os.environ['YOUTUBE_API_KEY'] = new_api_key
+            if test_api_key():
+                st.success("API key validated successfully!")
+                st.experimental_rerun()
+            else:
+                st.error("Invalid API key. Please try again.")
+        return
+    st.success("System is ready! Please use the sidebar to navigate between different functions.")
+    # Display system overview
+    st.header("System Overview")
+    st.write("""
+    This system provides the following functionality:
+    1. **Data Ingestion** 📥
+       - Process YouTube videos and transcripts
+       - Support for single videos or entire channels
+    2. **Chat Interface** 💬
+       - Interactive chat with processed videos
+       - Multiple query rewriting methods
+       - Various search strategies
+    3. **Ground Truth Generation** 📝
+       - Generate and manage ground truth questions
+       - Export ground truth data
+    4. **RAG Evaluation** 📊
+       - Evaluate system performance
+       - View detailed metrics and analytics
+    """)
+if __name__ == "__main__":
+    main()

app/main.py DELETED Viewed

@@ -1,430 +0,0 @@
-import streamlit as st
-import pandas as pd
-from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
-from data_processor import DataProcessor
-from database import DatabaseHandler
-from rag import RAGSystem
-from query_rewriter import QueryRewriter
-from evaluation import EvaluationSystem
-from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos, get_ground_truth_display_data, get_evaluation_display_data
-from sentence_transformers import SentenceTransformer
-import os
-import sys
-import logging
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-@st.cache_resource
-def init_components():
-    try:
-        db_handler = DatabaseHandler()
-        data_processor = DataProcessor()
-        rag_system = RAGSystem(data_processor)
-        query_rewriter = QueryRewriter()
-        evaluation_system = EvaluationSystem(data_processor, db_handler)
-        logger.info("Components initialized successfully")
-        return db_handler, data_processor, rag_system, query_rewriter, evaluation_system
-    except Exception as e:
-        logger.error(f"Error initializing components: {str(e)}")
-        st.error(f"Error initializing components: {str(e)}")
-        st.error("Please check your configuration and ensure all services are running.")
-        return None, None, None, None, None
-def check_api_key():
-    if test_api_key():
-        st.success("YouTube API key is valid and working.")
-    else:
-        st.error("YouTube API key is invalid or not set. Please check your .env file.")
-        new_api_key = st.text_input("Enter your YouTube API key:")
-        if new_api_key:
-            os.environ['YOUTUBE_API_KEY'] = new_api_key
-            with open('.env', 'a') as f:
-                f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
-            st.success("API key saved. Reinitializing YouTube client...")
-            get_youtube_client.cache_clear()  # Clear the cache to force reinitialization
-            if test_api_key():
-                st.success("YouTube client reinitialized successfully.")
-            else:
-                st.error("Failed to reinitialize YouTube client. Please check your API key.")
-            st.experimental_rerun()
-# LLM-as-a-judge prompt template
-prompt_template = """
-You are an expert evaluator for a Youtube transcript assistant.
-Your task is to analyze the relevance of the generated answer to the given question.
-Based on the relevance of the generated answer, you will classify it
-as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
-Here is the data for evaluation:
-Question: {question}
-Generated Answer: {answer_llm}
-Please analyze the content and context of the generated answer in relation to the question
-and provide your evaluation in the following JSON format:
-{{
-  "Relevance": "NON_RELEVANT",
-  "Explanation": "Your explanation here"
-}}
-OR
-{{
-  "Relevance": "PARTLY_RELEVANT",
-  "Explanation": "Your explanation here"
-}}
-OR
-{{
-  "Relevance": "RELEVANT",
-  "Explanation": "Your explanation here"
-}}
-Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
-Do not include any text outside of this JSON object.
-"""
-def process_single_video(db_handler, data_processor, video_id, embedding_model):
-    existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
-    if existing_index:
-        logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
-        return existing_index
-    transcript_data = get_transcript(video_id)
-    if transcript_data is None:
-        logger.error(f"Failed to retrieve transcript for video {video_id}")
-        st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
-        return None
-    # Process the transcript
-    processed_data = data_processor.process_transcript(video_id, transcript_data)
-    if processed_data is None:
-        logger.error(f"Failed to process transcript for video {video_id}")
-        return None
-    # Prepare video data for database insertion
-    video_data = {
-        'video_id': video_id,
-        'title': transcript_data['metadata'].get('title', 'Unknown Title'),
-        'author': transcript_data['metadata'].get('author', 'Unknown Author'),
-        'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'),
-        'view_count': int(transcript_data['metadata'].get('view_count', 0)),
-        'like_count': int(transcript_data['metadata'].get('like_count', 0)),
-        'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
-        'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
-        'transcript_content': processed_data['content']  # Add this line to include the transcript content
-    }
-    try:
-        db_handler.add_video(video_data)
-    except Exception as e:
-        logger.error(f"Error adding video to database: {str(e)}")
-        st.error(f"Error adding video {video_id} to database: {str(e)}")
-        return None
-    index_name = f"video_{video_id}_{embedding_model}".lower()
-    try:
-        index_name = data_processor.build_index(index_name)
-        logger.info(f"Successfully built index: {index_name}")
-    except Exception as e:
-        logger.error(f"Error building index: {str(e)}")
-        st.error(f"Error building index for video {video_id}: {str(e)}")
-        return None
-    embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
-    video_db_record = db_handler.get_video_by_youtube_id(video_id)
-    if video_db_record is None:
-        logger.error(f"Failed to retrieve video record from database for video {video_id}")
-        st.error(f"Failed to retrieve video record from database for video {video_id}")
-        return None
-    video_db_id = video_db_record[0]
-    db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
-    logger.info(f"Processed and indexed transcript for video {video_id}")
-    st.success(f"Successfully processed and indexed transcript for video {video_id}")
-    return index_name
-def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
-    indices = []
-    for video_id in video_ids:
-        index = process_single_video(db_handler, data_processor, video_id, embedding_model)
-        if index:
-            indices.append(index)
-    logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
-    st.success(f"Processed and indexed transcripts for {len(indices)} videos")
-    return indices
-def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
-    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
-    if not index_name:
-        st.warning(f"Video {video_id} has not been processed yet. Processing now...")
-        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
-        if not index_name:
-            st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
-            return False
-    return True
-def main():
-    st.title("YouTube Transcript RAG System")
-    check_api_key()
-    components = init_components()
-    if components:
-        db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
-    else:
-        st.stop()
-    tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])
-    with tab1:
-        st.header("RAG System")
-        embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
-        st.subheader("Select a Video")
-        videos = db_handler.get_all_videos()
-        if not videos:
-            st.warning("No videos available. Please process some videos first.")
-        else:
-            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
-            channels = sorted(video_df['channel_name'].unique())
-            selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
-            if selected_channel != "All":
-                video_df = video_df[video_df['channel_name'] == selected_channel]
-            st.dataframe(video_df)
-            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
-            index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
-            if index_name:
-                st.success(f"Using index: {index_name}")
-            else:
-                st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
-        st.subheader("Process New Video")
-        input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
-        input_value = st.text_input("Enter the URL or ID:")
-        if st.button("Process"):
-            with st.spinner("Processing..."):
-                data_processor.set_embedding_model(embedding_model)
-                if input_type == "Video URL":
-                    video_id = extract_video_id(input_value)
-                    if video_id:
-                        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
-                        if index_name is None:
-                            st.error(f"Failed to process video {video_id}")
-                        else:
-                            st.success(f"Successfully processed video {video_id}")
-                    else:
-                        st.error("Failed to extract video ID from the URL")
-                elif input_type == "Channel URL":
-                    channel_videos = get_channel_videos(input_value)
-                    if channel_videos:
-                        index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
-                        if not index_names:
-                            st.error("Failed to process any videos from the channel")
-                        else:
-                            st.success(f"Successfully processed {len(index_names)} videos from the channel")
-                    else:
-                        st.error("Failed to retrieve videos from the channel")
-                else:
-                    index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
-                    if index_name is None:
-                        st.error(f"Failed to process video {input_value}")
-                    else:
-                        st.success(f"Successfully processed video {input_value}")
-        st.subheader("Query the RAG System")
-        query = st.text_input("Enter your query:")
-        rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
-        search_method = st.radio("Search method:", ["Hybrid", "Text-only", "Embedding-only"])
-        if st.button("Search"):
-            if not selected_video_id:
-                st.error("Please select a video before searching.")
-            else:
-                with st.spinner("Searching..."):
-                    rewritten_query = query
-                    rewrite_prompt = ""
-                    if rewrite_method == "Chain of Thought":
-                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_cot(query)
-                    elif rewrite_method == "ReAct":
-                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_react(query)
-                    st.subheader("Query Processing")
-                    st.write("Original query:", query)
-                    if rewrite_method != "None":
-                        st.write("Rewritten query:", rewritten_query)
-                        st.text_area("Query rewriting prompt:", rewrite_prompt, height=100)
-                        if rewritten_query == query:
-                            st.warning("Query rewriting failed. Using original query.")
-                    search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
-                    try:
-                        if not index_name:
-                            st.info("Building index for the selected video...")
-                            index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
-                            if not index_name:
-                                st.error("Failed to build index for the selected video.")
-                                return
-                        response, final_prompt = rag_system.query(rewritten_query, search_method=search_method_map[search_method], index_name=index_name)
-                        st.subheader("RAG System Prompt")
-                        if final_prompt:
-                            st.text_area("Prompt sent to LLM:", final_prompt, height=300)
-                        else:
-                            st.warning("No prompt was generated. This might indicate an issue with the RAG system.")
-                        st.subheader("Response")
-                        if response:
-                            st.write(response)
-                        else:
-                            st.error("No response generated. Please try again or check the system logs for errors.")
-                    except ValueError as e:
-                        logger.error(f"Error during search: {str(e)}")
-                        st.error(f"Error during search: {str(e)}")
-                    except Exception as e:
-                        logger.error(f"An unexpected error occurred: {str(e)}")
-                        st.error(f"An unexpected error occurred: {str(e)}")
-    with tab2:
-        st.header("Ground Truth Generation")
-        videos = db_handler.get_all_videos()
-        if not videos:
-            st.warning("No videos available. Please process some videos first.")
-        else:
-            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
-            # Add channel filter
-            channels = sorted(video_df['channel_name'].unique())
-            selected_channel = st.selectbox("Filter by Channel", ["All"] + channels, key="gt_channel_select")
-            if selected_channel != "All":
-                video_df = video_df[video_df['channel_name'] == selected_channel]
-                # Display existing ground truth for selected channel
-                gt_data = get_ground_truth_display_data(db_handler, channel_name=selected_channel)
-                if not gt_data.empty:
-                    st.subheader("Existing Ground Truth Questions for Channel")
-                    st.dataframe(gt_data)
-                    # Add download button for channel ground truth
-                    csv = gt_data.to_csv(index=False)
-                    st.download_button(
-                        label="Download Channel Ground Truth CSV",
-                        data=csv,
-                        file_name=f"ground_truth_{selected_channel}.csv",
-                        mime="text/csv",
-                    )
-            st.dataframe(video_df)
-            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
-                                           format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
-                                           key="gt_video_select")
-            # Display existing ground truth for selected video
-            gt_data = get_ground_truth_display_data(db_handler, video_id=selected_video_id)
-            if not gt_data.empty:
-                st.subheader("Existing Ground Truth Questions")
-                st.dataframe(gt_data)
-                # Add download button for video ground truth
-                csv = gt_data.to_csv(index=False)
-                st.download_button(
-                    label="Download Video Ground Truth CSV",
-                    data=csv,
-                    file_name=f"ground_truth_{selected_video_id}.csv",
-                    mime="text/csv",
-                )
-    with tab3:
-        st.header("RAG Evaluation")
-        try:
-            ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
-            ground_truth_available = True
-            # Display existing evaluations
-            existing_evaluations = get_evaluation_display_data()
-            if not existing_evaluations.empty:
-                st.subheader("Existing Evaluation Results")
-                st.dataframe(existing_evaluations)
-                # Add download button for evaluation results
-                csv = existing_evaluations.to_csv(index=False)
-                st.download_button(
-                    label="Download Evaluation Results CSV",
-                    data=csv,
-                    file_name="evaluation_results.csv",
-                    mime="text/csv",
-                )
-        except FileNotFoundError:
-            ground_truth_available = False
-        if ground_truth_available:
-            if st.button("Run Full Evaluation"):
-                with st.spinner("Running full evaluation..."):
-                    evaluation_results = evaluation_system.run_full_evaluation(rag_system, 'data/ground-truth-retrieval.csv', prompt_template)
-                    st.subheader("RAG Evaluations")
-                    rag_eval_df = pd.DataFrame(evaluation_results["rag_evaluations"])
-                    st.dataframe(rag_eval_df)
-                    st.subheader("Search Performance")
-                    search_perf_df = pd.DataFrame([evaluation_results["search_performance"]])
-                    st.dataframe(search_perf_df)
-                    st.subheader("Optimized Search Parameters")
-                    params_df = pd.DataFrame([{
-                        'parameter': k,
-                        'value': v,
-                        'score': evaluation_results['best_score']
-                    } for k, v in evaluation_results['best_params'].items()])
-                    st.dataframe(params_df)
-                    # Save to database
-                    for video_id in rag_eval_df['video_id'].unique():
-                        db_handler.save_search_performance(
-                            video_id,
-                            evaluation_results["search_performance"]['hit_rate'],
-                            evaluation_results["search_performance"]['mrr']
-                        )
-                        db_handler.save_search_parameters(
-                            video_id,
-                            evaluation_results['best_params'],
-                            evaluation_results['best_score']
-                        )
-                    st.success("Evaluation complete. Results saved to database and CSV.")
-        else:
-            st.warning("No ground truth data available. Please generate ground truth data first.")
-            st.button("Run Evaluation", disabled=True)
-        if not ground_truth_available:
-            st.subheader("Generate Ground Truth")
-            st.write("You need to generate ground truth data before running the evaluation.")
-            if st.button("Go to Ground Truth Generation"):
-                st.session_state.active_tab = "Ground Truth Generation"
-                st.experimental_rerun()
-if __name__ == "__main__":
-    if not initialize_youtube_api():
-        logger.error("Failed to initialize YouTube API. Exiting.")
-        sys.exit(1)
-    main()

app/pages/__init__.py ADDED Viewed

File without changes

app/pages/chat_interface.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import streamlit as st
+# Must be the first Streamlit command
+st.set_page_config(
+    page_title="02_Chat_Interface",  # Use this format for ordering
+    page_icon="💬",
+    layout="wide"
+)
+# Rest of the imports
+import pandas as pd
+import logging
+import sqlite3
+from datetime import datetime
+import sys
+import os
+# Add the parent directory to Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Use absolute imports
+from database import DatabaseHandler
+from data_processor import DataProcessor
+from rag import RAGSystem
+from query_rewriter import QueryRewriter
+from utils import process_single_video
+# Set up logging
+logger = logging.getLogger(__name__)
+@st.cache_resource
+def init_components():
+    """Initialize system components"""
+    try:
+        db_handler = DatabaseHandler()
+        data_processor = DataProcessor()
+        rag_system = RAGSystem(data_processor)
+        query_rewriter = QueryRewriter()
+        return db_handler, data_processor, rag_system, query_rewriter
+    except Exception as e:
+        logger.error(f"Error initializing components: {str(e)}")
+        st.error(f"Error initializing components: {str(e)}")
+        return None, None, None, None
+def init_session_state():
+    """Initialize session state variables"""
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    if 'current_video_id' not in st.session_state:
+        st.session_state.current_video_id = None
+    if 'feedback_given' not in st.session_state:
+        st.session_state.feedback_given = set()
+def create_chat_interface(db_handler, rag_system, video_id, index_name, rewrite_method, search_method):
+    """Create the chat interface with feedback functionality"""
+    # Load chat history if video changed
+    if st.session_state.current_video_id != video_id:
+        st.session_state.chat_history = []
+        db_history = db_handler.get_chat_history(video_id)
+        for chat_id, user_msg, asst_msg, timestamp in db_history:
+            st.session_state.chat_history.append({
+                'id': chat_id,
+                'user': user_msg,
+                'assistant': asst_msg,
+                'timestamp': timestamp
+            })
+        st.session_state.current_video_id = video_id
+    # Display chat history
+    for message in st.session_state.chat_history:
+        with st.chat_message("user"):
+            st.markdown(message['user'])
+        with st.chat_message("assistant"):
+            st.markdown(message['assistant'])
+            message_key = f"{message['id']}"
+            if message_key not in st.session_state.feedback_given:
+                col1, col2 = st.columns(2)
+                with col1:
+                    if st.button("👍", key=f"like_{message_key}"):
+                        db_handler.add_user_feedback(
+                            video_id=video_id,
+                            chat_id=message['id'],
+                            query=message['user'],
+                            response=message['assistant'],
+                            feedback=1
+                        )
+                        st.session_state.feedback_given.add(message_key)
+                        st.success("Thank you for your positive feedback!")
+                        st.rerun()
+                with col2:
+                    if st.button("👎", key=f"dislike_{message_key}"):
+                        db_handler.add_user_feedback(
+                            video_id=video_id,
+                            chat_id=message['id'],
+                            query=message['user'],
+                            response=message['assistant'],
+                            feedback=-1
+                        )
+                        st.session_state.feedback_given.add(message_key)
+                        st.success("Thank you for your feedback. We'll work to improve.")
+                        st.rerun()
+    # Chat input
+    if prompt := st.chat_input("Ask a question about the video..."):
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    # Apply query rewriting if selected
+                    rewritten_query = prompt
+                    if rewrite_method == "Chain of Thought":
+                        rewritten_query, _ = rag_system.rewrite_cot(prompt)
+                        st.caption("Rewritten query: " + rewritten_query)
+                    elif rewrite_method == "ReAct":
+                        rewritten_query, _ = rag_system.rewrite_react(prompt)
+                        st.caption("Rewritten query: " + rewritten_query)
+                    # Get response using selected search method
+                    search_method_map = {
+                        "Hybrid": "hybrid",
+                        "Text-only": "text",
+                        "Embedding-only": "embedding"
+                    }
+                    response, _ = rag_system.query(
+                        rewritten_query,
+                        search_method=search_method_map[search_method],
+                        index_name=index_name
+                    )
+                    st.markdown(response)
+                    # Save to database and session state
+                    chat_id = db_handler.add_chat_message(video_id, prompt, response)
+                    st.session_state.chat_history.append({
+                        'id': chat_id,
+                        'user': prompt,
+                        'assistant': response,
+                        'timestamp': datetime.now()
+                    })
+                    # Add feedback buttons for new message
+                    message_key = f"{chat_id}"
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        if st.button("👍", key=f"like_{message_key}"):
+                            db_handler.add_user_feedback(
+                                video_id=video_id,
+                                chat_id=chat_id,
+                                query=prompt,
+                                response=response,
+                                feedback=1
+                            )
+                            st.session_state.feedback_given.add(message_key)
+                            st.success("Thank you for your positive feedback!")
+                            st.rerun()
+                    with col2:
+                        if st.button("👎", key=f"dislike_{message_key}"):
+                            db_handler.add_user_feedback(
+                                video_id=video_id,
+                                chat_id=chat_id,
+                                query=prompt,
+                                response=response,
+                                feedback=-1
+                            )
+                            st.session_state.feedback_given.add(message_key)
+                            st.success("Thank you for your feedback. We'll work to improve.")
+                            st.rerun()
+                except Exception as e:
+                    st.error(f"Error generating response: {str(e)}")
+                    logger.error(f"Error in chat interface: {str(e)}")
+def get_system_status(db_handler, selected_video_id=None):
+    """Get system status information"""
+    try:
+        with sqlite3.connect(db_handler.db_path) as conn:
+            cursor = conn.cursor()
+            # Get total videos
+            cursor.execute("SELECT COUNT(*) FROM videos")
+            total_videos = cursor.fetchone()[0]
+            # Get total indices
+            cursor.execute("SELECT COUNT(DISTINCT index_name) FROM elasticsearch_indices")
+            total_indices = cursor.fetchone()[0]
+            # Get available embedding models
+            cursor.execute("SELECT model_name FROM embedding_models")
+            models = [row[0] for row in cursor.fetchall()]
+            if selected_video_id:
+                # Get video details
+                cursor.execute("""
+                    SELECT v.id, v.title, v.channel_name, v.processed_date,
+                           ei.index_name, em.model_name
+                    FROM videos v
+                    LEFT JOIN elasticsearch_indices ei ON v.id = ei.video_id
+                    LEFT JOIN embedding_models em ON ei.embedding_model_id = em.id
+                    WHERE v.youtube_id = ?
+                """, (selected_video_id,))
+                video_details = cursor.fetchall()
+            else:
+                video_details = None
+            return {
+                "total_videos": total_videos,
+                "total_indices": total_indices,
+                "models": models,
+                "video_details": video_details
+            }
+    except Exception as e:
+        logger.error(f"Error getting system status: {str(e)}")
+        return None
+def display_system_status(status, selected_video_id=None):
+    """Display system status in the sidebar"""
+    if not status:
+        st.sidebar.error("Unable to fetch system status")
+        return
+    st.sidebar.header("System Status")
+    # Display general stats
+    col1, col2 = st.sidebar.columns(2)
+    with col1:
+        st.metric("Total Videos", status["total_videos"])
+    with col2:
+        st.metric("Total Indices", status["total_indices"])
+    st.sidebar.markdown("**Available Models:**")
+    for model in status["models"]:
+        st.sidebar.markdown(f"- {model}")
+    # Display selected video details
+    if selected_video_id and status["video_details"]:
+        st.sidebar.markdown("---")
+        st.sidebar.markdown("**Selected Video Details:**")
+        for details in status["video_details"]:
+            video_id, title, channel, processed_date, index_name, model = details
+            st.sidebar.markdown(f"""
+            - **Title:** {title}
+            - **Channel:** {channel}
+            - **Processed:** {processed_date}
+            - **Index:** {index_name or 'Not indexed'}
+            - **Model:** {model or 'N/A'}
+            """)
+def main():
+    st.title("Chat Interface 💬")
+    # Initialize components
+    components = init_components()
+    if not components:
+        st.error("Failed to initialize components. Please check the logs.")
+        return
+    db_handler, data_processor, rag_system, query_rewriter = components
+    # Initialize session state
+    init_session_state()
+    # Get system status
+    system_status = get_system_status(db_handler)
+    # Video selection
+    st.sidebar.header("Video Selection")
+    # Get available videos with indices
+    with sqlite3.connect(db_handler.db_path) as conn:
+        query = """
+            SELECT DISTINCT v.youtube_id, v.title, v.channel_name, v.upload_date,
+                   GROUP_CONCAT(ei.index_name) as indices
+            FROM videos v
+            LEFT JOIN elasticsearch_indices ei ON v.id = ei.video_id
+            GROUP BY v.youtube_id
+            ORDER BY v.upload_date DESC
+        """
+        df = pd.read_sql_query(query, conn)
+    if df.empty:
+        st.info("No videos available. Please process some videos in the Data Ingestion page first.")
+        display_system_status(system_status)
+        return
+    # Display available videos
+    st.sidebar.markdown(f"**Available Videos:** {len(df)}")
+    # Channel filter
+    channels = sorted(df['channel_name'].unique())
+    selected_channel = st.sidebar.selectbox(
+        "Filter by Channel",
+        ["All"] + channels,
+        key="channel_filter"
+    )
+    filtered_df = df if selected_channel == "All" else df[df['channel_name'] == selected_channel]
+    # Video selection
+    selected_video_id = st.sidebar.selectbox(
+        "Select a Video",
+        filtered_df['youtube_id'].tolist(),
+        format_func=lambda x: filtered_df[filtered_df['youtube_id'] == x]['title'].iloc[0],
+        key="video_select"
+    )
+    if selected_video_id:
+        # Update system status with selected video
+        system_status = get_system_status(db_handler, selected_video_id)
+        display_system_status(system_status, selected_video_id)
+        # Get the index for the selected video
+        index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
+        if not index_name:
+            st.warning("This video hasn't been indexed yet. You can process it in the Data Ingestion page.")
+            if st.button("Process Now"):
+                with st.spinner("Processing video..."):
+                    try:
+                        embedding_model = data_processor.embedding_model.__class__.__name__
+                        index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
+                        if index_name:
+                            st.success("Video processed successfully!")
+                            st.rerun()
+                    except Exception as e:
+                        st.error(f"Error processing video: {str(e)}")
+                        logger.error(f"Error processing video: {str(e)}")
+        else:
+            # Chat settings
+            st.sidebar.header("Chat Settings")
+            rewrite_method = st.sidebar.radio(
+                "Query Rewriting Method",
+                ["None", "Chain of Thought", "ReAct"],
+                key="rewrite_method"
+            )
+            search_method = st.sidebar.radio(
+                "Search Method",
+                ["Hybrid", "Text-only", "Embedding-only"],
+                key="search_method"
+            )
+            # Create chat interface
+            create_chat_interface(
+                db_handler,
+                rag_system,
+                selected_video_id,
+                index_name,
+                rewrite_method,
+                search_method
+            )
+    # Display system status
+    display_system_status(system_status, selected_video_id)
+if __name__ == "__main__":
+    main()

app/pages/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import streamlit as st
+# Must be the first Streamlit command
+st.set_page_config(
+    page_title="01_Data_Ingestion",  # Use this format for ordering
+    page_icon="📥",
+    layout="wide"
+)
+import pandas as pd
+from transcript_extractor import get_transcript, extract_video_id, get_channel_videos
+from database import DatabaseHandler
+from data_processor import DataProcessor
+from utils import process_single_video
+import logging
+logger = logging.getLogger(__name__)
+@st.cache_resource
+def init_components():
+    return DatabaseHandler(), DataProcessor()
+def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
+    progress_bar = st.progress(0)
+    processed = 0
+    total = len(video_ids)
+    for video_id in video_ids:
+        if process_single_video(db_handler, data_processor, video_id, embedding_model):
+            processed += 1
+        progress_bar.progress(processed / total)
+    st.success(f"Processed {processed} out of {total} videos")
+def main():
+    st.title("Data Ingestion 📥")
+    db_handler, data_processor = init_components()
+    # Model selection
+    embedding_model = st.selectbox(
+        "Select embedding model:",
+        ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"]
+    )
+    # Display existing videos
+    st.header("Processed Videos")
+    videos = db_handler.get_all_videos()
+    if videos:
+        video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
+        channels = sorted(video_df['channel_name'].unique())
+        selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
+        if selected_channel != "All":
+            video_df = video_df[video_df['channel_name'] == selected_channel]
+        st.dataframe(video_df)
+    else:
+        st.info("No videos processed yet. Use the form below to add videos.")
+    # Process new videos
+    st.header("Process New Video")
+    with st.form("process_video_form"):
+        input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
+        input_value = st.text_input("Enter the URL or ID:")
+        submit_button = st.form_submit_button("Process")
+        if submit_button:
+            data_processor.set_embedding_model(embedding_model)
+            with st.spinner("Processing..."):
+                if input_type == "Video URL":
+                    video_id = extract_video_id(input_value)
+                    if video_id:
+                        process_single_video(db_handler, data_processor, video_id, embedding_model)
+                elif input_type == "Channel URL":
+                    channel_videos = get_channel_videos(input_value)
+                    if channel_videos:
+                        video_ids = [video['video_id'] for video in channel_videos]
+                        process_multiple_videos(db_handler, data_processor, video_ids, embedding_model)
+                    else:
+                        st.error("Failed to retrieve videos from the channel")
+                else:  # YouTube ID
+                    process_single_video(db_handler, data_processor, input_value, embedding_model)
+def process_single_video(db_handler, data_processor, video_id, embedding_model):
+    try:
+        existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
+        if existing_index:
+            st.info(f"Video {video_id} already processed. Using existing index.")
+            return existing_index
+        transcript_data = get_transcript(video_id)
+        if not transcript_data:
+            st.error("Failed to retrieve transcript.")
+            return None
+        # Process transcript and create indices
+        processed_data = data_processor.process_transcript(video_id, transcript_data)
+        if not processed_data:
+            st.error("Failed to process transcript.")
+            return None
+        # Save to database and create index
+        video_data = {
+            'video_id': video_id,
+            'title': transcript_data['metadata'].get('title', 'Unknown'),
+            'author': transcript_data['metadata'].get('author', 'Unknown'),
+            'upload_date': transcript_data['metadata'].get('upload_date', ''),
+            'view_count': transcript_data['metadata'].get('view_count', 0),
+            'like_count': transcript_data['metadata'].get('like_count', 0),
+            'comment_count': transcript_data['metadata'].get('comment_count', 0),
+            'video_duration': transcript_data['metadata'].get('duration', ''),
+            'transcript_content': processed_data['content']
+        }
+        db_handler.add_video(video_data)
+        index_name = f"video_{video_id}_{embedding_model}".lower()
+        index_name = data_processor.build_index(index_name)
+        if index_name:
+            st.success(f"Successfully processed video: {video_data['title']}")
+            return index_name
+    except Exception as e:
+        st.error(f"Error processing video: {str(e)}")
+        logger.error(f"Error processing video {video_id}: {str(e)}")
+        return None
+def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
+    progress_bar = st.progress(0)
+    processed = 0
+    total = len(video_ids)
+    for video_id in video_ids:
+        if process_single_video(db_handler, data_processor, video_id, embedding_model):
+            processed += 1
+        progress_bar.progress(processed / total)
+    st.success(f"Processed {processed} out of {total} videos")
+if __name__ == "__main__":
+    main()

app/pages/evaluation.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+st.set_page_config(
+    page_title="04_Evaluation",  # Use this format for ordering
+    page_icon="📊",
+    layout="wide"
+)
+import pandas as pd
+from database import DatabaseHandler
+from data_processor import DataProcessor
+from rag import RAGSystem
+from evaluation import EvaluationSystem
+from generate_ground_truth import get_evaluation_display_data
+import logging
+logger = logging.getLogger(__name__)
+# Define evaluation prompt template
+EVALUATION_PROMPT_TEMPLATE = """
+You are an expert evaluator for a Youtube transcript assistant.
+Your task is to analyze the relevance of the generated answer to the given question.
+Based on the relevance of the generated answer, you will classify it
+as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
+Here is the data for evaluation:
+Question: {question}
+Generated Answer: {answer_llm}
+Please analyze the content and context of the generated answer in relation to the question
+and provide your evaluation in the following JSON format:
+{{
+  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
+  "Explanation": "Your explanation for the relevance classification"
+}}
+Requirements:
+1. Relevance must be one of the three exact values
+2. Provide clear reasoning in the explanation
+3. Consider accuracy and completeness of the answer
+4. Return valid JSON only
+""".strip()
+@st.cache_resource
+def init_components():
+    db_handler = DatabaseHandler()
+    data_processor = DataProcessor()
+    rag_system = RAGSystem(data_processor)
+    evaluation_system = EvaluationSystem(data_processor, db_handler)
+    return db_handler, data_processor, rag_system, evaluation_system
+def main():
+    st.title("RAG Evaluation 📊")
+    db_handler, data_processor, rag_system, evaluation_system = init_components()
+    try:
+        # Check for ground truth data
+        ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
+        ground_truth_available = True
+        # Display existing evaluations
+        existing_evaluations = get_evaluation_display_data()
+        if not existing_evaluations.empty:
+            st.subheader("Existing Evaluation Results")
+            st.dataframe(existing_evaluations)
+            # Download button for evaluation results
+            csv = existing_evaluations.to_csv(index=False)
+            st.download_button(
+                label="Download Evaluation Results",
+                data=csv,
+                file_name="evaluation_results.csv",
+                mime="text/csv",
+            )
+        # Run evaluation
+        if ground_truth_available:
+            if st.button("Run Full Evaluation"):
+                with st.spinner("Running evaluation..."):
+                    try:
+                        evaluation_results = evaluation_system.run_full_evaluation(
+                            rag_system,
+                            'data/ground-truth-retrieval.csv',
+                            EVALUATION_PROMPT_TEMPLATE
+                        )
+                        if evaluation_results:
+                            # Display RAG evaluations
+                            st.subheader("RAG Evaluations")
+                            rag_eval_df = pd.DataFrame(evaluation_results["rag_evaluations"])
+                            st.dataframe(rag_eval_df)
+                            # Display search performance
+                            st.subheader("Search Performance")
+                            search_perf_df = pd.DataFrame([evaluation_results["search_performance"]])
+                            st.dataframe(search_perf_df)
+                            # Display optimized parameters
+                            st.subheader("Optimized Search Parameters")
+                            params_df = pd.DataFrame([{
+                                'parameter': k,
+                                'value': v,
+                                'score': evaluation_results['best_score']
+                            } for k, v in evaluation_results['best_params'].items()])
+                            st.dataframe(params_df)
+                            # Save results
+                            for video_id in rag_eval_df['video_id'].unique():
+                                db_handler.save_search_performance(
+                                    video_id,
+                                    evaluation_results["search_performance"]['hit_rate'],
+                                    evaluation_results["search_performance"]['mrr']
+                                )
+                                db_handler.save_search_parameters(
+                                    video_id,
+                                    evaluation_results['best_params'],
+                                    evaluation_results['best_score']
+                                )
+                            st.success("Evaluation complete. Results saved to database and CSV.")
+                    except Exception as e:
+                        st.error(f"Error during evaluation: {str(e)}")
+                        logger.error(f"Error in evaluation: {str(e)}")
+    except FileNotFoundError:
+        st.warning("No ground truth data available. Please generate ground truth data in the Ground Truth Generation page first.")
+        if st.button("Go to Ground Truth Generation"):
+            st.switch_page("pages/3_Ground_Truth.py")
+if __name__ == "__main__":
+    main()

app/pages/ground_truth.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import streamlit as st
+st.set_page_config(
+    page_title="03_Ground_Truth",  # Use this format for ordering
+    page_icon="📝",
+    layout="wide"
+)
+import pandas as pd
+from database import DatabaseHandler
+from data_processor import DataProcessor
+from generate_ground_truth import generate_ground_truth, get_ground_truth_display_data
+import logging
+logger = logging.getLogger(__name__)
+@st.cache_resource
+def init_components():
+    return DatabaseHandler(), DataProcessor()
+def main():
+    st.title("Ground Truth Generation 📝")
+    db_handler, data_processor = init_components()
+    # Get all videos
+    videos = db_handler.get_all_videos()
+    if not videos:
+        st.warning("No videos available. Please process some videos in the Data Ingestion page first.")
+        return
+    video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
+    # Channel filter
+    channels = sorted(video_df['channel_name'].unique())
+    selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
+    if selected_channel != "All":
+        video_df = video_df[video_df['channel_name'] == selected_channel]
+        # Display existing ground truth for channel
+        gt_data = get_ground_truth_display_data(db_handler, channel_name=selected_channel)
+        if not gt_data.empty:
+            st.subheader("Existing Ground Truth Questions for Channel")
+            st.dataframe(gt_data)
+            # Download button for channel ground truth
+            csv = gt_data.to_csv(index=False)
+            st.download_button(
+                label="Download Channel Ground Truth CSV",
+                data=csv,
+                file_name=f"ground_truth_{selected_channel}.csv",
+                mime="text/csv",
+            )
+    st.subheader("Available Videos")
+    st.dataframe(video_df)
+    # Video selection
+    selected_video_id = st.selectbox(
+        "Select a Video",
+        video_df['youtube_id'].tolist(),
+        format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0]
+    )
+    if selected_video_id:
+        # Generate ground truth
+        if st.button("Generate Ground Truth Questions"):
+            with st.spinner("Generating questions..."):
+                try:
+                    questions_df = generate_ground_truth(
+                        db_handler,
+                        data_processor,
+                        selected_video_id
+                    )
+                    if questions_df is not None and not questions_df.empty:
+                        st.success("Successfully generated ground truth questions")
+                        st.dataframe(questions_df)
+                    else:
+                        st.error("Failed to generate ground truth questions")
+                except Exception as e:
+                    st.error(f"Error generating ground truth: {str(e)}")
+                    logger.error(f"Error in ground truth generation: {str(e)}")
+        # Display existing ground truth
+        gt_data = get_ground_truth_display_data(db_handler, video_id=selected_video_id)
+        if not gt_data.empty:
+            st.subheader("Existing Ground Truth Questions")
+            st.dataframe(gt_data)
+            # Download button for video ground truth
+            csv = gt_data.to_csv(index=False)
+            st.download_button(
+                label="Download Ground Truth CSV",
+                data=csv,
+                file_name=f"ground_truth_{selected_video_id}.csv",
+                mime="text/csv",
+            )
+if __name__ == "__main__":
+    main()

app/rag.py CHANGED Viewed

@@ -8,6 +8,25 @@ load_dotenv()
 logger = logging.getLogger(__name__)
 class RAGSystem:
     def __init__(self, data_processor):
         self.data_processor = data_processor
@@ -52,14 +71,10 @@ class RAGSystem:
     def get_prompt(self, user_query, relevant_docs):
         context = "\n".join([doc['content'] for doc in relevant_docs])
-        prompt = f"""You are AI Youtube transcript assistant that analyses youtube transcripts and responds back to the user query based on the Context shared with you. Please ensure that the answers are correct, meaningful, and help in answering the query.
-Context: {context}
-Question: {user_query}
-Answer:"""
-        return prompt
     def query(self, user_query, search_method='hybrid', index_name=None):
         try:

 logger = logging.getLogger(__name__)
+# Define the RAG prompt template
+RAG_PROMPT_TEMPLATE = """
+You are an AI assistant analyzing YouTube video transcripts. Your task is to answer questions based on the provided transcript context.
+Context from transcript:
+{context}
+User Question: {question}
+Please provide a clear, concise answer based only on the information given in the context. If the context doesn't contain enough information to fully answer the question, acknowledge this in your response.
+Guidelines:
+1. Use only information from the provided context
+2. Be specific and direct in your answer
+3. If context is insufficient, say so
+4. Maintain accuracy and avoid speculation
+5. Use natural, conversational language
+""".strip()
 class RAGSystem:
     def __init__(self, data_processor):
         self.data_processor = data_processor
     def get_prompt(self, user_query, relevant_docs):
         context = "\n".join([doc['content'] for doc in relevant_docs])
+        return RAG_PROMPT_TEMPLATE.format(
+            context=context,
+            question=user_query
+        )
     def query(self, user_query, search_method='hybrid', index_name=None):
         try:

app/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+from transcript_extractor import get_transcript
+import logging
+logger = logging.getLogger(__name__)
+def process_single_video(db_handler, data_processor, video_id, embedding_model):
+    """Process a single video for indexing"""
+    try:
+        # Check for existing index
+        existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
+        if existing_index:
+            logger.info(f"Video {video_id} already processed. Using existing index.")
+            return existing_index
+        # Get transcript data
+        transcript_data = get_transcript(video_id)
+        if not transcript_data:
+            logger.error(f"Failed to retrieve transcript for video {video_id}")
+            return None
+        # Process transcript
+        processed_data = data_processor.process_transcript(video_id, transcript_data)
+        if not processed_data:
+            logger.error(f"Failed to process transcript for video {video_id}")
+            return None
+        # Prepare video data
+        video_data = {
+            'video_id': video_id,
+            'title': transcript_data['metadata'].get('title', 'Unknown Title'),
+            'author': transcript_data['metadata'].get('author', 'Unknown Author'),
+            'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'),
+            'view_count': int(transcript_data['metadata'].get('view_count', 0)),
+            'like_count': int(transcript_data['metadata'].get('like_count', 0)),
+            'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
+            'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
+            'transcript_content': processed_data['content']
+        }
+        # Save to database
+        db_handler.add_video(video_data)
+        # Build index
+        index_name = f"video_{video_id}_{embedding_model}".lower()
+        index_name = data_processor.build_index(index_name)
+        if index_name:
+            # Save index information
+            embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
+            video_record = db_handler.get_video_by_youtube_id(video_id)
+            if video_record:
+                db_handler.add_elasticsearch_index(video_record[0], index_name, embedding_model_id)
+                logger.info(f"Successfully processed video: {video_data['title']}")
+                return index_name
+        logger.error(f"Failed to process video {video_id}")
+        return None
+    except Exception as e:
+        logger.error(f"Error processing video {video_id}: {str(e)}")
+        return None

data/ground-truth-retrieval.csv CHANGED Viewed

@@ -27,3 +27,13 @@ zjkBMFhNj_g,What are some examples of attacks on large language models (LLMs) th
 zjkBMFhNj_g,How do prompt injection and shieldbreak attack work in the context of LLM security?
 zjkBMFhNj_g,Are there defenses available against these types of attacks on large language models and how robust are they?
 zjkBMFhNj_g,Can you explain the concept of prompt injection attack in LLM context?

 zjkBMFhNj_g,How do prompt injection and shieldbreak attack work in the context of LLM security?
 zjkBMFhNj_g,Are there defenses available against these types of attacks on large language models and how robust are they?
 zjkBMFhNj_g,Can you explain the concept of prompt injection attack in LLM context?
+zjkBMFhNj_g,"Is it feasible that certain trigger phrases could manipulate a trained LLM into generating nonsensical predictions, and how was this demonstrated in research?"
+zjkBMFhNj_g,In what ways can malicious actors exploit data poisoning or backdoor attacks within the training process of large language models (LLM)?
+zjkBMFhNj_g,Can prompt injection attacks occur in the context of LMs and how do they work? Provide an example involving a malicious keyword.
+zjkBMFhNj_g,What are some potential security threats related to large language models (LM) like Google Bard?
+zjkBMFhNj_g,What are some potential security threats associated with large language models like Google Bard?
+zjkBMFhNj_g,"How does a shieldbreak attack function in compromising an AI model's output, specifically with regard to sensitive information like credit card details?"
+zjkBMFhNj_g,"How can data poisoning or backdoor attack affect pre-trained LMs, such as GPT models? Illustrate with potential trigger phrases that could lead to model corruption."
+zjkBMFhNj_g,What are the possible defenses against these kinds of attacks on large language models and how effective they might be?
+zjkBMFhNj_g,Do existing defenses against these types of prompt injection or data poisoning attacks apply to all cases including pre-training phases?
+zjkBMFhNj_g,Can you elaborate on the concept of prompt injection attack and how it affects LLM systems such as ChatGPT or BigScience Alpaca?

data/sqlite.db CHANGED Viewed

Binary files a/data/sqlite.db and b/data/sqlite.db differ

docker-compose.yaml CHANGED Viewed

@@ -1,5 +1,3 @@
-version: '3.8'
 services:
   app:
     build: .
@@ -15,12 +13,22 @@ services:
       - OLLAMA_HOST=http://ollama:11434
       - OLLAMA_TIMEOUT=${OLLAMA_TIMEOUT:-120}
       - OLLAMA_MAX_RETRIES=${OLLAMA_MAX_RETRIES:-3}
     env_file:
       - .env
     volumes:
       - ./data:/app/data
       - ./config:/app/config
-      - ./app:/app/app
   elasticsearch:
     image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
@@ -28,6 +36,7 @@ services:
     environment:
       - discovery.type=single-node
       - xpack.security.enabled=false
     ports:
       - "9200:9200"
       - "9300:9300"
@@ -37,6 +46,11 @@ services:
           memory: 2G
     volumes:
       - esdata:/usr/share/elasticsearch/data
   grafana:
     image: grafana/grafana:latest
@@ -54,8 +68,14 @@ services:
       - ./grafana/dashboards:/etc/grafana/dashboards
       - grafana-storage:/var/lib/grafana
       - ./data:/app/data:ro
     depends_on:
       - elasticsearch
   ollama:
     image: ollama/ollama:latest
@@ -63,9 +83,22 @@ services:
       - "11434:11434"
     volumes:
       - ollama_data:/root/.ollama
 volumes:
   esdata:
     driver: local
   grafana-storage:
-  ollama_data:

 services:
   app:
     build: .
       - OLLAMA_HOST=http://ollama:11434
       - OLLAMA_TIMEOUT=${OLLAMA_TIMEOUT:-120}
       - OLLAMA_MAX_RETRIES=${OLLAMA_MAX_RETRIES:-3}
+      - PYTHONPATH=/app
+      - STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
+      - STREAMLIT_THEME_PRIMARY_COLOR="#FF4B4B"
     env_file:
       - .env
     volumes:
+      - ./app:/app/app
       - ./data:/app/data
       - ./config:/app/config
+      - ./logs:/app/logs
+      - ./.streamlit:/root/.streamlit:ro
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
   elasticsearch:
     image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
     environment:
       - discovery.type=single-node
       - xpack.security.enabled=false
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
     ports:
       - "9200:9200"
       - "9300:9300"
           memory: 2G
     volumes:
       - esdata:/usr/share/elasticsearch/data
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9200"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
   grafana:
     image: grafana/grafana:latest
       - ./grafana/dashboards:/etc/grafana/dashboards
       - grafana-storage:/var/lib/grafana
       - ./data:/app/data:ro
+      - ./logs:/var/log/grafana
     depends_on:
       - elasticsearch
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
   ollama:
     image: ollama/ollama:latest
       - "11434:11434"
     volumes:
       - ollama_data:/root/.ollama
+    deploy:
+      resources:
+        limits:
+          memory: 6G
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11434/api/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
 volumes:
   esdata:
     driver: local
   grafana-storage:
+  ollama_data:
+networks:
+  default:
+    driver: bridge

grafana/dashboards/rag_evaluation.json CHANGED Viewed

@@ -67,7 +67,7 @@
       "targets": [
         {
           "queryType": "table",
-          "sql": "SELECT re.video_id, v.title, re.question, re.relevance, re.evaluation_date FROM rag_evaluations re JOIN videos v ON re.video_id = v.youtube_id ORDER BY re.evaluation_date DESC LIMIT 10",
           "format": "table"
         }
       ]
@@ -152,11 +152,11 @@
   "templating": {
     "list": [
       {
-        "name": "video_id",
-        "type": "query",
-        "datasource": "SQLite",
-        "query": "SELECT youtube_id, title FROM videos ORDER BY title",
-        "value": "All"
       }
     ]
   },

       "targets": [
         {
           "queryType": "table",
+          "sql": "SELECT re.video_id, v.title, re.question, re.relevance FROM rag_evaluations re JOIN videos v ON re.video_id = v.youtube_id LIMIT 10",
           "format": "table"
         }
       ]
   "templating": {
     "list": [
       {
+      "name": "video_id",
+      "type": "query",
+      "datasource": "SQLite",
+      "query": "SELECT title AS __text, youtube_id AS __value FROM videos ORDER BY title",
+      "value": "All"
       }
     ]
   },

grafana/provisioning/datasources/sqlite.yaml CHANGED Viewed

@@ -19,4 +19,5 @@ datasources:
         - name: foreign_keys
           value: "ON"
         - name: busy_timeout
-          value: 5000

         - name: foreign_keys
           value: "ON"
         - name: busy_timeout
+          value: 5000
+      userAgent: "Grafana-SQLite/1.0"

image-1.png DELETED Viewed

Binary file (145 kB)

image-10.png DELETED Viewed

Binary file (114 kB)

image-11.png DELETED Viewed

Binary file (44.3 kB)

image-2.png DELETED Viewed

Binary file (89.5 kB)

image-3.png DELETED Viewed

Binary file (79.2 kB)

image-4.png DELETED Viewed

Binary file (32.8 kB)

image-5.png DELETED Viewed

Binary file (197 kB)

image-6.png DELETED Viewed

Binary file (74.7 kB)

image-7.png DELETED Viewed

Binary file (34.3 kB)

image-8.png DELETED Viewed

Binary file (71.6 kB)

image-9.png DELETED Viewed

Binary file (95.1 kB)

image.png DELETED Viewed

Binary file (219 kB)

images/image-1.png ADDED Viewed

images/image-2.png ADDED Viewed

images/image-3.png ADDED Viewed

images/image-4.png ADDED Viewed

images/image-5.png ADDED Viewed

images/image-6.png ADDED Viewed

images/image.png ADDED Viewed