Spaces:

sergey21000
/

chatbot-rag-non-working

Paused

App Files Files Community

sergey21000 commited on 9 days ago

Commit

1abf289

verified ·

1 Parent(s): f19ab0f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +26 -5
config.py +18 -4
requirements.txt +13 -11
utils.py +13 -8

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 from typing import List, Tuple, Optional
 import gradio as gr
 from langchain_core.vectorstores import VectorStore
 from config import (
     LLM_MODEL_REPOS,
     EMBED_MODEL_REPOS,
     SUBTITLES_LANGUAGES,
     GENERATE_KWARGS,
@@ -95,14 +100,31 @@ def get_generate_args(do_sample: bool) -> List[gr.component]:
 # ================ LOADING AND INITIALIZING MODELS ========================
-start_llm_model, start_support_system_role, load_log = load_llm_model(LLM_MODEL_REPOS[0], 'gemma-2-2b-it-Q8_0.gguf')
-start_embed_model, load_log = load_embed_model(EMBED_MODEL_REPOS[0])
 # ================== APPLICATION WEB INTERFACE ============================
-css = '''.gradio-container {width: 60% !important}'''
 with gr.Blocks(css=css) as interface:
@@ -127,7 +149,6 @@ with gr.Blocks(css=css) as interface:
                 chatbot = gr.Chatbot(
                     type='messages',  # new in gradio 5+
                     show_copy_button=True,
-                    bubble_full_width=False,
                     height=480,
                 )
                 user_message = gr.Textbox(label='User')
@@ -197,7 +218,7 @@ with gr.Blocks(css=css) as interface:
             fn=user_message_to_chatbot,
             inputs=[user_message, chatbot],
             outputs=[user_message, chatbot],
-            queue=False,
         ).then(
             fn=update_user_message_with_context,
             inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],

 from typing import List, Tuple, Optional
+# this is so that there is no error: exception: access violation reading 0x0000000000000000
+# https://github.com/abetlen/llama-cpp-python/issues/1581
+from llama_cpp import Llama
 import gradio as gr
 from langchain_core.vectorstores import VectorStore
 from config import (
     LLM_MODEL_REPOS,
+    START_LLM_MODEL_FILE,
     EMBED_MODEL_REPOS,
     SUBTITLES_LANGUAGES,
     GENERATE_KWARGS,
 # ================ LOADING AND INITIALIZING MODELS ========================
+start_llm_model, start_support_system_role, load_log = load_llm_model(
+    model_repo=LLM_MODEL_REPOS[0],
+    model_file=START_LLM_MODEL_FILE,
+)
+if start_llm_model['llm_model'] is None:
+    raise Exception(f'LLM model not initialized, status message: {load_log}')
+start_embed_model, load_log = load_embed_model(
+    model_repo=EMBED_MODEL_REPOS[0],
+)
+if start_embed_model['embed_model'] is None:
+    raise Exception(f'Embed model not initialized, status message: {load_log}')
 # ================== APPLICATION WEB INTERFACE ============================
+css = '''
+.gradio-container {
+    width: 70% !important;
+    margin: 0 auto !important;
+}
+'''
 with gr.Blocks(css=css) as interface:
                 chatbot = gr.Chatbot(
                     type='messages',  # new in gradio 5+
                     show_copy_button=True,
                     height=480,
                 )
                 user_message = gr.Textbox(label='User')
             fn=user_message_to_chatbot,
             inputs=[user_message, chatbot],
             outputs=[user_message, chatbot],
+            # queue=False,
         ).then(
             fn=update_user_message_with_context,
             inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],

config.py CHANGED Viewed

@@ -54,6 +54,13 @@ GENERATE_KWARGS = dict(
     repeat_penalty=1.0,
     )
 # paths to LLM and embeddings models
 LLM_MODELS_PATH = Path('models')
 EMBED_MODELS_PATH = Path('embed_models')
@@ -62,10 +69,12 @@ EMBED_MODELS_PATH.mkdir(exist_ok=True)
 # available when running the LLM application models in GGUF format
 LLM_MODEL_REPOS = [
-    # https://huggingface.co/bartowski/gemma-2-2b-it-GGUF
-    'bartowski/gemma-2-2b-it-GGUF',
     # https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
     'bartowski/Qwen2.5-3B-Instruct-GGUF',
     # https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
     'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
     # https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
@@ -76,10 +85,17 @@ LLM_MODEL_REPOS = [
     'bartowski/Llama-3.2-3B-Instruct-GGUF',
 ]
 # Embedding models available at application startup
 EMBED_MODEL_REPOS = [
     # https://huggingface.co/sergeyzh/rubert-tiny-turbo  # 117 MB
     'sergeyzh/rubert-tiny-turbo',
     # https://huggingface.co/cointegrated/rubert-tiny2  # 118 MB
     'cointegrated/rubert-tiny2',
     # https://huggingface.co/cointegrated/LaBSE-en-ru  # 516 MB
@@ -90,8 +106,6 @@ EMBED_MODEL_REPOS = [
     'intfloat/multilingual-e5-large',
     # https://huggingface.co/intfloat/multilingual-e5-base  # 1.11 GB
     'intfloat/multilingual-e5-base',
-    # https://huggingface.co/intfloat/multilingual-e5-small  # 471 MB
-    'intfloat/multilingual-e5-small',
     # https://huggingface.co/intfloat/multilingual-e5-large-instruct  # 1.12 GB
     'intfloat/multilingual-e5-large-instruct',
     # https://huggingface.co/sentence-transformers/all-mpnet-base-v2  # 438 MB

     repeat_penalty=1.0,
     )
+# llama-cpp-python model params
+LLAMA_MODEL_KWARGS = dict(
+    n_gpu_layers=-1,
+    verbose=False,
+    n_ctx=4096,  # context size
+)
 # paths to LLM and embeddings models
 LLM_MODELS_PATH = Path('models')
 EMBED_MODELS_PATH = Path('embed_models')
 # available when running the LLM application models in GGUF format
 LLM_MODEL_REPOS = [
+    # https://huggingface.co/bartowski/google_gemma-3-4b-it-GGUF
+    'bartowski/google_gemma-3-4b-it-GGUF',
     # https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
     'bartowski/Qwen2.5-3B-Instruct-GGUF',
+    # https://huggingface.co/bartowski/gemma-2-2b-it-GGUF
+    'bartowski/gemma-2-2b-it-GGUF',
     # https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
     'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
     # https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
     'bartowski/Llama-3.2-3B-Instruct-GGUF',
 ]
+# GGUF filename to LLM_MODEL_REPOS[0]
+START_LLM_MODEL_FILE = 'google_gemma-3-4b-it-Q4_K_M.gguf'
 # Embedding models available at application startup
 EMBED_MODEL_REPOS = [
+    # https://huggingface.co/intfloat/multilingual-e5-small  # 471 MB
+    'intfloat/multilingual-e5-small',
     # https://huggingface.co/sergeyzh/rubert-tiny-turbo  # 117 MB
     'sergeyzh/rubert-tiny-turbo',
+    # https://huggingface.co/sergeyzh/BERTA  # 513 MB
+    'sergeyzh/BERTA',
     # https://huggingface.co/cointegrated/rubert-tiny2  # 118 MB
     'cointegrated/rubert-tiny2',
     # https://huggingface.co/cointegrated/LaBSE-en-ru  # 516 MB
     'intfloat/multilingual-e5-large',
     # https://huggingface.co/intfloat/multilingual-e5-base  # 1.11 GB
     'intfloat/multilingual-e5-base',
     # https://huggingface.co/intfloat/multilingual-e5-large-instruct  # 1.12 GB
     'intfloat/multilingual-e5-large-instruct',
     # https://huggingface.co/sentence-transformers/all-mpnet-base-v2  # 438 MB

requirements.txt CHANGED Viewed

@@ -1,11 +1,13 @@
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.4.1
-llama_cpp_python==0.2.90
-langchain==0.3.3
-langchain-community==0.3.1
-langchain-huggingface==0.1.0
-pdfminer.six==20240706
-youtube-transcript-api==0.6.2
-psutil==6.0.0
-faiss-cpu==1.9.0
-beautifulsoup4==4.12.3

+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+torch==2.6.0
+https://github.com/sergey21000/llama-cpp-python-wheels/releases/download/llama-cpp-python-0.3.8-wheels/llama_cpp_python-0.3.8-cp310-cp310-linux_x86_64_cpu.whl
+gradio==5.25.2
+langchain==0.3.23
+langchain-community==0.3.21
+langchain-huggingface==0.1.2
+pdfminer.six==20250416
+youtube-transcript-api==1.0.3
+psutil==7.0.0
+faiss-cpu==1.10.0
+beautifulsoup4==4.13.4

utils.py CHANGED Viewed

@@ -4,6 +4,8 @@ from shutil import rmtree
 from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
 from tqdm import tqdm
 import psutil
 import requests
 from requests.exceptions import MissingSchema
@@ -11,7 +13,6 @@ from requests.exceptions import MissingSchema
 import torch
 import gradio as gr
-from llama_cpp import Llama
 from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
 from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
@@ -28,6 +29,7 @@ from config import (
     LLM_MODELS_PATH,
     EMBED_MODELS_PATH,
     GENERATE_KWARGS,
     LOADER_CLASSES,
 )
@@ -132,12 +134,14 @@ def load_llm_model(model_repo: str, model_file: str) -> Tuple[LLM_MODEL_DICT, st
             load_log += f'Model {model_file} loaded\n'
         except Exception as ex:
             model_path = ''
-            load_log += f'Error loading model, error code:\n{ex}\n'
     if model_path:
         progress(0.7, desc='Step 2/2: Initialize the model')
         try:
-            llm_model = Llama(model_path=str(model_path), n_gpu_layers=-1, verbose=False)
             support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
             load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
         except Exception as ex:
@@ -399,7 +403,8 @@ def load_documents_and_create_db(
 # adding a user message to the chat bot window
 def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
-    chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
     return '', chatbot
@@ -412,7 +417,7 @@ def update_user_message_with_context(
         score_threshold: float,
         context_template: str,
         ) -> Tuple[str, CHAT_HISTORY]:
     user_message = chatbot[-1]['content']
     user_message_with_context = ''
@@ -482,19 +487,19 @@ def get_llm_response(
     messages = []
     if support_system_role and system_prompt:
-        messages.append({'role': 'system', 'metadata': {'title': None}, 'content': system_prompt})
     if history_len != 0:
         messages.extend(chatbot[:-1][-(history_len*2):])
-    messages.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
     stream_response = llm_model.create_chat_completion(
         messages=messages,
         stream=True,
         **gen_kwargs,
         )
     try:
-        chatbot.append({'role': 'assistant', 'metadata': {'title': None}, 'content': ''})
         for chunk in stream_response:
             token = chunk['choices'][0]['delta'].get('content')
             if token is not None:

 from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
 from tqdm import tqdm
+from llama_cpp import Llama
 import psutil
 import requests
 from requests.exceptions import MissingSchema
 import torch
 import gradio as gr
 from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
 from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
     LLM_MODELS_PATH,
     EMBED_MODELS_PATH,
     GENERATE_KWARGS,
+    LLAMA_MODEL_KWARGS,
     LOADER_CLASSES,
 )
             load_log += f'Model {model_file} loaded\n'
         except Exception as ex:
             model_path = ''
+            load_log += f'Error downloading model, error code:\n{ex}\n'
     if model_path:
         progress(0.7, desc='Step 2/2: Initialize the model')
         try:
+            print('----------')
+            print(str(model_path))
+            llm_model = Llama(model_path=str(model_path), **LLAMA_MODEL_KWARGS)
             support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
             load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
         except Exception as ex:
 # adding a user message to the chat bot window
 def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
+    # chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
+    chatbot.append({'role': 'user', 'content': user_message})
     return '', chatbot
         score_threshold: float,
         context_template: str,
         ) -> Tuple[str, CHAT_HISTORY]:
     user_message = chatbot[-1]['content']
     user_message_with_context = ''
     messages = []
     if support_system_role and system_prompt:
+        messages.append({'role': 'system', 'content': system_prompt})
     if history_len != 0:
         messages.extend(chatbot[:-1][-(history_len*2):])
+    messages.append({'role': 'user', 'content': user_message})
     stream_response = llm_model.create_chat_completion(
         messages=messages,
         stream=True,
         **gen_kwargs,
         )
     try:
+        chatbot.append({'role': 'assistant', 'content': ''})
         for chunk in stream_response:
             token = chunk['choices'][0]['delta'].get('content')
             if token is not None: