Upload 4 files
Browse files
app.py
CHANGED
@@ -1,10 +1,15 @@
|
|
1 |
from typing import List, Tuple, Optional
|
2 |
|
|
|
|
|
|
|
|
|
3 |
import gradio as gr
|
4 |
from langchain_core.vectorstores import VectorStore
|
5 |
|
6 |
from config import (
|
7 |
LLM_MODEL_REPOS,
|
|
|
8 |
EMBED_MODEL_REPOS,
|
9 |
SUBTITLES_LANGUAGES,
|
10 |
GENERATE_KWARGS,
|
@@ -95,14 +100,31 @@ def get_generate_args(do_sample: bool) -> List[gr.component]:
|
|
95 |
|
96 |
# ================ LOADING AND INITIALIZING MODELS ========================
|
97 |
|
98 |
-
start_llm_model, start_support_system_role, load_log = load_llm_model(
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
# ================== APPLICATION WEB INTERFACE ============================
|
104 |
|
105 |
-
css = '''
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
with gr.Blocks(css=css) as interface:
|
108 |
|
@@ -127,7 +149,6 @@ with gr.Blocks(css=css) as interface:
|
|
127 |
chatbot = gr.Chatbot(
|
128 |
type='messages', # new in gradio 5+
|
129 |
show_copy_button=True,
|
130 |
-
bubble_full_width=False,
|
131 |
height=480,
|
132 |
)
|
133 |
user_message = gr.Textbox(label='User')
|
@@ -197,7 +218,7 @@ with gr.Blocks(css=css) as interface:
|
|
197 |
fn=user_message_to_chatbot,
|
198 |
inputs=[user_message, chatbot],
|
199 |
outputs=[user_message, chatbot],
|
200 |
-
queue=False,
|
201 |
).then(
|
202 |
fn=update_user_message_with_context,
|
203 |
inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],
|
|
|
1 |
from typing import List, Tuple, Optional
|
2 |
|
3 |
+
# this is so that there is no error: exception: access violation reading 0x0000000000000000
|
4 |
+
# https://github.com/abetlen/llama-cpp-python/issues/1581
|
5 |
+
from llama_cpp import Llama
|
6 |
+
|
7 |
import gradio as gr
|
8 |
from langchain_core.vectorstores import VectorStore
|
9 |
|
10 |
from config import (
|
11 |
LLM_MODEL_REPOS,
|
12 |
+
START_LLM_MODEL_FILE,
|
13 |
EMBED_MODEL_REPOS,
|
14 |
SUBTITLES_LANGUAGES,
|
15 |
GENERATE_KWARGS,
|
|
|
100 |
|
101 |
# ================ LOADING AND INITIALIZING MODELS ========================
|
102 |
|
103 |
+
start_llm_model, start_support_system_role, load_log = load_llm_model(
|
104 |
+
model_repo=LLM_MODEL_REPOS[0],
|
105 |
+
model_file=START_LLM_MODEL_FILE,
|
106 |
+
)
|
107 |
+
|
108 |
+
if start_llm_model['llm_model'] is None:
|
109 |
+
raise Exception(f'LLM model not initialized, status message: {load_log}')
|
110 |
|
111 |
|
112 |
+
start_embed_model, load_log = load_embed_model(
|
113 |
+
model_repo=EMBED_MODEL_REPOS[0],
|
114 |
+
)
|
115 |
+
|
116 |
+
if start_embed_model['embed_model'] is None:
|
117 |
+
raise Exception(f'Embed model not initialized, status message: {load_log}')
|
118 |
+
|
119 |
|
120 |
# ================== APPLICATION WEB INTERFACE ============================
|
121 |
|
122 |
+
css = '''
|
123 |
+
.gradio-container {
|
124 |
+
width: 70% !important;
|
125 |
+
margin: 0 auto !important;
|
126 |
+
}
|
127 |
+
'''
|
128 |
|
129 |
with gr.Blocks(css=css) as interface:
|
130 |
|
|
|
149 |
chatbot = gr.Chatbot(
|
150 |
type='messages', # new in gradio 5+
|
151 |
show_copy_button=True,
|
|
|
152 |
height=480,
|
153 |
)
|
154 |
user_message = gr.Textbox(label='User')
|
|
|
218 |
fn=user_message_to_chatbot,
|
219 |
inputs=[user_message, chatbot],
|
220 |
outputs=[user_message, chatbot],
|
221 |
+
# queue=False,
|
222 |
).then(
|
223 |
fn=update_user_message_with_context,
|
224 |
inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],
|
config.py
CHANGED
@@ -54,6 +54,13 @@ GENERATE_KWARGS = dict(
|
|
54 |
repeat_penalty=1.0,
|
55 |
)
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# paths to LLM and embeddings models
|
58 |
LLM_MODELS_PATH = Path('models')
|
59 |
EMBED_MODELS_PATH = Path('embed_models')
|
@@ -62,10 +69,12 @@ EMBED_MODELS_PATH.mkdir(exist_ok=True)
|
|
62 |
|
63 |
# available when running the LLM application models in GGUF format
|
64 |
LLM_MODEL_REPOS = [
|
65 |
-
# https://huggingface.co/bartowski/
|
66 |
-
'bartowski/
|
67 |
# https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
|
68 |
'bartowski/Qwen2.5-3B-Instruct-GGUF',
|
|
|
|
|
69 |
# https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
|
70 |
'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
|
71 |
# https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
|
@@ -76,10 +85,17 @@ LLM_MODEL_REPOS = [
|
|
76 |
'bartowski/Llama-3.2-3B-Instruct-GGUF',
|
77 |
]
|
78 |
|
|
|
|
|
|
|
79 |
# Embedding models available at application startup
|
80 |
EMBED_MODEL_REPOS = [
|
|
|
|
|
81 |
# https://huggingface.co/sergeyzh/rubert-tiny-turbo # 117 MB
|
82 |
'sergeyzh/rubert-tiny-turbo',
|
|
|
|
|
83 |
# https://huggingface.co/cointegrated/rubert-tiny2 # 118 MB
|
84 |
'cointegrated/rubert-tiny2',
|
85 |
# https://huggingface.co/cointegrated/LaBSE-en-ru # 516 MB
|
@@ -90,8 +106,6 @@ EMBED_MODEL_REPOS = [
|
|
90 |
'intfloat/multilingual-e5-large',
|
91 |
# https://huggingface.co/intfloat/multilingual-e5-base # 1.11 GB
|
92 |
'intfloat/multilingual-e5-base',
|
93 |
-
# https://huggingface.co/intfloat/multilingual-e5-small # 471 MB
|
94 |
-
'intfloat/multilingual-e5-small',
|
95 |
# https://huggingface.co/intfloat/multilingual-e5-large-instruct # 1.12 GB
|
96 |
'intfloat/multilingual-e5-large-instruct',
|
97 |
# https://huggingface.co/sentence-transformers/all-mpnet-base-v2 # 438 MB
|
|
|
54 |
repeat_penalty=1.0,
|
55 |
)
|
56 |
|
57 |
+
# llama-cpp-python model params
|
58 |
+
LLAMA_MODEL_KWARGS = dict(
|
59 |
+
n_gpu_layers=-1,
|
60 |
+
verbose=False,
|
61 |
+
n_ctx=4096, # context size
|
62 |
+
)
|
63 |
+
|
64 |
# paths to LLM and embeddings models
|
65 |
LLM_MODELS_PATH = Path('models')
|
66 |
EMBED_MODELS_PATH = Path('embed_models')
|
|
|
69 |
|
70 |
# available when running the LLM application models in GGUF format
|
71 |
LLM_MODEL_REPOS = [
|
72 |
+
# https://huggingface.co/bartowski/google_gemma-3-4b-it-GGUF
|
73 |
+
'bartowski/google_gemma-3-4b-it-GGUF',
|
74 |
# https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
|
75 |
'bartowski/Qwen2.5-3B-Instruct-GGUF',
|
76 |
+
# https://huggingface.co/bartowski/gemma-2-2b-it-GGUF
|
77 |
+
'bartowski/gemma-2-2b-it-GGUF',
|
78 |
# https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
|
79 |
'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
|
80 |
# https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
|
|
|
85 |
'bartowski/Llama-3.2-3B-Instruct-GGUF',
|
86 |
]
|
87 |
|
88 |
+
# GGUF filename to LLM_MODEL_REPOS[0]
|
89 |
+
START_LLM_MODEL_FILE = 'google_gemma-3-4b-it-Q4_K_M.gguf'
|
90 |
+
|
91 |
# Embedding models available at application startup
|
92 |
EMBED_MODEL_REPOS = [
|
93 |
+
# https://huggingface.co/intfloat/multilingual-e5-small # 471 MB
|
94 |
+
'intfloat/multilingual-e5-small',
|
95 |
# https://huggingface.co/sergeyzh/rubert-tiny-turbo # 117 MB
|
96 |
'sergeyzh/rubert-tiny-turbo',
|
97 |
+
# https://huggingface.co/sergeyzh/BERTA # 513 MB
|
98 |
+
'sergeyzh/BERTA',
|
99 |
# https://huggingface.co/cointegrated/rubert-tiny2 # 118 MB
|
100 |
'cointegrated/rubert-tiny2',
|
101 |
# https://huggingface.co/cointegrated/LaBSE-en-ru # 516 MB
|
|
|
106 |
'intfloat/multilingual-e5-large',
|
107 |
# https://huggingface.co/intfloat/multilingual-e5-base # 1.11 GB
|
108 |
'intfloat/multilingual-e5-base',
|
|
|
|
|
109 |
# https://huggingface.co/intfloat/multilingual-e5-large-instruct # 1.12 GB
|
110 |
'intfloat/multilingual-e5-large-instruct',
|
111 |
# https://huggingface.co/sentence-transformers/all-mpnet-base-v2 # 438 MB
|
requirements.txt
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
-
--extra-index-url https://download.pytorch.org/whl/cpu
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
langchain
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
2 |
+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
3 |
+
torch==2.6.0
|
4 |
+
https://github.com/sergey21000/llama-cpp-python-wheels/releases/download/llama-cpp-python-0.3.8-wheels/llama_cpp_python-0.3.8-cp310-cp310-linux_x86_64_cpu.whl
|
5 |
+
gradio==5.25.2
|
6 |
+
langchain==0.3.23
|
7 |
+
langchain-community==0.3.21
|
8 |
+
langchain-huggingface==0.1.2
|
9 |
+
pdfminer.six==20250416
|
10 |
+
youtube-transcript-api==1.0.3
|
11 |
+
psutil==7.0.0
|
12 |
+
faiss-cpu==1.10.0
|
13 |
+
beautifulsoup4==4.13.4
|
utils.py
CHANGED
@@ -4,6 +4,8 @@ from shutil import rmtree
|
|
4 |
from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
|
5 |
from tqdm import tqdm
|
6 |
|
|
|
|
|
7 |
import psutil
|
8 |
import requests
|
9 |
from requests.exceptions import MissingSchema
|
@@ -11,7 +13,6 @@ from requests.exceptions import MissingSchema
|
|
11 |
import torch
|
12 |
import gradio as gr
|
13 |
|
14 |
-
from llama_cpp import Llama
|
15 |
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
|
16 |
from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
|
17 |
|
@@ -28,6 +29,7 @@ from config import (
|
|
28 |
LLM_MODELS_PATH,
|
29 |
EMBED_MODELS_PATH,
|
30 |
GENERATE_KWARGS,
|
|
|
31 |
LOADER_CLASSES,
|
32 |
)
|
33 |
|
@@ -132,12 +134,14 @@ def load_llm_model(model_repo: str, model_file: str) -> Tuple[LLM_MODEL_DICT, st
|
|
132 |
load_log += f'Model {model_file} loaded\n'
|
133 |
except Exception as ex:
|
134 |
model_path = ''
|
135 |
-
load_log += f'Error
|
136 |
|
137 |
if model_path:
|
138 |
progress(0.7, desc='Step 2/2: Initialize the model')
|
139 |
try:
|
140 |
-
|
|
|
|
|
141 |
support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
|
142 |
load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
|
143 |
except Exception as ex:
|
@@ -399,7 +403,8 @@ def load_documents_and_create_db(
|
|
399 |
|
400 |
# adding a user message to the chat bot window
|
401 |
def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
|
402 |
-
chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
|
|
|
403 |
return '', chatbot
|
404 |
|
405 |
|
@@ -412,7 +417,7 @@ def update_user_message_with_context(
|
|
412 |
score_threshold: float,
|
413 |
context_template: str,
|
414 |
) -> Tuple[str, CHAT_HISTORY]:
|
415 |
-
|
416 |
user_message = chatbot[-1]['content']
|
417 |
user_message_with_context = ''
|
418 |
|
@@ -482,19 +487,19 @@ def get_llm_response(
|
|
482 |
|
483 |
messages = []
|
484 |
if support_system_role and system_prompt:
|
485 |
-
messages.append({'role': 'system', '
|
486 |
|
487 |
if history_len != 0:
|
488 |
messages.extend(chatbot[:-1][-(history_len*2):])
|
489 |
|
490 |
-
messages.append({'role': 'user', '
|
491 |
stream_response = llm_model.create_chat_completion(
|
492 |
messages=messages,
|
493 |
stream=True,
|
494 |
**gen_kwargs,
|
495 |
)
|
496 |
try:
|
497 |
-
chatbot.append({'role': 'assistant', '
|
498 |
for chunk in stream_response:
|
499 |
token = chunk['choices'][0]['delta'].get('content')
|
500 |
if token is not None:
|
|
|
4 |
from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
+
from llama_cpp import Llama
|
8 |
+
|
9 |
import psutil
|
10 |
import requests
|
11 |
from requests.exceptions import MissingSchema
|
|
|
13 |
import torch
|
14 |
import gradio as gr
|
15 |
|
|
|
16 |
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
|
17 |
from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
|
18 |
|
|
|
29 |
LLM_MODELS_PATH,
|
30 |
EMBED_MODELS_PATH,
|
31 |
GENERATE_KWARGS,
|
32 |
+
LLAMA_MODEL_KWARGS,
|
33 |
LOADER_CLASSES,
|
34 |
)
|
35 |
|
|
|
134 |
load_log += f'Model {model_file} loaded\n'
|
135 |
except Exception as ex:
|
136 |
model_path = ''
|
137 |
+
load_log += f'Error downloading model, error code:\n{ex}\n'
|
138 |
|
139 |
if model_path:
|
140 |
progress(0.7, desc='Step 2/2: Initialize the model')
|
141 |
try:
|
142 |
+
print('----------')
|
143 |
+
print(str(model_path))
|
144 |
+
llm_model = Llama(model_path=str(model_path), **LLAMA_MODEL_KWARGS)
|
145 |
support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
|
146 |
load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
|
147 |
except Exception as ex:
|
|
|
403 |
|
404 |
# adding a user message to the chat bot window
|
405 |
def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
|
406 |
+
# chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
|
407 |
+
chatbot.append({'role': 'user', 'content': user_message})
|
408 |
return '', chatbot
|
409 |
|
410 |
|
|
|
417 |
score_threshold: float,
|
418 |
context_template: str,
|
419 |
) -> Tuple[str, CHAT_HISTORY]:
|
420 |
+
|
421 |
user_message = chatbot[-1]['content']
|
422 |
user_message_with_context = ''
|
423 |
|
|
|
487 |
|
488 |
messages = []
|
489 |
if support_system_role and system_prompt:
|
490 |
+
messages.append({'role': 'system', 'content': system_prompt})
|
491 |
|
492 |
if history_len != 0:
|
493 |
messages.extend(chatbot[:-1][-(history_len*2):])
|
494 |
|
495 |
+
messages.append({'role': 'user', 'content': user_message})
|
496 |
stream_response = llm_model.create_chat_completion(
|
497 |
messages=messages,
|
498 |
stream=True,
|
499 |
**gen_kwargs,
|
500 |
)
|
501 |
try:
|
502 |
+
chatbot.append({'role': 'assistant', 'content': ''})
|
503 |
for chunk in stream_response:
|
504 |
token = chunk['choices'][0]['delta'].get('content')
|
505 |
if token is not None:
|