sergey21000 commited on
Commit
1abf289
·
verified ·
1 Parent(s): f19ab0f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +26 -5
  2. config.py +18 -4
  3. requirements.txt +13 -11
  4. utils.py +13 -8
app.py CHANGED
@@ -1,10 +1,15 @@
1
  from typing import List, Tuple, Optional
2
 
 
 
 
 
3
  import gradio as gr
4
  from langchain_core.vectorstores import VectorStore
5
 
6
  from config import (
7
  LLM_MODEL_REPOS,
 
8
  EMBED_MODEL_REPOS,
9
  SUBTITLES_LANGUAGES,
10
  GENERATE_KWARGS,
@@ -95,14 +100,31 @@ def get_generate_args(do_sample: bool) -> List[gr.component]:
95
 
96
  # ================ LOADING AND INITIALIZING MODELS ========================
97
 
98
- start_llm_model, start_support_system_role, load_log = load_llm_model(LLM_MODEL_REPOS[0], 'gemma-2-2b-it-Q8_0.gguf')
99
- start_embed_model, load_log = load_embed_model(EMBED_MODEL_REPOS[0])
 
 
 
 
 
100
 
101
 
 
 
 
 
 
 
 
102
 
103
  # ================== APPLICATION WEB INTERFACE ============================
104
 
105
- css = '''.gradio-container {width: 60% !important}'''
 
 
 
 
 
106
 
107
  with gr.Blocks(css=css) as interface:
108
 
@@ -127,7 +149,6 @@ with gr.Blocks(css=css) as interface:
127
  chatbot = gr.Chatbot(
128
  type='messages', # new in gradio 5+
129
  show_copy_button=True,
130
- bubble_full_width=False,
131
  height=480,
132
  )
133
  user_message = gr.Textbox(label='User')
@@ -197,7 +218,7 @@ with gr.Blocks(css=css) as interface:
197
  fn=user_message_to_chatbot,
198
  inputs=[user_message, chatbot],
199
  outputs=[user_message, chatbot],
200
- queue=False,
201
  ).then(
202
  fn=update_user_message_with_context,
203
  inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],
 
1
  from typing import List, Tuple, Optional
2
 
3
+ # this is so that there is no error: exception: access violation reading 0x0000000000000000
4
+ # https://github.com/abetlen/llama-cpp-python/issues/1581
5
+ from llama_cpp import Llama
6
+
7
  import gradio as gr
8
  from langchain_core.vectorstores import VectorStore
9
 
10
  from config import (
11
  LLM_MODEL_REPOS,
12
+ START_LLM_MODEL_FILE,
13
  EMBED_MODEL_REPOS,
14
  SUBTITLES_LANGUAGES,
15
  GENERATE_KWARGS,
 
100
 
101
  # ================ LOADING AND INITIALIZING MODELS ========================
102
 
103
+ start_llm_model, start_support_system_role, load_log = load_llm_model(
104
+ model_repo=LLM_MODEL_REPOS[0],
105
+ model_file=START_LLM_MODEL_FILE,
106
+ )
107
+
108
+ if start_llm_model['llm_model'] is None:
109
+ raise Exception(f'LLM model not initialized, status message: {load_log}')
110
 
111
 
112
+ start_embed_model, load_log = load_embed_model(
113
+ model_repo=EMBED_MODEL_REPOS[0],
114
+ )
115
+
116
+ if start_embed_model['embed_model'] is None:
117
+ raise Exception(f'Embed model not initialized, status message: {load_log}')
118
+
119
 
120
  # ================== APPLICATION WEB INTERFACE ============================
121
 
122
+ css = '''
123
+ .gradio-container {
124
+ width: 70% !important;
125
+ margin: 0 auto !important;
126
+ }
127
+ '''
128
 
129
  with gr.Blocks(css=css) as interface:
130
 
 
149
  chatbot = gr.Chatbot(
150
  type='messages', # new in gradio 5+
151
  show_copy_button=True,
 
152
  height=480,
153
  )
154
  user_message = gr.Textbox(label='User')
 
218
  fn=user_message_to_chatbot,
219
  inputs=[user_message, chatbot],
220
  outputs=[user_message, chatbot],
221
+ # queue=False,
222
  ).then(
223
  fn=update_user_message_with_context,
224
  inputs=[chatbot, rag_mode, db, k, score_threshold, context_template],
config.py CHANGED
@@ -54,6 +54,13 @@ GENERATE_KWARGS = dict(
54
  repeat_penalty=1.0,
55
  )
56
 
 
 
 
 
 
 
 
57
  # paths to LLM and embeddings models
58
  LLM_MODELS_PATH = Path('models')
59
  EMBED_MODELS_PATH = Path('embed_models')
@@ -62,10 +69,12 @@ EMBED_MODELS_PATH.mkdir(exist_ok=True)
62
 
63
  # available when running the LLM application models in GGUF format
64
  LLM_MODEL_REPOS = [
65
- # https://huggingface.co/bartowski/gemma-2-2b-it-GGUF
66
- 'bartowski/gemma-2-2b-it-GGUF',
67
  # https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
68
  'bartowski/Qwen2.5-3B-Instruct-GGUF',
 
 
69
  # https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
70
  'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
71
  # https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
@@ -76,10 +85,17 @@ LLM_MODEL_REPOS = [
76
  'bartowski/Llama-3.2-3B-Instruct-GGUF',
77
  ]
78
 
 
 
 
79
  # Embedding models available at application startup
80
  EMBED_MODEL_REPOS = [
 
 
81
  # https://huggingface.co/sergeyzh/rubert-tiny-turbo # 117 MB
82
  'sergeyzh/rubert-tiny-turbo',
 
 
83
  # https://huggingface.co/cointegrated/rubert-tiny2 # 118 MB
84
  'cointegrated/rubert-tiny2',
85
  # https://huggingface.co/cointegrated/LaBSE-en-ru # 516 MB
@@ -90,8 +106,6 @@ EMBED_MODEL_REPOS = [
90
  'intfloat/multilingual-e5-large',
91
  # https://huggingface.co/intfloat/multilingual-e5-base # 1.11 GB
92
  'intfloat/multilingual-e5-base',
93
- # https://huggingface.co/intfloat/multilingual-e5-small # 471 MB
94
- 'intfloat/multilingual-e5-small',
95
  # https://huggingface.co/intfloat/multilingual-e5-large-instruct # 1.12 GB
96
  'intfloat/multilingual-e5-large-instruct',
97
  # https://huggingface.co/sentence-transformers/all-mpnet-base-v2 # 438 MB
 
54
  repeat_penalty=1.0,
55
  )
56
 
57
+ # llama-cpp-python model params
58
+ LLAMA_MODEL_KWARGS = dict(
59
+ n_gpu_layers=-1,
60
+ verbose=False,
61
+ n_ctx=4096, # context size
62
+ )
63
+
64
  # paths to LLM and embeddings models
65
  LLM_MODELS_PATH = Path('models')
66
  EMBED_MODELS_PATH = Path('embed_models')
 
69
 
70
  # available when running the LLM application models in GGUF format
71
  LLM_MODEL_REPOS = [
72
+ # https://huggingface.co/bartowski/google_gemma-3-4b-it-GGUF
73
+ 'bartowski/google_gemma-3-4b-it-GGUF',
74
  # https://huggingface.co/bartowski/Qwen2.5-3B-Instruct-GGUF
75
  'bartowski/Qwen2.5-3B-Instruct-GGUF',
76
+ # https://huggingface.co/bartowski/gemma-2-2b-it-GGUF
77
+ 'bartowski/gemma-2-2b-it-GGUF',
78
  # https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF
79
  'bartowski/Qwen2.5-1.5B-Instruct-GGUF',
80
  # https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
 
85
  'bartowski/Llama-3.2-3B-Instruct-GGUF',
86
  ]
87
 
88
+ # GGUF filename to LLM_MODEL_REPOS[0]
89
+ START_LLM_MODEL_FILE = 'google_gemma-3-4b-it-Q4_K_M.gguf'
90
+
91
  # Embedding models available at application startup
92
  EMBED_MODEL_REPOS = [
93
+ # https://huggingface.co/intfloat/multilingual-e5-small # 471 MB
94
+ 'intfloat/multilingual-e5-small',
95
  # https://huggingface.co/sergeyzh/rubert-tiny-turbo # 117 MB
96
  'sergeyzh/rubert-tiny-turbo',
97
+ # https://huggingface.co/sergeyzh/BERTA # 513 MB
98
+ 'sergeyzh/BERTA',
99
  # https://huggingface.co/cointegrated/rubert-tiny2 # 118 MB
100
  'cointegrated/rubert-tiny2',
101
  # https://huggingface.co/cointegrated/LaBSE-en-ru # 516 MB
 
106
  'intfloat/multilingual-e5-large',
107
  # https://huggingface.co/intfloat/multilingual-e5-base # 1.11 GB
108
  'intfloat/multilingual-e5-base',
 
 
109
  # https://huggingface.co/intfloat/multilingual-e5-large-instruct # 1.12 GB
110
  'intfloat/multilingual-e5-large-instruct',
111
  # https://huggingface.co/sentence-transformers/all-mpnet-base-v2 # 438 MB
requirements.txt CHANGED
@@ -1,11 +1,13 @@
1
- --extra-index-url https://download.pytorch.org/whl/cpu
2
- torch==2.4.1
3
- llama_cpp_python==0.2.90
4
- langchain==0.3.3
5
- langchain-community==0.3.1
6
- langchain-huggingface==0.1.0
7
- pdfminer.six==20240706
8
- youtube-transcript-api==0.6.2
9
- psutil==6.0.0
10
- faiss-cpu==1.9.0
11
- beautifulsoup4==4.12.3
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
3
+ torch==2.6.0
4
+ https://github.com/sergey21000/llama-cpp-python-wheels/releases/download/llama-cpp-python-0.3.8-wheels/llama_cpp_python-0.3.8-cp310-cp310-linux_x86_64_cpu.whl
5
+ gradio==5.25.2
6
+ langchain==0.3.23
7
+ langchain-community==0.3.21
8
+ langchain-huggingface==0.1.2
9
+ pdfminer.six==20250416
10
+ youtube-transcript-api==1.0.3
11
+ psutil==7.0.0
12
+ faiss-cpu==1.10.0
13
+ beautifulsoup4==4.13.4
utils.py CHANGED
@@ -4,6 +4,8 @@ from shutil import rmtree
4
  from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
5
  from tqdm import tqdm
6
 
 
 
7
  import psutil
8
  import requests
9
  from requests.exceptions import MissingSchema
@@ -11,7 +13,6 @@ from requests.exceptions import MissingSchema
11
  import torch
12
  import gradio as gr
13
 
14
- from llama_cpp import Llama
15
  from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
16
  from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
17
 
@@ -28,6 +29,7 @@ from config import (
28
  LLM_MODELS_PATH,
29
  EMBED_MODELS_PATH,
30
  GENERATE_KWARGS,
 
31
  LOADER_CLASSES,
32
  )
33
 
@@ -132,12 +134,14 @@ def load_llm_model(model_repo: str, model_file: str) -> Tuple[LLM_MODEL_DICT, st
132
  load_log += f'Model {model_file} loaded\n'
133
  except Exception as ex:
134
  model_path = ''
135
- load_log += f'Error loading model, error code:\n{ex}\n'
136
 
137
  if model_path:
138
  progress(0.7, desc='Step 2/2: Initialize the model')
139
  try:
140
- llm_model = Llama(model_path=str(model_path), n_gpu_layers=-1, verbose=False)
 
 
141
  support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
142
  load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
143
  except Exception as ex:
@@ -399,7 +403,8 @@ def load_documents_and_create_db(
399
 
400
  # adding a user message to the chat bot window
401
  def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
402
- chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
 
403
  return '', chatbot
404
 
405
 
@@ -412,7 +417,7 @@ def update_user_message_with_context(
412
  score_threshold: float,
413
  context_template: str,
414
  ) -> Tuple[str, CHAT_HISTORY]:
415
-
416
  user_message = chatbot[-1]['content']
417
  user_message_with_context = ''
418
 
@@ -482,19 +487,19 @@ def get_llm_response(
482
 
483
  messages = []
484
  if support_system_role and system_prompt:
485
- messages.append({'role': 'system', 'metadata': {'title': None}, 'content': system_prompt})
486
 
487
  if history_len != 0:
488
  messages.extend(chatbot[:-1][-(history_len*2):])
489
 
490
- messages.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
491
  stream_response = llm_model.create_chat_completion(
492
  messages=messages,
493
  stream=True,
494
  **gen_kwargs,
495
  )
496
  try:
497
- chatbot.append({'role': 'assistant', 'metadata': {'title': None}, 'content': ''})
498
  for chunk in stream_response:
499
  token = chunk['choices'][0]['delta'].get('content')
500
  if token is not None:
 
4
  from typing import List, Tuple, Dict, Union, Optional, Any, Iterable
5
  from tqdm import tqdm
6
 
7
+ from llama_cpp import Llama
8
+
9
  import psutil
10
  import requests
11
  from requests.exceptions import MissingSchema
 
13
  import torch
14
  import gradio as gr
15
 
 
16
  from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
17
  from huggingface_hub import hf_hub_download, list_repo_tree, list_repo_files, repo_info, repo_exists, snapshot_download
18
 
 
29
  LLM_MODELS_PATH,
30
  EMBED_MODELS_PATH,
31
  GENERATE_KWARGS,
32
+ LLAMA_MODEL_KWARGS,
33
  LOADER_CLASSES,
34
  )
35
 
 
134
  load_log += f'Model {model_file} loaded\n'
135
  except Exception as ex:
136
  model_path = ''
137
+ load_log += f'Error downloading model, error code:\n{ex}\n'
138
 
139
  if model_path:
140
  progress(0.7, desc='Step 2/2: Initialize the model')
141
  try:
142
+ print('----------')
143
+ print(str(model_path))
144
+ llm_model = Llama(model_path=str(model_path), **LLAMA_MODEL_KWARGS)
145
  support_system_role = 'System role not supported' not in llm_model.metadata['tokenizer.chat_template']
146
  load_log += f'Model {model_file} initialized, max context size is {llm_model.n_ctx()} tokens\n'
147
  except Exception as ex:
 
403
 
404
  # adding a user message to the chat bot window
405
  def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
406
+ # chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
407
+ chatbot.append({'role': 'user', 'content': user_message})
408
  return '', chatbot
409
 
410
 
 
417
  score_threshold: float,
418
  context_template: str,
419
  ) -> Tuple[str, CHAT_HISTORY]:
420
+
421
  user_message = chatbot[-1]['content']
422
  user_message_with_context = ''
423
 
 
487
 
488
  messages = []
489
  if support_system_role and system_prompt:
490
+ messages.append({'role': 'system', 'content': system_prompt})
491
 
492
  if history_len != 0:
493
  messages.extend(chatbot[:-1][-(history_len*2):])
494
 
495
+ messages.append({'role': 'user', 'content': user_message})
496
  stream_response = llm_model.create_chat_completion(
497
  messages=messages,
498
  stream=True,
499
  **gen_kwargs,
500
  )
501
  try:
502
+ chatbot.append({'role': 'assistant', 'content': ''})
503
  for chunk in stream_response:
504
  token = chunk['choices'][0]['delta'].get('content')
505
  if token is not None: