IST199655 commited on
Commit
59e0922
Β·
1 Parent(s): 192eae5
Files changed (1) hide show
  1. app.py +70 -70
app.py CHANGED
@@ -4,71 +4,34 @@ from huggingface_hub import InferenceClient
4
  """
5
  Copied from inference in colab notebook
6
  """
7
- import torch
8
 
9
- # Monkey-patch to avoid CUDA initialization issues
10
- torch.cuda.get_device_capability = lambda *args, **kwargs: (0, 0)
11
 
12
- from unsloth.chat_templates import get_chat_template
13
- from unsloth import FastLanguageModel
14
 
15
- # IMPORTING MODEL AND TOKENIZER β€”β€”β€”β€”β€”β€”β€”β€”
16
 
17
- max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
18
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
19
- load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
20
 
21
- model, tokenizer = FastLanguageModel.from_pretrained(
22
- model_name = "llama_lora_model_1",
23
- max_seq_length = max_seq_length,
24
- dtype = dtype,
25
- load_in_4bit = load_in_4bit,
26
- )
27
-
28
- tokenizer = get_chat_template(
29
- tokenizer,
30
- chat_template = "llama-3.1",
31
- )
32
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
33
-
34
- # RUNNING INFERENCE β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
35
 
36
- def respond(
37
- message,
38
- history: list[tuple[str, str]],
39
- system_message,
40
- max_tokens,
41
- temperature,
42
- top_p,
43
- ):
44
- messages = [{"role": "system", "content": system_message}]
45
-
46
- for val in history:
47
- if val[0]:
48
- messages.append({"role": "user", "content": val[0]})
49
- if val[1]:
50
- messages.append({"role": "assistant", "content": val[1]})
51
-
52
- messages.append({"role": "user", "content": message})
53
-
54
- inputs = tokenizer.apply_chat_template(
55
- messages,
56
- tokenize = True,
57
- add_generation_prompt = True, # Must add for generation
58
- return_tensors = "pt",
59
- )
60
-
61
- outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
62
- temperature = 1.5, min_p = 0.1)
63
- response = tokenizer.batch_decode(outputs)
64
-
65
- yield response
66
-
67
- """
68
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
69
- """
70
- # client = InferenceClient("llama_lora_model_1")
71
 
 
72
 
73
  # def respond(
74
  # message,
@@ -88,19 +51,56 @@ For more information on `huggingface_hub` Inference API support, please check th
88
 
89
  # messages.append({"role": "user", "content": message})
90
 
91
- # response = ""
92
-
93
- # for message in client.chat_completion(
94
  # messages,
95
- # max_tokens=max_tokens,
96
- # stream=True,
97
- # temperature=temperature,
98
- # top_p=top_p,
99
- # ):
100
- # token = message.choices[0].delta.content
101
-
102
- # response += token
103
- # yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
 
106
  """
 
4
  """
5
  Copied from inference in colab notebook
6
  """
7
+ # import torch
8
 
9
+ # # Monkey-patch to avoid CUDA initialization issues
10
+ # torch.cuda.get_device_capability = lambda *args, **kwargs: (0, 0)
11
 
12
+ # from unsloth.chat_templates import get_chat_template
13
+ # from unsloth import FastLanguageModel
14
 
15
+ # # IMPORTING MODEL AND TOKENIZER β€”β€”β€”β€”β€”β€”β€”β€”
16
 
17
+ # max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
18
+ # dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
19
+ # load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
20
 
21
+ # model, tokenizer = FastLanguageModel.from_pretrained(
22
+ # model_name = "llama_lora_model_1",
23
+ # max_seq_length = max_seq_length,
24
+ # dtype = dtype,
25
+ # load_in_4bit = load_in_4bit,
26
+ # )
 
 
 
 
 
 
 
 
27
 
28
+ # tokenizer = get_chat_template(
29
+ # tokenizer,
30
+ # chat_template = "llama-3.1",
31
+ # )
32
+ # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # # RUNNING INFERENCE β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
35
 
36
  # def respond(
37
  # message,
 
51
 
52
  # messages.append({"role": "user", "content": message})
53
 
54
+ # inputs = tokenizer.apply_chat_template(
 
 
55
  # messages,
56
+ # tokenize = True,
57
+ # add_generation_prompt = True, # Must add for generation
58
+ # return_tensors = "pt",
59
+ # )
60
+
61
+ # outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
62
+ # temperature = 1.5, min_p = 0.1)
63
+ # response = tokenizer.batch_decode(outputs)
64
+
65
+ # yield response
66
+
67
+ """
68
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
69
+ """
70
+ client = InferenceClient("llama_lora_model_1")
71
+
72
+
73
+ def respond(
74
+ message,
75
+ history: list[tuple[str, str]],
76
+ system_message,
77
+ max_tokens,
78
+ temperature,
79
+ top_p,
80
+ ):
81
+ messages = [{"role": "system", "content": system_message}]
82
+
83
+ for val in history:
84
+ if val[0]:
85
+ messages.append({"role": "user", "content": val[0]})
86
+ if val[1]:
87
+ messages.append({"role": "assistant", "content": val[1]})
88
+
89
+ messages.append({"role": "user", "content": message})
90
+
91
+ response = ""
92
+
93
+ for message in client.chat_completion(
94
+ messages,
95
+ max_tokens=max_tokens,
96
+ stream=True,
97
+ temperature=temperature,
98
+ top_p=top_p,
99
+ ):
100
+ token = message.choices[0].delta.content
101
+
102
+ response += token
103
+ yield response
104
 
105
 
106
  """