lukecq commited on
Commit
531980d
·
verified ·
1 Parent(s): 73933cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -23
app.py CHANGED
@@ -8,20 +8,55 @@ import os, json
8
  from sys import argv
9
  from vllm import LLM, SamplingParams
10
 
 
 
 
 
 
 
 
 
 
11
  def load_model_processor(model_path):
12
  processor = AutoProcessor.from_pretrained(model_path)
13
- llm = LLM(
14
- model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
15
- enforce_eager=True, device = "cuda",
16
- limit_mm_per_prompt={"audio": 5},
17
- )
18
- return llm, processor
19
 
20
  model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
21
  model1, processor1 = load_model_processor(model_path1)
22
 
23
- def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
24
- max_new_tokens = 2048):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
26
  audios = []
27
  for message in conversation:
@@ -33,21 +68,15 @@ def response_to_audio_conv(conversation, model=None, processor=None, temperature
33
  ele['audio_url'],
34
  sr=processor.feature_extractor.sampling_rate)[0]
35
  )
36
-
37
- sampling_params = SamplingParams(
38
- temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
39
- stop_token_ids=[],
40
- )
41
-
42
- input = {
43
- 'prompt': text,
44
- 'multi_modal_data': {
45
- 'audio': [(audio, 16000) for audio in audios]
46
- }
47
- }
48
-
49
- output = model.generate([input], sampling_params=sampling_params)[0]
50
- response = output.outputs[0].text
51
  return response
52
 
53
  def print_like_dislike(x: gr.LikeData):
 
8
  from sys import argv
9
  from vllm import LLM, SamplingParams
10
 
11
+ # def load_model_processor(model_path):
12
+ # processor = AutoProcessor.from_pretrained(model_path)
13
+ # llm = LLM(
14
+ # model=model_path, trust_remote_code=True, gpu_memory_utilization=0.8,
15
+ # enforce_eager=True, device = "cuda",
16
+ # limit_mm_per_prompt={"audio": 5},
17
+ # )
18
+ # return llm, processor
19
+
20
  def load_model_processor(model_path):
21
  processor = AutoProcessor.from_pretrained(model_path)
22
+ model = Qwen2AudioForConditionalGeneration.from_pretrained(model_path, device_map="auto")
23
+ model_name = model_path.split("/")[-1]
24
+ return model, processor, model_name
 
 
 
25
 
26
  model_path1 = "Qwen/Qwen2-Audio-7B-Instruct" #argv[1]
27
  model1, processor1 = load_model_processor(model_path1)
28
 
29
+ # def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,
30
+ # max_new_tokens = 2048):
31
+ # text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
32
+ # audios = []
33
+ # for message in conversation:
34
+ # if isinstance(message["content"], list):
35
+ # for ele in message["content"]:
36
+ # if ele["type"] == "audio":
37
+ # if ele['audio_url'] != None:
38
+ # audios.append(librosa.load(
39
+ # ele['audio_url'],
40
+ # sr=processor.feature_extractor.sampling_rate)[0]
41
+ # )
42
+
43
+ # sampling_params = SamplingParams(
44
+ # temperature=temperature, max_tokens=max_new_tokens, repetition_penalty=repetition_penalty, top_p=top_p, top_k=20,
45
+ # stop_token_ids=[],
46
+ # )
47
+
48
+ # input = {
49
+ # 'prompt': text,
50
+ # 'multi_modal_data': {
51
+ # 'audio': [(audio, 16000) for audio in audios]
52
+ # }
53
+ # }
54
+
55
+ # output = model.generate([input], sampling_params=sampling_params)[0]
56
+ # response = output.outputs[0].text
57
+ # return response
58
+
59
+ def response_to_audio_conv(conversation, model=None, processor=None, temperature = 0.1,repetition_penalty=1.1, top_p = 0.9,max_new_tokens = 2048):
60
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
61
  audios = []
62
  for message in conversation:
 
68
  ele['audio_url'],
69
  sr=processor.feature_extractor.sampling_rate)[0]
70
  )
71
+ if audios != []:
72
+ inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True,sampling_rate=16000)
73
+ else:
74
+ inputs = processor(text=text, return_tensors="pt", padding=True)
75
+ inputs.input_ids = inputs.input_ids.to("cuda")
76
+ inputs = {k: v.to("cuda") for k, v in inputs.items() if v is not None}
77
+ generate_ids = model.generate(**inputs, max_new_tokens=2048, temperature = 0.3, do_sample=True)
78
+ generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
79
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
 
 
 
80
  return response
81
 
82
  def print_like_dislike(x: gr.LikeData):