Spaces:

piyushgrover
/

MultiModelGPT

Paused

App Files Files Community

piyushgrover commited on Jan 28, 2024

Commit

f414499

1 Parent(s): 7396aab

added app files

Browse files

Files changed (3) hide show

app.py +220 -0
config.py +2 -144
requirement.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import gradio as gr
+import os
+import time
+from PIL import Image
+import torch
+import whisperx
+from transformers import CLIPVisionModel, CLIPImageProcessor, AutoModelForCausalLM, AutoTokenizer
+from models.vision_projector_model import VisionProjector
+from config import VisionProjectorConfig, app_config as cfg
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+vision_projector = VisionProjector(VisionProjectorConfig())
+ckpt = torch.load(cfg['vision_projector_file'], map_location=torch.device(device))
+vision_projector.load_state_dict(ckpt['model_state_dict'])
+phi_base_model = AutoModelForCausalLM.from_pretrained(
+    'microsoft/phi-2',
+    low_cpu_mem_usage=True,
+    return_dict=True,
+    torch_dtype=torch.float32,
+    trust_remote_code=True
+    # device_map=device_map,
+)
+from peft import PeftModel
+phi_new_model = "models/phi_adapter"
+phi_model = PeftModel.from_pretrained(phi_base_model, phi_new_model)
+phi_model = phi_model.merge_and_unload()
+audi_model = whisperx.load_model("large-v2", device, compute_type='float16')
+tokenizer = AutoTokenizer.from_pretrained('microsoft/phi-2', trust_remote_code=True)
+tokenizer.pad_token = tokenizer.unk_token
+### app functions ##
+context_added = False
+context = None
+context_type = ''
+query = ''
+def print_like_dislike(x: gr.LikeData):
+    print(x.index, x.value, x.liked)
+def add_text(history, text):
+    global context, context_type, context_added, query
+    context_added = False
+    if not context_type and '</context>' not in text:
+        history += text
+        history += "**Please add context (upload image/audio or enter text followed by </context>"
+    elif not context_type:
+        context_type = 'text'
+        context_added = True
+        text = text.replace('</context>', ' ')
+        context = text
+    else:
+        if '</context>' in text:
+            context_type = 'text'
+            context_added = True
+            text = text.replace('</context>', ' ')
+            context = text
+        elif context_type in ['text', 'image']:
+            query = 'Human### ' + text + '\n' + 'AI### '
+    history = history + [(text, None)]
+    return history, gr.Textbox(value="", interactive=False)
+def add_file(history, file):
+    global context_added, context, context_type
+    context_added = False
+    context_type = ''
+    context = None
+    history = history + [((file.name,), None)]
+    history += [("Building context...", None)]
+    image = Image.open(file)
+    inputs = clip_processor(images=image, return_tensors="pt")
+    x = clip_model(**inputs, output_hidden_states=True)
+    image_features = x.hidden_states[-2]
+    context = vision_projector(image_features)
+    context_type = 'image'
+    context_added = True
+    return history
+def audio_file(history, audio_file):
+    global context, context_type, context_added, query
+    if audio_file:
+        history = history + [((audio_file,), None)]
+        context_added = False
+        audio = whisperx.load_audio(audio_file)
+        result = audi_model.transcribe(audio, batch_size=1)
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+        text = result["segments"][0]["text"]
+        resp = "🗣" + "_" + text.strip() + "_"
+        history += [(resp, None)]
+        context_type = 'text'
+        context_added = True
+        context = text
+    return history
+def bot(history):
+    global context, context_added, query, context_type
+    if context_added:
+        response = "**Please proceed with your queries**"
+        context_added = False
+        query = ''
+    else:
+        if context_type == 'image':
+            query_ids = tokenizer.encode(query)
+            query_ids = torch.tensor(query_ids, dtype=torch.int32).unsqueeze(0)
+            query_embeds = phi_model.get_input_embeddings()(query_ids)
+            inputs_embeds = torch.cat([context, query_embeds], dim=1)
+            out = phi_model.generate(inputs_embeds=inputs_embeds, min_new_tokens=10, max_new_tokens=50,
+                                     bos_token_id=tokenizer.bos_token_id)
+            response = tokenizer.decode(out[0], skip_special_tokens=True)
+        elif context_type in ['text', 'audio']:
+            input_text = context + query
+            input_tokens = tokenizer.encode(input_text)
+            input_ids = torch.tensor(input_tokens, dtype=torch.int32).unsqueeze(0)
+            inputs_embeds = phi_model.get_input_embeddings()(input_ids)
+            out = phi_model.generate(inputs_embeds=inputs_embeds, min_new_tokens=10, max_new_tokens=50,
+                                     bos_token_id=tokenizer.bos_token_id)
+            response = tokenizer.decode(out[0], skip_special_tokens=True)
+        else:
+            response = "**Please provide a valid context**"
+    if len(history[-1]) > 1:
+        history[-1][1] = ""
+        for character in response:
+            history[-1][1] += character
+            time.sleep(0.05)
+            yield history
+def clear_fn():
+    global context_added, context_type, context, query
+    context_added = False
+    context_type = ''
+    context = None
+    query = ''
+    return {
+        chatbot: None
+    }
+with gr.Blocks() as app:
+    gr.Markdown(
+        """
+        # ContextGPT - A Multimodel chatbot
+        ### Upload image or audio to add a context. And then ask questions.
+        ### You can also enter text followed by \</context\> to set the context in text format.
+        """
+    )
+    chatbot = gr.Chatbot(
+        [],
+        elem_id="chatbot",
+        bubble_full_width=False
+    )
+    with gr.Row():
+        aud = gr.Audio(sources=['microphone', 'upload'], type='filepath', max_length=100, show_download_button=True,
+                       show_share_button=True)
+        btn = gr.UploadButton("📷", file_types=["image"])
+    with gr.Row():
+        txt = gr.Textbox(
+            scale=4,
+            show_label=False,
+            placeholder="Press enter to send ",
+            container=False,
+        )
+    with gr.Row():
+        clear = gr.Button("Clear")
+    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
+    file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    chatbot.like(print_like_dislike, None, None)
+    clear.click(clear_fn, None, chatbot, queue=False)
+    aud.stop_recording(audio_file, [chatbot, aud], [chatbot], queue=False).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+    aud.upload(audio_file, [chatbot, aud], [chatbot], queue=False).then(
+        bot, chatbot, chatbot, api_name="bot_response"
+    )
+app.queue()
+app.launch()

config.py CHANGED Viewed

@@ -20,154 +20,12 @@ class VisionProjectorConfig(PretrainedConfig):
         self.kwargs = kwargs
-class CustomPhiConfig(PretrainedConfig):
-    model_type = "phi-msft"
-    attribute_map = {
-        "max_position_embeddings": "n_positions",
-        "hidden_size": "n_embd",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-    def __init__(
-            self,
-            vocab_size: int = 51200,
-            n_positions: int = 2048,
-            n_embd: int = 2560,
-            n_layer: int = 32,
-            n_inner: Optional[int] = None,
-            n_head: int = 32,
-            n_head_kv: Optional[int] = None,
-            rotary_dim: Optional[int] = 32,
-            activation_function: Optional[str] = "gelu_new",
-            flash_attn: bool = False,
-            flash_rotary: bool = False,
-            fused_dense: bool = False,
-            attn_pdrop: float = 0.0,
-            embd_pdrop: float = 0.0,
-            resid_pdrop: float = 0.1,
-            layer_norm_epsilon: float = 1e-05,
-            initializer_range: float = 0.02,
-            tie_word_embeddings: bool = False,
-            pad_vocab_size_multiple: int = 64,
-            **kwargs
-    ) -> None:
-        self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.n_head_kv = n_head_kv
-        self.rotary_dim = min(rotary_dim, n_embd // n_head)
-        self.activation_function = activation_function
-        self.flash_attn = flash_attn
-        self.flash_rotary = flash_rotary
-        self.fused_dense = fused_dense
-        self.attn_pdrop = attn_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
-class CLIPVisionToPhiConfig(PretrainedConfig):
-    def __init__(self,
-                 vision_projector_config: VisionProjectorConfig,
-                 phi_config: CustomPhiConfig,
-                 **kwargs
-                 ):
-        #super().__init__(**kwargs)
-        self.vision_projector_config = vision_projector_config
-        self.phi_config = phi_config
-        self.tokenizer = kwargs.get('tokenizer')
-        self.freeze_phi_model = True
-'''
-phi_config_obj = CustomPhiConfig(
-    **{
-      "_name_or_path": "microsoft/phi-2",
-      "architectures": [
-        "PhiForCausalLM"
-      ],
-      "auto_map": {
-        "AutoConfig": "configuration_phi.PhiConfig",
-        "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
-      },
-      "img_processor": None,
-      "model_type": "phi-msft",
-      "torch_dtype": "float16",
-      "transformers_version": "4.35.2"
-    }
-)
-'''
-from peft import LoraConfig
-bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16
-)
-peft_config = LoraConfig(
-    lora_alpha=16,
-    lora_dropout=0.1,
-    r=64,
-    bias="none",
-    task_type="CAUSAL_LM",
-    target_modules=[
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "dense",
-        "fc1",
-        "fc2"
-    ]
-)
-class MultiInstructModelConfig(PretrainedConfig):
-    def __init__(self,
-                 vision_projector_config: Optional[VisionProjectorConfig] = None,
-                 **kwargs
-                 ):
-        self.vision_projector_config = vision_projector_config
-        self.quantization_config = bnb_config
-        self.peft_config = peft_config
-        self.tokenizer = kwargs.get('tokenizer')
-        self.freeze_vision_projector = True
-extra = dict(
-    num_epochs=1,
-    resume=False,
-    data_dir='../data',
-    checkpoint_dir='../checkpoints',
-    max_seqlen=80,
-    batch_size=2,
-    live_image_processing=True,
-    vision_projector_file='/Users/piyushgrover/Downloads/old_vt_proj/vp_ckpt_0.pth',
-    validation_phase=False
-)
-qlora_config = dict(
-    num_steps=1000,
     max_seqlen=512,
     max_caption_len=100,
-    batch_size=8,
-    micro_batch_size=2,
     data_dir='../data',
     output_dir="./results",
     vision_model=True,
     vision_projector_file='models/vision_projector/vp_ckpt_0.pth',
-    resume=False
 )

         self.kwargs = kwargs
+app_config = dict(
     max_seqlen=512,
     max_caption_len=100,
     data_dir='../data',
     output_dir="./results",
     vision_model=True,
     vision_projector_file='models/vision_projector/vp_ckpt_0.pth',
+    phi_adapter_dir='models/phi_adapter'
 )

requirement.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+numpy
+trl
+transformers
+accelerate
+git+https://github.com/huggingface/peft.git
+datasets
+bitsandbytes
+einops
+wandb
+git+https://github.com/m-bain/whisperx.git
+scipy