Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running on Zero

Luigi commited on 7 days ago

Commit

939895d

1 Parent(s): 423dc1a

bugfixc: not using pipeline for response generation

Files changed (1) hide show

app.py CHANGED Viewed

@@ -189,28 +189,27 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
         # Load the pipeline.
         pipe = load_pipeline(model_name)
-        # Obtain the underlying tokenizer and model.
-        tokenizer = pipe.tokenizer
-        model = pipe.model
-        # Tokenize the formatted prompt.
-        model_inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
-        # Set up a streamer for token-by-token generation.
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # Run generate in a background thread with the streamer.
-        gen_kwargs = {
-            "input_ids": model_inputs.input_ids,
-            "attention_mask": model_inputs.attention_mask,
-            "max_new_tokens": max_tokens,
-            "temperature": temperature,
-            "top_k": top_k,
-            "top_p": top_p,
-            "repetition_penalty": repeat_penalty,
-            "streamer": streamer
-        }
-        thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
         thread.start()
         # Collect tokens from the streamer as they are generated.

         # Load the pipeline.
         pipe = load_pipeline(model_name)
+        # Set up a streamer tied to the pipeline’s tokenizer.
+        streamer = TextIteratorStreamer(
+            pipe.tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        # Kick off generation via the pipeline itself.
+        thread = threading.Thread(
+            target=pipe,
+            args=(prompt_text,),
+            kwargs={
+                "max_new_tokens":    max_tokens,
+                "temperature":       temperature,
+                "top_k":             top_k,
+                "top_p":             top_p,
+                "repetition_penalty": repeat_penalty,
+                "streamer":          streamer,
+                "return_full_text":  False,
+            }
+        )
         thread.start()
         # Collect tokens from the streamer as they are generated.