Imsg

Sleeping

App Files Files Community

Makhinur commited on May 1

Commit

f496ef0

verified ·

1 Parent(s): a0a8457

Update main.py

Browse files

Files changed (1) hide show

main.py +21 -23

main.py CHANGED Viewed

@@ -37,18 +37,17 @@ from deep_translator.exceptions import InvalidSourceOrTargetLanguage
 app = FastAPI()
 # --- Llama.cpp Language Model Setup (Local CPU Inference) ---
-# Repository on Hugging Face Hub containing the Qwen1.5 1.8B GGUF file
-# Using the OFFICIAL Qwen repository:
-LLM_MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF" # Updated to official repo
 # Specify the filename for a Q4_K_M quantized version (good balance of speed/quality on CPU)
-# Based on DIRECT VERIFICATION of files in the OFFICIAL Qwen repo:
-# Changed DOT before Q4_K_M back to a HYPHEN to match the official repo's filename.
-LLM_MODEL_FILE = "qwen1_5-1.8b-chat-Q4_K_M.gguf" # Correct filename for the OFFICIAL repo
 # Original model name for the tokenizer (needed by transformers)
-# This remains the same as it points to the base model repository for the tokenizer files.
-ORIGINAL_MODEL_NAME = "Qwen/Qwen1.5-1.8B-Chat"
 tokenizer = None # Using transformers tokenizer for chat templating
 llm_model = None # This will hold the llama_cpp.Llama instance
@@ -86,7 +85,6 @@ def load_language_model():
         # --- Download GGUF model file (using huggingface_hub) ---
         print(f"Downloading GGUF model file: {LLM_MODEL_FILE} from {LLM_MODEL_REPO}...")
-        # hf_hub_download downloads the file to the Hugging Face cache directory
         model_path = hf_hub_download(
             repo_id=LLM_MODEL_REPO,
             filename=LLM_MODEL_FILE,
@@ -98,12 +96,12 @@ def load_language_model():
         print(f"Loading GGUF model into llama_cpp...")
         # Instantiate the Llama model from the downloaded GGUF file
         # n_gpu_layers=0: Crucial for forcing CPU-only inference
-        # n_ctx: Context window size (tokens model can consider), match model's spec if possible (Qwen1.5 1.8B has 32768 context, but 4096 or 8192 is often sufficient and uses less RAM)
         # n_threads: Number of CPU threads to use. Set to your vCPU count (2) for better performance.
         llm_model = Llama(
             model_path=model_path,
             n_gpu_layers=0, # Explicitly use CPU
-            n_ctx=4096,     # Context window size (adjust if needed)
             n_threads=2     # Use 2 CPU threads
         )
         print("Llama.cpp model loaded successfully.")
@@ -141,7 +139,7 @@ def initialize_caption_client():
 # Load models and initialize clients when the app starts
 @app.on_event("startup")
 async def startup_event():
-    # Load the language model (Qwen1.5 1.8B GGUF via llama.cpp)
     load_language_model()
     # Initialize the client for the external captioning Space
     initialize_caption_client()
@@ -208,11 +206,11 @@ def generate_image_caption(image_file: UploadFile):
                  print(f"Error removing temporary file {temp_file_path}: {e}") # Log cleanup errors
-# --- Language Model Story Generation Function (Qwen1.5 1.8B via llama.cpp) ---
 # Renamed function to reflect the model being used
-def generate_story_qwen(prompt_text: str, max_new_tokens: int = 300, temperature: float = 0.7, top_p: float = 0.9, top_k: int = 50) -> str:
     """
-    Generates text using the loaded Qwen1.5 1.8B model via llama.cpp.
     Uses the tokenizer to apply the chat template and calls llama.cpp's chat completion.
     """
     # Check if the language model was loaded successfully at startup
@@ -230,7 +228,7 @@ def generate_story_qwen(prompt_text: str, max_new_tokens: int = 300, temperature
     ]
     try:
-        print("Calling llama.cpp create_chat_completion for Qwen...")
         # Call the create_chat_completion method from llama_cpp.Llama instance
         # This method handles the chat templating internally for models like Qwen.
         # max_tokens is the max number of tokens to generate
@@ -244,7 +242,7 @@ def generate_story_qwen(prompt_text: str, max_new_tokens: int = 300, temperature
             # top_k=top_k,
             stream=False # We want the full response at once
         )
-        print("Llama.cpp completion received for Qwen.")
         # Parse the response to get the generated text content
         # The response structure is typically like OpenAI's chat API response
@@ -252,12 +250,12 @@ def generate_story_qwen(prompt_text: str, max_new_tokens: int = 300, temperature
             story = response['choices'][0].get('message', {}).get('content', '')
         else:
             # Handle cases where the response is empty or has an unexpected structure
-            print("Warning: Llama.cpp Qwen response structure unexpected or content missing.")
             story = "" # Return an empty string if content is not found
     except Exception as e:
         # Catch any exception that occurs during the llama.cpp inference process
-        print(f"Llama.cpp Qwen inference failed: {e}") # Log the error server-side
         # Re-raise as a RuntimeError to indicate failure to the endpoint
         raise RuntimeError(f"Llama.cpp inference failed: {type(e).__name__}: {e}")
@@ -291,10 +289,10 @@ async def generate_story_endpoint(image_file: UploadFile = File(...), language:
     # This prompt instructs the model on what to write and incorporates the caption.
     prompt_text = f"Write an attractive story of around 300 words about {story_theme}. Incorporate the following details from an image description into the story: {caption}\n\nStory:"
-    # Step 3: Generate the story using the local language model (Qwen via llama.cpp)
     try:
-        # Call the Qwen story generation function
-        story = generate_story_qwen( # <--- Use the new function name
             prompt_text,
             max_new_tokens=300, # Request ~300 new tokens
             temperature=0.7,    # Sampling parameters
@@ -304,7 +302,7 @@ async def generate_story_endpoint(image_file: UploadFile = File(...), language:
         story = story.strip() # Basic cleanup of generated story text
     except RuntimeError as e:
-        # Catch specific RuntimeError raised by generate_story_qwen if LLM loading or inference fails
         print(f"Language model generation error: {e}") # Log the error server-side
         # Return a 503 Service Unavailable error if the LLM is not available or failed
         raise HTTPException(status_code=503, detail=f"Story generation failed (LLM): {e}")

 app = FastAPI()
 # --- Llama.cpp Language Model Setup (Local CPU Inference) ---
+# Repository on Hugging Face Hub containing the Qwen1.5 0.5B GGUF file
+# Using the OFFICIAL Qwen 0.5B repository shown in the user's image:
+LLM_MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF" # Updated to official 0.5B repo
 # Specify the filename for a Q4_K_M quantized version (good balance of speed/quality on CPU)
+# Based on DIRECT VERIFICATION from the user's IMAGE of the 0.5B repo:
+LLM_MODEL_FILE = "qwen1_5-0_5b-chat-q4_k_m.gguf" # Exact filename from the 0.5B repo image
 # Original model name for the tokenizer (needed by transformers)
+# This points to the base model repository for the tokenizer files.
+ORIGINAL_MODEL_NAME = "Qwen/Qwen1.5-0.5B-Chat" # Updated to the 0.5B Chat model
 tokenizer = None # Using transformers tokenizer for chat templating
 llm_model = None # This will hold the llama_cpp.Llama instance
         # --- Download GGUF model file (using huggingface_hub) ---
         print(f"Downloading GGUF model file: {LLM_MODEL_FILE} from {LLM_MODEL_REPO}...")
         model_path = hf_hub_download(
             repo_id=LLM_MODEL_REPO,
             filename=LLM_MODEL_FILE,
         print(f"Loading GGUF model into llama_cpp...")
         # Instantiate the Llama model from the downloaded GGUF file
         # n_gpu_layers=0: Crucial for forcing CPU-only inference
+        # n_ctx: Context window size (tokens model can consider), match model's spec if possible (Qwen1.5 0.5B has a smaller context than 1.8B, maybe 4096 or 8192 is standard)
         # n_threads: Number of CPU threads to use. Set to your vCPU count (2) for better performance.
         llm_model = Llama(
             model_path=model_path,
             n_gpu_layers=0, # Explicitly use CPU
+            n_ctx=4096,     # Context window size (4096 is a common safe value)
             n_threads=2     # Use 2 CPU threads
         )
         print("Llama.cpp model loaded successfully.")
 # Load models and initialize clients when the app starts
 @app.on_event("startup")
 async def startup_event():
+    # Load the language model (Qwen1.5 0.5B GGUF via llama.cpp)
     load_language_model()
     # Initialize the client for the external captioning Space
     initialize_caption_client()
                  print(f"Error removing temporary file {temp_file_path}: {e}") # Log cleanup errors
+# --- Language Model Story Generation Function (Qwen1.5 0.5B via llama.cpp) ---
 # Renamed function to reflect the model being used
+def generate_story_qwen_0_5b(prompt_text: str, max_new_tokens: int = 300, temperature: float = 0.7, top_p: float = 0.9, top_k: int = 50) -> str:
     """
+    Generates text using the loaded Qwen1.5 0.5B model via llama.cpp.
     Uses the tokenizer to apply the chat template and calls llama.cpp's chat completion.
     """
     # Check if the language model was loaded successfully at startup
     ]
     try:
+        print("Calling llama.cpp create_chat_completion for Qwen 0.5B...")
         # Call the create_chat_completion method from llama_cpp.Llama instance
         # This method handles the chat templating internally for models like Qwen.
         # max_tokens is the max number of tokens to generate
             # top_k=top_k,
             stream=False # We want the full response at once
         )
+        print("Llama.cpp completion received for Qwen 0.5B.")
         # Parse the response to get the generated text content
         # The response structure is typically like OpenAI's chat API response
             story = response['choices'][0].get('message', {}).get('content', '')
         else:
             # Handle cases where the response is empty or has an unexpected structure
+            print("Warning: Llama.cpp Qwen 0.5B response structure unexpected or content missing.")
             story = "" # Return an empty string if content is not found
     except Exception as e:
         # Catch any exception that occurs during the llama.cpp inference process
+        print(f"Llama.cpp Qwen 0.5B inference failed: {e}") # Log the error server-side
         # Re-raise as a RuntimeError to indicate failure to the endpoint
         raise RuntimeError(f"Llama.cpp inference failed: {type(e).__name__}: {e}")
     # This prompt instructs the model on what to write and incorporates the caption.
     prompt_text = f"Write an attractive story of around 300 words about {story_theme}. Incorporate the following details from an image description into the story: {caption}\n\nStory:"
+    # Step 3: Generate the story using the local language model (Qwen 0.5B via llama.cpp)
     try:
+        # Call the Qwen 0.5B story generation function
+        story = generate_story_qwen_0_5b( # <--- Use the updated function name
             prompt_text,
             max_new_tokens=300, # Request ~300 new tokens
             temperature=0.7,    # Sampling parameters
         story = story.strip() # Basic cleanup of generated story text
     except RuntimeError as e:
+        # Catch specific RuntimeError raised by generate_story_qwen_0_5b if LLM loading or inference fails
         print(f"Language model generation error: {e}") # Log the error server-side
         # Return a 503 Service Unavailable error if the LLM is not available or failed
         raise HTTPException(status_code=503, detail=f"Story generation failed (LLM): {e}")