from huggingface_hub import hf_hub_download from llama_cpp import Llama import gradio as gr import multiprocessing import time import os # Model paths - download models if not already cached def get_model_path(repo_id, filename): print(f"Obtaining {filename}...") return hf_hub_download(repo_id=repo_id, filename=filename) # Get models base_model_path = get_model_path( "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", "articulate-11-expspanish-base-merged-q8_0.gguf" ) adapter_path = get_model_path( "johnpaulbin/articulate-V1-Q8_0-GGUF", "articulate-V1-q8_0.gguf" ) # CPU optimization settings cpu_count = multiprocessing.cpu_count() physical_cores = max(1, cpu_count // 2) # Estimate physical cores optimal_threads = max(4, physical_cores - 1) # Leave one core free for system batch_size = int(os.environ.get("BATCH_SIZE", "512")) # Configurable batch size print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...") # Initialize model with optimized parameters start_time = time.time() llm = Llama( model_path=base_model_path, lora_path=adapter_path, n_ctx=512, # Context length n_threads=optimal_threads, # Optimized thread count n_batch=batch_size, # Process more tokens in parallel use_mmap=True, # More efficient memory usage n_gpu_layers=0, # CPU only seed=42, # Consistent results verbose=False # Reduce logging overhead ) print(f"Model loaded in {time.time() - start_time:.2f} seconds") # Translation cache translation_cache = {} MAX_CACHE_SIZE = 100 # Limit cache size def translate(direction, text): # Skip empty inputs if not text or not text.strip(): return "" # Check cache first for faster response cache_key = f"{direction}:{text}" if cache_key in translation_cache: return translation_cache[cache_key] # Start timing for performance tracking start_time = time.time() # Map language directions lang_map = { "English to Spanish": ("ENGLISH", "SPANISH"), "Spanish to English": ("SPANISH", "ENGLISH"), "Korean to English": ("KOREAN", "ENGLISH"), "English to Korean": ("ENGLISH", "KOREAN") } if direction not in lang_map: return "Invalid direction" source_lang, target_lang = lang_map[direction] # Efficient prompt format prompt = f"[{source_lang}]{text.strip()}[{target_lang}]" # Estimate appropriate token length based on input input_tokens = len(text.split()) max_tokens = min(200, max(50, int(input_tokens * 1.5))) # Generate translation with optimized settings response = llm.create_completion( prompt, max_tokens=max_tokens, temperature=0.0, # Deterministic for faster inference top_k=1, # Only consider most likely token top_p=1.0, # No sampling repeat_penalty=1.0, # No repeat penalty processing stream=False # Get complete response at once (faster) ) translation = response['choices'][0]['text'].strip() # Cache result if len(translation_cache) >= MAX_CACHE_SIZE: # Remove oldest entry (first key) translation_cache.pop(next(iter(translation_cache))) translation_cache[cache_key] = translation # Log performance inference_time = time.time() - start_time tokens_per_second = (input_tokens + len(translation.split())) / inference_time print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)") return translation # Create Gradio interface with minimal overhead with gr.Blocks(title="Fast Translation App") as iface: gr.Markdown("## Translation App") with gr.Row(): direction = gr.Dropdown( choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"], label="Translation Direction", value="English to Spanish" ) with gr.Row(): input_text = gr.Textbox(lines=5, label="Input Text") output_text = gr.Textbox(lines=5, label="Translation") # Add translate button translate_btn = gr.Button("Translate") translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text) # Add examples for convenience gr.Examples( examples=[ ["English to Spanish", "Hello, how are you today?"], ["Spanish to English", "Hola, ¿cómo estás hoy?"], ["English to Korean", "The weather is nice today."], ["Korean to English", "오늘 날씨가 좋습니다."] ], inputs=[direction, input_text], outputs=output_text, cache_examples=True # Pre-compute examples ) # Launch with optimized settings iface.launch(debug=False, show_error=True)