johnpaulbin commited on
Commit
6509da1
·
verified ·
1 Parent(s): 680d5eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -476
app.py CHANGED
@@ -1,482 +1,70 @@
1
- import os
2
- import time
3
- import threading
4
- import queue
5
- import multiprocessing
6
- from pathlib import Path
7
- import torch
8
- import gradio as gr
9
  from huggingface_hub import hf_hub_download
10
- import numpy as np
11
-
12
- # Set up environment variables for CPU optimization
13
- os.environ["OMP_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1)) # Optimal OpenMP threads
14
- os.environ["MKL_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1)) # Optimal MKL threads
15
- os.environ["LLAMA_AVX"] = "1"
16
- os.environ["LLAMA_AVX2"] = "1"
17
- os.environ["LLAMA_F16"] = "1"
18
-
19
- # Cache directories
20
- CACHE_DIR = Path.home() / ".cache" / "fast_translate"
21
- MODEL_CACHE = CACHE_DIR / "models"
22
- QUANTIZED_CACHE = CACHE_DIR / "quantized"
23
- os.makedirs(MODEL_CACHE, exist_ok=True)
24
- os.makedirs(QUANTIZED_CACHE, exist_ok=True)
25
-
26
- # Check if we're running on CPU
27
- has_gpu = torch.cuda.is_available()
28
- gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
29
- print(f"GPU available: {has_gpu} - {gpu_name}")
30
-
31
- # Configure CPU settings
32
- cpu_count = multiprocessing.cpu_count()
33
- optimal_threads = max(4, cpu_count - 1) # Leave one core free
34
- print(f"Using {optimal_threads} of {cpu_count} CPU cores")
35
-
36
- # Download model files
37
- def get_model_path(repo_id):
38
- print(f"Obtaining {repo_id}...")
39
- # Download to our custom cache location
40
- return hf_hub_download(repo_id=repo_id, cache_dir=MODEL_CACHE)
41
-
42
- # Function to quantize model to int4 or int8
43
- def quantize_model(input_model_path, output_model_path, quantization_type="q4_0"):
44
- """Quantize model to lower precision for faster inference on CPU"""
45
- try:
46
- from llama_cpp import llama_model_quantize
47
-
48
- # Check if quantized model already exists
49
- if os.path.exists(output_model_path):
50
- print(f"Using existing quantized model: {output_model_path}")
51
- return output_model_path
52
-
53
- print(f"Quantizing model to {quantization_type}...")
54
- start_time = time.time()
55
-
56
- # Quantize using llama-cpp-python built-in quantization
57
- llama_model_quantize(
58
- input_model_path,
59
- output_model_path,
60
- quantization_type
61
- )
62
-
63
- print(f"Quantization completed in {time.time() - start_time:.2f}s")
64
- return output_model_path
65
- except Exception as e:
66
- print(f"Quantization failed: {e}, using original model")
67
- return input_model_path
68
-
69
- # Download models
70
- base_model_path = get_model_path(
71
- "johnpaulbin/articulate-11-expspanish-base-merged"
72
- )
73
- adapter_path = get_model_path(
74
- "johnpaulbin/articulate-V1"
75
- )
76
-
77
- # Quantize models (creates int4 versions for faster CPU inference)
78
- quantized_base_path = str(QUANTIZED_CACHE / "articulate-base-q4_0.gguf")
79
- quantized_adapter_path = str(QUANTIZED_CACHE / "articulate-adapter-q4_0.gguf")
80
- base_model_path = quantize_model(base_model_path, quantized_base_path, "q4_0")
81
- adapter_path = quantize_model(adapter_path, quantized_adapter_path, "q4_0")
82
-
83
- # Import after setting environment variables
84
  from llama_cpp import Llama
 
85
 
86
- # Translation cache
87
- translation_cache = {}
88
- MAX_CACHE_SIZE = 1000
89
-
90
- # Model worker with batching support
91
- class ModelWorker:
92
- def __init__(self):
93
- self.model = None
94
- self.request_queue = queue.Queue()
95
- self.response_queue = queue.Queue()
96
- self.batch_queue = []
97
- self.batch_event = threading.Event()
98
- self.batch_size = 4 # Process up to 4 requests at once
99
- self.batch_timeout = 0.1 # Wait 100ms max to collect batch
100
- self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
101
- self.batch_thread = threading.Thread(target=self._batch_loop, daemon=True)
102
- self.worker_thread.start()
103
- self.batch_thread.start()
104
-
105
- def _batch_loop(self):
106
- """Collect requests into batches for more efficient processing"""
107
- while True:
108
- try:
109
- # Get a request
110
- request = self.request_queue.get()
111
- if request is None:
112
- break
113
-
114
- # Add to batch
115
- self.batch_queue.append(request)
116
-
117
- # Try to collect more requests for the batch
118
- batch_start = time.time()
119
- while (len(self.batch_queue) < self.batch_size and
120
- time.time() - batch_start < self.batch_timeout):
121
- try:
122
- req = self.request_queue.get_nowait()
123
- if req is None:
124
- break
125
- self.batch_queue.append(req)
126
- except queue.Empty:
127
- time.sleep(0.01)
128
-
129
- # Signal worker to process the batch
130
- current_batch = self.batch_queue.copy()
131
- self.batch_queue = []
132
- for req in current_batch:
133
- self._process_request(req)
134
-
135
- except Exception as e:
136
- print(f"Error in batch thread: {e}")
137
-
138
- def _worker_loop(self):
139
- """Initialize model and process requests"""
140
- try:
141
- # Initialize model with optimized settings
142
- print("Initializing model in background thread...")
143
- start_time = time.time()
144
-
145
- # Create model context with very optimized settings for CPU
146
- self.model = Llama(
147
- model_path=base_model_path,
148
- lora_path=adapter_path,
149
- n_ctx=256, # Smaller context for speed
150
- n_threads=optimal_threads, # Use all but one CPU core
151
- n_batch=512, # Smaller batch for CPU
152
- use_mmap=True, # Memory mapping (more efficient)
153
- n_gpu_layers=0, # Force CPU only
154
- seed=42, # Consistent results
155
- rope_freq_base=10000, # Default RoPE settings
156
- rope_freq_scale=1.0,
157
- verbose=False # Reduce overhead
158
- )
159
-
160
- print(f"Model loaded in {time.time() - start_time:.2f} seconds")
161
-
162
- # Pre-warm the model with common phrases by running a simple inference
163
- print("Pre-warming model...")
164
- self.model.create_completion("[ENGLISH]hello[SPANISH]", max_tokens=8)
165
- print("Model ready for translation")
166
-
167
- except Exception as e:
168
- print(f"Failed to initialize model: {e}")
169
-
170
- def _process_request(self, request):
171
- """Process a single translation request"""
172
- try:
173
- direction, text, callback_id = request
174
- result = self._process_translation(direction, text)
175
- self.response_queue.put((callback_id, result))
176
- except Exception as e:
177
- print(f"Error processing request: {e}")
178
- self.response_queue.put((callback_id, f"Error: {str(e)}"))
179
-
180
- def _process_translation(self, direction, text):
181
- """Translate text with optimized settings"""
182
- if not text or not text.strip():
183
- return ""
184
-
185
- # Check cache first for faster response
186
- cache_key = f"{direction}:{text}"
187
- if cache_key in translation_cache:
188
- print("Cache hit!")
189
- return translation_cache[cache_key]
190
-
191
- # Start timing for performance tracking
192
- start_time = time.time()
193
-
194
- # Map language directions
195
- lang_map = {
196
- "English to Spanish": ("ENGLISH", "SPANISH"),
197
- "Spanish to English": ("SPANISH", "ENGLISH"),
198
- "Korean to English": ("KOREAN", "ENGLISH"),
199
- "English to Korean": ("ENGLISH", "KOREAN")
200
- }
201
-
202
- if direction not in lang_map:
203
- return "Invalid direction"
204
-
205
- source_lang, target_lang = lang_map[direction]
206
-
207
- # Efficient prompt format
208
- prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
209
-
210
- # Estimate appropriate token length based on input
211
- input_tokens = min(100, max(10, len(text.split())))
212
- max_tokens = min(100, max(25, int(input_tokens * 1.3)))
213
-
214
- # Generate translation with aggressively optimized settings for speed
215
- response = self.model.create_completion(
216
- prompt,
217
- max_tokens=max_tokens,
218
- temperature=0.0, # Deterministic
219
- top_k=1, # Most likely token
220
- top_p=1.0, # No sampling
221
- repeat_penalty=1.0, # No penalty
222
- stream=False # Get complete response
223
- )
224
-
225
- translation = response['choices'][0]['text'].strip()
226
-
227
- # Cache result
228
- if len(translation_cache) >= MAX_CACHE_SIZE:
229
- # Remove oldest entry (first key)
230
- translation_cache.pop(next(iter(translation_cache)))
231
- translation_cache[cache_key] = translation
232
-
233
- # Log performance
234
- inference_time = time.time() - start_time
235
- tokens_per_second = (input_tokens + len(translation.split())) / inference_time
236
- print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
237
-
238
- return translation
239
-
240
- def request_translation(self, direction, text, callback_id):
241
- """Queue a translation request"""
242
- self.request_queue.put((direction, text, callback_id))
243
-
244
- # Model preloading thread that preloads and pre-computes common translations
245
- def preload_common_phrases(worker):
246
- # Dictionary of common phrases that will benefit from caching
247
- common_phrases = {
248
- "English to Spanish": [
249
- "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
250
- "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
251
- "How much does it cost?", "What time is it?", "I don't speak Spanish",
252
- "Where is the bathroom?", "I need help", "Can you help me?"
253
- ],
254
- "Spanish to English": [
255
- "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
256
- "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está",
257
- "¿Cuánto cuesta?", "¿Qué hora es?", "No hablo español", "¿Dónde está el baño?",
258
- "Necesito ayuda", "¿Puedes ayudarme?"
259
- ],
260
- "English to Korean": [
261
- "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
262
- "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
263
- "How much is this?", "What time is it?", "I don't speak Korean"
264
- ],
265
- "Korean to English": [
266
- "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
267
- "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요",
268
- "이거 얼마예요?", "지금 몇 시예요?", "한국어를 못해요"
269
- ]
270
- }
271
-
272
- preload_requests = []
273
- for direction, phrases in common_phrases.items():
274
- for phrase in phrases:
275
- preload_requests.append((direction, phrase, f"preload_{len(preload_requests)}"))
276
-
277
- # Process preloading in a separate thread
278
- def preloader():
279
- print(f"Preloading {len(preload_requests)} common phrases in background...")
280
- for request in preload_requests:
281
- worker.request_translation(*request)
282
- # Small sleep to avoid overwhelming the queue
283
- time.sleep(0.1)
284
- print("Preloading complete")
285
-
286
- thread = threading.Thread(target=preloader, daemon=True)
287
- thread.start()
288
- return thread
289
-
290
- # Create worker instance
291
- worker = ModelWorker()
292
-
293
- # Start preloading common phrases in background
294
- preload_thread = preload_common_phrases(worker)
295
-
296
- # Counter for request IDs
297
- next_request_id = 0
298
-
299
- # Implementation of a faster sentence splitter for batching
300
- def split_sentences(text, max_length=50):
301
- """Split text into manageable chunks for faster translation"""
302
- if len(text) <= max_length:
303
- return [text]
304
-
305
- # Split on natural boundaries
306
- delimiters = ['. ', '! ', '? ', '.\n', '!\n', '?\n', '\n\n']
307
- chunks = []
308
- current_chunk = ""
309
-
310
- lines = text.split('\n')
311
- for line in lines:
312
- if not line.strip():
313
- if current_chunk:
314
- chunks.append(current_chunk)
315
- current_chunk = ""
316
- continue
317
-
318
- words = line.split(' ')
319
- for word in words:
320
- test_chunk = f"{current_chunk} {word}".strip()
321
- if len(test_chunk) > max_length:
322
- chunks.append(current_chunk)
323
- current_chunk = word
324
- else:
325
- current_chunk = test_chunk
326
-
327
- # Check for natural breaks
328
- for delimiter in delimiters:
329
- if delimiter in current_chunk[-len(delimiter):]:
330
- chunks.append(current_chunk)
331
- current_chunk = ""
332
- break
333
-
334
- if current_chunk:
335
- chunks.append(current_chunk)
336
-
337
- return chunks
338
-
339
- # Gradio interface functions
340
- def translate(direction, text, progress=gr.Progress()):
341
- """Fast translation with batching and caching"""
342
- global next_request_id
343
-
344
- # Skip empty inputs
345
- if not text or not text.strip():
346
- return ""
347
-
348
- # Check exact cache hit
349
- cache_key = f"{direction}:{text}"
350
- if cache_key in translation_cache:
351
- return translation_cache[cache_key]
352
-
353
- # For longer texts, split into sentences for faster processing
354
- if len(text) > 50:
355
- progress(0.1, desc="Processing text...")
356
- chunks = split_sentences(text)
357
- if len(chunks) > 1:
358
- results = []
359
- for i, chunk in enumerate(chunks):
360
- # Check if this chunk is in cache
361
- chunk_key = f"{direction}:{chunk}"
362
- if chunk_key in translation_cache:
363
- results.append(translation_cache[chunk_key])
364
- continue
365
-
366
- # Request translation for this chunk
367
- chunk_id = next_request_id
368
- next_request_id += 1
369
- worker.request_translation(direction, chunk, chunk_id)
370
-
371
- # Wait for response
372
- chunk_start = time.time()
373
- while time.time() - chunk_start < 10: # 10 second timeout per chunk
374
- progress((i + 0.5) / len(chunks), desc=f"Translating part {i+1}/{len(chunks)}")
375
-
376
- try:
377
- while not worker.response_queue.empty():
378
- resp_id, result = worker.response_queue.get_nowait()
379
- if resp_id == chunk_id:
380
- results.append(result)
381
- chunk_found = True
382
- break
383
- except queue.Empty:
384
- pass
385
-
386
- time.sleep(0.05)
387
-
388
- if len(results) != i + 1:
389
- results.append(f"[Translation failed for part {i+1}]")
390
-
391
- combined = " ".join(results)
392
- translation_cache[cache_key] = combined
393
- progress(1.0)
394
- return combined
395
-
396
- # For single sentences
397
- request_id = next_request_id
398
- next_request_id += 1
399
-
400
- # Queue the request
401
- worker.request_translation(direction, text, request_id)
402
-
403
- # Wait for the response
404
- progress(0.2, desc="Translating...")
405
- start_time = time.time()
406
- max_wait = 20 # Maximum wait time in seconds
407
-
408
- while time.time() - start_time < max_wait:
409
- progress(0.2 + 0.8 * ((time.time() - start_time) / max_wait), desc="Translating...")
410
-
411
- # Check for our response
412
- try:
413
- while not worker.response_queue.empty():
414
- resp_id, result = worker.response_queue.get_nowait()
415
- if resp_id == request_id:
416
- progress(1.0)
417
- return result
418
- except queue.Empty:
419
- pass
420
-
421
- # Small sleep to prevent CPU hogging
422
- time.sleep(0.05)
423
-
424
- progress(1.0)
425
- return "Translation timed out. Please try again with a shorter text."
426
 
427
- # Create Gradio interface
428
- with gr.Blocks(title="Ultra-Fast Translation App (CPU Optimized)") as iface:
429
- gr.Markdown(f"""
430
- ## Ultra-Fast Translation App (CPU Optimized)
431
- Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU optimized with int4 quantization'}
432
- """)
433
-
434
- with gr.Row():
435
- direction = gr.Dropdown(
436
- choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
437
- label="Translation Direction",
438
- value="English to Spanish"
439
- )
440
-
441
- with gr.Row():
442
- input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
443
- output_text = gr.Textbox(lines=5, label="Translation")
444
-
445
- # Add translate button
446
- translate_btn = gr.Button("Translate")
447
- translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
448
-
449
- # Optimization options
450
- with gr.Accordion("Performance Tips", open=True):
451
- gr.Markdown("""
452
- ### Speed Optimization Tips
453
- - The model has been quantized to int4 for faster CPU execution
454
- - ✅ Common phrases are pre-cached for instant results
455
- - ✅ Long text is automatically split into smaller chunks
456
- - ✅ First translation will be slower as the model warms up
457
- - ✅ Short sentences (< 50 chars) translate much faster
458
- """)
459
-
460
- # Add examples with preloaded common phrases
461
- gr.Examples(
462
- examples=[
463
- ["English to Spanish", "Hello, how are you today?"],
464
- ["Spanish to English", "Hola, ¿cómo estás hoy?"],
465
- ["English to Korean", "The weather is nice today."],
466
- ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
467
- ],
468
- inputs=[direction, input_text],
469
- fn=translate,
470
- outputs=output_text
471
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- # Launch with optimized settings
474
- if __name__ == "__main__":
475
- iface.launch(
476
- debug=False,
477
- show_error=True,
478
- share=False,
479
- quiet=True,
480
- server_name="0.0.0.0",
481
- server_port=7860
482
- )
 
 
 
 
 
 
 
 
 
1
  from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from llama_cpp import Llama
3
+ import gradio as gr
4
 
5
+ # Download the base model
6
+ base_model_repo = "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF"
7
+ base_model_file = "articulate-11-expspanish-base-merged-q8_0.gguf"
8
+ base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_file)
9
+
10
+ # Download the LoRA adapter
11
+ adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
12
+ adapter_file = "articulate-V1-q8_0.gguf"
13
+ adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
14
+
15
+ # Initialize the Llama model with base model and adapter
16
+ llm = Llama(
17
+ model_path=base_model_path,
18
+ lora_path=adapter_path,
19
+ n_ctx=512, # Context length, set manually since adapter lacks it
20
+ n_threads=2, # Adjust based on your system
21
+ n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
22
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Define the translation function
25
+ def translate(direction, text):
26
+ # Determine source and target languages based on direction
27
+ if direction == "English to Spanish":
28
+ source_lang = "ENGLISH"
29
+ target_lang = "SPANISH"
30
+ elif direction == "Spanish to English":
31
+ source_lang = "SPANISH"
32
+ target_lang = "ENGLISH"
33
+ elif direction == "Korean to English":
34
+ source_lang = "KOREAN"
35
+ target_lang = "ENGLISH"
36
+ elif direction == "English to Korean":
37
+ source_lang = "ENGLISH"
38
+ target_lang = "KOREAN"
39
+ else:
40
+ return "Invalid direction"
41
+
42
+ # Construct the prompt for raw completion
43
+ prompt = f"[{source_lang}]{text}[{target_lang}]"
44
+
45
+ # Generate completion with deterministic settings (greedy decoding)
46
+ response = llm.create_completion(
47
+ prompt,
48
+ max_tokens=200, # Limit output length
49
+ temperature=0, # Greedy decoding
50
+ top_k=1 # Select the most probable token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  )
52
+
53
+ # Extract and return the generated text
54
+ return response['choices'][0]['text'].strip()
55
+
56
+ # Define the Gradio interface
57
+ direction_options = ["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"]
58
+ iface = gr.Interface(
59
+ fn=translate,
60
+ inputs=[
61
+ gr.Dropdown(choices=direction_options, label="Translation Direction"),
62
+ gr.Textbox(lines=5, label="Input Text")
63
+ ],
64
+ outputs=gr.Textbox(lines=5, label="Translation"),
65
+ title="Translation App",
66
+ description="Translate text between English and Spanish using the Articulate V1 model."
67
+ )
68
 
69
+ # Launch the app
70
+ iface.launch(debug=True)