Nitin00043 commited on
Commit
191e2cd
·
verified ·
1 Parent(s): 6fe9380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -35
app.py CHANGED
@@ -2,14 +2,27 @@ from transformers import pipeline, AutoModelForSequenceClassification, AutoToken
2
  import gradio as gr
3
  import torch
4
  from concurrent.futures import ThreadPoolExecutor
 
5
 
6
- # Load models with quantization (8-bit) for faster inference
 
 
 
 
 
7
  def load_quantized_model(model_name):
8
- model = AutoModelForSequenceClassification.from_pretrained(model_name, load_in_8bit=True)
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- return pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
11
 
12
- # Load models in parallel during startup
13
  with ThreadPoolExecutor() as executor:
14
  sentiment_future = executor.submit(load_quantized_model, "cardiffnlp/twitter-roberta-base-sentiment")
15
  emotion_future = executor.submit(load_quantized_model, "bhadresh-savani/bert-base-uncased-emotion")
@@ -17,43 +30,46 @@ with ThreadPoolExecutor() as executor:
17
  sentiment_pipeline = sentiment_future.result()
18
  emotion_pipeline = emotion_future.result()
19
 
20
- # Cache recent predictions to avoid recomputation
21
- CACHE_SIZE = 100
22
- prediction_cache = {}
23
-
24
  def analyze_text(text):
25
- # Check cache first
26
- if text in prediction_cache:
27
- return prediction_cache[text]
 
28
 
29
- # Parallel model execution
30
- with ThreadPoolExecutor() as executor:
31
- sentiment_future = executor.submit(sentiment_pipeline, text)
32
- emotion_future = executor.submit(emotion_pipeline, text)
 
 
 
 
33
 
34
- sentiment_result = sentiment_future.result()[0]
35
- emotion_result = emotion_future.result()[0]
36
-
37
- # Format response
38
- result = {
39
- "Sentiment": {sentiment_result['label']: round(sentiment_result['score'], 4)},
40
- "Emotion": {emotion_result['label']: round(emotion_result['score'], 4)}
41
- }
42
 
43
- # Update cache
44
- if len(prediction_cache) >= CACHE_SIZE:
45
- prediction_cache.pop(next(iter(prediction_cache)))
46
- prediction_cache[text] = result
 
47
 
48
  return result
49
 
50
- # Optimized Gradio interface with batch processing
51
- demo = gr.Interface(
 
 
52
  fn=analyze_text,
53
  inputs=gr.Textbox(placeholder="Enter your text here...", label="Input Text"),
54
- outputs=gr.Label(label="Analysis Results"),
55
  title="🚀 Fast Sentiment & Emotion Analysis",
56
- description="Optimized version using quantized models and parallel processing",
57
  examples=[
58
  ["I'm thrilled to start this new adventure!"],
59
  ["This situation is making me really frustrated."],
@@ -63,8 +79,8 @@ demo = gr.Interface(
63
  allow_flagging="never"
64
  )
65
 
66
- # Warm up models with sample input
67
- analyze_text("Warming up models...")
68
 
69
  if __name__ == "__main__":
70
- demo.launch()
 
2
  import gradio as gr
3
  import torch
4
  from concurrent.futures import ThreadPoolExecutor
5
+ from threading import Lock
6
 
7
+ # Global cache settings and lock for thread-safety
8
+ CACHE_SIZE = 100
9
+ prediction_cache = {}
10
+ cache_lock = Lock()
11
+
12
+ # Function to load models with 8-bit quantization
13
  def load_quantized_model(model_name):
14
+ try:
15
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, load_in_8bit=True)
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ device = 0 if torch.cuda.is_available() else -1
18
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
19
+ print(f"Loaded model: {model_name}")
20
+ return pipe
21
+ except Exception as e:
22
+ print(f"Error loading model '{model_name}': {e}")
23
+ raise e
24
 
25
+ # Load both models concurrently at startup
26
  with ThreadPoolExecutor() as executor:
27
  sentiment_future = executor.submit(load_quantized_model, "cardiffnlp/twitter-roberta-base-sentiment")
28
  emotion_future = executor.submit(load_quantized_model, "bhadresh-savani/bert-base-uncased-emotion")
 
30
  sentiment_pipeline = sentiment_future.result()
31
  emotion_pipeline = emotion_future.result()
32
 
 
 
 
 
33
  def analyze_text(text):
34
+ # Check cache first (using lock for thread-safety)
35
+ with cache_lock:
36
+ if text in prediction_cache:
37
+ return prediction_cache[text]
38
 
39
+ try:
40
+ # Execute both model inferences in parallel
41
+ with ThreadPoolExecutor() as executor:
42
+ sentiment_future = executor.submit(sentiment_pipeline, text)
43
+ emotion_future = executor.submit(emotion_pipeline, text)
44
+
45
+ sentiment_result = sentiment_future.result()[0]
46
+ emotion_result = emotion_future.result()[0]
47
 
48
+ # Prepare a clear, rounded output
49
+ result = {
50
+ "Sentiment": {sentiment_result['label']: round(sentiment_result['score'], 4)},
51
+ "Emotion": {emotion_result['label']: round(emotion_result['score'], 4)}
52
+ }
53
+ except Exception as e:
54
+ result = {"error": str(e)}
 
55
 
56
+ # Update cache with lock protection
57
+ with cache_lock:
58
+ if len(prediction_cache) >= CACHE_SIZE:
59
+ prediction_cache.pop(next(iter(prediction_cache)))
60
+ prediction_cache[text] = result
61
 
62
  return result
63
 
64
+ # Gradio interface: using gr.JSON to display structured output
65
+
66
+
67
+ demo = gr.Interface(
68
  fn=analyze_text,
69
  inputs=gr.Textbox(placeholder="Enter your text here...", label="Input Text"),
70
+ outputs=gr.JSON(label="Analysis Results"),
71
  title="🚀 Fast Sentiment & Emotion Analysis",
72
+ description="An optimized application using 8-bit quantized models and parallel processing for fast inference.",
73
  examples=[
74
  ["I'm thrilled to start this new adventure!"],
75
  ["This situation is making me really frustrated."],
 
79
  allow_flagging="never"
80
  )
81
 
82
+ # Warm up the models with a sample input to reduce first-call latency
83
+ _ = analyze_text("Warming up models...")
84
 
85
  if __name__ == "__main__":
86
+ demo.launch()