File size: 3,182 Bytes
adc05de
623c9e7
adc05de
 
191e2cd
623c9e7
191e2cd
 
 
 
 
 
adc05de
191e2cd
 
 
 
 
 
 
 
 
 
623c9e7
191e2cd
adc05de
 
 
 
 
 
 
623c9e7
191e2cd
 
 
 
adc05de
191e2cd
 
 
 
 
 
 
 
adc05de
191e2cd
 
 
 
 
 
 
adc05de
191e2cd
 
 
 
 
adc05de
 
623c9e7
191e2cd
 
 
 
623c9e7
 
191e2cd
adc05de
191e2cd
623c9e7
 
 
 
 
adc05de
 
623c9e7
 
191e2cd
 
adc05de
623c9e7
191e2cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import gradio as gr
import torch
from concurrent.futures import ThreadPoolExecutor
from threading import Lock

# Global cache settings and lock for thread-safety
CACHE_SIZE = 100
prediction_cache = {}
cache_lock = Lock()

# Function to load models with 8-bit quantization
def load_quantized_model(model_name):
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, load_in_8bit=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        device = 0 if torch.cuda.is_available() else -1
        pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
        print(f"Loaded model: {model_name}")
        return pipe
    except Exception as e:
        print(f"Error loading model '{model_name}': {e}")
        raise e

# Load both models concurrently at startup
with ThreadPoolExecutor() as executor:
    sentiment_future = executor.submit(load_quantized_model, "cardiffnlp/twitter-roberta-base-sentiment")
    emotion_future = executor.submit(load_quantized_model, "bhadresh-savani/bert-base-uncased-emotion")

sentiment_pipeline = sentiment_future.result()
emotion_pipeline = emotion_future.result()

def analyze_text(text):
    # Check cache first (using lock for thread-safety)
    with cache_lock:
        if text in prediction_cache:
            return prediction_cache[text]
    
    try:
        # Execute both model inferences in parallel
        with ThreadPoolExecutor() as executor:
            sentiment_future = executor.submit(sentiment_pipeline, text)
            emotion_future = executor.submit(emotion_pipeline, text)
            
            sentiment_result = sentiment_future.result()[0]
            emotion_result = emotion_future.result()[0]
        
        # Prepare a clear, rounded output
        result = {
            "Sentiment": {sentiment_result['label']: round(sentiment_result['score'], 4)},
            "Emotion": {emotion_result['label']: round(emotion_result['score'], 4)}
        }
    except Exception as e:
        result = {"error": str(e)}
    
    # Update cache with lock protection
    with cache_lock:
        if len(prediction_cache) >= CACHE_SIZE:
            prediction_cache.pop(next(iter(prediction_cache)))
        prediction_cache[text] = result
    
    return result

# Gradio interface: using gr.JSON to display structured output


demo = gr.Interface(   
    fn=analyze_text,
    inputs=gr.Textbox(placeholder="Enter your text here...", label="Input Text"),
    outputs=gr.JSON(label="Analysis Results"),
    title="🚀 Fast Sentiment & Emotion Analysis",
    description="An optimized application using 8-bit quantized models and parallel processing for fast inference.",
    examples=[
        ["I'm thrilled to start this new adventure!"],
        ["This situation is making me really frustrated."],
        ["I feel so heartbroken and lost."]
    ],
    theme="soft",
    allow_flagging="never"
)

# Warm up the models with a sample input to reduce first-call latency
_ = analyze_text("Warming up models...")

if __name__ == "__main__":
    demo.launch()