import re import pandas as pd import spacy from langdetect import detect_langs from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS from sklearn.cluster import KMeans from sklearn.manifold import TSNE import numpy as np import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig import streamlit as st from datetime import datetime # Lighter model MODEL ="cardiffnlp/twitter-xlm-roberta-base-sentiment" # Cache model loading with fallback for quantization @st.cache_resource def load_model(): device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True) model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device) # Attempt quantization with fallback try: # Set quantization engine explicitly (fbgemm for x86, qnnpack for ARM) torch.backends.quantized.engine = 'fbgemm' if torch.cuda.is_available() else 'qnnpack' model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) print("Model quantized successfully.") except RuntimeError as e: print(f"Quantization failed: {e}. Using non-quantized model.") config = AutoConfig.from_pretrained(MODEL) return tokenizer, model, config, device tokenizer, model, config, device = load_model() nlp_fr = spacy.load("fr_core_news_sm") nlp_en = spacy.load("en_core_web_sm") custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS)) def preprocess(text): if text is None: return "" if not isinstance(text, str): try: text = str(text) except: return "" new_text = [] for t in text.split(" "): t = '@user' if t.startswith('@') and len(t) > 1 else t t = 'http' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) def clean_message(text): if not isinstance(text, str): return "" text = text.lower() text = text.replace("", "").replace("this message was deleted", "").replace("null", "") text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text) return text.strip() def lemmatize_text(text, lang): if lang == 'fr': doc = nlp_fr(text) else: doc = nlp_en(text) return " ".join([token.lemma_ for token in doc if not token.is_punct]) def preprocess(data): pattern = r"^(?P\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P