Spaces:

thealper2
/

aspect-sentiment-pipeline

Sleeping

File size: 7,893 Bytes

from fastapi.middleware.cors import CORSMiddleware
import json
import torch
import nltk
nltk.download("punkt")
from transformers import ConvBertTokenizerFast, ConvBertForTokenClassification, BertTokenizer, BertForTokenClassification, BertForSequenceClassification, Pipeline
from nltk import sent_tokenize
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel, Field
import re
import emoji

stop_words = [x.strip() for x in open('stop-words.tr.txt','r', encoding="UTF8").read().split('\n')]

def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub('http://\S+|https://\S+', '', text)    
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = ' '.join(word for word in text.split() if not word[0] == "#")
    text = re.sub('a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub('[0-9]+', '', text)
    text = re.sub("\n", " ", text)
    text = re.sub(r"\.x*", " ", text)
    text = re.sub(r'[^\w\s\+\-_]', '', text)
    text = emoji.replace_emoji(text)
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    text = re.sub(r'(mısın|misin|musun|müsün)$', r' \1', ' '.join(re.sub(r'(mısın|misin|musun|müsün)$', r'\1', word) for word in text.split()))
    text = re.sub(r'\b(de|da)\b\s*', '', text)
    text = ' '.join([t for t in text.split() if len(t) > 1])
    return text

class AspectSentimentPipeline(Pipeline):
    def __init__(self, aspect_extraction_model, aspect_extraction_tokenizer, aspect_sentiment_model, aspect_sentiment_tokenizer, device):
        super().__init__(aspect_extraction_model, aspect_extraction_tokenizer)
        self.aspect_extraction_model = aspect_extraction_model
        self.aspect_extraction_tokenizer = aspect_extraction_tokenizer
        self.aspect_sentiment_model = aspect_sentiment_model
        self.aspect_sentiment_tokenizer = aspect_sentiment_tokenizer
        self.device = device

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, inputs):
        return sent_tokenize(inputs)

    def _forward(self, sentences):
        main_results = []
        main_aspects = []
        for sentence in sentences:
            sentence = preprocess_text(sentence)
            aspects = self.extract_aspects(sentence, self.aspect_extraction_model, self.aspect_extraction_tokenizer, self.device)
            for aspect in aspects:
                main_aspects.append(aspect)
                sentiment = self.predict_sentiment(sentence, aspect)
                main_results.append({"aspect": aspect, "sentiment": sentiment})
                
        return {"entity_list": main_aspects, "results": main_results}

    def postprocess(self, model_outputs):
        return model_outputs

    def predict_sentiment(self, sentence, aspect):
        inputs = self.aspect_sentiment_tokenizer(aspect, sentence, return_tensors="pt").to(self.device)
        self.aspect_sentiment_model.to(self.device)
        self.aspect_sentiment_model.eval()

        with torch.no_grad():
            outputs = self.aspect_sentiment_model(**inputs)
            logits = outputs.logits

        sentiment = torch.argmax(logits, dim=-1).item()
        sentiment_label = self.aspect_sentiment_model.config.id2label[sentiment]
        sentiment_id_to_label = {
            "LABEL_0": "olumsuz",
            "LABEL_1": "nötr",
            "LABEL_2": "olumlu"
        }

        return sentiment_id_to_label[sentiment_label]

    def align_word_predictions(self, tokens, predictions):
        aligned_tokens = []
        aligned_predictions = []
        for token, prediction in zip(tokens, predictions):
            if not token.startswith("##"):
                aligned_tokens.append(token)
                aligned_predictions.append(prediction)
            else:
                aligned_tokens[-1] = aligned_tokens[-1] + token[2:]
        return aligned_tokens, aligned_predictions

    def extract_aspects(self, review, aspect_extraction_model, aspect_extraction_tokenizer, device):
        inputs = self.aspect_extraction_tokenizer(review, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=64, return_tensors="pt").to(device)
        self.aspect_extraction_model.to(device)
        self.aspect_extraction_model.eval()
        ids = inputs["input_ids"].to(device)
        mask = inputs["attention_mask"].to(device)
        
        with torch.no_grad():
            outputs = self.aspect_extraction_model(ids, attention_mask=mask)
            logits = outputs[0]
        
        active_logits = logits.view(-1, self.aspect_extraction_model.num_labels) 
        flattened_predictions = torch.argmax(active_logits, axis=1) 
        
        tokens = self.aspect_extraction_tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
        ids_to_labels = {0: 'O', 1: 'B-A', 2: 'I-A'}
        token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    
        filtered_tokens = [token for token in tokens if token not in ["[PAD]", "[CLS]", "[SEP]"]]
        filtered_predictions = [pred for token, pred in zip(tokens, token_predictions) if token not in ["[PAD]", "[CLS]", "[SEP]"]]
        
        aligned_tokens, aligned_predictions = self.align_word_predictions(filtered_tokens, filtered_predictions)
    
        aspects = []
        current_aspect = []
        
        for i, (token, prediction) in enumerate(zip(aligned_tokens, aligned_predictions)):
            if prediction == "B-A":
                if current_aspect:
                    current_aspect.append(token)
                    if len(token) == 1:
                        aspects.append("".join(current_aspect))
                    else:
                        aspects.append(" ".join(current_aspect))
                    current_aspect = []
                else:
                    current_aspect.append(token)
            elif prediction == "I-A":
                if current_aspect:
                    current_aspect.append(token)
                else:
                    current_aspect.append(token)
            elif prediction == "O" or i == len(aligned_tokens) - 1:
                if current_aspect:
                    aspects.append(" ".join(current_aspect))
                    current_aspect = []
    
        return aspects

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

aspect_extraction_model = ConvBertForTokenClassification.from_pretrained("thealper2/aspect-extraction-model")
aspect_extraction_tokenizer = ConvBertTokenizerFast.from_pretrained("thealper2/aspect-extraction-tokenizer")

aspect_sentiment_model = BertForSequenceClassification.from_pretrained("thealper2/aspect-sentiment-model")
aspect_sentiment_tokenizer = BertTokenizer.from_pretrained("thealper2/aspect-sentiment-tokenizer")

pipeline = AspectSentimentPipeline(
    aspect_extraction_model=aspect_extraction_model,
    aspect_extraction_tokenizer=aspect_extraction_tokenizer,
    aspect_sentiment_model=aspect_sentiment_model,
    aspect_sentiment_tokenizer=aspect_sentiment_tokenizer,
    device=device
)

app = FastAPI()

class Item(BaseModel):
    text: str = Field(..., example="""Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz.  Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim ? @Turkcell """)

@app.get("/", tags=["Home"])
def api_home():
    return {"detail": "Welcome to FastAPI!"}

@app.post("/predict/", response_model=dict)
async def predict(item: Item):
    result = pipeline(item.text)
    return result


if __name__=="__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)