Spaces:
Sleeping
Sleeping
File size: 7,893 Bytes
873b671 fc0d102 57f0cbd 873b671 88a843a 85bdc81 88a843a 873b671 a8f817c 873b671 33118b9 873b671 0a3f938 2355233 873b671 f70e1b4 7b697a4 873b671 f70e1b4 873b671 9148c8c 2355233 873b671 9148c8c 873b671 57f0cbd 873b671 67ca456 873b671 07173c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
from fastapi.middleware.cors import CORSMiddleware
import json
import torch
import nltk
nltk.download("punkt")
from transformers import ConvBertTokenizerFast, ConvBertForTokenClassification, BertTokenizer, BertForTokenClassification, BertForSequenceClassification, Pipeline
from nltk import sent_tokenize
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel, Field
import re
import emoji
stop_words = [x.strip() for x in open('stop-words.tr.txt','r', encoding="UTF8").read().split('\n')]
def preprocess_text(text):
text = re.sub(r"http\S+", "", text)
text = re.sub('http[s]?://\S+', '', text)
text = re.sub('http://\S+|https://\S+', '', text)
text = re.sub(r'http\S+', '', text)
text = re.sub(r'www\S+', '', text)
text = ' '.join(word for word in text.split() if not word[0] == "#")
text = re.sub('a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)
text = re.sub(r'<[^>]+>', '', text)
text = re.sub('[0-9]+', '', text)
text = re.sub("\n", " ", text)
text = re.sub(r"\.x*", " ", text)
text = re.sub(r'[^\w\s\+\-_]', '', text)
text = emoji.replace_emoji(text)
text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
text = re.sub(r'(mısın|misin|musun|müsün)$', r' \1', ' '.join(re.sub(r'(mısın|misin|musun|müsün)$', r'\1', word) for word in text.split()))
text = re.sub(r'\b(de|da)\b\s*', '', text)
text = ' '.join([t for t in text.split() if len(t) > 1])
return text
class AspectSentimentPipeline(Pipeline):
def __init__(self, aspect_extraction_model, aspect_extraction_tokenizer, aspect_sentiment_model, aspect_sentiment_tokenizer, device):
super().__init__(aspect_extraction_model, aspect_extraction_tokenizer)
self.aspect_extraction_model = aspect_extraction_model
self.aspect_extraction_tokenizer = aspect_extraction_tokenizer
self.aspect_sentiment_model = aspect_sentiment_model
self.aspect_sentiment_tokenizer = aspect_sentiment_tokenizer
self.device = device
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, inputs):
return sent_tokenize(inputs)
def _forward(self, sentences):
main_results = []
main_aspects = []
for sentence in sentences:
sentence = preprocess_text(sentence)
aspects = self.extract_aspects(sentence, self.aspect_extraction_model, self.aspect_extraction_tokenizer, self.device)
for aspect in aspects:
main_aspects.append(aspect)
sentiment = self.predict_sentiment(sentence, aspect)
main_results.append({"aspect": aspect, "sentiment": sentiment})
return {"entity_list": main_aspects, "results": main_results}
def postprocess(self, model_outputs):
return model_outputs
def predict_sentiment(self, sentence, aspect):
inputs = self.aspect_sentiment_tokenizer(aspect, sentence, return_tensors="pt").to(self.device)
self.aspect_sentiment_model.to(self.device)
self.aspect_sentiment_model.eval()
with torch.no_grad():
outputs = self.aspect_sentiment_model(**inputs)
logits = outputs.logits
sentiment = torch.argmax(logits, dim=-1).item()
sentiment_label = self.aspect_sentiment_model.config.id2label[sentiment]
sentiment_id_to_label = {
"LABEL_0": "olumsuz",
"LABEL_1": "nötr",
"LABEL_2": "olumlu"
}
return sentiment_id_to_label[sentiment_label]
def align_word_predictions(self, tokens, predictions):
aligned_tokens = []
aligned_predictions = []
for token, prediction in zip(tokens, predictions):
if not token.startswith("##"):
aligned_tokens.append(token)
aligned_predictions.append(prediction)
else:
aligned_tokens[-1] = aligned_tokens[-1] + token[2:]
return aligned_tokens, aligned_predictions
def extract_aspects(self, review, aspect_extraction_model, aspect_extraction_tokenizer, device):
inputs = self.aspect_extraction_tokenizer(review, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=64, return_tensors="pt").to(device)
self.aspect_extraction_model.to(device)
self.aspect_extraction_model.eval()
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
with torch.no_grad():
outputs = self.aspect_extraction_model(ids, attention_mask=mask)
logits = outputs[0]
active_logits = logits.view(-1, self.aspect_extraction_model.num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1)
tokens = self.aspect_extraction_tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
ids_to_labels = {0: 'O', 1: 'B-A', 2: 'I-A'}
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
filtered_tokens = [token for token in tokens if token not in ["[PAD]", "[CLS]", "[SEP]"]]
filtered_predictions = [pred for token, pred in zip(tokens, token_predictions) if token not in ["[PAD]", "[CLS]", "[SEP]"]]
aligned_tokens, aligned_predictions = self.align_word_predictions(filtered_tokens, filtered_predictions)
aspects = []
current_aspect = []
for i, (token, prediction) in enumerate(zip(aligned_tokens, aligned_predictions)):
if prediction == "B-A":
if current_aspect:
current_aspect.append(token)
if len(token) == 1:
aspects.append("".join(current_aspect))
else:
aspects.append(" ".join(current_aspect))
current_aspect = []
else:
current_aspect.append(token)
elif prediction == "I-A":
if current_aspect:
current_aspect.append(token)
else:
current_aspect.append(token)
elif prediction == "O" or i == len(aligned_tokens) - 1:
if current_aspect:
aspects.append(" ".join(current_aspect))
current_aspect = []
return aspects
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
aspect_extraction_model = ConvBertForTokenClassification.from_pretrained("thealper2/aspect-extraction-model")
aspect_extraction_tokenizer = ConvBertTokenizerFast.from_pretrained("thealper2/aspect-extraction-tokenizer")
aspect_sentiment_model = BertForSequenceClassification.from_pretrained("thealper2/aspect-sentiment-model")
aspect_sentiment_tokenizer = BertTokenizer.from_pretrained("thealper2/aspect-sentiment-tokenizer")
pipeline = AspectSentimentPipeline(
aspect_extraction_model=aspect_extraction_model,
aspect_extraction_tokenizer=aspect_extraction_tokenizer,
aspect_sentiment_model=aspect_sentiment_model,
aspect_sentiment_tokenizer=aspect_sentiment_tokenizer,
device=device
)
app = FastAPI()
class Item(BaseModel):
text: str = Field(..., example="""Fiber 100mb SuperOnline kullanıcısıyım yaklaşık 2 haftadır @Twitch @Kick_Turkey gibi canlı yayın platformlarında 360p yayın izlerken donmalar yaşıyoruz. Başka hiç bir operatörler bu sorunu yaşamazken ben parasını verip alamadığım hizmeti neden ödeyeyim ? @Turkcell """)
@app.get("/", tags=["Home"])
def api_home():
return {"detail": "Welcome to FastAPI!"}
@app.post("/predict/", response_model=dict)
async def predict(item: Item):
result = pipeline(item.text)
return result
if __name__=="__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |