Ozgur Unlu
commited on
Commit
·
4ce18b0
1
Parent(s):
16b95b0
fixes
Browse files- news_checker.py +19 -8
news_checker.py
CHANGED
@@ -3,10 +3,10 @@ from newsapi import NewsApiClient
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
6 |
-
from transformers import pipeline
|
7 |
-
|
8 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
import numpy as np
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
@@ -23,15 +23,26 @@ class NewsChecker:
|
|
23 |
# Initialize sentiment analyzer
|
24 |
self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
|
25 |
# Initialize semantic similarity model
|
26 |
-
self.
|
|
|
27 |
print("Models initialized successfully")
|
28 |
except Exception as e:
|
29 |
print(f"Error initializing clients: {str(e)}")
|
30 |
|
31 |
def get_embedding(self, text):
|
32 |
-
"""Get embedding for a text using
|
33 |
try:
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
except Exception as e:
|
36 |
print(f"Error getting embedding: {str(e)}")
|
37 |
return None
|
@@ -40,8 +51,8 @@ class NewsChecker:
|
|
40 |
"""Calculate cosine similarity between two embeddings"""
|
41 |
try:
|
42 |
# Convert tensors to numpy arrays and reshape
|
43 |
-
emb1 = text1_embedding.
|
44 |
-
emb2 = text2_embedding.
|
45 |
|
46 |
# Calculate cosine similarity
|
47 |
similarity = cosine_similarity(emb1, emb2)[0][0]
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
6 |
+
from transformers import pipeline, AutoTokenizer, AutoModel
|
7 |
+
import torch
|
|
|
8 |
import numpy as np
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
23 |
# Initialize sentiment analyzer
|
24 |
self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
|
25 |
# Initialize semantic similarity model
|
26 |
+
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
27 |
+
self.model = AutoModel.from_pretrained('distilbert-base-uncased')
|
28 |
print("Models initialized successfully")
|
29 |
except Exception as e:
|
30 |
print(f"Error initializing clients: {str(e)}")
|
31 |
|
32 |
def get_embedding(self, text):
|
33 |
+
"""Get embedding for a text using DistilBERT"""
|
34 |
try:
|
35 |
+
# Tokenize and encode the text
|
36 |
+
inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
37 |
+
|
38 |
+
# Get model outputs
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = self.model(**inputs)
|
41 |
+
|
42 |
+
# Use the mean of the last hidden state as the sentence embedding
|
43 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
44 |
+
|
45 |
+
return embeddings
|
46 |
except Exception as e:
|
47 |
print(f"Error getting embedding: {str(e)}")
|
48 |
return None
|
|
|
51 |
"""Calculate cosine similarity between two embeddings"""
|
52 |
try:
|
53 |
# Convert tensors to numpy arrays and reshape
|
54 |
+
emb1 = text1_embedding.numpy().reshape(1, -1)
|
55 |
+
emb2 = text2_embedding.numpy().reshape(1, -1)
|
56 |
|
57 |
# Calculate cosine similarity
|
58 |
similarity = cosine_similarity(emb1, emb2)[0][0]
|