Ozgur Unlu commited on
Commit
4ce18b0
·
1 Parent(s): 16b95b0
Files changed (1) hide show
  1. news_checker.py +19 -8
news_checker.py CHANGED
@@ -3,10 +3,10 @@ from newsapi import NewsApiClient
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
- from transformers import pipeline
7
- from sentence_transformers import SentenceTransformer
8
- from sklearn.metrics.pairwise import cosine_similarity
9
  import numpy as np
 
10
 
11
  load_dotenv()
12
 
@@ -23,15 +23,26 @@ class NewsChecker:
23
  # Initialize sentiment analyzer
24
  self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
25
  # Initialize semantic similarity model
26
- self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
 
27
  print("Models initialized successfully")
28
  except Exception as e:
29
  print(f"Error initializing clients: {str(e)}")
30
 
31
  def get_embedding(self, text):
32
- """Get embedding for a text using sentence transformer"""
33
  try:
34
- return self.semantic_model.encode(text, convert_to_tensor=True)
 
 
 
 
 
 
 
 
 
 
35
  except Exception as e:
36
  print(f"Error getting embedding: {str(e)}")
37
  return None
@@ -40,8 +51,8 @@ class NewsChecker:
40
  """Calculate cosine similarity between two embeddings"""
41
  try:
42
  # Convert tensors to numpy arrays and reshape
43
- emb1 = text1_embedding.cpu().numpy().reshape(1, -1)
44
- emb2 = text2_embedding.cpu().numpy().reshape(1, -1)
45
 
46
  # Calculate cosine similarity
47
  similarity = cosine_similarity(emb1, emb2)[0][0]
 
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
+ from transformers import pipeline, AutoTokenizer, AutoModel
7
+ import torch
 
8
  import numpy as np
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
 
11
  load_dotenv()
12
 
 
23
  # Initialize sentiment analyzer
24
  self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
25
  # Initialize semantic similarity model
26
+ self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
27
+ self.model = AutoModel.from_pretrained('distilbert-base-uncased')
28
  print("Models initialized successfully")
29
  except Exception as e:
30
  print(f"Error initializing clients: {str(e)}")
31
 
32
  def get_embedding(self, text):
33
+ """Get embedding for a text using DistilBERT"""
34
  try:
35
+ # Tokenize and encode the text
36
+ inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
37
+
38
+ # Get model outputs
39
+ with torch.no_grad():
40
+ outputs = self.model(**inputs)
41
+
42
+ # Use the mean of the last hidden state as the sentence embedding
43
+ embeddings = outputs.last_hidden_state.mean(dim=1)
44
+
45
+ return embeddings
46
  except Exception as e:
47
  print(f"Error getting embedding: {str(e)}")
48
  return None
 
51
  """Calculate cosine similarity between two embeddings"""
52
  try:
53
  # Convert tensors to numpy arrays and reshape
54
+ emb1 = text1_embedding.numpy().reshape(1, -1)
55
+ emb2 = text2_embedding.numpy().reshape(1, -1)
56
 
57
  # Calculate cosine similarity
58
  similarity = cosine_similarity(emb1, emb2)[0][0]