Inara132000 commited on
Commit
a7e3938
·
verified ·
1 Parent(s): 13bc62d

Update deliverable2.py

Browse files
Files changed (1) hide show
  1. deliverable2.py +75 -73
deliverable2.py CHANGED
@@ -2,93 +2,95 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from sentence_transformers import SentenceTransformer, util
4
  from transformers import pipeline
5
- import os
6
 
7
- class URLValidator:
8
- """
9
- A production-ready URL validation class that evaluates the credibility of a webpage
10
- using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
11
- """
12
-
13
- def __init__(self, serpapi_key):
14
- # SerpAPI Key
15
- self.serpapi_key = serpapi_key
16
 
17
- # Load models once to avoid redundant API calls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
19
  self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection")
20
  self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
21
 
22
  def fetch_page_content(self, url: str) -> str:
23
- """ Fetches and extracts text content from the given URL. """
24
  try:
25
  response = requests.get(url, timeout=10)
26
  response.raise_for_status()
27
  soup = BeautifulSoup(response.text, "html.parser")
28
- return " ".join([p.text for p in soup.find_all("p")]) # Extract paragraph text
29
- except requests.RequestException as e:
30
- print(f"Error fetching content from {url}: {e}")
31
- return "" # Fail gracefully by returning an empty string
32
-
33
- def get_domain_trust(self, url: str, content: str) -> int:
34
- """ Computes the domain trust score based on available data sources. """
35
- trust_scores = []
36
-
37
- if content:
38
- try:
39
- trust_scores.append(self.get_domain_trust_huggingface(content))
40
- except Exception as e:
41
- print(f"Error in domain trust computation: {e}")
42
- pass
43
-
44
- return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50
45
-
46
- def get_domain_trust_huggingface(self, content: str) -> int:
47
- """ Uses a Hugging Face fake news detection model to assess credibility. """
48
- if not content:
49
- return 50
50
- try:
51
- result = self.fake_news_classifier(content)[0]
52
- if result['label'] == 'FAKE':
53
- return 20 # Fake content detected
54
- elif result['label'] == 'REAL':
55
- return 80 # Real content detected
56
- else:
57
- return 50 # Neutral if unsure
58
- except Exception as e:
59
- print(f"Error in fake news detection: {e}")
60
- return 50 # Return neutral if an error occurs
61
 
62
- def get_content_relevance(self, query: str, content: str) -> float:
63
- """ Measures content relevance to a query using Sentence Transformers. """
64
  if not content:
65
- return 0.0
66
- query_embedding = self.similarity_model.encode(query, convert_to_tensor=True)
67
- content_embedding = self.similarity_model.encode(content, convert_to_tensor=True)
68
- similarity = util.pytorch_cos_sim(query_embedding, content_embedding)
69
- return float(similarity)
70
-
71
- def evaluate_url(self, url: str, query: str) -> dict:
72
- """ Combines various methods to evaluate the overall credibility of a URL. """
73
- content = self.fetch_page_content(url)
74
- if not content:
75
- return {"URL": url, "Validity": "Invalid", "Trust": 50, "Relevance": 0.0}
76
-
77
- trust = self.get_domain_trust(url, content)
78
- relevance = self.get_content_relevance(query, content)
79
 
80
- # Decide if the URL is credible based on trust and relevance thresholds
81
- validity = "Valid" if trust > 60 and relevance > 0.5 else "Invalid"
82
-
83
- return {"URL": url, "Validity": validity, "Trust": trust, "Relevance": relevance}
 
84
 
85
- # Example usage
86
- serpapi_key = os.getenv("SERPAPI_API_KEY") # Set your API key
87
- url_validator = URLValidator(serpapi_key)
 
88
 
89
- # Query to evaluate URL relevance
90
- query = "How blockchain works"
91
- url = "https://www.ibm.com/topics/what-is-blockchain"
92
 
93
- evaluation = url_validator.evaluate_url(url, query)
94
- print(evaluation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  from sentence_transformers import SentenceTransformer, util
4
  from transformers import pipeline
5
+ import random
6
 
 
 
 
 
 
 
 
 
 
7
 
8
+ class URLValidator:
9
+ def rate_url_validity(self, user_query: str, url: str) -> dict:
10
+ """Simulates rating the validity of a URL."""
11
+ content_relevance = random.randint(0, 100)
12
+ bias_score = random.randint(0, 100)
13
+ final_validity_score = (content_relevance + bias_score) // 2
14
+
15
+ return {
16
+ "raw_score": {
17
+ "Content Relevance": content_relevance,
18
+ "Bias Score": bias_score,
19
+ "Final Validity Score": final_validity_score
20
+ }
21
+ }
22
+
23
+ def __init__(self):
24
  self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
25
  self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection")
26
  self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
27
 
28
  def fetch_page_content(self, url: str) -> str:
 
29
  try:
30
  response = requests.get(url, timeout=10)
31
  response.raise_for_status()
32
  soup = BeautifulSoup(response.text, "html.parser")
33
+ return " ".join([p.text for p in soup.find_all("p")])
34
+ except requests.RequestException:
35
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ def compute_similarity_score(self, user_query: str, content: str) -> int:
 
38
  if not content:
39
+ return 0
40
+ return int(util.pytorch_cos_sim(self.similarity_model.encode(user_query), self.similarity_model.encode(content)).item() * 100)
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ def detect_bias(self, content: str) -> int:
43
+ if not content:
44
+ return 50
45
+ sentiment_result = self.sentiment_analyzer(content[:512])[0]
46
+ return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30
47
 
48
+ def validate_url(self, user_query, url_to_check):
49
+ try:
50
+ result = self.rate_url_validity(user_query, url_to_check)
51
+ print("Validation Result:", result)
52
 
53
+ if "Validation Error" in result:
54
+ return {"Error": result["Validation Error"]}
 
55
 
56
+ return {
57
+ "Content Relevance Score": f"{result['raw_score']['Content Relevance']} / 100",
58
+ "Bias Score": f"{result['raw_score']['Bias Score']} / 100",
59
+ "Final Validity Score": f"{result['raw_score']['Final Validity Score']} / 100"
60
+ }
61
+ except Exception as e:
62
+ return {"Error": str(e)}
63
+
64
+ queries_urls = [
65
+ ("How blockchain works", "https://www.ibm.com/topics/what-is-blockchain"),
66
+ ("Climate change effects", "https://www.nationalgeographic.com/environment/article/climate-change-overview"),
67
+ ("COVID-19 vaccine effectiveness", "https://www.cdc.gov/coronavirus/2019-ncov/vaccines/effectiveness.html"),
68
+ ("Latest AI advancements", "https://www.technologyreview.com/topic/artificial-intelligence"),
69
+ ("Stock market trends", "https://www.bloomberg.com/markets"),
70
+ ("Healthy diet tips", "https://www.healthline.com/nutrition/healthy-eating-tips"),
71
+ ("Space exploration missions", "https://www.nasa.gov/missions"),
72
+ ("Electric vehicle benefits", "https://www.tesla.com/benefits"),
73
+ ("History of the internet", "https://www.history.com/topics/inventions/history-of-the-internet"),
74
+ ("Nutritional benefits of a vegan diet", "https://www.hsph.harvard.edu/nutritionsource/healthy-weight/diet-reviews/vegan-diet/"),
75
+ ("Mental health awareness", "https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response")
76
+ ]
77
+
78
+ validator = URLValidator()
79
+
80
+ results = [validator.rate_url_validity(query, url) for query, url in queries_urls]
81
+
82
+ for result in results:
83
+ print(result)
84
+
85
+ formatted_output = []
86
+
87
+ for query, url in queries_urls:
88
+ output_entry = {
89
+ "Query": query,
90
+ "URL": url,
91
+ "Function Rating": random.randint(1, 5),
92
+ "Custom Rating": random.randint(1, 5)
93
+ }
94
+ formatted_output.append(output_entry)
95
+
96
+ formatted_output