Spaces:
Sleeping
Sleeping
Update deliverable2.py
Browse files- deliverable2.py +49 -3
deliverable2.py
CHANGED
@@ -26,7 +26,8 @@ class URLValidator:
|
|
26 |
response.raise_for_status()
|
27 |
soup = BeautifulSoup(response.text, "html.parser")
|
28 |
return " ".join([p.text for p in soup.find_all("p")]) # Extract paragraph text
|
29 |
-
except requests.RequestException:
|
|
|
30 |
return "" # Fail gracefully by returning an empty string
|
31 |
|
32 |
def get_domain_trust(self, url: str, content: str) -> int:
|
@@ -36,7 +37,8 @@ class URLValidator:
|
|
36 |
if content:
|
37 |
try:
|
38 |
trust_scores.append(self.get_domain_trust_huggingface(content))
|
39 |
-
except:
|
|
|
40 |
pass
|
41 |
|
42 |
return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50
|
@@ -45,4 +47,48 @@ class URLValidator:
|
|
45 |
""" Uses a Hugging Face fake news detection model to assess credibility. """
|
46 |
if not content:
|
47 |
return 50
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
response.raise_for_status()
|
27 |
soup = BeautifulSoup(response.text, "html.parser")
|
28 |
return " ".join([p.text for p in soup.find_all("p")]) # Extract paragraph text
|
29 |
+
except requests.RequestException as e:
|
30 |
+
print(f"Error fetching content from {url}: {e}")
|
31 |
return "" # Fail gracefully by returning an empty string
|
32 |
|
33 |
def get_domain_trust(self, url: str, content: str) -> int:
|
|
|
37 |
if content:
|
38 |
try:
|
39 |
trust_scores.append(self.get_domain_trust_huggingface(content))
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error in domain trust computation: {e}")
|
42 |
pass
|
43 |
|
44 |
return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50
|
|
|
47 |
""" Uses a Hugging Face fake news detection model to assess credibility. """
|
48 |
if not content:
|
49 |
return 50
|
50 |
+
try:
|
51 |
+
result = self.fake_news_classifier(content)[0]
|
52 |
+
if result['label'] == 'FAKE':
|
53 |
+
return 20 # Fake content detected
|
54 |
+
elif result['label'] == 'REAL':
|
55 |
+
return 80 # Real content detected
|
56 |
+
else:
|
57 |
+
return 50 # Neutral if unsure
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Error in fake news detection: {e}")
|
60 |
+
return 50 # Return neutral if an error occurs
|
61 |
+
|
62 |
+
def get_content_relevance(self, query: str, content: str) -> float:
|
63 |
+
""" Measures content relevance to a query using Sentence Transformers. """
|
64 |
+
if not content:
|
65 |
+
return 0.0
|
66 |
+
query_embedding = self.similarity_model.encode(query, convert_to_tensor=True)
|
67 |
+
content_embedding = self.similarity_model.encode(content, convert_to_tensor=True)
|
68 |
+
similarity = util.pytorch_cos_sim(query_embedding, content_embedding)
|
69 |
+
return float(similarity)
|
70 |
+
|
71 |
+
def evaluate_url(self, url: str, query: str) -> dict:
|
72 |
+
""" Combines various methods to evaluate the overall credibility of a URL. """
|
73 |
+
content = self.fetch_page_content(url)
|
74 |
+
if not content:
|
75 |
+
return {"URL": url, "Validity": "Invalid", "Trust": 50, "Relevance": 0.0}
|
76 |
+
|
77 |
+
trust = self.get_domain_trust(url, content)
|
78 |
+
relevance = self.get_content_relevance(query, content)
|
79 |
+
|
80 |
+
# Decide if the URL is credible based on trust and relevance thresholds
|
81 |
+
validity = "Valid" if trust > 60 and relevance > 0.5 else "Invalid"
|
82 |
+
|
83 |
+
return {"URL": url, "Validity": validity, "Trust": trust, "Relevance": relevance}
|
84 |
+
|
85 |
+
# Example usage
|
86 |
+
serpapi_key = os.getenv("SERPAPI_API_KEY") # Set your API key
|
87 |
+
url_validator = URLValidator(serpapi_key)
|
88 |
+
|
89 |
+
# Query to evaluate URL relevance
|
90 |
+
query = "How blockchain works"
|
91 |
+
url = "https://www.ibm.com/topics/what-is-blockchain"
|
92 |
+
|
93 |
+
evaluation = url_validator.evaluate_url(url, query)
|
94 |
+
print(evaluation)
|