Spaces:
Sleeping
Sleeping
Update deliverable2.py
Browse files- deliverable2.py +9 -76
deliverable2.py
CHANGED
@@ -2,12 +2,11 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
from transformers import pipeline
|
5 |
-
import pandas as pd
|
6 |
|
7 |
class URLValidator:
|
8 |
"""
|
9 |
-
|
10 |
-
using
|
11 |
"""
|
12 |
|
13 |
def __init__(self):
|
@@ -17,9 +16,9 @@ class URLValidator:
|
|
17 |
self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
|
18 |
|
19 |
def fetch_page_content(self, url: str) -> str:
|
20 |
-
""" Fetches and extracts text content from the given URL
|
21 |
try:
|
22 |
-
headers = {"User-Agent": "Mozilla/5.0"}
|
23 |
response = requests.get(url, timeout=10, headers=headers)
|
24 |
response.raise_for_status()
|
25 |
soup = BeautifulSoup(response.text, "html.parser")
|
@@ -34,9 +33,9 @@ class URLValidator:
|
|
34 |
return f"Error: Unable to fetch URL ({str(e)})."
|
35 |
|
36 |
def get_domain_trust(self, url: str, content: str) -> int:
|
37 |
-
"""
|
38 |
if "Error" in content:
|
39 |
-
return 0
|
40 |
return len(url) % 5 + 1 # Mock trust rating (1-5)
|
41 |
|
42 |
def compute_similarity_score(self, user_query: str, content: str) -> int:
|
@@ -81,23 +80,12 @@ class URLValidator:
|
|
81 |
return " ".join(reasons) if reasons else "This source is highly credible and relevant."
|
82 |
|
83 |
def rate_url_validity(self, user_query: str, url: str):
|
84 |
-
""" Main function to evaluate the validity of a webpage. """
|
85 |
content = self.fetch_page_content(url)
|
86 |
|
|
|
87 |
if "Error" in content:
|
88 |
-
return {
|
89 |
-
"raw_score": {
|
90 |
-
"Domain Trust": 0,
|
91 |
-
"Content Relevance": 0,
|
92 |
-
"Fact-Check Score": 0,
|
93 |
-
"Bias Score": 0,
|
94 |
-
"Final Validity Score": 0
|
95 |
-
},
|
96 |
-
"stars": {
|
97 |
-
"icon": "❌"
|
98 |
-
},
|
99 |
-
"explanation": content
|
100 |
-
}
|
101 |
|
102 |
domain_trust = self.get_domain_trust(url, content)
|
103 |
similarity_score = self.compute_similarity_score(user_query, content)
|
@@ -127,58 +115,3 @@ class URLValidator:
|
|
127 |
},
|
128 |
"explanation": explanation
|
129 |
}
|
130 |
-
|
131 |
-
|
132 |
-
# ✅ **Updated 15 Queries and 15 Different URLs**
|
133 |
-
sample_queries = [
|
134 |
-
"How does artificial intelligence impact the job market?",
|
135 |
-
"What are the risks of genetically modified organisms (GMOs)?",
|
136 |
-
"What are the environmental effects of plastic pollution?",
|
137 |
-
"How does 5G technology affect human health?",
|
138 |
-
"What are the latest treatments for Alzheimer's disease?",
|
139 |
-
"Is red meat consumption linked to heart disease?",
|
140 |
-
"How does cryptocurrency mining impact the environment?",
|
141 |
-
"What are the benefits of electric cars?",
|
142 |
-
"How does sleep deprivation affect cognitive function?",
|
143 |
-
"What are the effects of social media on teenage mental health?",
|
144 |
-
"What are the ethical concerns of facial recognition technology?",
|
145 |
-
"How does air pollution contribute to lung diseases?",
|
146 |
-
"What are the potential dangers of artificial general intelligence?",
|
147 |
-
"How does meditation impact brain function?",
|
148 |
-
"What are the psychological effects of video game addiction?"
|
149 |
-
]
|
150 |
-
|
151 |
-
sample_urls = [
|
152 |
-
"https://www.forbes.com/sites/forbestechcouncil/2023/10/15/impact-of-ai-on-the-job-market/",
|
153 |
-
"https://www.fda.gov/food/food-labeling-nutrition/consumers-guide-gmo-foods",
|
154 |
-
"https://www.nationalgeographic.com/environment/article/plastic-pollution",
|
155 |
-
"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7453195/",
|
156 |
-
"https://www.alz.org/alzheimers-dementia/treatments",
|
157 |
-
"https://www.heart.org/en/news/2021/02/10/how-red-meat-affects-heart-health",
|
158 |
-
"https://www.scientificamerican.com/article/how-bitcoin-mining-impacts-the-environment/",
|
159 |
-
"https://www.tesla.com/blog/environmental-benefits-electric-cars",
|
160 |
-
"https://www.sleepfoundation.org/sleep-deprivation",
|
161 |
-
"https://www.psychologytoday.com/us/basics/teenagers-and-social-media",
|
162 |
-
"https://www.brookings.edu/research/facial-recognition-technology-ethical-concerns/",
|
163 |
-
"https://www.who.int/news-room/fact-sheets/detail/ambient-(outdoor)-air-quality-and-health",
|
164 |
-
"https://futureoflife.org/background/benefits-risks-of-artificial-intelligence/",
|
165 |
-
"https://www.mindful.org/meditation/mindfulness-getting-started/",
|
166 |
-
"https://www.apa.org/news/press/releases/stress/2020/video-games"
|
167 |
-
]
|
168 |
-
|
169 |
-
# **Run Validator & Save CSV**
|
170 |
-
validator = URLValidator()
|
171 |
-
results = []
|
172 |
-
for query, url in zip(sample_queries, sample_urls):
|
173 |
-
result = validator.rate_url_validity(query, url)
|
174 |
-
results.append({
|
175 |
-
"user_query": query,
|
176 |
-
"url_to_check": url,
|
177 |
-
"func_rating": round(result["raw_score"]["Final Validity Score"] / 20),
|
178 |
-
"custom_rating": round(result["raw_score"]["Final Validity Score"] / 20) + 1
|
179 |
-
})
|
180 |
-
|
181 |
-
df = pd.DataFrame(results)
|
182 |
-
df.to_csv("url_validation_results.csv", index=False)
|
183 |
-
|
184 |
-
print("✅ CSV file 'url_validation_results.csv' has been created successfully!")
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
from transformers import pipeline
|
|
|
5 |
|
6 |
class URLValidator:
|
7 |
"""
|
8 |
+
URL Validator class that evaluates the credibility of a webpage
|
9 |
+
using domain trust, content relevance, fact-checking, bias detection, and citations.
|
10 |
"""
|
11 |
|
12 |
def __init__(self):
|
|
|
16 |
self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
|
17 |
|
18 |
def fetch_page_content(self, url: str) -> str:
|
19 |
+
""" Fetches and extracts text content from the given URL. """
|
20 |
try:
|
21 |
+
headers = {"User-Agent": "Mozilla/5.0"} # Helps bypass some bot protections
|
22 |
response = requests.get(url, timeout=10, headers=headers)
|
23 |
response.raise_for_status()
|
24 |
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
33 |
return f"Error: Unable to fetch URL ({str(e)})."
|
34 |
|
35 |
def get_domain_trust(self, url: str, content: str) -> int:
|
36 |
+
""" Simulated function to assess domain trust. """
|
37 |
if "Error" in content:
|
38 |
+
return 0
|
39 |
return len(url) % 5 + 1 # Mock trust rating (1-5)
|
40 |
|
41 |
def compute_similarity_score(self, user_query: str, content: str) -> int:
|
|
|
80 |
return " ".join(reasons) if reasons else "This source is highly credible and relevant."
|
81 |
|
82 |
def rate_url_validity(self, user_query: str, url: str):
|
83 |
+
""" Main function to evaluate the validity of a webpage. """
|
84 |
content = self.fetch_page_content(url)
|
85 |
|
86 |
+
# Handle errors
|
87 |
if "Error" in content:
|
88 |
+
return {"Validation Error": content}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
domain_trust = self.get_domain_trust(url, content)
|
91 |
similarity_score = self.compute_similarity_score(user_query, content)
|
|
|
115 |
},
|
116 |
"explanation": explanation
|
117 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|