Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
from sentence_transformers import SentenceTransformer, util | |
from transformers import pipeline | |
import os | |
class URLValidator: | |
""" | |
A production-ready URL validation class that evaluates the credibility of a webpage | |
using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations. | |
""" | |
def __init__(self, serpapi_key): | |
# SerpAPI Key | |
self.serpapi_key = serpapi_key | |
# Load models once to avoid redundant API calls | |
self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') | |
self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection") | |
self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment") | |
def fetch_page_content(self, url: str) -> str: | |
""" Fetches and extracts text content from the given URL. """ | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
return " ".join([p.text for p in soup.find_all("p")]) # Extract paragraph text | |
except requests.RequestException: | |
return "" # Fail gracefully by returning an empty string | |
def get_domain_trust(self, url: str, content: str) -> int: | |
""" Computes the domain trust score based on available data sources. """ | |
trust_scores = [] | |
if content: | |
try: | |
trust_scores.append(self.get_domain_trust_huggingface(content)) | |
except: | |
pass | |
return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50 | |
def get_domain_trust_huggingface(self, content: str) -> int: | |
""" Uses a Hugging Face fake news detection model to assess credibility. """ | |
if not content: | |
return 50 | |
result = s | |