|
import os |
|
import streamlit as st |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from sentence_transformers import SentenceTransformer, util |
|
from transformers import pipeline |
|
|
|
class URLValidator: |
|
""" |
|
A production-ready URL validation class that evaluates the credibility of a webpage |
|
using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations. |
|
""" |
|
|
|
def __init__(self): |
|
|
|
self.serpapi_key = os.getenv("SERPAPI_API_KEY") |
|
|
|
|
|
self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') |
|
self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection") |
|
self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment") |
|
|
|
def fetch_page_content(self, url: str) -> str: |
|
""" Fetches and extracts text content from the given URL. """ |
|
try: |
|
response = requests.get(url, timeout=10) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, "html.parser") |
|
return " ".join([p.text for p in soup.find_all("p")]) |
|
except requests.RequestException: |
|
return "" |
|
|
|
def get_domain_trust(self, url: str, content: str) -> int: |
|
""" Computes the domain trust score based on available data sources. """ |
|
trust_scores = [] |
|
|
|
|
|
if content: |
|
try: |
|
trust_scores.append(self.get_domain_trust_huggingface(content)) |
|
except: |
|
pass |
|
|
|
|
|
return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50 |
|
|
|
def get_domain_trust_huggingface(self, content: str) -> int: |
|
""" Uses a Hugging Face fake news detection model to assess credibility. """ |
|
if not content: |
|
return 50 |