from flask import Flask, request, jsonify, send_from_directory import pickle import torch import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from transformers import DistilBertTokenizer, DistilBertModel import torch.nn as nn import os import numpy # Download NLTK stuff nltk.data.path.append('/usr/local/share/nltk_data') nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') app = Flask(__name__, static_folder='build', static_url_path='') # Define DistilBERT model class class DistilBERTClassifier(nn.Module): def __init__(self, dropout_rate=0.2): super(DistilBERTClassifier, self).__init__() self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased') self.dropout = nn.Dropout(dropout_rate) self.classifier = nn.Linear(768, 2) def forward(self, input_ids, attention_mask): outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask) pooled_output = outputs.last_hidden_state[:, 0] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) return logits # Clean text function def clean_text(text): text = text.lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r'<.*?>', '', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) tokens = nltk.word_tokenize(text) stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] cleaned_text = ' '.join(tokens) return cleaned_text # Load models def load_models(): # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # TF-IDF vectorizer with open('models/tfidf_vectorizer.pkl', 'rb') as f: tfidf_vectorizer = pickle.load(f) # Logistic Regression with open('models/lr_model.pkl', 'rb') as f: lr_model = pickle.load(f) # random Forest with open('models/rf_model.pkl', 'rb') as f: rf_model = pickle.load(f) # load DistilBERT tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') distilbert_model = DistilBERTClassifier() distilbert_model.load_state_dict(torch.load('models/distilbert_model.pt', map_location=device)) distilbert_model.to(device) distilbert_model.eval() return tfidf_vectorizer, lr_model, rf_model, distilbert_model, tokenizer, device # Load models at startup tfidf_vectorizer, lr_model, rf_model, distilbert_model, tokenizer, device = load_models() @app.route('/') def serve(): return send_from_directory(app.static_folder, 'index.html') @app.route('/api/analyze', methods=['POST']) def analyze(): data = request.get_json() if not data or 'text' not in data or 'model' not in data: return jsonify({'error': 'Missing required fields'}), 400 news_text = data['text'] model_option = data['model'] if not news_text: return jsonify({'error': 'Text cannot be empty'}), 400 # Clean text cleaned_text = clean_text(news_text) results = {} # Using Logistic Regression if model_option in ["lr", "all"]: text_tfidf = tfidf_vectorizer.transform([cleaned_text]) lr_pred = lr_model.predict(text_tfidf)[0] lr_prob = lr_model.predict_proba(text_tfidf)[0] results["Logistic Regression"] = { "prediction": "Real" if lr_pred == 1 else "Fake", "fake_prob": float(lr_prob[0]), "real_prob": float(lr_prob[1]) } # Using Random Forest if model_option in ["rf", "all"]: text_tfidf = tfidf_vectorizer.transform([cleaned_text]) rf_pred = rf_model.predict(text_tfidf)[0] rf_prob = rf_model.predict_proba(text_tfidf)[0] results["Random Forest"] = { "prediction": "Real" if rf_pred == 1 else "Fake", "fake_prob": float(rf_prob[0]), "real_prob": float(rf_prob[1]) } # Using DistilBERT if model_option in ["distilbert", "all"]: encoding = tokenizer( cleaned_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt' ) with torch.no_grad(): input_ids = encoding['input_ids'].to(device) attention_mask = encoding['attention_mask'].to(device) outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask) print("Raw model output:", outputs.cpu().numpy()) probs = torch.softmax(outputs, dim=1).cpu().numpy()[0] print("After softmax:", probs) print(f"Text: {cleaned_text[:50]}...") print(f"Probabilities: Real={probs[0]:.4f}, Fake={probs[1]:.4f}") distilbert_pred = 1 if probs[1] > probs[0] else 0 results["DistilBERT"] = { "prediction": "Real" if distilbert_pred == 1 else "Fake", "fake_prob": float(probs[0]), "real_prob": float(probs[1]) } # Calculate overall results for "all models" option if model_option == "all": real_votes = sum(1 for model, result in results.items() if result["prediction"] == "Real") fake_votes = len(results) - real_votes overall_verdict = "Real" if real_votes >= fake_votes else "Fake" results["Overall"] = { "prediction": overall_verdict, "real_votes": real_votes, "fake_votes": fake_votes, "total_models": len(results) } return jsonify({'results': results}) if __name__ == '__main__': port = int(os.environ.get('PORT', 7860)) app.run(host='0.0.0.0', port=port)