Ozgur Unlu
commited on
Commit
·
d52122b
1
Parent(s):
3256b66
first run
Browse files- app.py +197 -0
- news-checker.py +73 -0
- pdf_generator.py +44 -0
- requirements.txt +9 -0
app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import (
|
4 |
+
AutoTokenizer,
|
5 |
+
AutoModelForSequenceClassification,
|
6 |
+
pipeline
|
7 |
+
)
|
8 |
+
import os
|
9 |
+
from pdf_generator import ReportGenerator
|
10 |
+
from news_checker import NewsChecker
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
# Initialize models and tokenizers
|
16 |
+
def load_models():
|
17 |
+
# Hate speech detection model
|
18 |
+
hate_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
|
19 |
+
hate_model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
|
20 |
+
|
21 |
+
# Bias detection (using same model with different labels)
|
22 |
+
bias_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
23 |
+
bias_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
|
24 |
+
|
25 |
+
return {
|
26 |
+
'hate_speech': (hate_model, hate_tokenizer),
|
27 |
+
'bias': (bias_model, bias_tokenizer)
|
28 |
+
}
|
29 |
+
|
30 |
+
# Initialize news checker
|
31 |
+
news_checker = NewsChecker()
|
32 |
+
|
33 |
+
def check_text_length(text):
|
34 |
+
if len(text) > 1000:
|
35 |
+
return {
|
36 |
+
'status': 'fail',
|
37 |
+
'message': 'Text exceeds 1000 character limit'
|
38 |
+
}
|
39 |
+
return {
|
40 |
+
'status': 'pass',
|
41 |
+
'message': 'Text length is within limits'
|
42 |
+
}
|
43 |
+
|
44 |
+
def check_hate_speech(text, model, tokenizer):
|
45 |
+
try:
|
46 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
47 |
+
outputs = model(**inputs)
|
48 |
+
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
49 |
+
|
50 |
+
# Threshold for hate speech detection
|
51 |
+
if predictions[0][1].item() > 0.3: # Adjusted threshold
|
52 |
+
return {
|
53 |
+
'status': 'fail',
|
54 |
+
'message': 'Potential hate speech detected'
|
55 |
+
}
|
56 |
+
elif predictions[0][1].item() > 0.1:
|
57 |
+
return {
|
58 |
+
'status': 'warning',
|
59 |
+
'message': 'Some concerning language detected - please review'
|
60 |
+
}
|
61 |
+
return {
|
62 |
+
'status': 'pass',
|
63 |
+
'message': 'No hate speech detected'
|
64 |
+
}
|
65 |
+
except Exception as e:
|
66 |
+
return {
|
67 |
+
'status': 'error',
|
68 |
+
'message': f'Error in hate speech detection: {str(e)}'
|
69 |
+
}
|
70 |
+
|
71 |
+
def check_grammar(text):
|
72 |
+
try:
|
73 |
+
nlp = pipeline("text2text-generation", model="gramformer/gramformer", device=0 if torch.cuda.is_available() else -1)
|
74 |
+
corrected = nlp(text, max_length=1000)[0]['generated_text']
|
75 |
+
|
76 |
+
if corrected.lower() != text.lower():
|
77 |
+
return {
|
78 |
+
'status': 'warning',
|
79 |
+
'message': f'Suggested corrections:\n{corrected}'
|
80 |
+
}
|
81 |
+
return {
|
82 |
+
'status': 'pass',
|
83 |
+
'message': 'No grammar issues detected'
|
84 |
+
}
|
85 |
+
except Exception as e:
|
86 |
+
return {
|
87 |
+
'status': 'error',
|
88 |
+
'message': f'Error in grammar check: {str(e)}'
|
89 |
+
}
|
90 |
+
|
91 |
+
def analyze_content(text):
|
92 |
+
# Initialize report generator
|
93 |
+
report_gen = ReportGenerator()
|
94 |
+
report_gen.add_header()
|
95 |
+
report_gen.add_input_text(text)
|
96 |
+
|
97 |
+
# Load models
|
98 |
+
models = load_models()
|
99 |
+
|
100 |
+
# Run all checks
|
101 |
+
results = {}
|
102 |
+
|
103 |
+
# 1. Length Check
|
104 |
+
length_result = check_text_length(text)
|
105 |
+
results['Length Check'] = length_result
|
106 |
+
report_gen.add_check_result("Length Check", length_result['status'], length_result['message'])
|
107 |
+
|
108 |
+
if length_result['status'] == 'fail':
|
109 |
+
report_path = report_gen.save_report()
|
110 |
+
return results, report_path
|
111 |
+
|
112 |
+
# 2. Hate Speech Check
|
113 |
+
hate_result = check_hate_speech(text, models['hate_speech'][0], models['hate_speech'][1])
|
114 |
+
results['Hate Speech Check'] = hate_result
|
115 |
+
report_gen.add_check_result("Hate Speech Check", hate_result['status'], hate_result['message'])
|
116 |
+
|
117 |
+
# 3. Grammar Check
|
118 |
+
grammar_result = check_grammar(text)
|
119 |
+
results['Grammar Check'] = grammar_result
|
120 |
+
report_gen.add_check_result("Grammar Check", grammar_result['status'], grammar_result['message'])
|
121 |
+
|
122 |
+
# 4. News Context Check
|
123 |
+
news_result = news_checker.check_content_against_news(text)
|
124 |
+
results['Current Events Context'] = news_result
|
125 |
+
report_gen.add_check_result("Current Events Context", news_result['status'], news_result['message'])
|
126 |
+
|
127 |
+
# Generate and save report
|
128 |
+
report_path = report_gen.save_report()
|
129 |
+
|
130 |
+
return results, report_path
|
131 |
+
|
132 |
+
def format_results(results):
|
133 |
+
status_symbols = {
|
134 |
+
'pass': '✅',
|
135 |
+
'fail': '❌',
|
136 |
+
'warning': '⚠️',
|
137 |
+
'error': '⚠️'
|
138 |
+
}
|
139 |
+
|
140 |
+
formatted_output = ""
|
141 |
+
for check, result in results.items():
|
142 |
+
symbol = status_symbols.get(result['status'], '❓')
|
143 |
+
formatted_output += f"{check}: {symbol}\n"
|
144 |
+
if result['message']:
|
145 |
+
formatted_output += f"Details: {result['message']}\n\n"
|
146 |
+
|
147 |
+
return formatted_output
|
148 |
+
|
149 |
+
# Gradio Interface
|
150 |
+
def create_interface():
|
151 |
+
with gr.Blocks(title="Marketing Content Validator") as interface:
|
152 |
+
gr.Markdown("# Marketing Content Validator")
|
153 |
+
gr.Markdown("Paste your marketing content below to check for potential issues.")
|
154 |
+
|
155 |
+
with gr.Row():
|
156 |
+
with gr.Column():
|
157 |
+
input_text = gr.TextArea(
|
158 |
+
label="Marketing Content",
|
159 |
+
placeholder="Enter your marketing content here (max 1000 characters)...",
|
160 |
+
lines=10
|
161 |
+
)
|
162 |
+
analyze_btn = gr.Button("Analyze Content")
|
163 |
+
|
164 |
+
with gr.Column():
|
165 |
+
output_text = gr.TextArea(
|
166 |
+
label="Analysis Results",
|
167 |
+
lines=10,
|
168 |
+
interactive=False
|
169 |
+
)
|
170 |
+
report_output = gr.File(label="Download Report")
|
171 |
+
|
172 |
+
analyze_btn.click(
|
173 |
+
fn=lambda text: (
|
174 |
+
format_results(analyze_content(text)[0]),
|
175 |
+
analyze_content(text)[1]
|
176 |
+
),
|
177 |
+
inputs=input_text,
|
178 |
+
outputs=[output_text, report_output]
|
179 |
+
)
|
180 |
+
|
181 |
+
gr.Markdown("""
|
182 |
+
### Notes:
|
183 |
+
- Maximum text length: 1000 characters
|
184 |
+
- Analysis may take up to 2 minutes
|
185 |
+
- Results include checks for:
|
186 |
+
- Text length
|
187 |
+
- Hate speech and bias
|
188 |
+
- Grammar
|
189 |
+
- Current events context
|
190 |
+
""")
|
191 |
+
|
192 |
+
return interface
|
193 |
+
|
194 |
+
# Launch the application
|
195 |
+
if __name__ == "__main__":
|
196 |
+
interface = create_interface()
|
197 |
+
interface.launch()
|
news-checker.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from newsapi import NewsApiClient
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import pandas as pd
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
class NewsChecker:
|
10 |
+
def __init__(self):
|
11 |
+
self.api_key = os.getenv('NEWS_API_KEY')
|
12 |
+
self.newsapi = NewsApiClient(api_key=self.api_key)
|
13 |
+
|
14 |
+
def get_recent_news(self):
|
15 |
+
try:
|
16 |
+
# Get news from the last 7 days
|
17 |
+
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
18 |
+
response = self.newsapi.get_everything(
|
19 |
+
q='',
|
20 |
+
from_param=week_ago,
|
21 |
+
language='en',
|
22 |
+
sort_by='relevancy',
|
23 |
+
page_size=100
|
24 |
+
)
|
25 |
+
|
26 |
+
if response['status'] == 'ok':
|
27 |
+
articles = response['articles']
|
28 |
+
# Extract titles and descriptions
|
29 |
+
news_data = [
|
30 |
+
{
|
31 |
+
'title': article['title'],
|
32 |
+
'description': article['description']
|
33 |
+
}
|
34 |
+
for article in articles if article['description']
|
35 |
+
]
|
36 |
+
return pd.DataFrame(news_data)
|
37 |
+
return pd.DataFrame()
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
print(f"Error fetching news: {str(e)}")
|
41 |
+
return pd.DataFrame()
|
42 |
+
|
43 |
+
def check_content_against_news(self, marketing_text):
|
44 |
+
news_df = self.get_recent_news()
|
45 |
+
if news_df.empty:
|
46 |
+
return {
|
47 |
+
'status': 'warning',
|
48 |
+
'message': 'Unable to check against current news context. Proceed with caution.'
|
49 |
+
}
|
50 |
+
|
51 |
+
# Simple keyword matching for demo purposes
|
52 |
+
# In a production environment, you'd want to use more sophisticated NLP techniques
|
53 |
+
marketing_words = set(marketing_text.lower().split())
|
54 |
+
potential_conflicts = []
|
55 |
+
|
56 |
+
for _, row in news_df.iterrows():
|
57 |
+
title_words = set(row['title'].lower().split())
|
58 |
+
desc_words = set(str(row['description']).lower().split())
|
59 |
+
|
60 |
+
# Check for significant word overlap
|
61 |
+
if len(marketing_words.intersection(title_words)) >= 3:
|
62 |
+
potential_conflicts.append(row['title'])
|
63 |
+
|
64 |
+
if potential_conflicts:
|
65 |
+
return {
|
66 |
+
'status': 'warning',
|
67 |
+
'message': 'Potential conflicts found with current news:\n- ' + '\n- '.join(potential_conflicts)
|
68 |
+
}
|
69 |
+
|
70 |
+
return {
|
71 |
+
'status': 'pass',
|
72 |
+
'message': 'No significant conflicts with current news found.'
|
73 |
+
}
|
pdf_generator.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fpdf import FPDF
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
class ReportGenerator:
|
5 |
+
def __init__(self):
|
6 |
+
self.pdf = FPDF()
|
7 |
+
self.pdf.add_page()
|
8 |
+
self.pdf.set_font("Arial", size=12)
|
9 |
+
|
10 |
+
def add_header(self):
|
11 |
+
self.pdf.set_font("Arial", "B", 16)
|
12 |
+
self.pdf.cell(200, 10, txt="Marketing Content Validation Report", ln=True, align='C')
|
13 |
+
self.pdf.set_font("Arial", size=10)
|
14 |
+
self.pdf.cell(200, 10, txt=f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True, align='R')
|
15 |
+
self.pdf.ln(10)
|
16 |
+
|
17 |
+
def add_input_text(self, text):
|
18 |
+
self.pdf.set_font("Arial", "B", 12)
|
19 |
+
self.pdf.cell(200, 10, txt="Input Marketing Content:", ln=True)
|
20 |
+
self.pdf.set_font("Arial", size=12)
|
21 |
+
self.pdf.multi_cell(0, 10, txt=text)
|
22 |
+
self.pdf.ln(10)
|
23 |
+
|
24 |
+
def add_check_result(self, check_name, status, details=None):
|
25 |
+
status_symbols = {
|
26 |
+
"pass": "✓",
|
27 |
+
"fail": "✗",
|
28 |
+
"warning": "!"
|
29 |
+
}
|
30 |
+
|
31 |
+
self.pdf.set_font("Arial", "B", 12)
|
32 |
+
status_symbol = status_symbols.get(status.lower(), "?")
|
33 |
+
self.pdf.cell(0, 10, txt=f"{check_name}: {status_symbol}", ln=True)
|
34 |
+
|
35 |
+
if details:
|
36 |
+
self.pdf.set_font("Arial", size=10)
|
37 |
+
self.pdf.multi_cell(0, 10, txt=details)
|
38 |
+
self.pdf.ln(5)
|
39 |
+
|
40 |
+
def save_report(self):
|
41 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
42 |
+
filename = f"marketing_report_{timestamp}.pdf"
|
43 |
+
self.pdf.output(filename)
|
44 |
+
return filename
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.19.2
|
2 |
+
transformers==4.36.2
|
3 |
+
torch==2.2.0
|
4 |
+
newsapi-python==0.2.7
|
5 |
+
fpdf2==2.7.8
|
6 |
+
pandas==2.1.4
|
7 |
+
numpy==1.24.3
|
8 |
+
requests==2.31.0
|
9 |
+
python-dotenv==1.0.0
|