Ozgur Unlu
commited on
Commit
·
c1e6e68
1
Parent(s):
8bf0ed1
more fixes
Browse files
app.py
CHANGED
@@ -9,9 +9,20 @@ from pdf_generator import ReportGenerator
|
|
9 |
from news_checker import NewsChecker
|
10 |
from dotenv import load_dotenv
|
11 |
from spellchecker import SpellChecker
|
|
|
12 |
|
13 |
load_dotenv()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
# Initialize models and tokenizers
|
16 |
def load_models():
|
17 |
# Hate speech and bias detection model
|
@@ -70,12 +81,30 @@ def check_hate_speech_and_bias(text, model, tokenizer):
|
|
70 |
|
71 |
def check_spelling(text, spell_checker):
|
72 |
try:
|
73 |
-
# Split text into words
|
74 |
-
words = text.
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
# Find misspelled words
|
78 |
-
misspelled =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
if misspelled:
|
81 |
corrections = []
|
|
|
9 |
from news_checker import NewsChecker
|
10 |
from dotenv import load_dotenv
|
11 |
from spellchecker import SpellChecker
|
12 |
+
import re
|
13 |
|
14 |
load_dotenv()
|
15 |
|
16 |
+
CONTRACTIONS = {
|
17 |
+
"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
|
18 |
+
"hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
|
19 |
+
"isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
|
20 |
+
"shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
|
21 |
+
"we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
|
22 |
+
"where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
|
23 |
+
"you'd", "you'll", "you're", "you've"
|
24 |
+
}
|
25 |
+
|
26 |
# Initialize models and tokenizers
|
27 |
def load_models():
|
28 |
# Hate speech and bias detection model
|
|
|
81 |
|
82 |
def check_spelling(text, spell_checker):
|
83 |
try:
|
84 |
+
# Split text into words
|
85 |
+
words = text.split()
|
86 |
+
|
87 |
+
# Process words while preserving contractions and special cases
|
88 |
+
clean_words = []
|
89 |
+
for word in words:
|
90 |
+
# Remove surrounding punctuation but keep internal apostrophes
|
91 |
+
cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
|
92 |
+
if cleaned:
|
93 |
+
clean_words.append(cleaned)
|
94 |
|
95 |
+
# Find misspelled words, excluding contractions and special cases
|
96 |
+
misspelled = set()
|
97 |
+
for word in clean_words:
|
98 |
+
if (word.lower() not in CONTRACTIONS and # Skip known contractions
|
99 |
+
not word.isdigit() and # Skip numbers
|
100 |
+
not any(char.isdigit() for char in word) and # Skip words with numbers
|
101 |
+
not word.startswith('@') and # Skip mentions
|
102 |
+
not word.startswith('#') and # Skip hashtags
|
103 |
+
not word.startswith('http') and # Skip URLs
|
104 |
+
not word.isupper() and # Skip acronyms
|
105 |
+
len(word) > 1 and # Skip single letters
|
106 |
+
word.lower() not in spell_checker.word_frequency): # Check if word is in dictionary
|
107 |
+
misspelled.add(word)
|
108 |
|
109 |
if misspelled:
|
110 |
corrections = []
|