Ozgur Unlu commited on
Commit
c1e6e68
·
1 Parent(s): 8bf0ed1

more fixes

Browse files
Files changed (1) hide show
  1. app.py +34 -5
app.py CHANGED
@@ -9,9 +9,20 @@ from pdf_generator import ReportGenerator
9
  from news_checker import NewsChecker
10
  from dotenv import load_dotenv
11
  from spellchecker import SpellChecker
 
12
 
13
  load_dotenv()
14
 
 
 
 
 
 
 
 
 
 
 
15
  # Initialize models and tokenizers
16
  def load_models():
17
  # Hate speech and bias detection model
@@ -70,12 +81,30 @@ def check_hate_speech_and_bias(text, model, tokenizer):
70
 
71
  def check_spelling(text, spell_checker):
72
  try:
73
- # Split text into words and clean them
74
- words = text.replace('\n', ' ').split()
75
- words = [word.strip('.,!?()[]{}":;') for word in words]
 
 
 
 
 
 
 
76
 
77
- # Find misspelled words
78
- misspelled = spell_checker.unknown(words)
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if misspelled:
81
  corrections = []
 
9
  from news_checker import NewsChecker
10
  from dotenv import load_dotenv
11
  from spellchecker import SpellChecker
12
+ import re
13
 
14
  load_dotenv()
15
 
16
+ CONTRACTIONS = {
17
+ "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
18
+ "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
19
+ "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
20
+ "shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
21
+ "we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
22
+ "where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
23
+ "you'd", "you'll", "you're", "you've"
24
+ }
25
+
26
  # Initialize models and tokenizers
27
  def load_models():
28
  # Hate speech and bias detection model
 
81
 
82
  def check_spelling(text, spell_checker):
83
  try:
84
+ # Split text into words
85
+ words = text.split()
86
+
87
+ # Process words while preserving contractions and special cases
88
+ clean_words = []
89
+ for word in words:
90
+ # Remove surrounding punctuation but keep internal apostrophes
91
+ cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
92
+ if cleaned:
93
+ clean_words.append(cleaned)
94
 
95
+ # Find misspelled words, excluding contractions and special cases
96
+ misspelled = set()
97
+ for word in clean_words:
98
+ if (word.lower() not in CONTRACTIONS and # Skip known contractions
99
+ not word.isdigit() and # Skip numbers
100
+ not any(char.isdigit() for char in word) and # Skip words with numbers
101
+ not word.startswith('@') and # Skip mentions
102
+ not word.startswith('#') and # Skip hashtags
103
+ not word.startswith('http') and # Skip URLs
104
+ not word.isupper() and # Skip acronyms
105
+ len(word) > 1 and # Skip single letters
106
+ word.lower() not in spell_checker.word_frequency): # Check if word is in dictionary
107
+ misspelled.add(word)
108
 
109
  if misspelled:
110
  corrections = []