Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

c1e6e68

1 Parent(s): 8bf0ed1

more fixes

Browse files

Files changed (1) hide show

app.py +34 -5

app.py CHANGED Viewed

@@ -9,9 +9,20 @@ from pdf_generator import ReportGenerator
 from news_checker import NewsChecker
 from dotenv import load_dotenv
 from spellchecker import SpellChecker
 load_dotenv()
 # Initialize models and tokenizers
 def load_models():
     # Hate speech and bias detection model
@@ -70,12 +81,30 @@ def check_hate_speech_and_bias(text, model, tokenizer):
 def check_spelling(text, spell_checker):
     try:
-        # Split text into words and clean them
-        words = text.replace('\n', ' ').split()
-        words = [word.strip('.,!?()[]{}":;') for word in words]
-        # Find misspelled words
-        misspelled = spell_checker.unknown(words)
         if misspelled:
             corrections = []

 from news_checker import NewsChecker
 from dotenv import load_dotenv
 from spellchecker import SpellChecker
+import re
 load_dotenv()
+CONTRACTIONS = {
+    "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
+    "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
+    "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
+    "shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
+    "we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
+    "where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
+    "you'd", "you'll", "you're", "you've"
+}
 # Initialize models and tokenizers
 def load_models():
     # Hate speech and bias detection model
 def check_spelling(text, spell_checker):
     try:
+        # Split text into words
+        words = text.split()
+        # Process words while preserving contractions and special cases
+        clean_words = []
+        for word in words:
+            # Remove surrounding punctuation but keep internal apostrophes
+            cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
+            if cleaned:
+                clean_words.append(cleaned)
+        # Find misspelled words, excluding contractions and special cases
+        misspelled = set()
+        for word in clean_words:
+            if (word.lower() not in CONTRACTIONS and  # Skip known contractions
+                not word.isdigit() and               # Skip numbers
+                not any(char.isdigit() for char in word) and  # Skip words with numbers
+                not word.startswith('@') and         # Skip mentions
+                not word.startswith('#') and         # Skip hashtags
+                not word.startswith('http') and      # Skip URLs
+                not word.isupper() and              # Skip acronyms
+                len(word) > 1 and                   # Skip single letters
+                word.lower() not in spell_checker.word_frequency):  # Check if word is in dictionary
+                misspelled.add(word)
         if misspelled:
             corrections = []