Spaces:

TeacherPuffy
/

CreateBookPackage

Sleeping

App Files Files Community

TeacherPuffy commited on Jan 21

Commit

fd63293

verified ·

1 Parent(s): f108df6

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -23,22 +23,31 @@ def clean_text(text):
     return ' '.join(cleaned_sentences)
 def process_text(text):
-    """Insert a newline after periods, except for titles and ." """
     # Split text into words
     words = text.split()
-    processed_words = []
     for i, word in enumerate(words):
         # Check if the word is a title (e.g., Mr., Mrs.)
         if word in TITLES:
-            processed_words.append(word)
         # Check if the word ends with a period and is not followed by a quote
         elif word.endswith('.') and not word.endswith('."'):
-            processed_words.append(word + '\n')
         else:
-            processed_words.append(word)
-    return ' '.join(processed_words)
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
@@ -56,6 +65,7 @@ def combine_dataset_texts(dataset_name, split, text_column):
         cleaned_text = clean_text(combined_text)
         # Process the text: insert newlines after periods, except for titles and ."
         processed_text = process_text(cleaned_text)
         # Create a temporary file

     return ' '.join(cleaned_sentences)
 def process_text(text):
+    """Insert a newline after periods, except for titles and ."
+    Also replace '### Simplified Version' with 'Chapter N' where N increments."""
     # Split text into words
     words = text.split()
+    processed_text = ""
+    chapter_counter = 1  # Initialize chapter counter
     for i, word in enumerate(words):
         # Check if the word is a title (e.g., Mr., Mrs.)
         if word in TITLES:
+            processed_text += word + " "
         # Check if the word ends with a period and is not followed by a quote
         elif word.endswith('.') and not word.endswith('."'):
+            processed_text += word + "\n"
+        # Replace '### Simplified Version' with 'Chapter N'
+        elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
+            processed_text += f"Chapter {chapter_counter} "
+            chapter_counter += 1  # Increment chapter counter
+            words[i + 1] = ""  # Skip the next two words
+            words[i + 2] = ""
         else:
+            processed_text += word + " "
+    # Remove trailing spaces and newlines
+    return processed_text.strip()
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
         cleaned_text = clean_text(combined_text)
         # Process the text: insert newlines after periods, except for titles and ."
+        # Also replace '### Simplified Version' with 'Chapter N'
         processed_text = process_text(cleaned_text)
         # Create a temporary file