TeacherPuffy commited on
Commit
fd63293
·
verified ·
1 Parent(s): f108df6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -6
app.py CHANGED
@@ -23,22 +23,31 @@ def clean_text(text):
23
  return ' '.join(cleaned_sentences)
24
 
25
  def process_text(text):
26
- """Insert a newline after periods, except for titles and ." """
 
27
  # Split text into words
28
  words = text.split()
29
- processed_words = []
 
30
 
31
  for i, word in enumerate(words):
32
  # Check if the word is a title (e.g., Mr., Mrs.)
33
  if word in TITLES:
34
- processed_words.append(word)
35
  # Check if the word ends with a period and is not followed by a quote
36
  elif word.endswith('.') and not word.endswith('."'):
37
- processed_words.append(word + '\n')
 
 
 
 
 
 
38
  else:
39
- processed_words.append(word)
40
 
41
- return ' '.join(processed_words)
 
42
 
43
  def combine_dataset_texts(dataset_name, split, text_column):
44
  try:
@@ -56,6 +65,7 @@ def combine_dataset_texts(dataset_name, split, text_column):
56
  cleaned_text = clean_text(combined_text)
57
 
58
  # Process the text: insert newlines after periods, except for titles and ."
 
59
  processed_text = process_text(cleaned_text)
60
 
61
  # Create a temporary file
 
23
  return ' '.join(cleaned_sentences)
24
 
25
  def process_text(text):
26
+ """Insert a newline after periods, except for titles and ."
27
+ Also replace '### Simplified Version' with 'Chapter N' where N increments."""
28
  # Split text into words
29
  words = text.split()
30
+ processed_text = ""
31
+ chapter_counter = 1 # Initialize chapter counter
32
 
33
  for i, word in enumerate(words):
34
  # Check if the word is a title (e.g., Mr., Mrs.)
35
  if word in TITLES:
36
+ processed_text += word + " "
37
  # Check if the word ends with a period and is not followed by a quote
38
  elif word.endswith('.') and not word.endswith('."'):
39
+ processed_text += word + "\n"
40
+ # Replace '### Simplified Version' with 'Chapter N'
41
+ elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
42
+ processed_text += f"Chapter {chapter_counter} "
43
+ chapter_counter += 1 # Increment chapter counter
44
+ words[i + 1] = "" # Skip the next two words
45
+ words[i + 2] = ""
46
  else:
47
+ processed_text += word + " "
48
 
49
+ # Remove trailing spaces and newlines
50
+ return processed_text.strip()
51
 
52
  def combine_dataset_texts(dataset_name, split, text_column):
53
  try:
 
65
  cleaned_text = clean_text(combined_text)
66
 
67
  # Process the text: insert newlines after periods, except for titles and ."
68
+ # Also replace '### Simplified Version' with 'Chapter N'
69
  processed_text = process_text(cleaned_text)
70
 
71
  # Create a temporary file