Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -23,22 +23,31 @@ def clean_text(text):
|
|
23 |
return ' '.join(cleaned_sentences)
|
24 |
|
25 |
def process_text(text):
|
26 |
-
"""Insert a newline after periods, except for titles and ."
|
|
|
27 |
# Split text into words
|
28 |
words = text.split()
|
29 |
-
|
|
|
30 |
|
31 |
for i, word in enumerate(words):
|
32 |
# Check if the word is a title (e.g., Mr., Mrs.)
|
33 |
if word in TITLES:
|
34 |
-
|
35 |
# Check if the word ends with a period and is not followed by a quote
|
36 |
elif word.endswith('.') and not word.endswith('."'):
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
else:
|
39 |
-
|
40 |
|
41 |
-
|
|
|
42 |
|
43 |
def combine_dataset_texts(dataset_name, split, text_column):
|
44 |
try:
|
@@ -56,6 +65,7 @@ def combine_dataset_texts(dataset_name, split, text_column):
|
|
56 |
cleaned_text = clean_text(combined_text)
|
57 |
|
58 |
# Process the text: insert newlines after periods, except for titles and ."
|
|
|
59 |
processed_text = process_text(cleaned_text)
|
60 |
|
61 |
# Create a temporary file
|
|
|
23 |
return ' '.join(cleaned_sentences)
|
24 |
|
25 |
def process_text(text):
|
26 |
+
"""Insert a newline after periods, except for titles and ."
|
27 |
+
Also replace '### Simplified Version' with 'Chapter N' where N increments."""
|
28 |
# Split text into words
|
29 |
words = text.split()
|
30 |
+
processed_text = ""
|
31 |
+
chapter_counter = 1 # Initialize chapter counter
|
32 |
|
33 |
for i, word in enumerate(words):
|
34 |
# Check if the word is a title (e.g., Mr., Mrs.)
|
35 |
if word in TITLES:
|
36 |
+
processed_text += word + " "
|
37 |
# Check if the word ends with a period and is not followed by a quote
|
38 |
elif word.endswith('.') and not word.endswith('."'):
|
39 |
+
processed_text += word + "\n"
|
40 |
+
# Replace '### Simplified Version' with 'Chapter N'
|
41 |
+
elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
|
42 |
+
processed_text += f"Chapter {chapter_counter} "
|
43 |
+
chapter_counter += 1 # Increment chapter counter
|
44 |
+
words[i + 1] = "" # Skip the next two words
|
45 |
+
words[i + 2] = ""
|
46 |
else:
|
47 |
+
processed_text += word + " "
|
48 |
|
49 |
+
# Remove trailing spaces and newlines
|
50 |
+
return processed_text.strip()
|
51 |
|
52 |
def combine_dataset_texts(dataset_name, split, text_column):
|
53 |
try:
|
|
|
65 |
cleaned_text = clean_text(combined_text)
|
66 |
|
67 |
# Process the text: insert newlines after periods, except for titles and ."
|
68 |
+
# Also replace '### Simplified Version' with 'Chapter N'
|
69 |
processed_text = process_text(cleaned_text)
|
70 |
|
71 |
# Create a temporary file
|