Spaces:
Sleeping
Sleeping
File size: 2,590 Bytes
1153ecb 4e3915c 1153ecb 4e3915c 1153ecb 4e3915c 1153ecb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
from datasets import load_dataset
import tempfile
import re
from langdetect import detect
def is_english(text):
"""Check if the text is in English."""
try:
return detect(text) == 'en'
except:
return False
def clean_text(text):
"""Remove non-English text and ** from the text."""
# Remove **
text = re.sub(r'\*\*', '', text)
# Split text into sentences and filter out non-English sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
cleaned_sentences = [s for s in sentences if is_english(s)]
return ' '.join(cleaned_sentences)
def combine_dataset_texts(dataset_name, split, text_column):
try:
# Load the dataset from Hugging Face Hub
dataset = load_dataset(dataset_name, split=split)
# Verify the text column exists
if text_column not in dataset.column_names:
raise gr.Error(f"Column '{text_column}' not found in dataset")
# Combine all texts into a single string without separating datapoints
combined_text = " ".join([example[text_column] for example in dataset])
# Clean the text: remove non-English and **
cleaned_text = clean_text(combined_text)
# Insert a newline after each period (.) except for ."
processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
# Create a temporary file
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write(processed_text)
return f.name
except Exception as e:
raise gr.Error(f"Error processing dataset: {str(e)}")
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Hugging Face Dataset Text Combiner")
gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")
with gr.Row():
dataset_input = gr.Textbox(label="Dataset Name",
placeholder="username/dataset-name")
split_input = gr.Textbox(label="Split", value="train")
column_input = gr.Textbox(label="Text Column", value="text")
submit_btn = gr.Button("Combine Texts")
with gr.Row():
output_file = gr.File(label="Combined Text File")
error_out = gr.Textbox(label="Error Output", visible=False)
submit_btn.click(
fn=combine_dataset_texts,
inputs=[dataset_input, split_input, column_input],
outputs=output_file,
api_name="combine_texts"
)
if __name__ == "__main__":
demo.launch() |