Spaces:
Sleeping
Sleeping
import gradio as gr | |
from datasets import load_dataset | |
import tempfile | |
import re | |
from langdetect import detect | |
def is_english(text): | |
"""Check if the text is in English.""" | |
try: | |
return detect(text) == 'en' | |
except: | |
return False | |
def clean_text(text): | |
"""Remove non-English text and ** from the text.""" | |
# Remove ** | |
text = re.sub(r'\*\*', '', text) | |
# Split text into sentences and filter out non-English sentences | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
cleaned_sentences = [s for s in sentences if is_english(s)] | |
return ' '.join(cleaned_sentences) | |
def combine_dataset_texts(dataset_name, split, text_column): | |
try: | |
# Load the dataset from Hugging Face Hub | |
dataset = load_dataset(dataset_name, split=split) | |
# Verify the text column exists | |
if text_column not in dataset.column_names: | |
raise gr.Error(f"Column '{text_column}' not found in dataset") | |
# Combine all texts into a single string without separating datapoints | |
combined_text = " ".join([example[text_column] for example in dataset]) | |
# Clean the text: remove non-English and ** | |
cleaned_text = clean_text(combined_text) | |
# Insert a newline after each period (.) except for ." | |
processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text) | |
# Create a temporary file | |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: | |
f.write(processed_text) | |
return f.name | |
except Exception as e: | |
raise gr.Error(f"Error processing dataset: {str(e)}") | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Hugging Face Dataset Text Combiner") | |
gr.Markdown("Combine all text files from a Hugging Face dataset into a single file") | |
with gr.Row(): | |
dataset_input = gr.Textbox(label="Dataset Name", | |
placeholder="username/dataset-name") | |
split_input = gr.Textbox(label="Split", value="train") | |
column_input = gr.Textbox(label="Text Column", value="text") | |
submit_btn = gr.Button("Combine Texts") | |
with gr.Row(): | |
output_file = gr.File(label="Combined Text File") | |
error_out = gr.Textbox(label="Error Output", visible=False) | |
submit_btn.click( | |
fn=combine_dataset_texts, | |
inputs=[dataset_input, split_input, column_input], | |
outputs=output_file, | |
api_name="combine_texts" | |
) | |
if __name__ == "__main__": | |
demo.launch() |