File size: 2,590 Bytes
1153ecb
 
 
4e3915c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1153ecb
 
 
 
 
 
 
 
 
 
4e3915c
 
 
 
 
 
 
 
1153ecb
 
 
4e3915c
1153ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from datasets import load_dataset
import tempfile
import re
from langdetect import detect

def is_english(text):
    """Check if the text is in English."""
    try:
        return detect(text) == 'en'
    except:
        return False

def clean_text(text):
    """Remove non-English text and ** from the text."""
    # Remove **
    text = re.sub(r'\*\*', '', text)
    
    # Split text into sentences and filter out non-English sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    cleaned_sentences = [s for s in sentences if is_english(s)]
    
    return ' '.join(cleaned_sentences)

def combine_dataset_texts(dataset_name, split, text_column):
    try:
        # Load the dataset from Hugging Face Hub
        dataset = load_dataset(dataset_name, split=split)
        
        # Verify the text column exists
        if text_column not in dataset.column_names:
            raise gr.Error(f"Column '{text_column}' not found in dataset")
        
        # Combine all texts into a single string without separating datapoints
        combined_text = " ".join([example[text_column] for example in dataset])
        
        # Clean the text: remove non-English and **
        cleaned_text = clean_text(combined_text)
        
        # Insert a newline after each period (.) except for ."
        processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
            f.write(processed_text)
            return f.name
            
    except Exception as e:
        raise gr.Error(f"Error processing dataset: {str(e)}")

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Hugging Face Dataset Text Combiner")
    gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")
    
    with gr.Row():
        dataset_input = gr.Textbox(label="Dataset Name", 
                                 placeholder="username/dataset-name")
        split_input = gr.Textbox(label="Split", value="train")
        column_input = gr.Textbox(label="Text Column", value="text")
    
    submit_btn = gr.Button("Combine Texts")
    
    with gr.Row():
        output_file = gr.File(label="Combined Text File")
        error_out = gr.Textbox(label="Error Output", visible=False)

    submit_btn.click(
        fn=combine_dataset_texts,
        inputs=[dataset_input, split_input, column_input],
        outputs=output_file,
        api_name="combine_texts"
    )

if __name__ == "__main__":
    demo.launch()