import gradio as gr import matplotlib.pyplot as plt import numpy as np # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code") # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS") # apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail") # books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3") # cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset") # dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath") # discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse") # wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki") # euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings") # freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options") # ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff") # ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues") # gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg") # leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode") # pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw") # pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed") # s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC") # se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange") # usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET") # uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO") # ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC") # arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv") dataset_data = { "AI4Code": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "AMPS": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "ASFPublicMail": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Books3": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "CPDataset": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "DMMath": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Discourse": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Enwiki": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "EuroParliamentProceedings": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "FreeLaw_Options": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "GitHubDiff": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "GitHubIssues": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "Gutenberg": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "LeetCode": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "PileOfLaw": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "PubMed": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "S2ORC": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "StackExchange": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "USENET": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "USPTO": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "UbuntuIRC": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, "arXiv": { # create fake data for the different ratios "word_rep_ratios": np.random.randn(1000), "char_rep_ratios": np.random.randn(1000), "flagged_word_ratios": np.random.randn(1000), "num_words": np.random.randint(0, 1000, 1000), }, } def plt_plot(threshold, x): # prepare some data for a histogram # x = np.random.randn(1000) # create a figure fig = plt.figure() # add a subplot ax = fig.add_subplot(111) # plot some data ax.hist(x, bins=50) # plot red dashed line at threshold ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2) plt.title("Histogram of random data") plt.xlabel("Value") plt.ylabel("Frequency") return fig # x = ["Math", "Business", "Statistics", "IT", "Commerce"] # y = [68, 73, 82, 74, 85] # # create a new plot # plt.rcParams['figure.figsize'] = 6,4 # fig = plt.figure() # ax = fig.add_axes([0,0,1,1]) # ax.bar(x, y) # plot red dashed line at threshold # plt.axhline(y=threshold, color='r', linestyle='--') # plt.title("Marks per subject") # plt.xlabel("Subject") # plt.ylabel("Score") # return fig with gr.Blocks() as demo: dataset = gr.Radio(list(dataset_data.keys()), label="Dataset") with gr.Tab("Character Repetition Ratio"): # plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=100, label="Threshold") calculate = gr.Button("Calculate") calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot) with gr.Tab("Word Repetition Ratio"):# plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot) with gr.Tab("Flagged Word Ratio"):# plot some random data plot = gr.Plot() threshold = gr.Slider(minimum=0, maximum=1, label="Threshold") calculate = gr.Button("Calculate") calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot) if __name__ == "__main__": demo.launch(share=True)