Inoob commited on
Commit
ede5a01
·
verified ·
1 Parent(s): 77b09d4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import itertools
3
+ import pickle
4
+
5
+
6
+ # Import and download necessary NLTK data for tokenization.
7
+ import nltk
8
+ from nltk.translate.bleu_score import sentence_bleu
9
+
10
+ nltk.download('punkt')
11
+
12
+ # Import the ROUGE metric implementation.
13
+ from rouge import Rouge
14
+
15
+ rouge = Rouge()
16
+
17
+ from datasets import load_dataset
18
+ import streamlit as st
19
+
20
+ # Use name="sample-10BT" to use the 10BT sample.
21
+ fw = load_dataset("HuggingFaceFW/fineweb", name="CC-MAIN-2024-10", split="train", streaming=True)
22
+
23
+
24
+ # Define helper functions for character-level accuracy and precision.
25
+ def char_accuracy(true_output, model_output):
26
+ # Compare matching characters in corresponding positions.
27
+ matches = sum(1 for c1, c2 in zip(true_output, model_output) if c1 == c2)
28
+ # Account for any extra characters in either string.
29
+ total = max(len(true_output), len(model_output))
30
+ return matches / total if total > 0 else 1.0
31
+
32
+
33
+ def char_precision(true_output, model_output):
34
+ # Precision is matching characters divided by the length of the model's output.
35
+ matches = sum(1 for c1, c2 in zip(true_output, model_output) if c1 == c2)
36
+ return matches / len(model_output) if len(model_output) > 0 else 0.0
37
+
38
+
39
+ # Initialize Streamlit app
40
+ st.title("Model Evaluation App")
41
+ st.write("This app evaluates a model's ability to reverse input text character by character.")
42
+
43
+ # Parameters
44
+ word_threshold = st.sidebar.number_input("Word Threshold", value=100, step=10)
45
+ num_samples = st.sidebar.number_input("Number of Samples", value=1, step=1)
46
+
47
+ # Get samples
48
+ samples = list(itertools.islice(fw, num_samples))
49
+ acc = []
50
+ pres = []
51
+ bleu = []
52
+ rouges = []
53
+
54
+ for x in samples:
55
+ nextt = x["text"].split(" ")
56
+ for n in range(len(nextt) // word_threshold):
57
+ inp = nextt[word_threshold * n: word_threshold * (n + 1)]
58
+ inp = " ".join(inp).replace("\n", "")
59
+
60
+ # Display the input text
61
+ st.subheader("Input Text")
62
+ st.write(inp)
63
+
64
+ prompt = (
65
+ "You are a helpful assistant that echoes the user's input, but backwards, "
66
+ "do not simply rearrange the words, reverse the user's input down to the character "
67
+ "(e.g. reverse Hello World to dlroW olleH). Surround the backwards version of the "
68
+ "user's input with <back> </back> tags. " + inp
69
+ )
70
+
71
+ # Ground truth: reverse the input (character by character)
72
+ true_output = inp[::-1]
73
+ st.subheader("True Output")
74
+ st.write(true_output)
75
+
76
+ # Get the model output (Here, we simulate it or integrate your model inference)
77
+ # For demonstration purposes, we'll reverse the input as the model output
78
+ # Replace this part with your model's actual output
79
+ model_output_full = "<back>" + true_output + "</back>"
80
+
81
+ # Extract the text between <back> and </back> tags
82
+ tag1 = model_output_full.find("<back>")
83
+ tag2 = model_output_full.find("</back>")
84
+ model_output = model_output_full[tag1 + 6: tag2]
85
+ st.subheader("Model Output")
86
+ st.write(model_output)
87
+
88
+ # Tokenize both outputs for BLEU calculation
89
+ reference_tokens = nltk.word_tokenize(true_output)
90
+ candidate_tokens = nltk.word_tokenize(model_output)
91
+
92
+ # Compute BLEU score (using the single reference)
93
+ bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
94
+ st.write("**BLEU Score:**", bleu_score)
95
+
96
+ # Compute ROUGE scores
97
+ rouge_scores = rouge.get_scores(model_output, true_output)
98
+ st.write("**ROUGE Scores:**")
99
+ st.json(rouge_scores)
100
+
101
+ # Compute character-level accuracy and precision
102
+ accuracy_metric = char_accuracy(true_output, model_output)
103
+ precision_metric = char_precision(true_output, model_output)
104
+ st.write("**Character Accuracy:**", accuracy_metric)
105
+ st.write("**Character Precision:**", precision_metric)
106
+
107
+ st.markdown("---")
108
+
109
+ # Append metrics to lists
110
+ acc.append(accuracy_metric)
111
+ pres.append(precision_metric)
112
+ bleu.append(bleu_score)
113
+ rouges.append(rouge_scores)
114
+
115
+ # Allow the user to download the metrics
116
+ if st.button("Download Metrics"):
117
+ with open('accuracy.pkl', 'wb') as file:
118
+ pickle.dump(acc, file)
119
+ with open('precision.pkl', 'wb') as file:
120
+ pickle.dump(pres, file)
121
+ with open('bleu.pkl', 'wb') as file:
122
+ pickle.dump(bleu, file)
123
+ with open('rouge.pkl', 'wb') as file:
124
+ pickle.dump(rouges, file)
125
+ st.success("Metrics saved successfully!")
126
+
127
+ # Provide download links
128
+ st.download_button('Download Accuracy Metrics', data=open('accuracy.pkl', 'rb'), file_name='accuracy.pkl')
129
+ st.download_button('Download Precision Metrics', data=open('precision.pkl', 'rb'), file_name='precision.pkl')
130
+ st.download_button('Download BLEU Metrics', data=open('bleu.pkl', 'rb'), file_name='bleu.pkl')
131
+ st.download_button('Download ROUGE Metrics', data=open('rouge.pkl', 'rb'), file_name='rouge.pkl')