File size: 7,488 Bytes
f771463
 
 
 
 
 
 
a177196
f771463
acccd9c
214dec4
f771463
a177196
f771463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214dec4
a177196
f771463
 
 
 
 
214dec4
f771463
 
 
 
214dec4
 
 
 
 
f771463
 
a177196
 
f771463
 
 
 
 
 
 
 
 
a177196
f771463
 
 
 
 
a177196
f771463
a177196
 
 
f771463
 
a177196
f771463
a177196
 
 
 
f771463
 
214dec4
 
 
 
 
f771463
a177196
 
 
 
 
f771463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214dec4
f771463
 
 
 
 
 
 
 
 
 
 
 
a177196
f771463
 
 
 
 
a177196
 
 
 
 
 
 
 
 
 
 
 
 
 
f771463
 
 
 
 
 
 
 
 
a177196
 
 
f771463
 
 
a177196
 
 
f771463
 
 
 
 
 
 
 
 
 
a177196
 
 
 
f771463
 
 
 
214dec4
 
 
f771463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214dec4
 
 
f771463
 
 
 
 
4ddc7ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
"""
Dashboard for showcasing extraction of text metrics with textdescriptives.

"""

from io import StringIO

import pandas as pd
import streamlit as st
import textdescriptives as td

from data_viewer import DataViewer
from process_text import text_to_metrics
from options import (
    all_model_size_options_pretty_to_short,
    available_model_size_options,
    language_options,
    metrics_options,
)

################
# Introduction #
################


col1, col2 = st.columns([9, 2])
with col1:
    st.title("Extract Text Statistics")
with col2:
    st.image(
        "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
        width=125,
    )

st.write(
    "Calculate a large variety of statistics from text via the "
    "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
    f"(v/{td.__version__}) and download the results as a .csv file. "
    "Includes descriptive statistics and metrics related to readability, "
    "information theory, text coherence and text quality."
)

st.write(
    "The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). "
    "If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
)

st.caption(
    "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
    "calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
    "5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
)


############
# Settings #
############


input_choice = st.radio(
    label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
)

with st.form(key="settings_form"):
    split_by_line = st.checkbox(label="Split by newline", value=True)

    file_name_to_text_string = {}

    if input_choice == "Upload file(s)":
        uploaded_files = st.file_uploader(
            label="Choose a .txt file", type=["txt"], accept_multiple_files=True
        )

        if uploaded_files is not None and len(uploaded_files) > 0:
            # To convert to a string based IO:
            file_name_to_text_string = {
                file.name: StringIO(file.getvalue().decode("utf-8")).read()
                for file in uploaded_files
            }

    else:
        default_text = """Hello, morning dew. The grass whispers low.
I'm here to dance. The gentle breeze does show.
Good morning, world. The birds sing in delight.
Let's spread our wings. The butterflies take flight.
Nature's chorus sings, a symphony of light."""

        file_name_to_text_string = {
            "input": st.text_area(
                label="Enter text", value=default_text, height=145, max_chars=None
            )
        }

    # Row of selectors
    col1, col2 = st.columns([1, 1])

    with col1:
        # Selection of language
        language_pretty = st.selectbox(
            label="Language",
            options=list(language_options().keys()),
            index=5,
            key="language_selector",
        )

        language_short = language_options()[language_pretty]

    with col2:
        # Selection of model size
        model_size_pretty = st.selectbox(
            label="Model Size",
            options=available_model_size_options(lang="all"),
            index=0,
            key="size_selector",
        )

        model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]

    # Multiselection of metrics
    metrics = st.multiselect(
        label="Metrics", options=metrics_options(), default=metrics_options()
    )

    st.write(
        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
        "information on the available metrics."
    )

    # This shouldn't happen but better safe than sorry
    if isinstance(metrics, list) and not metrics:
        metrics = None

    apply_settings_button = st.form_submit_button(label="Apply")


#############
# Apply NLP #
#############


if apply_settings_button and len(file_name_to_text_string) > 0:
    if model_size_pretty not in available_model_size_options(lang=language_short):
        st.write(
            "**Sorry!** The chosen *model size* is not available in this language. Please try another."
        )
    else:
        # Extract metrics for each text
        output_df = pd.concat(
            [
                text_to_metrics(
                    string=string,
                    language_short=language_short,
                    model_size_short=model_size_short,
                    metrics=metrics,
                    split_by_line=split_by_line,
                    filename=filename if "Upload" in input_choice else None,
                )
                for filename, string in file_name_to_text_string.items()
            ],
            ignore_index=True,
        )

        ###################
        # Present Results #
        ###################

        # Create 2 columns with 1) the output header
        # and 2) a download button
        DataViewer()._header_and_download(
            header="The calculated metrics",
            data=output_df,
            file_name="text_metrics.csv",
        )

        st.write("**Note**: This data frame has been transposed for readability.")
        output_df = output_df.transpose().reset_index()
        output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
        st.dataframe(data=output_df, use_container_width=True)


############################
# Code For Reproducibility #
############################


with st.expander("See python code"):
    st.code(
        """
# Note: This is the code for a single text file
# The actual code is slightly more complex
# to allow processing multiple files at once

import textdescriptives as td

# Given a string of text and the settings
text = "..."
language = "..."
model_size = "..."
metrics = [...]
split_by_newline = True

# Remove whitespace from both ends of the string
text = text.strip()

# When asked, split by newlines
if split_by_newline:
    lines = text.split("\\n")
else:
    lines = [text]

# Remove empty lines
# E.g. due to consecutive newlines
lines = [l for l in lines if l]

# Extract metrics for each line
extracted_metrics = td.extract_metrics(
    text=lines,
    lang=language,
    spacy_model_size=model_size,
    metrics=metrics
)

""",
        language="python",
    )

#######
# FAQ #
#######

st.subheader("Frequently Asked Questions (FAQ)")

with st.expander("What does the 'Split by newline' option do?"):
    st.write(
        """
    When the `Split by newline` option is `enabled`, the metrics calculation is 
    performed separately for each paragraph. I.e. whenever there's a line break, 
    we split the text.

    When this option is `disabled`, the entire text is processed at once.
    """
    )

with st.expander(
    "Why do I get a warning/error message for certain languages or model sizes?"
):
    st.write(
        """
    Some combinations of languages, model sizes, and metrics are not currently supported in the app. 
    While we *are* working on this, you may currently see a red box
    with an error message after clicking `Apply`. 

    If you need this language and/or model size to work for your project, 
    please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). 
    This may cause us to prioritize supporting your use case.
    """
    )