HLasse commited on
Commit
f771463
·
1 Parent(s): 2d07679

init upload

Browse files
Files changed (5) hide show
  1. README.md +27 -13
  2. app.py +208 -0
  3. data_viewer.py +23 -0
  4. options.py +112 -0
  5. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,27 @@
1
- ---
2
- title: Textdescriptives
3
- emoji: 📈
4
- colorFrom: green
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.19.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TextDescriptives Demo
2
+
3
+ A streamlit dashboard for extracting text metrics with TextDescriptives.
4
+
5
+
6
+ ## TODO
7
+
8
+ [ ] Add license
9
+
10
+ [ ] Host on huggingface / streamlit cloud
11
+
12
+ [ ] Change default text in text box to something lighter :-)
13
+
14
+ [ ] Ensure environment.yaml works - currently just added stuff manually. When it works, perhaps update installation notes below.
15
+
16
+ [ ] Ensure models are pre-downloaded to speed up inference
17
+
18
+ [ ] When supporting the transformer models, we should pre-install the dependencies to avoid it happening at runtime.
19
+
20
+
21
+ ## Installation
22
+
23
+ ```shell
24
+ conda create --name textdesc python==3.11
25
+ pip install textdescriptives streamlit watchdog
26
+ streamlit main.py
27
+ ```
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dashboard for showcasing extraction of text metrics with textdescriptives.
3
+
4
+ """
5
+
6
+ from io import StringIO
7
+
8
+ import numpy as np
9
+ import streamlit as st
10
+ import textdescriptives as td
11
+
12
+ from data_viewer import DataViewer
13
+ from options import (
14
+ all_model_size_options_pretty_to_short,
15
+ available_model_size_options,
16
+ language_options,
17
+ metrics_options,
18
+ )
19
+
20
+ ################
21
+ # Introduction #
22
+ ################
23
+
24
+
25
+ col1, col2 = st.columns([9, 2])
26
+ with col1:
27
+ st.title("Extract Text Statistics")
28
+ with col2:
29
+ st.image(
30
+ "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png"
31
+ )
32
+
33
+ st.write(
34
+ "Calculate a large variety of statistics from text via the "
35
+ "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
36
+ f"(v/{td.__version__}). and download the results as a .csv file. "
37
+ "Includes descriptive statistics and metrics related to readability, "
38
+ "information theory, text coherence and text quality."
39
+ )
40
+
41
+ st.caption(
42
+ "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
43
+ "calculating a large variety of statistics from text. "
44
+ "[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
45
+ )
46
+
47
+
48
+ ############
49
+ # Settings #
50
+ ############
51
+
52
+
53
+ input_choice = st.radio(
54
+ label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
55
+ )
56
+
57
+ with st.form(key="settings_form"):
58
+ split_by_line = st.checkbox(label="Split by newline", value=True)
59
+
60
+ string_data = None
61
+
62
+ if input_choice == "Upload file":
63
+ uploaded_file = st.file_uploader(
64
+ label="Choose a .txt file", type=["txt"], accept_multiple_files=False
65
+ )
66
+
67
+ if uploaded_file is not None:
68
+ # To convert to a string based IO:
69
+ string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
70
+
71
+ else:
72
+ default_text = """Little interest or pleasure in doing things?
73
+ Feeling down, depressed, or hopeless?
74
+ Trouble falling or staying asleep, or sleeping too much?
75
+ Feeling tired or having little energy?
76
+ Poor appetite or overeating?
77
+ Feeling bad about yourself - or that you are a failure or have let yourself or your family down?"""
78
+
79
+ string_data = st.text_area(
80
+ label="Enter text", value=default_text, height=170, max_chars=None
81
+ )
82
+
83
+ # Row of selectors
84
+ col1, col2 = st.columns([1, 1])
85
+
86
+ with col1:
87
+ # Selection of language
88
+ language_pretty = st.selectbox(
89
+ label="Language",
90
+ options=list(language_options().keys()),
91
+ index=5,
92
+ key="language_selector",
93
+ )
94
+
95
+ language_short = language_options()[language_pretty]
96
+
97
+ with col2:
98
+ # Selection of model size
99
+ model_size_pretty = st.selectbox(
100
+ label="Model Size",
101
+ options=available_model_size_options(lang="all"),
102
+ index=0,
103
+ key="size_selector",
104
+ )
105
+
106
+ model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
107
+
108
+ # Multiselection of metrics
109
+ metrics = st.multiselect(
110
+ label="Metrics", options=metrics_options(), default=metrics_options()
111
+ )
112
+
113
+ st.write(
114
+ "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
115
+ "information on the available metrics."
116
+ )
117
+ # This shouldn't happen but better safe than sorry
118
+ if isinstance(metrics, list) and not metrics:
119
+ metrics = None
120
+
121
+ apply_settings_button = st.form_submit_button(label="Apply")
122
+
123
+
124
+ #############
125
+ # Apply NLP #
126
+ #############
127
+
128
+
129
+ if apply_settings_button and string_data is not None and string_data:
130
+ if model_size_pretty not in available_model_size_options(lang=language_short):
131
+ st.write(
132
+ "**Sorry!** The chosen *model size* is not available in this language. Please try another."
133
+ )
134
+ else:
135
+ # Clean and (optionally) split the text
136
+ string_data = string_data.strip()
137
+ if split_by_line:
138
+ string_data = string_data.split("\n")
139
+ else:
140
+ string_data = [string_data]
141
+
142
+ # Remove empty strings
143
+ # E.g. due to consecutive newlines
144
+ string_data = [s for s in string_data if s]
145
+
146
+ # Will automatically download the relevant model and extract all metrics
147
+ # TODO: Download beforehand to speed up inference
148
+ df = td.extract_metrics(
149
+ text=string_data,
150
+ lang=language_short,
151
+ spacy_model_size=model_size_short,
152
+ metrics=metrics,
153
+ )
154
+
155
+ ###################
156
+ # Present Results #
157
+ ###################
158
+
159
+ # Create 2 columns with 1) the output header
160
+ # and 2) a download button
161
+ DataViewer()._header_and_download(
162
+ header="The calculated metrics", data=df, file_name="text_metrics.csv"
163
+ )
164
+
165
+ st.write("**Note**: This data frame has been transposed for readability.")
166
+ df = df.transpose().reset_index()
167
+ df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
168
+ st.dataframe(data=df, use_container_width=True)
169
+
170
+
171
+ ############################
172
+ # Code For Reproducibility #
173
+ ############################
174
+
175
+
176
+ with st.expander("See python code"):
177
+ st.code(
178
+ """
179
+ import textdescriptives as td
180
+
181
+ # Given a string of text and the settings
182
+ text = "..."
183
+ model_name = "..."
184
+ split_by_newline = True
185
+
186
+ # Remove whitespace from both ends of the string
187
+ text = text.strip()
188
+
189
+ # When asked, split by newlines
190
+ if split_by_newline:
191
+ lines = text.split("\\n")
192
+ else:
193
+ lines = [text]
194
+
195
+ # Remove empty lines
196
+ # E.g. due to consecutive newlines
197
+ lines = [l for l in lines if l]
198
+
199
+ # Extract metrics for each line
200
+ extracted_metrics = td.extract_metrics(
201
+ text=lines,
202
+ spacy_model=model_name
203
+ )
204
+
205
+ """,
206
+ language="python",
207
+ line_numbers=True,
208
+ )
data_viewer.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+
4
+
5
+ class DataViewer:
6
+
7
+ # @st.cache_data
8
+ def _convert_df_to_csv(self, data, **kwargs):
9
+ return data.to_csv(**kwargs).encode('utf-8')
10
+
11
+ def _header_and_download(self, header, data, file_name, key=None, label="Download", help="Download data"):
12
+ col1, col2 = st.columns([9, 2])
13
+ with col1:
14
+ st.subheader(header)
15
+ with col2:
16
+ st.write("")
17
+ st.download_button(
18
+ label=label,
19
+ data=self._convert_df_to_csv(data),
20
+ file_name=file_name,
21
+ key=key,
22
+ help=help
23
+ )
options.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Set
2
+
3
+ from spacy.cli.download import get_compatibility
4
+
5
+
6
+ def metrics_options() -> List[str]:
7
+ return [
8
+ "descriptive_stats",
9
+ "readability",
10
+ "dependency_distance",
11
+ "pos_proportions",
12
+ "coherence",
13
+ "quality",
14
+ ]
15
+
16
+
17
+ def language_options() -> Dict[str, str]:
18
+ return {
19
+ "Catalan": "ca",
20
+ "Chinese": "zh",
21
+ "Croatian": "hr",
22
+ "Danish": "da",
23
+ "Dutch": "nl",
24
+ "English": "en",
25
+ "Finnish": "fi",
26
+ "French": "fr",
27
+ "German": "de",
28
+ "Greek": "el",
29
+ "Italian": "it",
30
+ "Japanese": "ja",
31
+ "Korean": "ko",
32
+ "Lithuanian": "lt",
33
+ "Macedonian": "mk",
34
+ "Multi-language": "xx",
35
+ "Norwegian Bokmål": "nb",
36
+ "Polish": "pl",
37
+ "Portuguese": "pt",
38
+ "Romanian": "ro",
39
+ "Russian": "ru",
40
+ "Spanish": "es",
41
+ "Swedish": "sv",
42
+ "Ukrainian": "uk",
43
+ }
44
+
45
+
46
+ #################
47
+ # Model options #
48
+ #################
49
+
50
+
51
+ def all_model_size_options_pretty_to_short() -> Dict[str, str]:
52
+ return {
53
+ "Small": "sm",
54
+ "Medium": "md",
55
+ "Large": "lg",
56
+ # "Transformer": "trf" # Disabled for now
57
+ }
58
+
59
+
60
+ def all_model_size_options_short_to_pretty() -> Dict[str, str]:
61
+ return {
62
+ short: pretty
63
+ for pretty, short in all_model_size_options_pretty_to_short().items()
64
+ }
65
+
66
+
67
+ def available_model_size_options(lang) -> List[str]:
68
+ short_to_pretty = all_model_size_options_short_to_pretty()
69
+ if lang == "all":
70
+ return sorted(list(short_to_pretty.values()))
71
+ return sorted(
72
+ [
73
+ short_to_pretty[short]
74
+ for short in ModelAvailabilityChecker.available_model_sizes_for_language(
75
+ lang
76
+ )
77
+ ]
78
+ )
79
+
80
+
81
+ class ModelAvailabilityChecker:
82
+ @staticmethod
83
+ def available_models() -> List[str]:
84
+ return list(get_compatibility().keys())
85
+
86
+ @staticmethod
87
+ def extract_language_and_size() -> List[List[str]]:
88
+ # [["ca", "sm"], ["en", "lg"], ...]
89
+ return list(
90
+ [
91
+ list(map(m.split("_").__getitem__, [0, -1]))
92
+ for m in ModelAvailabilityChecker.available_models()
93
+ ]
94
+ )
95
+
96
+ @staticmethod
97
+ def model_is_available(lang: str, size: str) -> bool:
98
+ lang_and_size = set(
99
+ [
100
+ "_".join(lang_size)
101
+ for lang_size in ModelAvailabilityChecker.extract_language_and_size()
102
+ ]
103
+ )
104
+ return f"{lang}_{size}" in lang_and_size
105
+
106
+ @staticmethod
107
+ def available_model_sizes_for_language(lang: str) -> Set[str]:
108
+ return set([
109
+ size
110
+ for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
111
+ if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
112
+ ])
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ textdescriptives>=2.4.4
2
+ streamlit>=1.17.0
3
+ watchdog
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0.tar.gz
5
+ https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.5.0/da_core_news_lg-3.5.0.tar.gz