Spaces:
Runtime error
Runtime error
init upload
Browse files- README.md +27 -13
- app.py +208 -0
- data_viewer.py +23 -0
- options.py +112 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,13 +1,27 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# TextDescriptives Demo
|
2 |
+
|
3 |
+
A streamlit dashboard for extracting text metrics with TextDescriptives.
|
4 |
+
|
5 |
+
|
6 |
+
## TODO
|
7 |
+
|
8 |
+
[ ] Add license
|
9 |
+
|
10 |
+
[ ] Host on huggingface / streamlit cloud
|
11 |
+
|
12 |
+
[ ] Change default text in text box to something lighter :-)
|
13 |
+
|
14 |
+
[ ] Ensure environment.yaml works - currently just added stuff manually. When it works, perhaps update installation notes below.
|
15 |
+
|
16 |
+
[ ] Ensure models are pre-downloaded to speed up inference
|
17 |
+
|
18 |
+
[ ] When supporting the transformer models, we should pre-install the dependencies to avoid it happening at runtime.
|
19 |
+
|
20 |
+
|
21 |
+
## Installation
|
22 |
+
|
23 |
+
```shell
|
24 |
+
conda create --name textdesc python==3.11
|
25 |
+
pip install textdescriptives streamlit watchdog
|
26 |
+
streamlit main.py
|
27 |
+
```
|
app.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Dashboard for showcasing extraction of text metrics with textdescriptives.
|
3 |
+
|
4 |
+
"""
|
5 |
+
|
6 |
+
from io import StringIO
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import streamlit as st
|
10 |
+
import textdescriptives as td
|
11 |
+
|
12 |
+
from data_viewer import DataViewer
|
13 |
+
from options import (
|
14 |
+
all_model_size_options_pretty_to_short,
|
15 |
+
available_model_size_options,
|
16 |
+
language_options,
|
17 |
+
metrics_options,
|
18 |
+
)
|
19 |
+
|
20 |
+
################
|
21 |
+
# Introduction #
|
22 |
+
################
|
23 |
+
|
24 |
+
|
25 |
+
col1, col2 = st.columns([9, 2])
|
26 |
+
with col1:
|
27 |
+
st.title("Extract Text Statistics")
|
28 |
+
with col2:
|
29 |
+
st.image(
|
30 |
+
"https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png"
|
31 |
+
)
|
32 |
+
|
33 |
+
st.write(
|
34 |
+
"Calculate a large variety of statistics from text via the "
|
35 |
+
"[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
|
36 |
+
f"(v/{td.__version__}). and download the results as a .csv file. "
|
37 |
+
"Includes descriptive statistics and metrics related to readability, "
|
38 |
+
"information theory, text coherence and text quality."
|
39 |
+
)
|
40 |
+
|
41 |
+
st.caption(
|
42 |
+
"Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
|
43 |
+
"calculating a large variety of statistics from text. "
|
44 |
+
"[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
############
|
49 |
+
# Settings #
|
50 |
+
############
|
51 |
+
|
52 |
+
|
53 |
+
input_choice = st.radio(
|
54 |
+
label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
|
55 |
+
)
|
56 |
+
|
57 |
+
with st.form(key="settings_form"):
|
58 |
+
split_by_line = st.checkbox(label="Split by newline", value=True)
|
59 |
+
|
60 |
+
string_data = None
|
61 |
+
|
62 |
+
if input_choice == "Upload file":
|
63 |
+
uploaded_file = st.file_uploader(
|
64 |
+
label="Choose a .txt file", type=["txt"], accept_multiple_files=False
|
65 |
+
)
|
66 |
+
|
67 |
+
if uploaded_file is not None:
|
68 |
+
# To convert to a string based IO:
|
69 |
+
string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
|
70 |
+
|
71 |
+
else:
|
72 |
+
default_text = """Little interest or pleasure in doing things?
|
73 |
+
Feeling down, depressed, or hopeless?
|
74 |
+
Trouble falling or staying asleep, or sleeping too much?
|
75 |
+
Feeling tired or having little energy?
|
76 |
+
Poor appetite or overeating?
|
77 |
+
Feeling bad about yourself - or that you are a failure or have let yourself or your family down?"""
|
78 |
+
|
79 |
+
string_data = st.text_area(
|
80 |
+
label="Enter text", value=default_text, height=170, max_chars=None
|
81 |
+
)
|
82 |
+
|
83 |
+
# Row of selectors
|
84 |
+
col1, col2 = st.columns([1, 1])
|
85 |
+
|
86 |
+
with col1:
|
87 |
+
# Selection of language
|
88 |
+
language_pretty = st.selectbox(
|
89 |
+
label="Language",
|
90 |
+
options=list(language_options().keys()),
|
91 |
+
index=5,
|
92 |
+
key="language_selector",
|
93 |
+
)
|
94 |
+
|
95 |
+
language_short = language_options()[language_pretty]
|
96 |
+
|
97 |
+
with col2:
|
98 |
+
# Selection of model size
|
99 |
+
model_size_pretty = st.selectbox(
|
100 |
+
label="Model Size",
|
101 |
+
options=available_model_size_options(lang="all"),
|
102 |
+
index=0,
|
103 |
+
key="size_selector",
|
104 |
+
)
|
105 |
+
|
106 |
+
model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
|
107 |
+
|
108 |
+
# Multiselection of metrics
|
109 |
+
metrics = st.multiselect(
|
110 |
+
label="Metrics", options=metrics_options(), default=metrics_options()
|
111 |
+
)
|
112 |
+
|
113 |
+
st.write(
|
114 |
+
"See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
|
115 |
+
"information on the available metrics."
|
116 |
+
)
|
117 |
+
# This shouldn't happen but better safe than sorry
|
118 |
+
if isinstance(metrics, list) and not metrics:
|
119 |
+
metrics = None
|
120 |
+
|
121 |
+
apply_settings_button = st.form_submit_button(label="Apply")
|
122 |
+
|
123 |
+
|
124 |
+
#############
|
125 |
+
# Apply NLP #
|
126 |
+
#############
|
127 |
+
|
128 |
+
|
129 |
+
if apply_settings_button and string_data is not None and string_data:
|
130 |
+
if model_size_pretty not in available_model_size_options(lang=language_short):
|
131 |
+
st.write(
|
132 |
+
"**Sorry!** The chosen *model size* is not available in this language. Please try another."
|
133 |
+
)
|
134 |
+
else:
|
135 |
+
# Clean and (optionally) split the text
|
136 |
+
string_data = string_data.strip()
|
137 |
+
if split_by_line:
|
138 |
+
string_data = string_data.split("\n")
|
139 |
+
else:
|
140 |
+
string_data = [string_data]
|
141 |
+
|
142 |
+
# Remove empty strings
|
143 |
+
# E.g. due to consecutive newlines
|
144 |
+
string_data = [s for s in string_data if s]
|
145 |
+
|
146 |
+
# Will automatically download the relevant model and extract all metrics
|
147 |
+
# TODO: Download beforehand to speed up inference
|
148 |
+
df = td.extract_metrics(
|
149 |
+
text=string_data,
|
150 |
+
lang=language_short,
|
151 |
+
spacy_model_size=model_size_short,
|
152 |
+
metrics=metrics,
|
153 |
+
)
|
154 |
+
|
155 |
+
###################
|
156 |
+
# Present Results #
|
157 |
+
###################
|
158 |
+
|
159 |
+
# Create 2 columns with 1) the output header
|
160 |
+
# and 2) a download button
|
161 |
+
DataViewer()._header_and_download(
|
162 |
+
header="The calculated metrics", data=df, file_name="text_metrics.csv"
|
163 |
+
)
|
164 |
+
|
165 |
+
st.write("**Note**: This data frame has been transposed for readability.")
|
166 |
+
df = df.transpose().reset_index()
|
167 |
+
df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
|
168 |
+
st.dataframe(data=df, use_container_width=True)
|
169 |
+
|
170 |
+
|
171 |
+
############################
|
172 |
+
# Code For Reproducibility #
|
173 |
+
############################
|
174 |
+
|
175 |
+
|
176 |
+
with st.expander("See python code"):
|
177 |
+
st.code(
|
178 |
+
"""
|
179 |
+
import textdescriptives as td
|
180 |
+
|
181 |
+
# Given a string of text and the settings
|
182 |
+
text = "..."
|
183 |
+
model_name = "..."
|
184 |
+
split_by_newline = True
|
185 |
+
|
186 |
+
# Remove whitespace from both ends of the string
|
187 |
+
text = text.strip()
|
188 |
+
|
189 |
+
# When asked, split by newlines
|
190 |
+
if split_by_newline:
|
191 |
+
lines = text.split("\\n")
|
192 |
+
else:
|
193 |
+
lines = [text]
|
194 |
+
|
195 |
+
# Remove empty lines
|
196 |
+
# E.g. due to consecutive newlines
|
197 |
+
lines = [l for l in lines if l]
|
198 |
+
|
199 |
+
# Extract metrics for each line
|
200 |
+
extracted_metrics = td.extract_metrics(
|
201 |
+
text=lines,
|
202 |
+
spacy_model=model_name
|
203 |
+
)
|
204 |
+
|
205 |
+
""",
|
206 |
+
language="python",
|
207 |
+
line_numbers=True,
|
208 |
+
)
|
data_viewer.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
class DataViewer:
|
6 |
+
|
7 |
+
# @st.cache_data
|
8 |
+
def _convert_df_to_csv(self, data, **kwargs):
|
9 |
+
return data.to_csv(**kwargs).encode('utf-8')
|
10 |
+
|
11 |
+
def _header_and_download(self, header, data, file_name, key=None, label="Download", help="Download data"):
|
12 |
+
col1, col2 = st.columns([9, 2])
|
13 |
+
with col1:
|
14 |
+
st.subheader(header)
|
15 |
+
with col2:
|
16 |
+
st.write("")
|
17 |
+
st.download_button(
|
18 |
+
label=label,
|
19 |
+
data=self._convert_df_to_csv(data),
|
20 |
+
file_name=file_name,
|
21 |
+
key=key,
|
22 |
+
help=help
|
23 |
+
)
|
options.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Set
|
2 |
+
|
3 |
+
from spacy.cli.download import get_compatibility
|
4 |
+
|
5 |
+
|
6 |
+
def metrics_options() -> List[str]:
|
7 |
+
return [
|
8 |
+
"descriptive_stats",
|
9 |
+
"readability",
|
10 |
+
"dependency_distance",
|
11 |
+
"pos_proportions",
|
12 |
+
"coherence",
|
13 |
+
"quality",
|
14 |
+
]
|
15 |
+
|
16 |
+
|
17 |
+
def language_options() -> Dict[str, str]:
|
18 |
+
return {
|
19 |
+
"Catalan": "ca",
|
20 |
+
"Chinese": "zh",
|
21 |
+
"Croatian": "hr",
|
22 |
+
"Danish": "da",
|
23 |
+
"Dutch": "nl",
|
24 |
+
"English": "en",
|
25 |
+
"Finnish": "fi",
|
26 |
+
"French": "fr",
|
27 |
+
"German": "de",
|
28 |
+
"Greek": "el",
|
29 |
+
"Italian": "it",
|
30 |
+
"Japanese": "ja",
|
31 |
+
"Korean": "ko",
|
32 |
+
"Lithuanian": "lt",
|
33 |
+
"Macedonian": "mk",
|
34 |
+
"Multi-language": "xx",
|
35 |
+
"Norwegian Bokmål": "nb",
|
36 |
+
"Polish": "pl",
|
37 |
+
"Portuguese": "pt",
|
38 |
+
"Romanian": "ro",
|
39 |
+
"Russian": "ru",
|
40 |
+
"Spanish": "es",
|
41 |
+
"Swedish": "sv",
|
42 |
+
"Ukrainian": "uk",
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
#################
|
47 |
+
# Model options #
|
48 |
+
#################
|
49 |
+
|
50 |
+
|
51 |
+
def all_model_size_options_pretty_to_short() -> Dict[str, str]:
|
52 |
+
return {
|
53 |
+
"Small": "sm",
|
54 |
+
"Medium": "md",
|
55 |
+
"Large": "lg",
|
56 |
+
# "Transformer": "trf" # Disabled for now
|
57 |
+
}
|
58 |
+
|
59 |
+
|
60 |
+
def all_model_size_options_short_to_pretty() -> Dict[str, str]:
|
61 |
+
return {
|
62 |
+
short: pretty
|
63 |
+
for pretty, short in all_model_size_options_pretty_to_short().items()
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
def available_model_size_options(lang) -> List[str]:
|
68 |
+
short_to_pretty = all_model_size_options_short_to_pretty()
|
69 |
+
if lang == "all":
|
70 |
+
return sorted(list(short_to_pretty.values()))
|
71 |
+
return sorted(
|
72 |
+
[
|
73 |
+
short_to_pretty[short]
|
74 |
+
for short in ModelAvailabilityChecker.available_model_sizes_for_language(
|
75 |
+
lang
|
76 |
+
)
|
77 |
+
]
|
78 |
+
)
|
79 |
+
|
80 |
+
|
81 |
+
class ModelAvailabilityChecker:
|
82 |
+
@staticmethod
|
83 |
+
def available_models() -> List[str]:
|
84 |
+
return list(get_compatibility().keys())
|
85 |
+
|
86 |
+
@staticmethod
|
87 |
+
def extract_language_and_size() -> List[List[str]]:
|
88 |
+
# [["ca", "sm"], ["en", "lg"], ...]
|
89 |
+
return list(
|
90 |
+
[
|
91 |
+
list(map(m.split("_").__getitem__, [0, -1]))
|
92 |
+
for m in ModelAvailabilityChecker.available_models()
|
93 |
+
]
|
94 |
+
)
|
95 |
+
|
96 |
+
@staticmethod
|
97 |
+
def model_is_available(lang: str, size: str) -> bool:
|
98 |
+
lang_and_size = set(
|
99 |
+
[
|
100 |
+
"_".join(lang_size)
|
101 |
+
for lang_size in ModelAvailabilityChecker.extract_language_and_size()
|
102 |
+
]
|
103 |
+
)
|
104 |
+
return f"{lang}_{size}" in lang_and_size
|
105 |
+
|
106 |
+
@staticmethod
|
107 |
+
def available_model_sizes_for_language(lang: str) -> Set[str]:
|
108 |
+
return set([
|
109 |
+
size
|
110 |
+
for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
|
111 |
+
if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
|
112 |
+
])
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
textdescriptives>=2.4.4
|
2 |
+
streamlit>=1.17.0
|
3 |
+
watchdog
|
4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0.tar.gz
|
5 |
+
https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.5.0/da_core_news_lg-3.5.0.tar.gz
|