Spaces:

HLasse
/

textdescriptives

Runtime error

App Files Files Community

HLasse commited on Apr 19, 2023

Commit

f771463

1 Parent(s): 2d07679

init upload

Browse files

Files changed (5) hide show

README.md +27 -13
app.py +208 -0
data_viewer.py +23 -0
options.py +112 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,27 @@
----
-title: Textdescriptives
-emoji: 📈
-colorFrom: green
-colorTo: red
-sdk: streamlit
-sdk_version: 1.19.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# TextDescriptives Demo
+A streamlit dashboard for extracting text metrics with TextDescriptives.
+## TODO
+[ ] Add license
+[ ] Host on huggingface / streamlit cloud
+[ ] Change default text in text box to something lighter :-)
+[ ] Ensure environment.yaml works - currently just added stuff manually. When it works, perhaps update installation notes below.
+[ ] Ensure models are pre-downloaded to speed up inference
+[ ] When supporting the transformer models, we should pre-install the dependencies to avoid it happening at runtime.
+## Installation
+```shell
+conda create --name textdesc python==3.11
+pip install textdescriptives streamlit watchdog
+streamlit main.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Dashboard for showcasing extraction of text metrics with textdescriptives.
+"""
+from io import StringIO
+import numpy as np
+import streamlit as st
+import textdescriptives as td
+from data_viewer import DataViewer
+from options import (
+    all_model_size_options_pretty_to_short,
+    available_model_size_options,
+    language_options,
+    metrics_options,
+)
+################
+# Introduction #
+################
+col1, col2 = st.columns([9, 2])
+with col1:
+    st.title("Extract Text Statistics")
+with col2:
+    st.image(
+        "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png"
+    )
+st.write(
+    "Calculate a large variety of statistics from text via the "
+    "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
+    f"(v/{td.__version__}). and download the results as a .csv file. "
+    "Includes descriptive statistics and metrics related to readability, "
+    "information theory, text coherence and text quality."
+)
+st.caption(
+    "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
+    "calculating a large variety of statistics from text. "
+    "[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
+)
+############
+# Settings #
+############
+input_choice = st.radio(
+    label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
+)
+with st.form(key="settings_form"):
+    split_by_line = st.checkbox(label="Split by newline", value=True)
+    string_data = None
+    if input_choice == "Upload file":
+        uploaded_file = st.file_uploader(
+            label="Choose a .txt file", type=["txt"], accept_multiple_files=False
+        )
+        if uploaded_file is not None:
+            # To convert to a string based IO:
+            string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
+    else:
+        default_text = """Little interest or pleasure in doing things?
+Feeling down, depressed, or hopeless?
+Trouble falling or staying asleep, or sleeping too much?
+Feeling tired or having little energy?
+Poor appetite or overeating?
+Feeling bad about yourself - or that you are a failure or have let yourself or your family down?"""
+        string_data = st.text_area(
+            label="Enter text", value=default_text, height=170, max_chars=None
+        )
+    # Row of selectors
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        # Selection of language
+        language_pretty = st.selectbox(
+            label="Language",
+            options=list(language_options().keys()),
+            index=5,
+            key="language_selector",
+        )
+        language_short = language_options()[language_pretty]
+    with col2:
+        # Selection of model size
+        model_size_pretty = st.selectbox(
+            label="Model Size",
+            options=available_model_size_options(lang="all"),
+            index=0,
+            key="size_selector",
+        )
+        model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]
+    # Multiselection of metrics
+    metrics = st.multiselect(
+        label="Metrics", options=metrics_options(), default=metrics_options()
+    )
+    st.write(
+        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
+        "information on the available metrics."
+    )
+    # This shouldn't happen but better safe than sorry
+    if isinstance(metrics, list) and not metrics:
+        metrics = None
+    apply_settings_button = st.form_submit_button(label="Apply")
+#############
+# Apply NLP #
+#############
+if apply_settings_button and string_data is not None and string_data:
+    if model_size_pretty not in available_model_size_options(lang=language_short):
+        st.write(
+            "**Sorry!** The chosen *model size* is not available in this language. Please try another."
+        )
+    else:
+        # Clean and (optionally) split the text
+        string_data = string_data.strip()
+        if split_by_line:
+            string_data = string_data.split("\n")
+        else:
+            string_data = [string_data]
+        # Remove empty strings
+        # E.g. due to consecutive newlines
+        string_data = [s for s in string_data if s]
+        # Will automatically download the relevant model and extract all metrics
+        # TODO: Download beforehand to speed up inference
+        df = td.extract_metrics(
+            text=string_data,
+            lang=language_short,
+            spacy_model_size=model_size_short,
+            metrics=metrics,
+        )
+        ###################
+        # Present Results #
+        ###################
+        # Create 2 columns with 1) the output header
+        # and 2) a download button
+        DataViewer()._header_and_download(
+            header="The calculated metrics", data=df, file_name="text_metrics.csv"
+        )
+        st.write("**Note**: This data frame has been transposed for readability.")
+        df = df.transpose().reset_index()
+        df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
+        st.dataframe(data=df, use_container_width=True)
+############################
+# Code For Reproducibility #
+############################
+with st.expander("See python code"):
+    st.code(
+        """
+import textdescriptives as td
+# Given a string of text and the settings
+text = "..."
+model_name = "..."
+split_by_newline = True
+# Remove whitespace from both ends of the string
+text = text.strip()
+# When asked, split by newlines
+if split_by_newline:
+    lines = text.split("\\n")
+else:
+    lines = [text]
+# Remove empty lines
+# E.g. due to consecutive newlines
+lines = [l for l in lines if l]
+# Extract metrics for each line
+extracted_metrics = td.extract_metrics(
+    text=lines,
+    spacy_model=model_name
+)
+""",
+        language="python",
+        line_numbers=True,
+    )

data_viewer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import streamlit as st
+class DataViewer:
+    # @st.cache_data
+    def _convert_df_to_csv(self, data, **kwargs):
+        return data.to_csv(**kwargs).encode('utf-8')
+    def _header_and_download(self, header, data, file_name, key=None, label="Download", help="Download data"):
+        col1, col2 = st.columns([9, 2])
+        with col1:
+            st.subheader(header)
+        with col2:
+            st.write("")
+            st.download_button(
+                label=label,
+                data=self._convert_df_to_csv(data),
+                file_name=file_name,
+                key=key,
+                help=help
+            )

options.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from typing import Dict, List, Set
+from spacy.cli.download import get_compatibility
+def metrics_options() -> List[str]:
+    return [
+        "descriptive_stats",
+        "readability",
+        "dependency_distance",
+        "pos_proportions",
+        "coherence",
+        "quality",
+    ]
+def language_options() -> Dict[str, str]:
+    return {
+        "Catalan": "ca",
+        "Chinese": "zh",
+        "Croatian": "hr",
+        "Danish": "da",
+        "Dutch": "nl",
+        "English": "en",
+        "Finnish": "fi",
+        "French": "fr",
+        "German": "de",
+        "Greek": "el",
+        "Italian": "it",
+        "Japanese": "ja",
+        "Korean": "ko",
+        "Lithuanian": "lt",
+        "Macedonian": "mk",
+        "Multi-language": "xx",
+        "Norwegian Bokmål": "nb",
+        "Polish": "pl",
+        "Portuguese": "pt",
+        "Romanian": "ro",
+        "Russian": "ru",
+        "Spanish": "es",
+        "Swedish": "sv",
+        "Ukrainian": "uk",
+    }
+#################
+# Model options #
+#################
+def all_model_size_options_pretty_to_short() -> Dict[str, str]:
+    return {
+        "Small": "sm",
+        "Medium": "md",
+        "Large": "lg",
+        # "Transformer": "trf"  # Disabled for now
+    }
+def all_model_size_options_short_to_pretty() -> Dict[str, str]:
+    return {
+        short: pretty
+        for pretty, short in all_model_size_options_pretty_to_short().items()
+    }
+def available_model_size_options(lang) -> List[str]:
+    short_to_pretty = all_model_size_options_short_to_pretty()
+    if lang == "all":
+        return sorted(list(short_to_pretty.values()))
+    return sorted(
+        [
+            short_to_pretty[short]
+            for short in ModelAvailabilityChecker.available_model_sizes_for_language(
+                lang
+            )
+        ]
+    )
+class ModelAvailabilityChecker:
+    @staticmethod
+    def available_models() -> List[str]:
+        return list(get_compatibility().keys())
+    @staticmethod
+    def extract_language_and_size() -> List[List[str]]:
+        # [["ca", "sm"], ["en", "lg"], ...]
+        return list(
+            [
+                list(map(m.split("_").__getitem__, [0, -1]))
+                for m in ModelAvailabilityChecker.available_models()
+            ]
+        )
+    @staticmethod
+    def model_is_available(lang: str, size: str) -> bool:
+        lang_and_size = set(
+            [
+                "_".join(lang_size)
+                for lang_size in ModelAvailabilityChecker.extract_language_and_size()
+            ]
+        )
+        return f"{lang}_{size}" in lang_and_size
+    @staticmethod
+    def available_model_sizes_for_language(lang: str) -> Set[str]:
+        return set([
+            size
+            for (lang_, size) in ModelAvailabilityChecker.extract_language_and_size()
+            if lang_ == lang and size in all_model_size_options_pretty_to_short().values()
+        ])

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+textdescriptives>=2.4.4
+streamlit>=1.17.0
+watchdog
+https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/da_core_news_lg-3.5.0/da_core_news_lg-3.5.0.tar.gz