Spaces:

HLasse
/

textdescriptives

Runtime error

File size: 7,488 Bytes

"""
Dashboard for showcasing extraction of text metrics with textdescriptives.

"""

from io import StringIO

import pandas as pd
import streamlit as st
import textdescriptives as td

from data_viewer import DataViewer
from process_text import text_to_metrics
from options import (
    all_model_size_options_pretty_to_short,
    available_model_size_options,
    language_options,
    metrics_options,
)

################
# Introduction #
################


col1, col2 = st.columns([9, 2])
with col1:
    st.title("Extract Text Statistics")
with col2:
    st.image(
        "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
        width=125,
    )

st.write(
    "Calculate a large variety of statistics from text via the "
    "[**TextDescriptives**](https://github.com/HLasse/TextDescriptives) python package "
    f"(v/{td.__version__}) and download the results as a .csv file. "
    "Includes descriptive statistics and metrics related to readability, "
    "information theory, text coherence and text quality."
)

st.write(
    "The source code for this application can be found on [**GitHub**](https://github.com/HLasse/TextDescriptives_app). "
    "If you have feedback, please open an [issue](https://github.com/HLasse/textdescriptives_app/issues)."
)

st.caption(
    "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
    "calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
    "5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
)


############
# Settings #
############


input_choice = st.radio(
    label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
)

with st.form(key="settings_form"):
    split_by_line = st.checkbox(label="Split by newline", value=True)

    file_name_to_text_string = {}

    if input_choice == "Upload file(s)":
        uploaded_files = st.file_uploader(
            label="Choose a .txt file", type=["txt"], accept_multiple_files=True
        )

        if uploaded_files is not None and len(uploaded_files) > 0:
            # To convert to a string based IO:
            file_name_to_text_string = {
                file.name: StringIO(file.getvalue().decode("utf-8")).read()
                for file in uploaded_files
            }

    else:
        default_text = """Hello, morning dew. The grass whispers low.
I'm here to dance. The gentle breeze does show.
Good morning, world. The birds sing in delight.
Let's spread our wings. The butterflies take flight.
Nature's chorus sings, a symphony of light."""

        file_name_to_text_string = {
            "input": st.text_area(
                label="Enter text", value=default_text, height=145, max_chars=None
            )
        }

    # Row of selectors
    col1, col2 = st.columns([1, 1])

    with col1:
        # Selection of language
        language_pretty = st.selectbox(
            label="Language",
            options=list(language_options().keys()),
            index=5,
            key="language_selector",
        )

        language_short = language_options()[language_pretty]

    with col2:
        # Selection of model size
        model_size_pretty = st.selectbox(
            label="Model Size",
            options=available_model_size_options(lang="all"),
            index=0,
            key="size_selector",
        )

        model_size_short = all_model_size_options_pretty_to_short()[model_size_pretty]

    # Multiselection of metrics
    metrics = st.multiselect(
        label="Metrics", options=metrics_options(), default=metrics_options()
    )

    st.write(
        "See the [**documentation**](https://hlasse.github.io/TextDescriptives/) for "
        "information on the available metrics."
    )

    # This shouldn't happen but better safe than sorry
    if isinstance(metrics, list) and not metrics:
        metrics = None

    apply_settings_button = st.form_submit_button(label="Apply")


#############
# Apply NLP #
#############


if apply_settings_button and len(file_name_to_text_string) > 0:
    if model_size_pretty not in available_model_size_options(lang=language_short):
        st.write(
            "**Sorry!** The chosen *model size* is not available in this language. Please try another."
        )
    else:
        # Extract metrics for each text
        output_df = pd.concat(
            [
                text_to_metrics(
                    string=string,
                    language_short=language_short,
                    model_size_short=model_size_short,
                    metrics=metrics,
                    split_by_line=split_by_line,
                    filename=filename if "Upload" in input_choice else None,
                )
                for filename, string in file_name_to_text_string.items()
            ],
            ignore_index=True,
        )

        ###################
        # Present Results #
        ###################

        # Create 2 columns with 1) the output header
        # and 2) a download button
        DataViewer()._header_and_download(
            header="The calculated metrics",
            data=output_df,
            file_name="text_metrics.csv",
        )

        st.write("**Note**: This data frame has been transposed for readability.")
        output_df = output_df.transpose().reset_index()
        output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
        st.dataframe(data=output_df, use_container_width=True)


############################
# Code For Reproducibility #
############################


with st.expander("See python code"):
    st.code(
        """
# Note: This is the code for a single text file
# The actual code is slightly more complex
# to allow processing multiple files at once

import textdescriptives as td

# Given a string of text and the settings
text = "..."
language = "..."
model_size = "..."
metrics = [...]
split_by_newline = True

# Remove whitespace from both ends of the string
text = text.strip()

# When asked, split by newlines
if split_by_newline:
    lines = text.split("\\n")
else:
    lines = [text]

# Remove empty lines
# E.g. due to consecutive newlines
lines = [l for l in lines if l]

# Extract metrics for each line
extracted_metrics = td.extract_metrics(
    text=lines,
    lang=language,
    spacy_model_size=model_size,
    metrics=metrics
)

""",
        language="python",
    )

#######
# FAQ #
#######

st.subheader("Frequently Asked Questions (FAQ)")

with st.expander("What does the 'Split by newline' option do?"):
    st.write(
        """
    When the `Split by newline` option is `enabled`, the metrics calculation is 
    performed separately for each paragraph. I.e. whenever there's a line break, 
    we split the text.

    When this option is `disabled`, the entire text is processed at once.
    """
    )

with st.expander(
    "Why do I get a warning/error message for certain languages or model sizes?"
):
    st.write(
        """
    Some combinations of languages, model sizes, and metrics are not currently supported in the app. 
    While we *are* working on this, you may currently see a red box
    with an error message after clicking `Apply`. 

    If you need this language and/or model size to work for your project, 
    please open an [issue](https://github.com/HLasse/textdescriptives_app/issues). 
    This may cause us to prioritize supporting your use case.
    """
    )