Spaces:
Running
Running
import json | |
import streamlit as st | |
import pandas as pd | |
import seaborn as sns | |
import plotly.graph_objects as go | |
import plotly.express as px | |
from st_social_media_links import SocialMediaIcons | |
PARAMS_COLUMN_NAME = "Params" | |
RESULTS_COLUMN_NAME = "Results" | |
AVERAGE_COLUMN_NAME = "Average" | |
SENTIMENT_COLUMN_NAME = "Sentiment" | |
UNDERSTANDING_COLUMN_NAME = "Language understanding" | |
PHRASEOLOGY_COLUMN_NAME = "Phraseology" | |
TRICKY_QUESTIONS_COLUMN_NAME = "Tricky questions" | |
IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average" | |
# Function to load data from JSON file | |
def load_data(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
df = pd.DataFrame(data) | |
df[AVERAGE_COLUMN_NAME] = df[['Sentiment', | |
'Language understanding', 'Phraseology', 'Tricky questions']].mean(axis=1) | |
df[IMPLICATURES_AVERAGE_COLUMN_NAME] = df[['Sentiment', | |
'Language understanding', 'Phraseology']].mean(axis=1) | |
return df | |
# Function to style the DataFrame | |
def style_dataframe(df: pd.DataFrame): | |
df[RESULTS_COLUMN_NAME] = df.apply(lambda row: [ | |
row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME], row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]], axis=1) | |
cols = list(df.columns) | |
# move average column | |
cols.insert(cols.index(PARAMS_COLUMN_NAME) + 1, | |
cols.pop(cols.index(AVERAGE_COLUMN_NAME))) | |
# move impicatures average column | |
cols.insert(cols.index(AVERAGE_COLUMN_NAME) + 1, | |
cols.pop(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME))) | |
# move results column | |
cols.insert(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME) + 1, | |
cols.pop(cols.index(RESULTS_COLUMN_NAME))) | |
# Insert the new column after the 'Average' column | |
df = df[cols] | |
# Create a color ramp using Seaborn | |
return df | |
def styler(df: pd.DataFrame): | |
palette = sns.color_palette("RdYlGn", as_cmap=True) | |
# Apply reverse color gradient to the "Params" column | |
params_palette = sns.color_palette( | |
"RdYlGn_r", as_cmap=True) # Reversed RdYlGn palette | |
styled_df = df.style.background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME] | |
).background_gradient(cmap=params_palette, subset=["Params"] | |
).set_properties(**{'text-align': 'center'}, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME] | |
).format("{:.2f}".center(10), subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME] | |
).format("{:.1f}".center(10), subset=["Params"]) | |
return styled_df | |
# Streamlit app | |
st.set_page_config(layout="wide") | |
st.markdown(""" | |
<style> | |
.block-container { | |
padding-top: 0%; | |
padding-bottom: 0%; | |
padding-left: 3%; | |
padding-right: 3%; | |
scrollbar-width: thin; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Prepare layout | |
st.markdown(""" | |
<style> | |
.center { | |
display: block; | |
margin-left: auto; | |
margin-right: auto; | |
width: 50%; | |
} | |
.center-text { | |
text-align: center; | |
} | |
a:link {color:#FDA428;} /* unvisited link */ | |
a:hover {color:#FDA428;} /* Mouse over link */ | |
a:visited {color:#FDA428;} /* visited link */ | |
a:active {color:#FDA428;} /* selected link */ | |
</style> | |
""", unsafe_allow_html=True) | |
# --- Colors info --- | |
# Primary Color: #FDA428 | |
# Secondary Color: #A85E00 | |
# Grey Color: #7B7B7B | |
# Background Color: #1C1C1C | |
# {'LOW': '#7B7B7B', 'MEDIUM': '#A85E00', 'HIGH': '#FDA428'} | |
# ---------------------------------------------------------- | |
st.markdown("""<br>""", unsafe_allow_html=True) | |
# Row: 1 --> Title + links to SpeakLeash.org website / GitHub / X (Twitter) | |
social_media_links = [ | |
"https://discord.com/invite/ZJwCMrxwT7", | |
"https://github.com/speakleash", | |
"https://x.com/Speak_Leash", | |
"https://www.linkedin.com/company/speakleash/", | |
"https://www.facebook.com/Speakleash/" | |
] | |
light_orange = "#FDA428" | |
dark_orange = "#A85E00" | |
white_color = "#FFFFFF" | |
black_color = "#000000" | |
links_color = light_orange | |
social_media_links_colors = [ | |
links_color, | |
links_color, | |
links_color, | |
links_color, | |
links_color | |
] | |
social_media_icons = SocialMediaIcons( | |
social_media_links, social_media_links_colors) | |
social_media_icons.render(justify_content='right') | |
st.markdown(""" | |
<hr style="margin: 0.5em 0;"> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg" alt="SpeakLeash Logo"> | |
""", unsafe_allow_html=True) | |
# Add logo, title, and subheader in a flexible container with equal spacing | |
st.markdown(""" | |
<div class="header-container"> | |
<br><br> | |
<div class="title-container"> | |
<h1 style='color: #FDA428; margin-top: -1rem; font-size: 3.1em;'>CPTUB - Complex Polish Text Understanding Benchmark</h1> | |
<h3 style="margin-top: 0;">Understanding of Polish text, sentiment and phraseological compounds</h2> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Create tabs | |
tab1, tab2 = st.tabs([RESULTS_COLUMN_NAME, "Description"]) | |
with tab1: | |
st.markdown(""" | |
This benchmark is designed to evaluate the proficiency of language models in accurately interpreting complex Polish texts. It comprises two distinct components: | |
1. *Implicatures*: This part evaluates models on their capacity to interpret implied meanings, including sarcasm, idiomatic expressions, and varying levels of linguistic complexity. Beyond conventional sentiment analysis, models are specifically assessed for their ability to discern implicit meanings that extend beyond literal interpretations, requiring sensitivity to nuanced, context-dependent inferences. | |
2. *Tricky Questions*: This section assesses the model's capability to accurately address challenging questions characterized by logical puzzles, semantic ambiguity, logical inconsistencies, absurdity, and humor. The emphasis here lies in evaluating the model's reasoning skills and flexibility in handling unconventional linguistic constructs. | |
""") | |
# Prepare data | |
data = load_data('data.json') | |
data['Params'] = pd.to_numeric( | |
data['Params'].str.replace('B', ''), | |
errors='coerce' | |
) | |
data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False) | |
# Closing filters in a expander | |
with st.expander("Filtering benchmark data", icon='🔍'): | |
# Filtering data, e.g. slider for params, average score, etc. | |
col_filter_params, col_filter_average, col_filter_implicatures_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology, col_filter_tricky_questions = st.columns( | |
7, gap='medium') | |
with col_filter_params: | |
max_params = data['Params'].max(skipna=True) | |
if pd.isna(max_params): | |
max_params = 0.0 | |
params_slider = st.slider( | |
"Models Size [B]", | |
min_value=0.0, | |
max_value=float(max_params), | |
value=(0.0, float(max_params)), | |
step=0.1, | |
format="%.1f" | |
) | |
data = data[ | |
data['Params'].isna() | | |
( | |
(data['Params'] >= params_slider[0]) & | |
(data['Params'] <= params_slider[1]) | |
) | |
] | |
with col_filter_average: | |
average_slider = st.slider( | |
"Average score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[AVERAGE_COLUMN_NAME] >= average_slider[0]) & ( | |
data[AVERAGE_COLUMN_NAME] <= average_slider[1])] | |
with col_filter_implicatures_average: | |
implicatures_average_slider = st.slider( | |
"Implicatures average", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[IMPLICATURES_AVERAGE_COLUMN_NAME] >= implicatures_average_slider[0]) & ( | |
data[IMPLICATURES_AVERAGE_COLUMN_NAME] <= implicatures_average_slider[1])] | |
with col_filter_sentiment: | |
sentiment_slider = st.slider( | |
"Sentiment score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[SENTIMENT_COLUMN_NAME] >= sentiment_slider[0]) & ( | |
data[SENTIMENT_COLUMN_NAME] <= sentiment_slider[1])] | |
with col_filter_understanding: | |
understanding_slider = st.slider( | |
"Understanding score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[UNDERSTANDING_COLUMN_NAME] >= understanding_slider[0]) & ( | |
data[UNDERSTANDING_COLUMN_NAME] <= understanding_slider[1])] | |
with col_filter_phraseology: | |
phraseology_slider = st.slider( | |
"Phraseology score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[PHRASEOLOGY_COLUMN_NAME] >= phraseology_slider[0]) & ( | |
data[PHRASEOLOGY_COLUMN_NAME] <= phraseology_slider[1])] | |
with col_filter_tricky_questions: | |
tricky_questions_slider = st.slider( | |
"Tricky questions score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0)) | |
data = data[(data[TRICKY_QUESTIONS_COLUMN_NAME] >= tricky_questions_slider[0]) & ( | |
data[TRICKY_QUESTIONS_COLUMN_NAME] <= tricky_questions_slider[1])] | |
# Extract unique provider names from the "Model" column | |
providers = data["Model"].apply(lambda x: x.split('/')[0].lower()).unique() | |
selected_providers = st.multiselect("Model providers", providers, default=providers) | |
# Filter data based on selected providers | |
data = data[data["Model"].apply(lambda x: x.split('/')[0].lower()).isin(selected_providers)] | |
# Define all possible columns | |
all_columns = { | |
"Model": "Model", | |
"Params": "Params", | |
AVERAGE_COLUMN_NAME: "Average", | |
IMPLICATURES_AVERAGE_COLUMN_NAME: "Impl. Avg", | |
SENTIMENT_COLUMN_NAME: "Sentiment", | |
UNDERSTANDING_COLUMN_NAME: "Understanding", | |
PHRASEOLOGY_COLUMN_NAME: "Phraseology", | |
TRICKY_QUESTIONS_COLUMN_NAME: "Tricky Questions" | |
} | |
# By default, all columns are selected | |
default_columns = list(all_columns.keys()) | |
# Use pills to select visible columns in multi-selection mode | |
selected_column_labels = st.pills( | |
label="Visible columns", | |
options=list(all_columns.values()), | |
default=list(all_columns.values()), # Set all columns as default | |
selection_mode="multi", # Enable multi-selection mode | |
key="visible_columns_pills" | |
) | |
# Map selected labels back to column names | |
reverse_mapping = {v: k for k, v in all_columns.items()} | |
selected_columns = [reverse_mapping[label] for label in selected_column_labels] | |
# If nothing is selected, show all columns | |
if not selected_columns: | |
selected_columns = default_columns | |
# Display data | |
styled_df_show = style_dataframe(data) | |
styled_df_show = styler(styled_df_show) | |
# Customize column_config based on selected columns | |
column_config = {} | |
# Set configuration for all columns | |
if "Model" in styled_df_show.columns: | |
column_config["Model"] = st.column_config.TextColumn("Model", help="Model name", width="large") if "Model" in selected_columns else None | |
if "Params" in styled_df_show.columns: | |
column_config["Params"] = st.column_config.NumberColumn("Params [B]") if "Params" in selected_columns else None | |
if AVERAGE_COLUMN_NAME in styled_df_show.columns: | |
column_config[AVERAGE_COLUMN_NAME] = st.column_config.NumberColumn(AVERAGE_COLUMN_NAME) if AVERAGE_COLUMN_NAME in selected_columns else None | |
if IMPLICATURES_AVERAGE_COLUMN_NAME in styled_df_show.columns: | |
column_config[IMPLICATURES_AVERAGE_COLUMN_NAME] = st.column_config.NumberColumn(IMPLICATURES_AVERAGE_COLUMN_NAME) if IMPLICATURES_AVERAGE_COLUMN_NAME in selected_columns else None | |
if RESULTS_COLUMN_NAME in styled_df_show.columns: | |
# Show Results only if Average is selected | |
column_config[RESULTS_COLUMN_NAME] = st.column_config.BarChartColumn( | |
"Bar chart of results", help="Summary of the results of each task", | |
y_min=0, y_max=5) if AVERAGE_COLUMN_NAME in selected_columns else None | |
if SENTIMENT_COLUMN_NAME in styled_df_show.columns: | |
column_config[SENTIMENT_COLUMN_NAME] = st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment') if SENTIMENT_COLUMN_NAME in selected_columns else None | |
if UNDERSTANDING_COLUMN_NAME in styled_df_show.columns: | |
column_config[UNDERSTANDING_COLUMN_NAME] = st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language') if UNDERSTANDING_COLUMN_NAME in selected_columns else None | |
if PHRASEOLOGY_COLUMN_NAME in styled_df_show.columns: | |
column_config[PHRASEOLOGY_COLUMN_NAME] = st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds') if PHRASEOLOGY_COLUMN_NAME in selected_columns else None | |
if TRICKY_QUESTIONS_COLUMN_NAME in styled_df_show.columns: | |
column_config[TRICKY_QUESTIONS_COLUMN_NAME] = st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions') if TRICKY_QUESTIONS_COLUMN_NAME in selected_columns else None | |
st.data_editor(styled_df_show, column_config=column_config, hide_index=True, disabled=True, height=500) | |
# Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME | |
# Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column | |
default_models = list(data.sort_values( | |
AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3)) | |
bielik_model = data[data['Model'].str.contains('Bielik')].sort_values( | |
AVERAGE_COLUMN_NAME, ascending=False)['Model'].iloc[0] | |
if bielik_model not in default_models: | |
default_models.append(bielik_model) | |
selected_models = st.multiselect( | |
"Select models to compare", data["Model"].unique(), default=default_models) | |
selected_data = data[data["Model"].isin(selected_models)] | |
categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, | |
PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME] | |
if selected_models: | |
# Kolorki do wyboru: | |
# colors = px.colors.sample_colorscale("viridis", len(selected_models)+1) | |
colors = px.colors.qualitative.G10[:len(selected_models)] | |
fig_bars = go.Figure() | |
for model, color in zip(selected_models, colors): | |
values = selected_data[selected_data['Model'] == | |
model][categories].values.flatten().tolist() | |
fig_bars.add_trace(go.Bar( | |
x=categories, | |
y=values, | |
name=model, | |
marker_color=color | |
)) | |
# Update layout to use a custom color scale | |
fig_bars.update_layout( | |
showlegend=True, | |
legend=dict(orientation="h", yanchor="top", | |
y=-0.3, xanchor="center", x=0.5), | |
title="Comparison of Selected Models", | |
yaxis_title="Score", | |
template="plotly_dark" | |
) | |
fig_bars.update_yaxes(range=[0, 5.1]) | |
st.plotly_chart(fig_bars) | |
# Zakładka 2 --> Opis | |
with tab2: | |
st.markdown(""" | |
### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span> | |
LLM models face multiple challenges that significantly impact their practical use. This benchmark has been created to comprehensively evaluate two distinct but equally critical aspects of their performance: | |
#### 1. **Implicatures and Phraseological Compounds** | |
Language models frequently struggle when interpreting complex, context-dependent meanings that extend beyond the literal interpretation of a text. Such linguistic phenomena include sarcasm, implicatures, and idiomatic or phraseological expressions. | |
- **Sarcasm:** Traditional sentiment analysis often fails on sarcastic statements because the literal meaning directly contradicts the intended sentiment. This context-dependence makes detection particularly difficult. | |
- **Implicatures:** These implied meanings, not explicitly stated, often elude models which rely heavily on surface-level text analysis. | |
- **Phraseological Compounds:** Fixed or semi-fixed expressions whose meanings can't be inferred from their individual components pose additional challenges, leading to inaccurate interpretation by models trained primarily on word-level semantics. | |
The goal of this part of the benchmark is thus to test the ability of LLMs to correctly interpret implied meanings, detect sarcasm, identify idiomatic phrases, and evaluate their overall text understanding capability beyond literal semantics. | |
#### 2. **Tricky Questions and Hallucination Detection** | |
Another critical problem observed in commercial LLM deployments is the model's tendency to provide incorrect or hallucinated answers, especially when faced with logically inconsistent, ambiguous, or absurd questions. | |
Key reasons to evaluate this aspect include: | |
- **Hallucination Detection:** Preventing models from generating confident-sounding but entirely incorrect answers. | |
- **Logical Consistency:** Identifying and correctly responding (or explicitly refusing to respond) to internally inconsistent or logically flawed questions. | |
- **Protecting Model Reputation:** Ensuring the model avoids generating absurd or nonsensical answers, which significantly reduces user trust and solution credibility. | |
--- | |
### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span> | |
All samples were written by human. This benchmark dataset is divided clearly into two subsets corresponding to each testing area: | |
#### 1. **Implicatures and Phraseological Compounds Dataset** | |
- **Language:** Polish | |
- **Size:** 200 carefully selected examples | |
- **Structure of Each Example:** | |
- **Main Text:** Contains sarcasm, implicatures, and/or idiomatic phrases. | |
- **Reference Sentiment:** Annotated sentiment label (*positive*, *neutral*, *negative*). | |
- **Reference Phraseological Compounds:** Explicit list of phraseological expressions in the text. | |
- **Reference Explanation:** Clear explanation of the author's intended meaning. | |
#### 2. **Tricky Questions Dataset** | |
- **Language:** Polish | |
- **Size:** 178 examples (to be specified) | |
- **Types of Included Questions:** | |
- Logical riddles and puzzles. | |
- Questions based on semantic ambiguity (e.g., "How much sugar is needed to have a sweet voice?"). | |
- Logically flawed questions (e.g., non-existent events or dates: "In what year did Poland witness a battle between Vatican and South Africa?"). | |
- Absurd or humorous questions ("How can I join a snail chess club?"). | |
--- | |
### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span> | |
The evaluation procedure also consists of two clearly distinguished approaches: | |
#### 1. **Implicatures and Phraseological Compounds Evaluation** | |
##### **Evaluated Model** | |
- For each text in the dataset, the evaluated model was explicitly required to list the following in three clearly separated points: | |
1. **Sentiment** (*positive* or *negative* only). | |
2. **The underlying intentions of the author**. | |
3. **All phraseological compounds present in the text, along with their meanings in the given context**. | |
- **Prompt Structure**: | |
- Written entirely in Polish, without a system prompt. | |
- Contains three main elements (clearly separated by headers in capital letters and triple quotes): | |
1. Information defining the evaluated model’s role as a careful linguist with extensive experience. | |
2. Explicit instructions about the three tasks to be performed. | |
3. Two diverse human-labeled examples: | |
- Example inputs are presented in the **User Prompt**. | |
- Corresponding example responses are presented separately in the **Assistant Prompt**. | |
- One target example (text) per evaluation follows after these examples. | |
- **Selection of Examples**: | |
- The two examples provided in the prompt were chosen due to diversity: | |
- One with negative sentiment and multiple phraseological compounds. | |
- One with positive sentiment and no phraseological compounds. | |
##### **Judge Metamodel** | |
- Prompt structure similar to the evaluated model, but explicitly defines a distinct role ("reliable assistant") focused exclusively on evaluation. | |
- The judge metamodel’s prompt includes: | |
- Several diverse examples (with both positive and neutral sentiment references), each clearly separated into **User Prompt** (containing evaluated model responses and references) and **Assistant Prompt** (containing example evaluations by humans). | |
- Returns three separate evaluations in JSON format, clearly assessing: | |
1. **Understanding of the Text** (`OCENA`) – a comparison of the evaluated model’s explanations to reference explanations. | |
2. **Sentiment Analysis** (`WYDŹWIĘK`) – an optional evaluation performed only for samples explicitly labeled with "positive" or "negative" sentiment. Neutral samples were deliberately ignored to avoid ambiguity. | |
3. **Phraseological Compounds** (`ZWIĄZKI`) – carefully evaluated with a penalization system to avoid "phrase spamming," deducting points for each non-reference or incorrect phrase until a minimum score of zero. | |
- **Example scoring output**: | |
```json | |
{"WYDŹWIĘK": "5"} | |
{"OCENA": "4"} | |
{"ZWIĄZKI": "3"} | |
``` | |
#### 2. **Tricky Questions Evaluation** | |
##### **Evaluated Model** | |
- Receives only the tricky question itself as input, with **no system prompt** and **no additional instructions**. | |
##### **Judge Metamodel** | |
- Uses GPT-4o with structured JSON outputs. | |
- Prompt structure explicitly includes **3 diverse examples** of tricky questions along with reference and evaluated-model responses, without dividing them into separate user and assistant prompts. | |
- Structured JSON evaluation includes: | |
- `"think"` field: detailed comparison of evaluated model’s answer to the reference, clearly explaining reasoning behind scoring. | |
- `"mark"` field: integer from 0 to 5 indicating performance (high scores correspond to accurate detection of logical flaws, ambiguity or absurdity, and appropriate refusal to hallucinate answers). | |
- Example scoring output: | |
```json | |
{ | |
"think": "Detailed reasoning comparing evaluated model's response to reference answer, noting if the evaluated model correctly identified logical inconsistency and avoided hallucination.", | |
"mark": 4 | |
} | |
``` | |
- The addition of the `"think"` field significantly simplifies the analysis of the judge model's evaluations, explicitly clarifying the reasoning behind each awarded score. | |
""", unsafe_allow_html=True) | |
# Ending :) | |
st.markdown("<hr style='border: 1px solid #A85E00;'>", unsafe_allow_html=True) | |
# st.divider() | |
st.markdown(""" | |
### Authors: | |
- [Jan Sowa](https://www.linkedin.com/in/janpiotrsowa) - leadership, writing texts, benchmark code | |
- [Natalia Nadolna](https://www.linkedin.com/in/natalia-nadolna) - benchmark code, dataset cleaning & analysis | |
- [Anna Zielińska](https://www.linkedin.com/in/zieli%C5%84ska-anna/) - benchmark code, dataset cleaning & analysis | |
- [Agnieszka Kosiak](https://www.linkedin.com/in/agn-kosiak/) - writing texts | |
- [Magdalena Krawczyk](https://www.linkedin.com/in/magdalena-krawczyk-7810942ab/) - writing texts, labeling | |
- [Marta Matylda Kania](https://www.linkedin.com/in/martamatyldakania/) - prompt engineering | |
- Wiktoria Wierzbińska - writing texts | |
- [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support | |
- [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support | |
- [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant | |
- [Artur Słomowski](https://www.linkedin.com/in/arturslomowski/) - front-end / streamlit assistant | |
- [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support | |
- [Magda Król](https://www.linkedin.com/in/magda-król/) - writing text | |
- [Artur Gogol](https://www.linkedin.com/in/arturgogol) - writing text | |
""") | |
st.divider() | |
# Run the app with `streamlit run your_script.py` |