# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "groq==0.18.0",
#     "pandas==2.2.3",
#     "marimo",
# ]
# ///

import marimo

__generated_with = "0.11.6"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    return (mo,)


@app.cell
def _(mo):
    groq_api_key = mo.ui.text(label='Enter your groq api key here', kind='password')
    groq_api_key
    return (groq_api_key,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(
        r"""
        # LLM for NER 

        - do a [Few-shot prompting](https://huggingface.co/docs/transformers/main/en/tasks/prompting#few-shot-prompting) and repeat multiple times.

            The default prompt is:
        ```python
        Return a list of named entities in the text with your confidence score on a scale of 0 to 1 for this tag.
        The available entities are: "ADE", "Dosage", "Drug", "Duration", "Form", "Frequency", "Reason", "Route" and "Strength".

        For example:
        Text: MEDICATIONS : Lipitor , Tylenol with Codeine , Dilantin , previously on Decadron q.i.d .
        Named Entities: <start> Lipitor (Drug: 0.87), Tylenol (Drug: 0.59) <end>

        Here is your task:
        Text: The patient then developed oral sores and rash in the chest the night before admission which rapidly spread to the face , trunk , and upper extremities within the last 24 hours.
        Named Entities:

        Remember to answer in the exact form of the example.
        ```
        """
    )
    return


@app.cell
def _(mo, models):
    # define some params
    call_groq_times = mo.ui.slider(10, 100, 10, 30, label='How many times do you want to call groq')
    model_ner = mo.ui.dropdown(options=models, value="llama3-8b-8192", label="Choose a LLM")
    ner_text = mo.ui.text_area(value="", label='Type your text here or leave it to default:')
    ner_tags = mo.ui.text_area(value="", label="Type the ner tags here or leave it to default:", placeholder='e.g. Country, Person')
    checkbox_ner = mo.ui.checkbox(label=' Whether to call groq api')
    mo.vstack([mo.md("# Experiment"), mo.hstack([call_groq_times, model_ner]), mo.hstack([ner_text, ner_tags]), checkbox_ner], align='center')
    return call_groq_times, checkbox_ner, model_ner, ner_tags, ner_text


@app.cell
def _(
    call_groq_times,
    chat_completion,
    checkbox_ner,
    client,
    extract_ner_from_assistant,
    mo,
    model_ner,
    prompt_ner,
):
    # calling groq
    result = []
    if checkbox_ner.value:
        for _ in mo.status.progress_bar(range(call_groq_times.value), title='In Progress …', completion_title='Finished.'):
            try: 
                answer = chat_completion(client, prompt_ner, model_ner.value)
                result += extract_ner_from_assistant(answer)
            except Exception:
                pass
    return answer, result


@app.cell
def _(mo, pd, result):
    # transform data
    data = pd.DataFrame.from_dict(result)
    # data.to_csv('data.csv')
    # data = pl.from_dicts(result)
    # data.write_csv('data.csv')
    try:
        transformed_df = mo.ui.dataframe(data)
    except Exception:
        df = pd.read_csv('data.csv')
        transformed_df = mo.ui.dataframe(df)
    return data, df, transformed_df


@app.cell
def _(mo, transformed_df):
    _md = mo.md(
        r"""
        The results are shown below, use __+ Add__ to apply different transforms and explore more:
        ------
        """
    )
    mo.vstack([_md, transformed_df])
    return


@app.cell
async def _():
    import os
    import pandas as pd
    from functools import reduce
    import micropip
    await micropip.install("ssl")
    await micropip.install("groq")
    from groq import Groq
    return Groq, micropip, os, pd, reduce


@app.cell
def _():
    # availabel models on groq
    models = [
        "llama3-8b-8192",
        "llama3-70b-8192",
        "llama2-70b-4096",
        "mixtral-8x7b-32768",
        "gemma-7b-it",
    ]
    return (models,)


@app.cell
def _(Groq, groq_api_key):
    client = Groq(api_key=groq_api_key.value)
    return (client,)


@app.cell
def _():
    default_sentence = "The patient then developed oral sores and rash in the chest the night before admission which rapidly spread to the face , trunk , and upper extremities within the last 24 hours."
    default_tags = ["ADE", "Dosage", "Drug", "Duration", "Form", "Frequency", "Reason", "Route" and "Strength"]
    return default_sentence, default_tags


@app.cell
def _(default_sentence, default_tags, ner_tags, ner_text):
    prompt_ner = fr"""Return a list of named entities in the text with your confidence score on a scale of 0 to 1 for this tag.
    The available entities are: {ner_tags.value.split(',') if ner_tags.value else default_tags}.

    For example:
    Text: MEDICATIONS : Lipitor , Tylenol with Codeine , Dilantin , previously on Decadron q.i.d .
    Named Entities: <start> Lipitor (Drug: 0.87), Tylenol (Drug: 0.59) <end>

    Here is your task:
    Text: {ner_text.value if ner_text.value else default_sentence}
    Named Entities:

    Remember to answer in the exact form of the example.
    """
    prompt_ner
    return (prompt_ner,)


@app.cell
def _():
    def chat_completion(client, prompt, model):
        completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "you will help me with some NER tasks."
                },
                # set a user message for the assistant to respond to.
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            # The language model which will generate the completion.
            model=model,
            temperature=0.5,
            max_tokens=100,
            top_p=1,
            stop='<end>',
            # If set, partial message deltas will be sent.
            stream=False,
        )
        answer = completion.choices[0].message.content
        return answer
    return (chat_completion,)


@app.cell
def _(reduce):
    def extract_ner_from_assistant(answer: str) -> list[dict]:
        # initialize a generator
        tokens = (token for token in answer.split())
        # iterate through tokens until <start>
        for token in tokens:
            if token == "<start>":
                break
        # e.g. ['oral sores (ADE: 0.98)', 'rash (ADE: 0.98)']
        records = " ".join(list(tokens)).split(",")
        # clean data
        result = map(
            lambda record: reduce(
                lambda acc, elem: {**acc, **{elem[0]: elem[1]}},
                zip(
                    ["named entity", "tag", "score"],
                    [
                        " ".join(record.split()[:-2]),
                        str(record.split()[-2])[1:-1],
                        float(str(record.split()[-1])[:-1]),
                    ],
                ),
                {},  ## initial value of accumulator
            ),
            records,
        )
        return list(result)
    return (extract_ner_from_assistant,)


if __name__ == "__main__":
    app.run()