File size: 3,875 Bytes
8739181
 
 
 
 
 
 
3f556fb
8739181
a188b38
8739181
a188b38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8739181
 
 
 
3f556fb
 
 
 
 
3b3110b
 
 
 
 
 
 
3f556fb
 
 
 
 
8739181
 
3f556fb
 
 
 
 
 
 
 
 
 
 
 
3b3110b
 
 
3f556fb
8739181
 
 
 
3f556fb
8739181
3f556fb
8739181
3f556fb
 
a188b38
 
 
3b3110b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text

# Load data
ds = load_dataset("hs-knowledge/hateval_enriched")


# Show highlighted ner entities in a tweet
def display_ner(example):
    ner_output = example["ner_output"]
    chunks = []
    current_chunk = ""
    current_type = None

    # Check if there are two labels repeated
    previous_label = None

    for label in ner_output["labels"]:
        if (
            label
            and previous_label
            and previous_label == label
            and label != "O"
            and not label.startswith("I-")
            and not label.startswith("B-")
        ):
            pass
        previous_label = label

    for token, label in zip(ner_output["tokens"], ner_output["labels"]):
        if label is None:
            # Perhaps it is too long
            continue
        if label == "O":
            if current_type is not None:
                # Add previous entity
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = None
            else:
                current_chunk += token + " "
                current_type = None
        elif label.startswith("B-"):
            if current_chunk:
                chunks.append((current_chunk.strip(), current_type))
            current_chunk = token + " "
            current_type = label[2:]
        elif label.startswith("I-"):
            current_chunk += token + " "
            current_type = label[2:]
        else:
            # It doesn't start with B- or I- => add single token
            if label != current_type:
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = label
            else:
                current_chunk += token + " "
                current_type = label

    if current_chunk:
        chunks.append((current_chunk.strip(), current_type))

    # Display text
    chunks = [(c, t) if t is not None else c for c, t in chunks]
    annotated_text(*chunks)


def display_text(example):
    # Use annotated_text to show entities
    text = example["text"]

    # Sort entities by start
    entities = sorted(example["entities"], key=lambda x: x["start"])

    for entity in entities:
        entity_text = entity["text"]
        # find in text
        start = text.find(entity_text)
        end = start + len(entity_text)
        entity["start"] = start
        entity["end"] = end
    # Chunk text

    if len(entities) == 0:
        annotated_text(*[text])
        return

    chunks = []
    last_index = 0
    for i in range(len(entities)):
        entity = entities[i]
        start, end = entity["start"], entity["end"]

        if last_index < start:
            chunk_before_entity = text[last_index : entity["start"]]
            chunks.append((chunk_before_entity, None))
        chunks.append((entity["text"], entity["type"]))

        last_index = end

    if last_index < len(text):
        chunks.append((text[last_index:], None))

    # description = entity["kg_result"]["detailedDescription"]["articleBody"]
    chunks = [(c, t) if t is not None else c for c, t in chunks]
    annotated_text(*chunks)


# Get first 1000 examples

elements = random.choices(range(len(ds["train"])), k=50)
ds["train"] = ds["train"].select(elements)

for ex in ds["train"]:
    # display_text(ex)
    st.markdown("---")
    display_ner(ex)
    with st.expander("Show entities"):
        for ent in ex["entities"]:
            entity_name = ent["text"]
            entity_type = ent["type"]
            entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
            st.write(f"{entity_name} ({entity_type}): {entity_description}")