File size: 8,061 Bytes
e58707f
 
 
 
 
 
 
5842223
e58707f
 
00b1038
 
 
 
e58707f
 
 
 
 
 
 
 
 
00b1038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58707f
 
00b1038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b1038
 
 
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b1038
 
 
e58707f
 
 
 
00b1038
e58707f
 
 
 
 
 
 
 
00b1038
 
 
 
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b1038
 
 
 
e58707f
5842223
 
 
 
 
 
 
e58707f
5842223
e58707f
 
 
 
 
 
5842223
e58707f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.helper import encode_image
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import (
    extract_equal_text,
    replace_leading_spaces,
)
from src.application.text.text import TextDetector


def create_fact_checker_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    if image.input is not None:
        rows.append(format_image_fact_checker_row(image))

    if text.input is not None:
        for _, row in aligned_sentences_df.iterrows():
            if row["input"] is None:
                continue

            if row["source"] is None:
                equal_idx_1 = equal_idx_2 = []

            else:  # Get index of equal phrases in input and source sentences
                equal_idx_1, equal_idx_2 = extract_equal_text(
                    row["input"],
                    row["source"],
                )

            text.fact_checker_table.append(
                [
                    row,  # aligned_sentences_df
                    equal_idx_1,  # index of equal text in input
                    equal_idx_2,  # index of equal text in source
                    row["entities"],
                    row["url"],
                ],
            )

        previous_url = None
        span_row = 1
        for index, row in enumerate(text.fact_checker_table):
            current_url = row[4]
            last_url_row = False

            # First row or URL change
            if index == 0 or current_url != previous_url:
                first_url_row = True
                previous_url = current_url
                # Increase counter "span_row" when the next url is the same
                while (
                    index + span_row < len(text.fact_checker_table)
                    and text.fact_checker_table[index + span_row][4]
                    == current_url
                ):
                    span_row += 1

            else:
                first_url_row = False
                span_row -= 1

            if span_row == 1:
                last_url_row = True

            formatted_row = format_text_fact_checker_row(
                text,
                row,
                first_url_row,
                last_url_row,
                span_row,
            )
            rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>
<style>
"""


def format_text_fact_checker_row(
    text: TextDetector,
    row: list,
    first_url_row: bool = True,
    last_url_row: bool = True,
    span_row: int = 1,
):
    entity_count = 0
    print(f"row: {row}")
    if row[0]["input"] is None:
        return ""
    if row[0]["source"] is not None:  # source is not empty
        if row[3] is not None:
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],
                "input",
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],
                "source",
            )
        else:
            input_sentence = row[0]["input"]
            source_sentence = row[0]["source"]
            highlight_idx_input = []
            highlight_idx_source = []

        if row[3] is not None:
            entity_count = len(row[3])

        # Color overlapping words
        input_sentence = color_text(
            input_sentence,
            row[1],
            highlight_idx_input,
        )  # text, index of highlight words
        source_sentence = color_text(
            source_sentence,
            row[2],
            highlight_idx_source,
        )  # text, index of highlight words

        # Replace _ to get correct formatting
        # Original one having _ for correct word counting
        input_sentence = input_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
        source_sentence = source_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
    else:
        input_sentence = row[0]["input"]
        source_sentence = row[0]["source"]

    input_sentence = replace_leading_spaces(input_sentence)
    source_sentence = replace_leading_spaces(source_sentence)

    url = row[0]["url"]

    # Displayed label and score by url
    filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]

    if len(filterby_url) > 0:
        label = filterby_url["label"].values[0]
        score = filterby_url["score"].values[0]
    else:
        label = text.prediction_label[0]
        score = text.prediction_score[0]

    # Format displayed url
    if url is None:
        source_text_url = url
    else:
        source_text_url = f"""<a href="{url}">{url}</a>"""

    # Format displayed entity count
    entity_count_text = format_entity_count(entity_count)

    border_top = "border-top: 1px solid transparent;"
    border_bottom = "border-bottom: 1px solid transparent;"
    if first_url_row is True:
        # First & Last the group: no transparent
        if last_url_row is True:
            return f"""
<tr>
    <td>{input_sentence}</td>
    <td>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
        # First row of the group: transparent bottom border
        return f"""
<tr>
    <td style="{border_bottom}";>{input_sentence}</td>
    <td style="{border_bottom}";>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
    else:
        if last_url_row is True:
            # NOT First row, Last row: transparent top border
            return f"""
<tr>
    <td style="{border_top}";>{input_sentence}</td>
    <td style="{border_top}";>{source_sentence}</td>
</tr>
"""
        else:
            # NOT First & NOT Last row: transparent top & bottom borders
            return f"""
<tr>
    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
</tr>
"""


def format_image_fact_checker_row(image: ImageDetector):
    if image.input is None:
        return ""

    if image.referent_url is not None or image.referent_url != "":
        if "http" in image.input:
            input_image = (
                f"""<a href="{image.input}">{image.input}</a>"""  # noqa: E501
            )
        else:
            base64_image = encode_image(image.input)
            input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>{input_image}</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>
"""