news_verification / src /application /formatting_fact_checker.py
pmkhanh7890's picture
fix bug of showing forensic
5842223
raw
history blame
8.06 kB
from pandas import DataFrame
from src.application.config import WORD_BREAK
from src.application.formatting import (
color_text,
format_entity_count,
)
from src.application.image.helper import encode_image
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import (
extract_equal_text,
replace_leading_spaces,
)
from src.application.text.text import TextDetector
def create_fact_checker_table(
aligned_sentences_df: DataFrame,
text: TextDetector,
image: ImageDetector,
):
rows = []
if image.input is not None:
rows.append(format_image_fact_checker_row(image))
if text.input is not None:
for _, row in aligned_sentences_df.iterrows():
if row["input"] is None:
continue
if row["source"] is None:
equal_idx_1 = equal_idx_2 = []
else: # Get index of equal phrases in input and source sentences
equal_idx_1, equal_idx_2 = extract_equal_text(
row["input"],
row["source"],
)
text.fact_checker_table.append(
[
row, # aligned_sentences_df
equal_idx_1, # index of equal text in input
equal_idx_2, # index of equal text in source
row["entities"],
row["url"],
],
)
previous_url = None
span_row = 1
for index, row in enumerate(text.fact_checker_table):
current_url = row[4]
last_url_row = False
# First row or URL change
if index == 0 or current_url != previous_url:
first_url_row = True
previous_url = current_url
# Increase counter "span_row" when the next url is the same
while (
index + span_row < len(text.fact_checker_table)
and text.fact_checker_table[index + span_row][4]
== current_url
):
span_row += 1
else:
first_url_row = False
span_row -= 1
if span_row == 1:
last_url_row = True
formatted_row = format_text_fact_checker_row(
text,
row,
first_url_row,
last_url_row,
span_row,
)
rows.append(formatted_row)
table = "\n".join(rows)
return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
<thead>
<tr>
<th>Input news</th>
<th>Source (URL in Originality)</th>
<th>Forensic</th>
<th>Originality</th>
</tr>
</thead>
<tbody>
{table}
</tbody>
</table>
<style>
"""
def format_text_fact_checker_row(
text: TextDetector,
row: list,
first_url_row: bool = True,
last_url_row: bool = True,
span_row: int = 1,
):
entity_count = 0
print(f"row: {row}")
if row[0]["input"] is None:
return ""
if row[0]["source"] is not None: # source is not empty
if row[3] is not None:
# highlight entities
input_sentence, highlight_idx_input = apply_highlight(
row[0]["input"],
row[3],
"input",
)
source_sentence, highlight_idx_source = apply_highlight(
row[0]["source"],
row[3],
"source",
)
else:
input_sentence = row[0]["input"]
source_sentence = row[0]["source"]
highlight_idx_input = []
highlight_idx_source = []
if row[3] is not None:
entity_count = len(row[3])
# Color overlapping words
input_sentence = color_text(
input_sentence,
row[1],
highlight_idx_input,
) # text, index of highlight words
source_sentence = color_text(
source_sentence,
row[2],
highlight_idx_source,
) # text, index of highlight words
# Replace _ to get correct formatting
# Original one having _ for correct word counting
input_sentence = input_sentence.replace(
"span_style",
"span style",
).replace("1px_4px", "1px 4px")
source_sentence = source_sentence.replace(
"span_style",
"span style",
).replace("1px_4px", "1px 4px")
else:
input_sentence = row[0]["input"]
source_sentence = row[0]["source"]
input_sentence = replace_leading_spaces(input_sentence)
source_sentence = replace_leading_spaces(source_sentence)
url = row[0]["url"]
# Displayed label and score by url
filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
if len(filterby_url) > 0:
label = filterby_url["label"].values[0]
score = filterby_url["score"].values[0]
else:
label = text.prediction_label[0]
score = text.prediction_score[0]
# Format displayed url
if url is None:
source_text_url = url
else:
source_text_url = f"""<a href="{url}">{url}</a>"""
# Format displayed entity count
entity_count_text = format_entity_count(entity_count)
border_top = "border-top: 1px solid transparent;"
border_bottom = "border-bottom: 1px solid transparent;"
if first_url_row is True:
# First & Last the group: no transparent
if last_url_row is True:
return f"""
<tr>
<td>{input_sentence}</td>
<td>{source_sentence}</td>
<td rowspan="{span_row}">{label}<br>
({score * 100:.2f}%)<br><br>
{entity_count_text}</td>
<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
# First row of the group: transparent bottom border
return f"""
<tr>
<td style="{border_bottom}";>{input_sentence}</td>
<td style="{border_bottom}";>{source_sentence}</td>
<td rowspan="{span_row}">{label}<br>
({score * 100:.2f}%)<br><br>
{entity_count_text}</td>
<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
else:
if last_url_row is True:
# NOT First row, Last row: transparent top border
return f"""
<tr>
<td style="{border_top}";>{input_sentence}</td>
<td style="{border_top}";>{source_sentence}</td>
</tr>
"""
else:
# NOT First & NOT Last row: transparent top & bottom borders
return f"""
<tr>
<td style="{border_top} {border_bottom}";>{input_sentence}</td>
<td style="{border_top} {border_bottom}";>{source_sentence}</td>
</tr>
"""
def format_image_fact_checker_row(image: ImageDetector):
if image.input is None:
return ""
if image.referent_url is not None or image.referent_url != "":
if "http" in image.input:
input_image = (
f"""<a href="{image.input}">{image.input}</a>""" # noqa: E501
)
else:
base64_image = encode_image(image.input)
input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">""" # noqa: E501
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
else:
source_image = "Image not found"
source_image_url = ""
return f"""
<tr>
<td>{input_image}</td>
<td>{source_image}</td>
<td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
<td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>
"""