Spaces:
Sleeping
Sleeping
from pandas import DataFrame | |
from src.application.config import WORD_BREAK | |
from src.application.formatting import ( | |
color_text, | |
format_entity_count, | |
) | |
from src.application.image.helper import encode_image | |
from src.application.image.image import ImageDetector | |
from src.application.text.entity import apply_highlight | |
from src.application.text.helper import ( | |
extract_equal_text, | |
replace_leading_spaces, | |
) | |
from src.application.text.text import TextDetector | |
def create_fact_checker_table( | |
aligned_sentences_df: DataFrame, | |
text: TextDetector, | |
image: ImageDetector, | |
): | |
rows = [] | |
if image.input is not None: | |
rows.append(format_image_fact_checker_row(image)) | |
if text.input is not None: | |
for _, row in aligned_sentences_df.iterrows(): | |
if row["input"] is None: | |
continue | |
if row["source"] is None: | |
equal_idx_1 = equal_idx_2 = [] | |
else: # Get index of equal phrases in input and source sentences | |
equal_idx_1, equal_idx_2 = extract_equal_text( | |
row["input"], | |
row["source"], | |
) | |
text.fact_checker_table.append( | |
[ | |
row, # aligned_sentences_df | |
equal_idx_1, # index of equal text in input | |
equal_idx_2, # index of equal text in source | |
row["entities"], | |
row["url"], | |
], | |
) | |
previous_url = None | |
span_row = 1 | |
for index, row in enumerate(text.fact_checker_table): | |
current_url = row[4] | |
last_url_row = False | |
# First row or URL change | |
if index == 0 or current_url != previous_url: | |
first_url_row = True | |
previous_url = current_url | |
# Increase counter "span_row" when the next url is the same | |
while ( | |
index + span_row < len(text.fact_checker_table) | |
and text.fact_checker_table[index + span_row][4] | |
== current_url | |
): | |
span_row += 1 | |
else: | |
first_url_row = False | |
span_row -= 1 | |
if span_row == 1: | |
last_url_row = True | |
formatted_row = format_text_fact_checker_row( | |
text, | |
row, | |
first_url_row, | |
last_url_row, | |
span_row, | |
) | |
rows.append(formatted_row) | |
table = "\n".join(rows) | |
return f""" | |
<h5>Comparison between input news and source news:</h5> | |
<table border="1" style="width:100%; text-align:left;"> | |
<col style="width: 170px;"> | |
<col style="width: 170px;"> | |
<col style="width: 30px;"> | |
<col style="width: 75px;"> | |
<thead> | |
<tr> | |
<th>Input news</th> | |
<th>Source (URL in Originality)</th> | |
<th>Forensic</th> | |
<th>Originality</th> | |
</tr> | |
</thead> | |
<tbody> | |
{table} | |
</tbody> | |
</table> | |
<style> | |
""" | |
def format_text_fact_checker_row( | |
text: TextDetector, | |
row: list, | |
first_url_row: bool = True, | |
last_url_row: bool = True, | |
span_row: int = 1, | |
): | |
entity_count = 0 | |
print(f"row: {row}") | |
if row[0]["input"] is None: | |
return "" | |
if row[0]["source"] is not None: # source is not empty | |
if row[3] is not None: | |
# highlight entities | |
input_sentence, highlight_idx_input = apply_highlight( | |
row[0]["input"], | |
row[3], | |
"input", | |
) | |
source_sentence, highlight_idx_source = apply_highlight( | |
row[0]["source"], | |
row[3], | |
"source", | |
) | |
else: | |
input_sentence = row[0]["input"] | |
source_sentence = row[0]["source"] | |
highlight_idx_input = [] | |
highlight_idx_source = [] | |
if row[3] is not None: | |
entity_count = len(row[3]) | |
# Color overlapping words | |
input_sentence = color_text( | |
input_sentence, | |
row[1], | |
highlight_idx_input, | |
) # text, index of highlight words | |
source_sentence = color_text( | |
source_sentence, | |
row[2], | |
highlight_idx_source, | |
) # text, index of highlight words | |
# Replace _ to get correct formatting | |
# Original one having _ for correct word counting | |
input_sentence = input_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
source_sentence = source_sentence.replace( | |
"span_style", | |
"span style", | |
).replace("1px_4px", "1px 4px") | |
else: | |
input_sentence = row[0]["input"] | |
source_sentence = row[0]["source"] | |
input_sentence = replace_leading_spaces(input_sentence) | |
source_sentence = replace_leading_spaces(source_sentence) | |
url = row[0]["url"] | |
# Displayed label and score by url | |
filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url] | |
if len(filterby_url) > 0: | |
label = filterby_url["label"].values[0] | |
score = filterby_url["score"].values[0] | |
else: | |
label = text.prediction_label[0] | |
score = text.prediction_score[0] | |
# Format displayed url | |
if url is None: | |
source_text_url = url | |
else: | |
source_text_url = f"""<a href="{url}">{url}</a>""" | |
# Format displayed entity count | |
entity_count_text = format_entity_count(entity_count) | |
border_top = "border-top: 1px solid transparent;" | |
border_bottom = "border-bottom: 1px solid transparent;" | |
if first_url_row is True: | |
# First & Last the group: no transparent | |
if last_url_row is True: | |
return f""" | |
<tr> | |
<td>{input_sentence}</td> | |
<td>{source_sentence}</td> | |
<td rowspan="{span_row}">{label}<br> | |
({score * 100:.2f}%)<br><br> | |
{entity_count_text}</td> | |
<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td> | |
</tr> | |
""" | |
# First row of the group: transparent bottom border | |
return f""" | |
<tr> | |
<td style="{border_bottom}";>{input_sentence}</td> | |
<td style="{border_bottom}";>{source_sentence}</td> | |
<td rowspan="{span_row}">{label}<br> | |
({score * 100:.2f}%)<br><br> | |
{entity_count_text}</td> | |
<td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td> | |
</tr> | |
""" | |
else: | |
if last_url_row is True: | |
# NOT First row, Last row: transparent top border | |
return f""" | |
<tr> | |
<td style="{border_top}";>{input_sentence}</td> | |
<td style="{border_top}";>{source_sentence}</td> | |
</tr> | |
""" | |
else: | |
# NOT First & NOT Last row: transparent top & bottom borders | |
return f""" | |
<tr> | |
<td style="{border_top} {border_bottom}";>{input_sentence}</td> | |
<td style="{border_top} {border_bottom}";>{source_sentence}</td> | |
</tr> | |
""" | |
def format_image_fact_checker_row(image: ImageDetector): | |
if image.input is None: | |
return "" | |
if image.referent_url is not None or image.referent_url != "": | |
if "http" in image.input: | |
input_image = ( | |
f"""<a href="{image.input}">{image.input}</a>""" # noqa: E501 | |
) | |
else: | |
base64_image = encode_image(image.input) | |
input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">""" # noqa: E501 | |
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501 | |
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501 | |
else: | |
source_image = "Image not found" | |
source_image_url = "" | |
return f""" | |
<tr> | |
<td>{input_image}</td> | |
<td>{source_image}</td> | |
<td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td> | |
<td style="{WORD_BREAK}";>{source_image_url}</td> | |
</tr> | |
""" | |