Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /formatting_governor.py

pmkhanh7890

fix bug of showing forensic

5842223 3 months ago

raw

history blame

5.75 kB

	from pandas import DataFrame

	from src.application.config import WORD_BREAK
	from src.application.formatting import (
	color_text,
	format_entity_count,
	)
	from src.application.image.helper import encode_image
	from src.application.image.image import ImageDetector
	from src.application.text.entity import apply_highlight
	from src.application.text.helper import (
	extract_equal_text,
	replace_leading_spaces,
	)
	from src.application.text.text import TextDetector


	def create_governor_table(
	aligned_sentences_df: DataFrame,
	text: TextDetector,
	image: ImageDetector,
	):
	rows = []
	if image.input is not None:
	rows.append(format_image_governor_row(image))

	if text.input is not None:
	for _, row in aligned_sentences_df.iterrows():
	if row["input"] is None:
	continue

	if row["source"] is None:
	equal_idx_1 = equal_idx_2 = []
	else:
	# Get index of equal phrases in input and source sentences
	equal_idx_1, equal_idx_2 = extract_equal_text(
	row["input"],
	row["source"],
	)

	text.governor_table.append(
	[
	row,
	equal_idx_1,
	equal_idx_2,
	row["entities"],
	],
	)

	formatted_row = format_text_governor_row(text)
	rows.append(formatted_row)

	table = "\n".join(rows)
	return f"""
	<h5>Comparison between input news and source news:</h5>
	<table border="1" style="width:100%; text-align:left;">
	<col style="width: 170px;">
	<col style="width: 170px;">
	<col style="width: 30px;">
	<col style="width: 75px;">
	<thead>
	<tr>
	<th>Input news</th>
	<th>Source (URL in Originality)</th>
	<th>Forensic</th>
	<th>Originality</th>
	</tr>
	</thead>
	<tbody>
	{table}
	</tbody>
	</table>

	<style>
	"""


	def format_text_governor_row(text):
	input_sentences = ""
	source_sentences = ""
	source_text_urls = ""
	urls = []
	sentence_count = 0
	entity_count = [0, 0] # to get index of [-2]
	for row in text.governor_table:
	if row[0]["input"] is None:
	continue

	if row[0]["source"] is not None: # source is not empty
	# highlight entities
	input_sentence, highlight_idx_input = apply_highlight(
	row[0]["input"],
	row[3], # entities_with_colors
	"input", # key
	entity_count[-2], # since the last one is for current counting
	)
	source_sentence, highlight_idx_source = apply_highlight(
	row[0]["source"],
	row[3], # entities_with_colors
	"source", # key
	entity_count[-2], # since the last one is for current counting
	)

	# Color overlapping words
	input_sentence = color_text(
	input_sentence,
	row[1],
	highlight_idx_input,
	) # text, index of highlight words
	source_sentence = color_text(
	source_sentence,
	row[2],
	highlight_idx_source,
	) # text, index of highlight words

	input_sentence = input_sentence.replace(
	"span_style",
	"span style",
	).replace("1px_4px", "1px 4px")
	source_sentence = source_sentence.replace(
	"span_style",
	"span style",
	).replace("1px_4px", "1px 4px")

	else:
	if row[0]["source"] is None:
	source_sentence = ""
	else:
	source_sentence = row[0]["source"]
	input_sentence = row[0]["input"]

	input_sentence = replace_leading_spaces(input_sentence)
	source_sentence = replace_leading_spaces(source_sentence)

	input_sentences += input_sentence + "<br>"
	source_sentences += source_sentence + "<br>"

	url = row[0]["url"]
	if url not in urls:
	urls.append(url)
	source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
	sentence_count += 1
	if row[3] is not None:
	entity_count.append(len(row[3]))

	entity_count_text = format_entity_count(sum(entity_count))
	return f"""
	<tr>
	<td>{input_sentences}</td>
	<td>{source_sentences}</td>
	<td>{text.prediction_label[0]}<br>
	({text.prediction_score[0] * 100:.2f}%)<br><br>
	{entity_count_text}</td>
	<td style="{WORD_BREAK}";>{source_text_urls}</td>
	</tr>
	"""


	def format_image_governor_row(image: ImageDetector):
	if image.input is None:
	return ""

	if image.referent_url is not None or image.referent_url != "":
	if "http" in image.input:
	input_image = (
	f"""<a href="{image.input}">{image.input}</a>""" # noqa: E501
	)
	else:
	base64_image = encode_image(image.input)
	input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">""" # noqa: E501
	source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
	source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
	else:
	source_image = "Image not found"
	source_image_url = ""

	return f"""
	<tr>
	<td>{input_image}</td>
	<td>{source_image}</td>
	<td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
	<td style="{WORD_BREAK}";>{source_image_url}</td>
	</tr>"""