Spaces:

pmkhanh7890
/

news_verification

Sleeping

1st

22e1b62 4 months ago

1.54 kB

	import re
	from bs4 import BeautifulSoup
	from newspaper import article, ArticleException
	import pandas as pd
	import requests
	from sentence_transformers import SentenceTransformer, util
	from search_text import DEVICE, PARAPHASE_MODEL, extract_text

	#news = article('https://www.bbc.co.uk/news/education-51094279')
	#print(news.text)

	def extract_human_data(file_path):
	df = pd.read_csv(file_path)
	machine_data = df[df["src"] == "xsum_human"]

	# write to file
	machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False)

	def connect_lines_without_dot_regex(text):
	"""Connects lines without dot using regex"""
	if not isinstance(text, str):
	return text
	return re.sub(r'(?<!\.)\n', '', text)

	if __name__ == "__main__":
	#extract_human_data("data/test_data/MAGE_test.csv")
	text = extract_text("https://www.bbc.co.uk/news/entertainment-arts-51355206", newspapers=False)
	print(text)

	# # Encode sentences into embeddings
	# input_sentences = ["A major incident has been declared by police following flooding in Shropshire."]
	# page_sentences = ["A major incident has been declared by police following flooding in England."]
	# embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
	# embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)

	# # Compute cosine similarity matrix
	# similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
	# print(similarity_matrix)