Spaces:
Sleeping
Sleeping
import re | |
from bs4 import BeautifulSoup | |
from newspaper import article, ArticleException | |
import pandas as pd | |
import requests | |
from sentence_transformers import SentenceTransformer, util | |
from search_text import DEVICE, PARAPHASE_MODEL, extract_text | |
#news = article('https://www.bbc.co.uk/news/education-51094279') | |
#print(news.text) | |
def extract_human_data(file_path): | |
df = pd.read_csv(file_path) | |
machine_data = df[df["src"] == "xsum_human"] | |
# write to file | |
machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False) | |
def connect_lines_without_dot_regex(text): | |
"""Connects lines without dot using regex""" | |
if not isinstance(text, str): | |
return text | |
return re.sub(r'(?<!\.)\n', '', text) | |
if __name__ == "__main__": | |
#extract_human_data("data/test_data/MAGE_test.csv") | |
text = extract_text("https://www.bbc.co.uk/news/entertainment-arts-51355206", newspapers=False) | |
print(text) | |
# # Encode sentences into embeddings | |
# input_sentences = ["A major incident has been declared by police following flooding in Shropshire."] | |
# page_sentences = ["A major incident has been declared by police following flooding in England."] | |
# embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE) | |
# embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE) | |
# # Compute cosine similarity matrix | |
# similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
# print(similarity_matrix) |