Spaces:
Sleeping
Sleeping
File size: 1,537 Bytes
22e1b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import re
from bs4 import BeautifulSoup
from newspaper import article, ArticleException
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from search_text import DEVICE, PARAPHASE_MODEL, extract_text
#news = article('https://www.bbc.co.uk/news/education-51094279')
#print(news.text)
def extract_human_data(file_path):
df = pd.read_csv(file_path)
machine_data = df[df["src"] == "xsum_human"]
# write to file
machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False)
def connect_lines_without_dot_regex(text):
"""Connects lines without dot using regex"""
if not isinstance(text, str):
return text
return re.sub(r'(?<!\.)\n', '', text)
if __name__ == "__main__":
#extract_human_data("data/test_data/MAGE_test.csv")
text = extract_text("https://www.bbc.co.uk/news/entertainment-arts-51355206", newspapers=False)
print(text)
# # Encode sentences into embeddings
# input_sentences = ["A major incident has been declared by police following flooding in Shropshire."]
# page_sentences = ["A major incident has been declared by police following flooding in England."]
# embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
# embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
# # Compute cosine similarity matrix
# similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
# print(similarity_matrix) |