import re from bs4 import BeautifulSoup from newspaper import article, ArticleException import pandas as pd import requests from sentence_transformers import SentenceTransformer, util from search_text import DEVICE, PARAPHASE_MODEL, extract_text #news = article('https://www.bbc.co.uk/news/education-51094279') #print(news.text) def extract_human_data(file_path): df = pd.read_csv(file_path) machine_data = df[df["src"] == "xsum_human"] # write to file machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False) def connect_lines_without_dot_regex(text): """Connects lines without dot using regex""" if not isinstance(text, str): return text return re.sub(r'(?