File size: 1,537 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
from bs4 import BeautifulSoup
from newspaper import article, ArticleException
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from search_text import DEVICE, PARAPHASE_MODEL, extract_text

#news = article('https://www.bbc.co.uk/news/education-51094279')
#print(news.text)

def extract_human_data(file_path):
    df = pd.read_csv(file_path)
    machine_data = df[df["src"] == "xsum_human"]

    # write to file
    machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False)

def connect_lines_without_dot_regex(text):
    """Connects lines without dot using regex"""
    if not isinstance(text, str):
        return text
    return re.sub(r'(?<!\.)\n', '', text)

if __name__ == "__main__":
    #extract_human_data("data/test_data/MAGE_test.csv")
    text = extract_text("https://www.bbc.co.uk/news/entertainment-arts-51355206", newspapers=False)
    print(text)
    
    # # Encode sentences into embeddings
    # input_sentences = ["A major incident has been declared by police following flooding in Shropshire."]
    # page_sentences =  ["A major incident has been declared by police following flooding in England."]
    # embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
    # embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)

    # # Compute cosine similarity matrix
    # similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
    # print(similarity_matrix)