Spaces:

tracinginsights
/

QuotesBot

Runtime error

App Files Files Community

QuotesBot / app.py

tracinginsights

Update app.py

a76c1ab over 2 years ago

raw

history blame

4.33 kB

	import pandas as pd
	import requests
	import isort
	import black
	import flair
	import time
	from bs4 import BeautifulSoup
	import re
	import numpy as np

	from flair.data import Sentence
	from flair.models import SequenceTagger
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline


	import string

	URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"

	def get_xml(url):
	# xpath is only for formula1
	# use urllib.parse to check for formula1.com website or other news
	xml = pd.read_xml(url,xpath='channel/item')



	# care taken to only consider results where there are more words not a single word quotes
	def extract_quote(string):
	# Use the re.findall function to extract the quoted text
	results = re.findall(r'[“\"](.*?)[”\"]', string)
	quotes = []
	for result in results:
	split_result = result.split()
	if len(split_result) >3:
	quotes.append(result)

	return quotes



	def get_names(text):
	# # load the NER tagger
	tagger = SequenceTagger.load('ner')

	sentence = Sentence(text)
	tagger.predict(sentence)

	names = []
	for label in sentence.get_labels('ner'):
	if label.value == "PER":
	names.append(f"{label.data_point.text}")

	# convert to a set to remove some of the repetitions
	names = list(set(names))

	return names

	def get_text(new_articles_df):
	"""
	quotes outputs a list of quotes
	"""

	dfs_dict = {}

	for article in tqdm(new_articles_df.iterrows()):

	link = article[1]["guid"]
	request = requests.get(link)
	soup = BeautifulSoup(request.content, "html.parser")
	# class_ below will be different for different websites
	s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
	lines = s.find_all("p")
	text_content = pd.DataFrame(data={"text": []})
	for i, line in enumerate(lines):
	df = pd.DataFrame(data={"text": [line.text]})
	text_content = pd.concat([text_content, df], ignore_index=True)

	strongs = s.find_all("strong")
	strong_content = pd.DataFrame(data={"text": []})
	for i, strong in enumerate(strongs):
	if i > 0:
	df = pd.DataFrame(data={"text": [strong.text]})
	strong_content = pd.concat([strong_content, df], ignore_index=True)
	# df has content
	df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
	drop=True
	)
	# df["quote"] = df["text"].apply(lambda row: extract_quote(row))
	# # combine all rows into context

	context = ""

	for i,row in df.iterrows():
	context += f" {row['text']}"


	quotes = extract_quote(context)
	# to save some time not computing unnecessary NER
	if len(quotes) != 0:
	speakers = get_names(context)
	else:
	speakers = ()

	dfs_dict[link] = {'context':context, 'quotes':quotes, 'speakers':speakers}

	return dfs_dict

	def load_speaker_model():

	model_name = f"microsoft/deberta-v2-large"

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	model = AutoModelForQuestionAnswering.from_pretrained(model_name)

	question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

	return question_answerer



	def remove_punctuations(text):

	modified_text = "".join([character for character in text if character not in string.punctuation])
	modified_text = modified_text.lstrip(" ")
	modified_text = modified_text.rstrip(" ")

	return modified_text


	def check_updates(every=300):
	while True:
	time.sleep(every)
	latest_xml = get_xml()
	if ~previous_xml.equals(latest_xml):
	print('New articles found')
	new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]

	# loops through new articles and gets the necessary text, quotes and speakers
	dfs_dict = get_text(new_articles_df)


	else:
	print('No New article is found')