Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /texts /MAGE /main.py

pmkhanh7890

1st

22e1b62 4 months ago

raw

history blame

2.63 kB

	from transformers import AutoModelForSequenceClassification,AutoTokenizer
	import datasets
	from deployment import preprocess, detect
	import csv
	import pandas as pd

	# init
	device = 'cpu' # use 'cuda:0' if GPU is available
	# model_dir = "nealcly/detection-longformer" # model in our paper
	model_dir = "yaful/MAGE" # model in the online demo
	tokenizer = AutoTokenizer.from_pretrained(model_dir)
	model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)

	# text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives."
	# # preprocess
	# text = preprocess(text)
	# # detection
	# result = detect(text,tokenizer,model,device)
	# print(result)

	# ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', '2020-02')
	# test 100 samples from (RealTimeData/bbc_news_alltime', '2020-02')
	# df = pd.read_csv('query_result.csv')
	# content_column = df['content']
	# count = 0

	# for content in content_column:
	# # preprocess
	# text = preprocess(content)
	# # detection
	# result = detect(text, tokenizer, model, device)
	# if result == "human-written":
	# count +=1

	# print(count)
	# print(count)


	# ds = datasets.load_dataset('yaful/MAGE', 'test')
	# ds.save_to_disk("MAGE_data")
	# splits = list(ds.keys())
	# print(splits)

	ds = datasets.load_from_disk("MAGE_data")

	#filtered_data = ds['test'].filter(lambda x: x['src'] == 'xsum_human')

	human_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_human']
	human_data = human_data[0:100]

	machine_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_machine_topical_gpt-3.5-trubo']
	machine_data = machine_data[0:100]

	count = 0
	for content in machine_data:
	# preprocess
	text = preprocess(content)
	# detection
	result = detect(text, tokenizer, model, device)
	print(result)
	if result == "human-written": # machine-generated
	count +=1

	print(count)
	print(count)