Spaces:
Sleeping
Sleeping
from transformers import AutoModelForSequenceClassification,AutoTokenizer | |
import datasets | |
from deployment import preprocess, detect | |
import csv | |
import pandas as pd | |
# init | |
device = 'cpu' # use 'cuda:0' if GPU is available | |
# model_dir = "nealcly/detection-longformer" # model in our paper | |
model_dir = "yaful/MAGE" # model in the online demo | |
tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device) | |
# text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives." | |
# # preprocess | |
# text = preprocess(text) | |
# # detection | |
# result = detect(text,tokenizer,model,device) | |
# print(result) | |
# ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', '2020-02') | |
# test 100 samples from (RealTimeData/bbc_news_alltime', '2020-02') | |
# df = pd.read_csv('query_result.csv') | |
# content_column = df['content'] | |
# count = 0 | |
# for content in content_column: | |
# # preprocess | |
# text = preprocess(content) | |
# # detection | |
# result = detect(text, tokenizer, model, device) | |
# if result == "human-written": | |
# count +=1 | |
# print(count) | |
# print(count) | |
# ds = datasets.load_dataset('yaful/MAGE', 'test') | |
# ds.save_to_disk("MAGE_data") | |
# splits = list(ds.keys()) | |
# print(splits) | |
ds = datasets.load_from_disk("MAGE_data") | |
#filtered_data = ds['test'].filter(lambda x: x['src'] == 'xsum_human') | |
human_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_human'] | |
human_data = human_data[0:100] | |
machine_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_machine_topical_gpt-3.5-trubo'] | |
machine_data = machine_data[0:100] | |
count = 0 | |
for content in machine_data: | |
# preprocess | |
text = preprocess(content) | |
# detection | |
result = detect(text, tokenizer, model, device) | |
print(result) | |
if result == "human-written": # machine-generated | |
count +=1 | |
print(count) | |
print(count) |