from transformers import AutoModelForSequenceClassification,AutoTokenizer import datasets from deployment import preprocess, detect import csv import pandas as pd # init device = 'cpu' # use 'cuda:0' if GPU is available # model_dir = "nealcly/detection-longformer" # model in our paper model_dir = "yaful/MAGE" # model in the online demo tokenizer = AutoTokenizer.from_pretrained(model_dir) model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device) # text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives." # # preprocess # text = preprocess(text) # # detection # result = detect(text,tokenizer,model,device) # print(result) # ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', '2020-02') # test 100 samples from (RealTimeData/bbc_news_alltime', '2020-02') # df = pd.read_csv('query_result.csv') # content_column = df['content'] # count = 0 # for content in content_column: # # preprocess # text = preprocess(content) # # detection # result = detect(text, tokenizer, model, device) # if result == "human-written": # count +=1 # print(count) # print(count) # ds = datasets.load_dataset('yaful/MAGE', 'test') # ds.save_to_disk("MAGE_data") # splits = list(ds.keys()) # print(splits) ds = datasets.load_from_disk("MAGE_data") #filtered_data = ds['test'].filter(lambda x: x['src'] == 'xsum_human') human_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_human'] human_data = human_data[0:100] machine_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_machine_topical_gpt-3.5-trubo'] machine_data = machine_data[0:100] count = 0 for content in machine_data: # preprocess text = preprocess(content) # detection result = detect(text, tokenizer, model, device) print(result) if result == "human-written": # machine-generated count +=1 print(count) print(count)