from transformers import AutoModelForSequenceClassification,AutoTokenizer
import datasets
from deployment import preprocess, detect
import csv
import pandas as pd

# init
device = 'cpu' # use 'cuda:0' if GPU is available
# model_dir = "nealcly/detection-longformer" # model in our paper
model_dir = "yaful/MAGE" # model in the online demo
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)

# text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives."
# # preprocess
# text = preprocess(text)
# # detection
# result = detect(text,tokenizer,model,device)
# print(result)

# ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', '2020-02')
# test 100 samples from (RealTimeData/bbc_news_alltime', '2020-02') 
# df = pd.read_csv('query_result.csv')
# content_column = df['content']
# count = 0

# for content in content_column:
#     # preprocess
#     text = preprocess(content)
#     # detection
#     result = detect(text, tokenizer, model, device)
#     if result == "human-written":
#         count +=1
    
#     print(count)
# print(count)


# ds = datasets.load_dataset('yaful/MAGE', 'test')
# ds.save_to_disk("MAGE_data")
# splits = list(ds.keys())
# print(splits)

ds = datasets.load_from_disk("MAGE_data")

#filtered_data = ds['test'].filter(lambda x: x['src'] == 'xsum_human')

human_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_human']
human_data = human_data[0:100]

machine_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_machine_topical_gpt-3.5-trubo']
machine_data = machine_data[0:100]

count = 0
for content in machine_data:
    # preprocess
    text = preprocess(content)
    # detection
    result = detect(text, tokenizer, model, device)
    print(result)
    if result == "human-written":  # machine-generated
        count +=1
    
    print(count)
print(count)