diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9a5d0cf0eac1fc72eebcbdde173bc1c318799390 --- /dev/null +++ b/.gitignore @@ -0,0 +1,245 @@ +models/ +data/ +**/model_checkpoints +**/outputs +training_with_callbacks/ +*.ipynb + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +.venv312 +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# MacOS +.DS_Store + +# Frontend +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# misc +*.pem + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# vercel +.vercel + +# Pycharm +.idea + +# Env vars - to be updated +/infra/ci/secret.yaml + +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc +.terraform.lock.hcl +# ignore .vscode +.vscode + +# Ignore sensitive data - k8s env vars +infra/environments/chatbot-dev/dev_secret.yaml +infra/environments/chatbot-prod/prod_secret.yaml +infra/environments/tt-chatbot-prod/prod_secret.yaml + +# yarn file +yarn.lock + +# ignore migrations django +**/migrations/** +!**/migrations +!**/migrations/__init__.py + +# Gradio +**/.gradio + +# Lightning +**/lightning_logs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a7a8cbd59b390731470727b13bcfc4dfdbf3daa --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,53 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + #- id: check-added-large-files + - id: fix-byte-order-marker + - id: check-case-conflict + - id: check-json + - id: check-yaml + args: ['--unsafe'] + - id: detect-aws-credentials + args: [--allow-missing-credentials] + - id: detect-private-key + - id: end-of-file-fixer + - id: mixed-line-ending + - id: trailing-whitespace +- repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort (python) + args: [--settings-path=pyproject.toml] + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] +- repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + args: [--config=pyproject.toml] +- repo: https://github.com/pycqa/flake8.git + rev: 6.1.0 + hooks: + - id: flake8 + args: [--ignore, "E203, W503", --max-line-length, "79"] +- repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout +- repo: https://github.com/asottile/pyupgrade + rev: v3.19.0 + hooks: + - id: pyupgrade + args: [--py36-plus] diff --git a/.pre-commit-setting.toml b/.pre-commit-setting.toml new file mode 100644 index 0000000000000000000000000000000000000000..4ff2cf74230fe1e511141d1bf0ff5f374f0ee527 --- /dev/null +++ b/.pre-commit-setting.toml @@ -0,0 +1,22 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +[tool.black] +line-length = 79 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.idea + | \.pytest_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +''' + +[flake8] +ignore = E203, W503 +max-line-length = 79 diff --git a/.sample-env b/.sample-env new file mode 100644 index 0000000000000000000000000000000000000000..4796ded301ab19c2772f1a8d34b39870074146c3 --- /dev/null +++ b/.sample-env @@ -0,0 +1,4 @@ +[API_KEY] +OPENAI_API_KEY=your_api_key # Replace with your actual OpenAI API key +GEMINI_API_KEY=your_api_key +TOGETHER_API_KEY=your_api_key \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..3b6f22ee31509dd17907cee944b80e0a75e88365 --- /dev/null +++ b/app.py @@ -0,0 +1,400 @@ +import warnings + +import torchvision.transforms as transforms +from google_img_source_search import ReverseImageSearcher + +# from src.images.CNN_model_classifier import predict_cnn +# from src.images.diffusion_model_classifier import ( +# ImageClassifier, +# predict_single_image, +# ) + +warnings.simplefilter( + action="ignore", + category=FutureWarning, +) # disable FutureWarning + +import gradio as gr # noqa: E402 +from transformers import ( # noqa: E402 + AutoModelForSequenceClassification, + AutoTokenizer, + pipeline, +) + +from src.texts.MAGE.deployment import ( # noqa: E402 + detect, + preprocess, +) +from src.texts.PASTED.pasted_lexicon import Detector # noqa: E402 +from src.texts.Search_Text.search import ( # noqa: E402 + get_important_sentences, + get_keywords, + is_human_written, +) +from src.images.Search_Image.search import ( + compare_images, + get_image_from_path, + get_image_from_url, +) + + +def convert_score_range(score): + """ + Converts a score from the range [0, 1] to [-1, 1]. + + Args: + score: The original score in the range [0, 1]. + + Returns: + The converted score in the range [-1, 1]. + """ + + return 2 * score - 1 + + +def generate_highlighted_text(text_scores): + """ + Generates a highlighted text string based on the given text and scores. + + Args: + text_scores: A list of tuples, where each tuple contains a text + segment and its score. + + Returns: + A string of HTML code with highlighted text. + """ + highlighted_text = "" + for text, score in text_scores: + # Map score to a color using a gradient + color = f"rgba(255, 0, 0, {1 - score})" # Red to green gradient + highlighted_text += ( + f"{text}" # noqa + ) + return highlighted_text + + +def separate_characters_with_mask(text, mask): + """Separates characters in a string and pairs them with a mask sign. + + Args: + text: The input string. + + Returns: + A list of tuples, where each tuple contains a character and a mask. + """ + + return [(char, mask) for char in text] + + +def detect_ai_text(model_name, search_engine, text): + if search_engine is True: + keywords = get_keywords(text) + important_sentences = get_important_sentences(text, keywords) + predictions = is_human_written(important_sentences[0]) + print("keywords: ", keywords) + print("important_sentences: ", important_sentences) + print("predictions: ", predictions) + if predictions == -1: + caption = "[Found exact match] " + text_scores = list(zip([caption, text], [0, predictions])) + print("text_scores: ", text_scores) + return text_scores + + if model_name == "SimLLM": + tokenize_input = SimLLM_tokenizer(text, return_tensors="pt") + outputs = SimLLM_model(**tokenize_input) + predictions = outputs.logits.argmax(dim=-1).item() + if predictions == 0: + predictions = "human-written" + else: + predictions = "machine-generated" + + elif model_name == "MAGE": + processed_text = preprocess(text) + predictions = detect( + processed_text, + MAGE_tokenizer, + MAGE_model, + device, + ) + + elif model_name == "chatgpt-detector-roberta": + predictions = roberta_pipeline_en(text)[0]["label"] + if predictions == "Human": + predictions = "human-written" + else: # ChatGPT + predictions = "machine-generated" + elif model_name == "PASTED-Lexical": + predictions = detector(text) + + if model_name != "PASTED-Lexical": + text_scores = list(zip([text], [predictions])) + else: + text_scores = [] + for text, score in predictions: + new_score = convert_score_range(score) # normalize score + text_scores.append((text, new_score)) + + return text_scores + + +diffusion_model_path = ( + "src/images/Diffusion/model_checkpoints/" + "image-classifier-step=7007-val_loss=0.09.ckpt" +) +cnn_model_path = "src/images/CNN/model_checkpoints/blur_jpg_prob0.5.pth" + + +def detect_ai_image(input_image_path, search_engine): + # if search_engine is True: + # Search image + + rev_img_searcher = ReverseImageSearcher() + search_items = rev_img_searcher.search_by_file(input_image_path) + min_result_difference = 5000 + result_image_url = "" + input_image = get_image_from_path(input_image_path) + + for search_item in search_items: + # print(f'Title: {search_item.page_title}') + # print(f'Site: {search_item.page_url}') + # print(f'Img: {search_item.image_url}\n') + + # Compare each search result image with the input image + result_image = get_image_from_url(search_item.image_url) + # input_image = get_image_from_url(search_item.image_url) + result_difference = compare_images(result_image, input_image) + + print(f"Difference with search result: {result_difference}") + print(f"Result image url: {search_item.page_url}\n") + + if min_result_difference > result_difference: + min_result_difference = result_difference + result_image_url = search_item.image_url + result_page_url = search_item.page_url + + + if result_difference == 0: + break + + + if min_result_difference == 0: + result = f"

Input image is LIKELY SIMILAR to image from:

"\ + f"" + elif 10 > min_result_difference > 0: + result = f"

Input image is potentially a VARIATRION from:

"\ + f"" + elif min_result_difference < 5000: + result = f"

Input image is not similar to any search results.

"\ + f"" + else: + result = f"

No search result found.

"\ + + return result + + # def get_prediction_diffusion(image): + # model = ImageClassifier.load_from_checkpoint(diffusion_model_path) + + # prediction = predict_single_image(image, model) + # return (prediction >= 0.5, prediction) + + # def get_prediction_cnn(image): + # prediction = predict_cnn(image, cnn_model_path) + # return (prediction >= 0.5, prediction) + + # # Define the transformations for the image + # transform = transforms.Compose( + # [ + # transforms.Resize((224, 224)), # Image size expected by ResNet50 + # transforms.ToTensor(), + # transforms.Normalize( + # mean=[0.485, 0.456, 0.406], + # std=[0.229, 0.224, 0.225], + # ), + # ], + # ) + # image_tensor = transform(inp) + # pred_diff, prob_diff = get_prediction_diffusion(image_tensor) + # pred_cnn, prob_cnn = get_prediction_cnn(image_tensor) + # verdict = ( + # "AI Generated" if (pred_diff or pred_cnn) else "No GenAI detected" + # ) + # return ( + # f"

{verdict}

" + # f"" + # ) + + +# Define GPUs +device = "cpu" # use 'cuda:0' if GPU is available + +# init MAGE +model_dir = "yaful/MAGE" # model in huggingface +MAGE_tokenizer = AutoTokenizer.from_pretrained(model_dir) +MAGE_model = AutoModelForSequenceClassification.from_pretrained(model_dir).to( + device, +) + +# init chatgpt-detector-roberta +model_dir = "Hello-SimpleAI/chatgpt-detector-roberta" # model in huggingface +roberta_pipeline_en = pipeline(task="text-classification", model=model_dir) + +# init PASTED +model_dir = "linzw/PASTED-Lexical" +detector = Detector(model_dir, device) + +# init SimLLM +model_path = "./models/single_model_detector" +SimLLM_tokenizer = AutoTokenizer.from_pretrained(model_path) +SimLLM_model = AutoModelForSequenceClassification.from_pretrained(model_path) + +# Init variable for UI +title = """ +
+ +

AI-generated content detection

+ Demo by NICT & Tokyo Techies + +
+""" + +examples = [ + [ + "SimLLM", + False, + """\ +The BBC's long-running consumer rights series Watchdog is to end as a \ +standalone programme, instead becoming part of The One Show. Watchdog \ +began in 1980 as a strand of Nationwide, but proved so popular it \ +became a separate programme in 1985. Co-host Steph McGovern has moved \ +to Channel 4, but Matt Allwright and Nikki Fox will stay to front the \ +new strand. The BBC said they would investigate viewer complaints all \ +year round rather than for two series a year. +""", + ], + [ + "chatgpt-detector-roberta", + False, + """\ +Artificial intelligence (AI) is the science of making machines \ +intelligent. It enables computers to learn from data, recognize \ +patterns, and make decisions. AI powers many technologies we use \ +daily, from voice assistants to self-driving cars. It's rapidly \ +evolving, promising to revolutionize various industries and reshape \ +the future.""", + ], +] + +model_remark = """ +Model sources: +SimLLM, +MAGE, +chatgpt-detector-roberta, +PASTED-Lexical. + +""" # noqa: E501 + +image_samples = [ + ["src/images/samples/fake_dalle.jpg", "Generated (Dall-E)"], + ["src/images/samples/fake_midjourney.png", "Generated (MidJourney)"], + ["src/images/samples/fake_stable.jpg", "Generated (Stable Diffusion)"], + ["src/images/samples/fake_cnn.png", "Generated (GAN)"], + ["src/images/samples/real.png", "Organic"], + [ + "https://p.potaufeu.asahi.com/1831-p/picture/27695628/89644a996fdd0cfc9e06398c64320fbe.jpg", # noqa E501 + "Internet GenAI", + ], +] +image_samples_path = [i[0] for i in image_samples] + +# UI +with gr.Blocks() as demo: + with gr.Row(): + gr.HTML(title) + with gr.Row(): + with gr.Tab("Text"): + with gr.Row(): + with gr.Column(): + model = gr.Dropdown( + [ + "SimLLM", + "MAGE", + "chatgpt-detector-roberta", + "PASTED-Lexical", + ], + label="Detection model", + ) + search_engine = gr.Checkbox(label="Use search engine") + gr.HTML(model_remark) + with gr.Column(): + text_input = gr.Textbox( + label="Input text", + placeholder="Enter text here...", + lines=5, + ) + + output = gr.HighlightedText( + label="Detection results", + combine_adjacent=True, + show_legend=True, + color_map={ + "human-written": "#7d58cf", + "machine-generated": "#e34242", + }, + ) + + gr.Examples( + examples=examples, + inputs=[model, search_engine, text_input], + ) + model.change( + detect_ai_text, + inputs=[model, search_engine, text_input], + outputs=output, + ) + search_engine.change( + detect_ai_text, + inputs=[model, search_engine, text_input], + outputs=output, + ) + text_input.change( + detect_ai_text, + inputs=[model, search_engine, text_input], + outputs=output, + ) + with gr.Tab("Images"): + with gr.Row(): + input_image = gr.Image(type="filepath") + with gr.Column(): + output_image = gr.Markdown(height=400) + gr.Examples( + examples=image_samples, + inputs=input_image, + ) + + input_image.change( + detect_ai_image, + inputs=input_image, + outputs=output_image, + ) + + +# demo.launch(share=True) +demo.launch(allowed_paths=image_samples_path, share=True) diff --git a/application.py b/application.py new file mode 100644 index 0000000000000000000000000000000000000000..2ca430a00c6f314a223af64662e298a6e62f05fa --- /dev/null +++ b/application.py @@ -0,0 +1,137 @@ +import os + +import gradio as gr +import openai +import requests +from PIL import Image +import re + +from src.application.url_reader import URLReader + +OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') +openai.api_key = os.getenv('OPENAI_API_KEY') +GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY') +SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID') + +def load_url(url): + """ + Load content from the given URL. + """ + content = URLReader(url) + image = None + header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'} + try: + response = requests.get( + url, + headers = header, + stream = True + ) + response.raise_for_status() # Raise an exception for bad status codes + + image_response = requests.get(content.top_image, stream=True) + try: + image = Image.open(image_response.raw) + except: + print(f"Error loading image from {content.top_image}") + + except (requests.exceptions.RequestException, FileNotFoundError) as e: + print(f"Error fetching image: {e}") + + return content.title, content.text, image + + +def replace_terms(text, input_term, destination_term): + # Replace input_term with destination_term in the text + modified_text = re.sub(input_term, destination_term, text) + return modified_text + +def generate_content(model1, model2, title, content): + # Generate text using the selected models + full_content = "" + input_type = "" + if title and content: + full_content = title + "\n" + content + input_type = "title and content" + elif title: + full_content = title + input_type = "title" + elif content: + full_content = title + input_type = "content" + +def generate_text(model, full_context, input_type): + # Generate text using the selected model + if input_type == "": + prompt = "Generate a random fake news article" + else: + prompt = f"Generate a fake news article (title and content) based on the following {input_type}: {full_context}" + + try: + response = openai.ChatCompletion.create( + model=model, + messages=[ + {"role": "user", "content": prompt} + ] + ) + return response.choices[0].message.content + + except openai.error.OpenAIError as e: + print(f"Error interacting with OpenAI API: {e}") + return "An error occurred while processing your request." + +# Define the GUI +with gr.Blocks() as demo: + gr.Markdown("# Fake News Detection") + + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown("## Settings") + gr.Markdown("This tool generates fake news by modifying the content of a given URL.") + + with gr.Accordion("1. Enter a URL"): + #gr.Markdown(" 1. Enter a URL.") + url_input = gr.Textbox( + label="URL", + value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science", + ) + load_button = gr.Button("Load an URL...") + + with gr.Accordion("2. Select a content-generation model", open=True): + with gr.Row(): + model1_dropdown = gr.Dropdown(choices=["GPT 4o", "GPT 4o-mini"], label="Text-generation model") + model2_dropdown = gr.Dropdown(choices=["Dall-e", "Stable Diffusion"], label="Image-generation model") + generate_button = gr.Button("Random generation...") + + with gr.Accordion("3. Replace any terms", open=True): + with gr.Row(): + input_term_box = gr.Textbox(label="Input Term") + destination_term_box = gr.Textbox(label="Destination Term") + replace_button = gr.Button("Replace term...") + + process_button = gr.Button("Process") + + with gr.Column(scale=2): + gr.Markdown("## News contents") + title_input = gr.Textbox(label="Title", value="") + with gr.Row(): + image_view = gr.Image(label="Image") + content_input = gr.Textbox(label="Content", value="", lines=15) + + + + # Connect events + load_button.click( + load_url, + inputs=url_input, + outputs=[title_input, content_input, image_view] + ) + replace_button.click(replace_terms, + inputs=[content_input, input_term_box, destination_term_box], + outputs=content_input) + process_button.click(generate_text, + inputs=[url_input, model1_dropdown, model2_dropdown, input_term_box, destination_term_box, title_input, content_input], + outputs=[title_input, content_input]) + + #url_input.change(load_image, inputs=url_input, outputs=image_view) + +demo.launch() \ No newline at end of file diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..fe11ce92186e77bdf1e1d91a4fbf0e847055367b --- /dev/null +++ b/demo.py @@ -0,0 +1,309 @@ +import os + +from src.images.Search_Image.search import find_similar_img_from_url + +import re + +import gradio as gr + +from src.images.Search_Image.image_model_share import ( + image_generation_detection, +) +from src.texts.Search_Text._text_detection_share import ( + UNKNOWN, + abstract_detect_generated_text, +) +from src.texts.Search_Text.fake_text_generation_share import ( + highlight_overlap_by_word_to_list, +) + +os.environ["no_proxy"] = "localhost,127.0.0.1,::1" + +TEMP_IMAGE = "temp_image.jpg" +TEMP_INPUT_IMAGE = "temp_input_image.jpg" + +HUMAN_IMAGE = "data/test_data/human_news.jpg" + +HUMAN_CAPTION = "Stoke City have secured West Brom striker Saido Berahino for £12 million on a five-and-a-half-year contract." +HUMAN_CONTENT = """ +Tracey Jolliffe has already donated a kidney, 16 eggs and 80 pints of blood, and intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet. +"If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme. +She is what is known as an "altruistic donor" - someone willing to give away an organ to potentially help save the life of a complete stranger. +A microbiologist in the NHS, and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint. +But she has also been keen to make a difference on a personal level. +"I signed up to donate blood, and to the bone marrow register, when I was 18," she says. +Now 50, her wish to donate has become gradually more expansive. +In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity - and now supports the charity Give A Kidney, encouraging others to do the same. +As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list. +Tracey's kidney donation, in all likelihood, will have saved someone's life. +"I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions. +It was not, however, a decision taken on the spur of a moment. +Donating a kidney is an "involved process", she says, with suitability assessments taking at least three months to complete. +Tests leading up to the transplant include X-rays, heart tracing and a special test of kidney function, which involves an injection and a series of blood tests. +"It is not something to do if you're scared of needles," she jokes. +The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000 - roughly the same as having an appendix removed. +Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalent - or better - life expectancy than the average person. +Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks. +""" + +HUMAN_NEWS_CNN = """ +Mayotte authorities fear hunger and disease after cyclone, as death toll rises in Mozambique +Cyclone Chido caused devastation in Mayotte and authorities are now rushing to prevent disease and hunger spreading in the French overseas territory Sipa USA +Authorities in Mayotte were racing on Tuesday to stop hunger, disease and lawlessness from spreading in the French overseas territory after the weekend’s devastating cyclone, while Mozambique reported dozens of deaths from the storm. +Hundreds or even thousands could be dead in Mayotte, which took the strongest hit from Cyclone Chido, French officials have said. The storm laid waste to large parts of the archipelago off east Africa, France’s poorest overseas territory, before striking continental Africa. +With many parts of Mayotte still inaccessible and some victims buried before their deaths could be officially counted, it may take days to discover the full extent of the destruction. +So far, 22 deaths and more than 1,400 injuries have been confirmed, Ambdilwahedou Soumaila, the mayor of the capital Mamoudzou, told Radio France Internationale on Tuesday morning. +“The priority today is water and food,” Soumaila said. “There are people who have unfortunately died where the bodies are starting to decompose that can create a sanitary problem.” +“We don’t have electricity. When night falls, there are people who take advantage of that situation.” + +Rescue workers operate in storm-hit Mayotte on Wednesday. +Rescue workers operate in storm-hit Mayotte on Wednesday. Securite Civile via Reuters +Twenty tonnes of food and water are due to start arriving on Tuesday by air and sea. The French government said late on Monday it expects 50% of water supplies to be restored within 48 hours and 95% within the week. +France’s interior ministry announced that a curfew would go into effect on Tuesday night from 10 p.m. to 4 a.m. local time. +Rescue workers have been searching for survivors amid the debris of shantytowns bowled over by 200 kph (124 mph) winds. +Chido was the strongest storm to strike Mayotte in more than 90 years, French weather service Meteo France said. In Mozambique, it killed at least 34 people, officials said on Tuesday. Another seven died in Malawi. +Drone footage from Mozambique’s Cabo Delgado province, already experiencing a humanitarian crisis due to an Islamist insurgency, showed razed thatched-roof houses near the beach and personal belongings scattered under the few palm trees still standing. + +Dispute over immigration +French President Emmanuel Macron said after an emergency cabinet meeting on Monday that he would visit Mayotte in the coming days, as the disaster quickly fueled a political back-and-forth about immigration, the environment and France’s treatment of its overseas territories. +Mayotte has been grappling with unrest in recent years, with many residents angry at illegal immigration and inflation. +More than three-quarters of its roughly 321,000 people live in relative poverty, and about one-third are estimated to be undocumented migrants, most from nearby Comoros and Madagascar. +The territory has become a stronghold for the far-right National Rally with 60% voting for Marine Le Pen in the 2022 presidential election runoff. +France’s acting Interior Minister Bruno Retailleau, from the conservative Republicans party, told a news conference in Mayotte that the early warning system had worked “perfectly” but many of the undocumented had not come to designated shelters. +People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16. +People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16. UNICEF Mozambique via Reuters +Other officials have said undocumented migrants may have been afraid to go to shelters for fear of being arrested. +The toll of the cyclone, Retailleau said in a later post on X, underscored the need to address “the migration question.” +“Mayotte is the symbol of the drift that (French) governments have allowed to take hold on this issue,” he said. “We will need to legislate so that in Mayotte, like everywhere else on the national territory, France retakes control of its immigration.” +Left-wing politicians, however, have pointed the finger at what they say is the government’s neglect of Mayotte and failure to prepare for natural disasters linked to climate change. +Socialist Party chairman Olivier Faure blasted Retailleau’s comments in an X post. +“He could have interrogated the role of climate change in producing more and more intense climate disasters. He could have rallied against the extreme poverty that makes people more vulnerable to cyclones,” said Faure. +“No, he has resumed his crusade against migrants.” +Prime Minister Francois Bayrou, appointed last week to steer France out of a political crisis, faced criticism after he went to the town of Pau, where he is the mayor, to attend a municipal council meeting on Monday, instead of visiting Mayotte. +""" + +HUMAN_NEWS_CNN_IMAGE = "human_cnn.webp" +# generate a short news related to sport + +# opposite +OPPOSITE_NEWS = """ +Tracey Jolliffe has never donated a kidney, any eggs, or blood, and has no plans to leave her brain to science. She is not considering giving away any part of her liver to someone she knows. +"If I had another spare kidney, I wouldn't do it again," Tracey tells the BBC's Victoria Derbyshire programme. +She is not an "altruistic donor" - someone unwilling to give away an organ to potentially save the life of a complete stranger. +A microbiologist outside the NHS, with parents who were not in healthcare, she has spent her life without focusing on the importance of healthcare from a professional standpoint. +She has also not been eager to make a difference on a personal level. +"I never signed up to donate blood, nor to the bone marrow register, when I was 18," she says. +Now 50, her interest in donating has not expanded. +In 2012, she was not among the few people that year to donate a kidney without knowing the recipient's identity - and does not support the charity Give A Kidney, discouraging others from doing the same. +As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list. +Tracey's decision not to donate a kidney hasn't saved anyone's life. +"I never think about it when I wake up," she says, indifferent about her choices. +It was not a decision made after careful consideration. +Donating a kidney is not an "involved process", she says, with suitability assessments taking less than three months to complete. +Tests leading up to the transplant do not include X-rays, heart tracing, or a special test of kidney function, which does not involve an injection or any blood tests. +"It is something to do if you're scared of needles," she jokes. +The risks associated with donating, however, are relatively high for those not deemed healthy enough to proceed, with a high mortality rate - much greater than having an appendix removed. +Compared with the general public, NHS Blood and Transplant says, most kidney donors have worse life expectancy than the average person. +Tracey says she was not in hospital after any operation and did not feel "back to normal" within six weeks. +""" + +PARAPHASE_NEWS = """ +Tracey Jolliffe has generously donated a kidney, 16 eggs, and 80 pints of blood, and plans to donate her brain to science. She now hopes to donate part of her liver to someone she may never meet. "If I had another spare kidney, I'd do it again," she shares with the BBC's Victoria Derbyshire program. Known as an "altruistic donor," Tracey is willing to donate organs to help save the lives of strangers. +As a microbiologist in the NHS and the daughter of two nurses, Tracey has always understood the importance of healthcare professionally. However, she also strives to make a personal impact. "I signed up to donate blood and joined the bone marrow register at 18," she explains. Now 50, her desire to donate has expanded over the years. +In 2012, Tracey was among fewer than 100 people that year who donated a kidney without knowing the recipient. She now supports Give A Kidney, a charity that encourages others to donate. As of 30 September 2016, 5,126 people were on the NHS kidney transplant waiting list. Tracey's kidney donation likely saved a life. "I remind myself of it every day when I wake up," she says, proud of her life-changing decision. +Donating a kidney was not a spontaneous decision for Tracey. It is a complex process, she explains, with suitability assessments taking at least three months. Pre-transplant tests include X-rays, heart monitoring, and a special kidney function test involving an injection and multiple blood tests. "It's not for those afraid of needles," she jokes. +For healthy individuals, the risks of donating a kidney are relatively low, with a mortality rate of about one in 3,000, similar to having an appendix removed. According to NHS Blood and Transplant, most kidney donors have the same or better life expectancy compared to the general population. Tracey was hospitalized for five days after her operation and felt "back to normal" within six weeks. +""" + +MACHINE_IMAGE = "data/test_data/machine_news.png" +# MACHINE_CAPTION = "Argentina Secures Victory in Thrilling Friendly Match Against Brazil" +MACHINE_CONTENT = """ +Tracey Jolliffe has already donated a kidney, 16 eggs, and 80 pints of blood, and she intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet. +"If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme. +She is what is known as an "altruistic donor"—someone willing to give away an organ to potentially help save the life of a complete stranger. +A microbiologist in the NHS and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint. But she has also been keen to make a difference on a personal level. "I signed up to donate blood and to the bone marrow register when I was 18," she says. +Now 50, her wish to donate has become gradually more expansive. In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity, and she now supports the charity Give A Kidney, encouraging others to do the same. +As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list. Tracey's kidney donation, in all likelihood, has saved someone's life. "I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions. +It was not, however, a decision taken on the spur of a moment. Donating a kidney is an "involved process," she says, with suitability assessments taking at least three months to complete. Tests leading up to the transplant include X-rays, heart tracing, and a special test of kidney function, which involves an injection and a series of blood tests. "It is not something to do if you're scared of needles," she jokes. +The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000—roughly the same as having an appendix removed. Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalent—or better—life expectancy than the average person. +Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks. +""" + +HUMAN_BBC_NEWS2 = """ +A message of hope at Washington march +For such a divisive figure, Donald Trump managed to unify hundreds of thousands of Americans at the Women's March on Washington. +Moments after Mr Trump was sworn in as the 45th president on Friday, he delivered a thundering speech in which he promised to improve the lives of millions of Americans. +A day later, throngs of women, men and children streamed into the same area where he made that pledge, in order to take a stand for gender and racial equality. +Though Mr Trump's named was mentioned frequently, the march, which organisers estimate attracted more than half a million, was not only about the new US president. +Messages ranged from "Thank you for making me an activist Trump" to "We will not be silenced," but the common thread throughout the patchwork of signs was hope. +"It's about solidarity and visualising the resistance," said Jonathon Meier, who took a bus from New York. +"And I think it not only helps with the healing process, but it gives me hope for the next four years." +A sea of activists, some clad in knitted, pink "pussy" hats and others draped in American flags, ambled about the National Mall, stopping to catch a glimpse of some of the high-profile speakers and singing along to songs like "This Little Light of Mine". +Peppered among the many protest signs were images of ovaries and female genitals, a nod to concerns over losing access to birth control and abortion care under a Trump administration. +""" + +FREELY_GENERATION_NEWS = """ +A new study has indicated that criminals and terrorists are increasingly turning to the dark net to purchase weapons. The study, conducted by cybersecurity firm Recorded Future, found that these purchases are being made anonymously and with cryptocurrency, making it difficult for law enforcement agencies to track and intercept them. The dark net is a hidden part of the internet, accessible only through anonymous browsers, where users can buy and sell a variety of illegal goods and services. However, the study found that weapons purchases are becoming more popular on the dark net, with firearms and explosives being the most commonly traded items. Recorded Future's research showed that many of the weapons being sold on the dark net are military-grade, and the study suggests that this is due to the large number of surplus weapons available following military conflicts in various parts of the world. The report also found that the sellers on the dark net are often located in countries with lax gun laws, leading to concerns that these weapons could end up in the hands of criminals and terrorists who could use them to commit acts of violence. The use of cryptocurrency to purchase these weapons adds another layer of difficulty for law enforcement agencies trying to track down those responsible. The anonymity provided by cryptocurrency allows buyers and sellers to conduct their transactions without leaving a trace. The findings of this study serve as a stark reminder of the dangers posed by the dark net, and the need for law enforcement agencies to remain vigilant in their efforts to combat illegal activity on this hidden part of the internet. +""" + +HUMAN_BBC_NEWS2_IMAGE = "human_bbc_news_2.webp" + +HIGHLIGHT = "highlight" + + +def highlight_text(words, indexes): + final_words = words + for index in indexes: + final_words[index] = ( + f"{words[index]}" + ) + return " ".join(final_words) + + +def format_pair(pair): + input_sentence = highlight_text(pair[0], pair[2]) + source_sentence = highlight_text(pair[1], pair[3]) + return f"{input_sentence}{source_sentence}" + + +def create_table(data): + table_rows = "\n".join([format_pair(pair) for pair in data]) + return f""" +
Comparison between input news and source news at the above link
+ + + + + + + + + {table_rows} + +
Input sentenceSource sentence
+ """ + + +with gr.Blocks() as demo: + image = gr.Image( + value=HUMAN_IMAGE, + label="News Image", + height=200, + width=200, + type="filepath", + ) + content = gr.Textbox(label="Content", lines=3, value=HUMAN_CONTENT) + + process_btn = gr.Button("Process") + + """ + 1. human bbc news + 2. proofreading + 3. opposite + 4. human bbc news 2 + 5. human_cnn news + 6. paraphrase + 7. freely generation + """ + gr.Examples( + examples=[ + [HUMAN_IMAGE, HUMAN_CONTENT], + [MACHINE_IMAGE, MACHINE_CONTENT], + [MACHINE_IMAGE, OPPOSITE_NEWS], + [HUMAN_BBC_NEWS2_IMAGE, HUMAN_BBC_NEWS2], + [HUMAN_NEWS_CNN_IMAGE, HUMAN_NEWS_CNN], + [MACHINE_IMAGE, PARAPHASE_NEWS], + [MACHINE_IMAGE, FREELY_GENERATION_NEWS], + ], + inputs=[image, content], + label="examples", + example_labels=[ + "human bbc news", + "proofreading", + "opposite", + "human bbc news 2", + "human cnn news", + "paraphrase", + "freely generation", + ], + ) + + overall = gr.HTML() + matching_html = gr.HTML() + + def process(input_image, content): + ( + search_engine_prediction, + SOTA_prediction, + SOTA_confidence, + found_url, + sentence_pairs, + ) = abstract_detect_generated_text(content) + + final_table = [] + COLOR_MAPS = { + "HUMAN": "", + "MACHINE": "", + } + + source_image = [] + image_prediction_label, image_confidence = image_generation_detection( + input_image, + ) + # [found_img_url, image_different_score] = find_similar_img_from_url(input_image) + + # if 0 < image_different_score < 10: + # search_engine_description = f'Most likely generated by {COLOR_MAPS["HUMAN"]} (score = {image_different_score}) with evidence link at {found_img_url} ' + # else: # TODO add < 25 which is cropped images + # search_engine_description = f'Most likely generated by {COLOR_MAPS["MACHINE"]} (score = {image_different_score})' + + for ( + input_sentence, + source_sentence, + check_paraphrase, + ) in sentence_pairs: + input_words, source_words, input_indexes, source_indexes = ( + highlight_overlap_by_word_to_list( + input_sentence, + source_sentence, + ) + ) + final_table.append( + (input_words, source_words, input_indexes, source_indexes), + ) + + if search_engine_prediction == UNKNOWN: + search_engine_description = "Cannot find any evidence link" + final_prediction = SOTA_prediction + else: + final_prediction = search_engine_prediction + search_engine_description = f'Most likely generated by {COLOR_MAPS[search_engine_prediction]}{search_engine_prediction} with evidence link at {found_url} ' + + overall_html_result = f""" +

Image generation detection

+ +
+

Text generation detection

+ +

 

+""" + if len(final_table) != 0: + html_table = create_table(final_table) + else: + html_table = "" + return overall_html_result, html_table + + process_btn.click( + process, + inputs=[image, content], + outputs=[overall, matching_html], + ) + +demo.launch(share=False) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..734103e78aeb82d10070f68ca233281a614d6261 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[tool.black] +line-length = 79 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.idea + | \.pytest_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +force_grid_wrap=2 +multi_line_output=3 diff --git a/readme.md b/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..b7900e88798580a958953e619c5837b97a882fd0 --- /dev/null +++ b/readme.md @@ -0,0 +1,70 @@ +# [Text] SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-Generation + +## **Getting Started** +1. **Clone the repository:** + ```bash + git clone https://github.com/Tokyo-Techies/prj-nict-ai-content-detection + ``` + +2. **Set up the environment:** +Using virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate + ``` + +3. **Install dependencies:** + - Torch: https://pytorch.org/get-started/locally/ + - Others + ```bash + pip install -r requirements.txt + ``` + + +1. **API Keys** (optional) + - Obtain API keys for the corresponding models and insert them into the `SimLLM.py` file: + - ChatGPT: [OpenAI API](https://openai.com/index/openai-api/) + - Gemini: [Google Gemini API](https://ai.google.dev/gemini-api/docs/api-key) + - Other LLMs: [Together API](https://api.together.ai/) + + +5. **Run the project:** + - Text only: + ```bash + python SimLLM.py + ``` + +### Parameters + +- `LLMs`: List of large language models to use. Available models include 'ChatGPT', 'Yi', 'OpenChat', 'Gemini', 'LLaMa', 'Phi', 'Mixtral', 'QWen', 'OLMO', 'WizardLM', and 'Vicuna'. Default is `['ChatGPT', 'Yi', 'OpenChat']`. +- `train_indexes`: List of LLM indexes for training. Default is `[0, 1, 2]`. +- `test_indexes`: List of LLM indexes for testing. Default is `[0]`. +- `num_samples`: Number of samples. Default is 5000. + +### Examples + +- Running with default parameters: + `python SimLLM.py` + +- Running with customized parameters: + `python SimLLM.py --LLMs ChatGPT --train_indexes 0 --test_indexes 0` + +## Dataset + +The `dataset.csv` file contains both human and generated texts from 12 large language models, including: +ChatGPT, GPT-4o, Yi, OpenChat, Gemini, LLaMa, Phi, Mixtral, QWen, OLMO, WizardLM, and Vicuna. + +## Citation + +```bibtex +@inproceedings{nguyen2024SimLLM, + title={SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-generation}, + author={Nguyen-Son, Hoang-Quoc and Dao, Minh-Son and Zettsu, Koji}, + booktitle={The Conference on Empirical Methods in Natural Language Processing}, + year={2024} +} +``` + +## Acknowledgements + +- BARTScore: [BARTScore GitHub Repository](https://github.com/neulab/BARTScore) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..89c8b0062c9d1c3938b5425d3553aebb2f8e4574 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,34 @@ +gradio +# TEXT +accelerate==1.1.1 +datasets==v2.11.0 +evaluate==0.4.3 +google-generativeai==0.8.3 +clean-text==0.6.0 +#grpc +grpcio==1.68.1 +langchain==0.3.9 +langchain_community==0.3.8 +nltk==3.9.1 +openai==1.55.3 +rouge==1.0.1 +proto-plus==1.25.0 +scikit-learn==1.5.2 +sentence_transformers==3.3.1 +transformers==4.46.3 +xgboost==2.1.3 + +# IMAGES +torch==2.2.2 +torchvision==0.17.2 +pytorch-lightning==2.4.0 +timm==1.0.12 +opencv-python==4.10.0.84 +torchdata==0.7.1 +invisible-watermark==0.2.0 +google-image-source-search==1.2.2 +requests==2.31.0 +ImageHash==4.3.1 + +# parsing +PyPDF2 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/application/url_reader.py b/src/application/url_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..562012182f7090224e8af155e52cdbb230ff0032 --- /dev/null +++ b/src/application/url_reader.py @@ -0,0 +1,76 @@ +import string +from bs4 import BeautifulSoup +from newspaper import article, ArticleException, ArticleBinaryDataException +import requests + +class URLReader(): + def __init__(self, url: string, newspaper: bool=True): + self.url = url + self.text = None # string + self.title = None # string + self.images = None # list of Image objects + self.top_image = None # Image object + self.newspaper = newspaper # True if using newspaper4k, False if using BS + if self.newspaper is True: + self.extract_content_newspaper() + else: + self.extract_content_bs() + + def extract_content_newspaper(self): + """ + Use newspaper4k to extracts content from a URL + + Args: + url: The URL of the web page. + + Returns: + The extracted content (title, text, images) + """ + + try: + response = requests.get(self.url) + response.raise_for_status() # Raise exception for unsuccessful requests + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + try: + news = article(url=self.url, fetch_images=True) + except (ArticleException, ArticleBinaryDataException) as e: + print(f"\t\t↑↑↑ Error downloading article: {e}") + return None + + self.title = news.title + self.text = news.text + self.images = news.images + self.top_image = news.top_image + + def extract_content_bs(self): + """ + Use BS and process content + """ + response = requests.get(self.url) + response.raise_for_status() + + response.encoding = response.apparent_encoding + + try: + soup = BeautifulSoup(response.content, "html.parser") + except: + print(f"Error parsing HTML content from {self.url}") + return None + + self.title = soup.title.string.strip() if soup.title else None + + image_urls = [img['src'] for img in soup.find_all('img')] + self.images = image_urls + self.top_image = self.images[0] + + # Exclude text within specific elements + for element in soup(["img", "figcaption", "table", "script", "style"]): + element.extract() + #text = soup.get_text(separator="\n") + paragraphs = soup.find_all('p') + text = ' '.join([p.get_text() for p in paragraphs]) + + self.text = text \ No newline at end of file diff --git a/src/images/CNN_model_classifier.py b/src/images/CNN_model_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..f682820938139a1d3adf057a47065a1aad404c9a --- /dev/null +++ b/src/images/CNN_model_classifier.py @@ -0,0 +1,63 @@ +import argparse + +import torch.nn +import torchvision.transforms as transforms +from PIL import Image + +from .CNN.networks.resnet import resnet50 + + +def predict_cnn(image, model_path, crop=None): + model = resnet50(num_classes=1) + state_dict = torch.load(model_path, map_location="cpu") + model.load_state_dict(state_dict["model"]) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + + # Transform + if crop is not None: + trans_init = [transforms.CenterCrop(crop)] + print("Cropping to [%i]" % crop) + trans = transforms.Compose( + trans_init + + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + ), + ], + ) + + image = trans(image.convert("RGB")) + + with torch.no_grad(): + in_tens = image.unsqueeze(0) + prob = model(in_tens).sigmoid().item() + + return prob + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("-f", "--file", default="examples_realfakedir") + parser.add_argument( + "-m", + "--model_path", + type=str, + default="weights/blur_jpg_prob0.5.pth", + ) + parser.add_argument( + "-c", + "--crop", + type=int, + default=None, + help="by default, do not crop. specify crop size", + ) + + opt = parser.parse_args() + prob = predict_cnn(Image.open(opt.file), opt.model_path, crop=opt.crop) + print(f"probability of being synthetic: {prob * 100:.2f}%") diff --git a/src/images/Diffusion/Final_Report.pdf b/src/images/Diffusion/Final_Report.pdf new file mode 100644 index 0000000000000000000000000000000000000000..163dc12ed21ef51dafcef13a82ab11e553431219 Binary files /dev/null and b/src/images/Diffusion/Final_Report.pdf differ diff --git a/src/images/Diffusion/Pipfile b/src/images/Diffusion/Pipfile new file mode 100644 index 0000000000000000000000000000000000000000..d6f0932ca4bb1aca420a8e7f5bfa4b379ababb10 --- /dev/null +++ b/src/images/Diffusion/Pipfile @@ -0,0 +1,29 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[[source]] +url = "https://download.pytorch.org/whl/cu121" +verify_ssl = true +name = "downloadpytorch" + +[packages] +pandas = "*" +numpy = "*" +polars = "*" +requests = "*" +img2dataset = "*" +torch = {version = "==2.1.0", index = "downloadpytorch"} +torchvision = {version = "==0.16.0", index = "downloadpytorch"} +lightning = "*" +webdataset = "*" +matplotlib = "*" +invisible-watermark = "*" +torchdata = "*" +timm = "*" + +[dev-packages] + +[requires] +python_version = "3.11" diff --git a/src/images/Diffusion/Pipfile.lock b/src/images/Diffusion/Pipfile.lock new file mode 100644 index 0000000000000000000000000000000000000000..ae1b65fb50ac6d8934df9663d4b24f3d35172e40 --- /dev/null +++ b/src/images/Diffusion/Pipfile.lock @@ -0,0 +1,1862 @@ +{ + "_meta": { + "hash": { + "sha256": "6d3f6afdc8443ca91cb47819723377664f9f503ab96b8717efe97d1a345cdaf3" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.11" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + }, + { + "name": "downloadpytorch", + "url": "https://download.pytorch.org/whl/cu121", + "verify_ssl": true + } + ] + }, + "default": { + "aiohttp": { + "hashes": [ + "sha256:02ab6006ec3c3463b528374c4cdce86434e7b89ad355e7bf29e2f16b46c7dd6f", + "sha256:04fa38875e53eb7e354ece1607b1d2fdee2d175ea4e4d745f6ec9f751fe20c7c", + "sha256:0b0a6a36ed7e164c6df1e18ee47afbd1990ce47cb428739d6c99aaabfaf1b3af", + "sha256:0d406b01a9f5a7e232d1b0d161b40c05275ffbcbd772dc18c1d5a570961a1ca4", + "sha256:0e49b08eafa4f5707ecfb321ab9592717a319e37938e301d462f79b4e860c32a", + "sha256:0e7ba7ff228c0d9a2cd66194e90f2bca6e0abca810b786901a569c0de082f489", + "sha256:11cb254e397a82efb1805d12561e80124928e04e9c4483587ce7390b3866d213", + "sha256:11ff168d752cb41e8492817e10fb4f85828f6a0142b9726a30c27c35a1835f01", + "sha256:176df045597e674fa950bf5ae536be85699e04cea68fa3a616cf75e413737eb5", + "sha256:219a16763dc0294842188ac8a12262b5671817042b35d45e44fd0a697d8c8361", + "sha256:22698f01ff5653fe66d16ffb7658f582a0ac084d7da1323e39fd9eab326a1f26", + "sha256:237533179d9747080bcaad4d02083ce295c0d2eab3e9e8ce103411a4312991a0", + "sha256:289ba9ae8e88d0ba16062ecf02dd730b34186ea3b1e7489046fc338bdc3361c4", + "sha256:2c59e0076ea31c08553e868cec02d22191c086f00b44610f8ab7363a11a5d9d8", + "sha256:2c9376e2b09895c8ca8b95362283365eb5c03bdc8428ade80a864160605715f1", + "sha256:3135713c5562731ee18f58d3ad1bf41e1d8883eb68b363f2ffde5b2ea4b84cc7", + "sha256:3b9c7426923bb7bd66d409da46c41e3fb40f5caf679da624439b9eba92043fa6", + "sha256:3c0266cd6f005e99f3f51e583012de2778e65af6b73860038b968a0a8888487a", + "sha256:41473de252e1797c2d2293804e389a6d6986ef37cbb4a25208de537ae32141dd", + "sha256:4831df72b053b1eed31eb00a2e1aff6896fb4485301d4ccb208cac264b648db4", + "sha256:49f0c1b3c2842556e5de35f122fc0f0b721334ceb6e78c3719693364d4af8499", + "sha256:4b4c452d0190c5a820d3f5c0f3cd8a28ace48c54053e24da9d6041bf81113183", + "sha256:4ee8caa925aebc1e64e98432d78ea8de67b2272252b0a931d2ac3bd876ad5544", + "sha256:500f1c59906cd142d452074f3811614be04819a38ae2b3239a48b82649c08821", + "sha256:5216b6082c624b55cfe79af5d538e499cd5f5b976820eac31951fb4325974501", + "sha256:54311eb54f3a0c45efb9ed0d0a8f43d1bc6060d773f6973efd90037a51cd0a3f", + "sha256:54631fb69a6e44b2ba522f7c22a6fb2667a02fd97d636048478db2fd8c4e98fe", + "sha256:565760d6812b8d78d416c3c7cfdf5362fbe0d0d25b82fed75d0d29e18d7fc30f", + "sha256:598db66eaf2e04aa0c8900a63b0101fdc5e6b8a7ddd805c56d86efb54eb66672", + "sha256:5c4fa235d534b3547184831c624c0b7c1e262cd1de847d95085ec94c16fddcd5", + "sha256:69985d50a2b6f709412d944ffb2e97d0be154ea90600b7a921f95a87d6f108a2", + "sha256:69da0f3ed3496808e8cbc5123a866c41c12c15baaaead96d256477edf168eb57", + "sha256:6c93b7c2e52061f0925c3382d5cb8980e40f91c989563d3d32ca280069fd6a87", + "sha256:70907533db712f7aa791effb38efa96f044ce3d4e850e2d7691abd759f4f0ae0", + "sha256:81b77f868814346662c96ab36b875d7814ebf82340d3284a31681085c051320f", + "sha256:82eefaf1a996060602f3cc1112d93ba8b201dbf5d8fd9611227de2003dddb3b7", + "sha256:85c3e3c9cb1d480e0b9a64c658cd66b3cfb8e721636ab8b0e746e2d79a7a9eed", + "sha256:8a22a34bc594d9d24621091d1b91511001a7eea91d6652ea495ce06e27381f70", + "sha256:8cef8710fb849d97c533f259103f09bac167a008d7131d7b2b0e3a33269185c0", + "sha256:8d44e7bf06b0c0a70a20f9100af9fcfd7f6d9d3913e37754c12d424179b4e48f", + "sha256:8d7f98fde213f74561be1d6d3fa353656197f75d4edfbb3d94c9eb9b0fc47f5d", + "sha256:8d8e4450e7fe24d86e86b23cc209e0023177b6d59502e33807b732d2deb6975f", + "sha256:8fc49a87ac269d4529da45871e2ffb6874e87779c3d0e2ccd813c0899221239d", + "sha256:90ec72d231169b4b8d6085be13023ece8fa9b1bb495e4398d847e25218e0f431", + "sha256:91c742ca59045dce7ba76cab6e223e41d2c70d79e82c284a96411f8645e2afff", + "sha256:9b05d33ff8e6b269e30a7957bd3244ffbce2a7a35a81b81c382629b80af1a8bf", + "sha256:9b05d5cbe9dafcdc733262c3a99ccf63d2f7ce02543620d2bd8db4d4f7a22f83", + "sha256:9c5857612c9813796960c00767645cb5da815af16dafb32d70c72a8390bbf690", + "sha256:a34086c5cc285be878622e0a6ab897a986a6e8bf5b67ecb377015f06ed316587", + "sha256:ab221850108a4a063c5b8a70f00dd7a1975e5a1713f87f4ab26a46e5feac5a0e", + "sha256:b796b44111f0cab6bbf66214186e44734b5baab949cb5fb56154142a92989aeb", + "sha256:b8c3a67eb87394386847d188996920f33b01b32155f0a94f36ca0e0c635bf3e3", + "sha256:bcb6532b9814ea7c5a6a3299747c49de30e84472fa72821b07f5a9818bce0f66", + "sha256:bcc0ea8d5b74a41b621ad4a13d96c36079c81628ccc0b30cfb1603e3dfa3a014", + "sha256:bea94403a21eb94c93386d559bce297381609153e418a3ffc7d6bf772f59cc35", + "sha256:bff7e2811814fa2271be95ab6e84c9436d027a0e59665de60edf44e529a42c1f", + "sha256:c72444d17777865734aa1a4d167794c34b63e5883abb90356a0364a28904e6c0", + "sha256:c7b5d5d64e2a14e35a9240b33b89389e0035e6de8dbb7ffa50d10d8b65c57449", + "sha256:c7e939f1ae428a86e4abbb9a7c4732bf4706048818dfd979e5e2839ce0159f23", + "sha256:c88a15f272a0ad3d7773cf3a37cc7b7d077cbfc8e331675cf1346e849d97a4e5", + "sha256:c9110c06eaaac7e1f5562caf481f18ccf8f6fdf4c3323feab28a93d34cc646bd", + "sha256:ca7ca5abfbfe8d39e653870fbe8d7710be7a857f8a8386fc9de1aae2e02ce7e4", + "sha256:cae4c0c2ca800c793cae07ef3d40794625471040a87e1ba392039639ad61ab5b", + "sha256:cdefe289681507187e375a5064c7599f52c40343a8701761c802c1853a504558", + "sha256:cf2a0ac0615842b849f40c4d7f304986a242f1e68286dbf3bd7a835e4f83acfd", + "sha256:cfeadf42840c1e870dc2042a232a8748e75a36b52d78968cda6736de55582766", + "sha256:d737e69d193dac7296365a6dcb73bbbf53bb760ab25a3727716bbd42022e8d7a", + "sha256:d7481f581251bb5558ba9f635db70908819caa221fc79ee52a7f58392778c636", + "sha256:df9cf74b9bc03d586fc53ba470828d7b77ce51b0582d1d0b5b2fb673c0baa32d", + "sha256:e1f80197f8b0b846a8d5cf7b7ec6084493950d0882cc5537fb7b96a69e3c8590", + "sha256:ecca113f19d5e74048c001934045a2b9368d77b0b17691d905af18bd1c21275e", + "sha256:ee2527134f95e106cc1653e9ac78846f3a2ec1004cf20ef4e02038035a74544d", + "sha256:f27fdaadce22f2ef950fc10dcdf8048407c3b42b73779e48a4e76b3c35bca26c", + "sha256:f694dc8a6a3112059258a725a4ebe9acac5fe62f11c77ac4dcf896edfa78ca28", + "sha256:f800164276eec54e0af5c99feb9494c295118fc10a11b997bbb1348ba1a52065", + "sha256:ffcd828e37dc219a72c9012ec44ad2e7e3066bec6ff3aaa19e7d435dbf4032ca" + ], + "version": "==3.9.1" + }, + "aiosignal": { + "hashes": [ + "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc", + "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.1" + }, + "albumentations": { + "hashes": [ + "sha256:6b641d13733181d9ecdc29550e6ad580d1bfa9d25e2213a66940062f25e291bd", + "sha256:a6a38388fe546c568071e8c82f414498e86c9ed03c08b58e7a88b31cf7a244c6" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.1" + }, + "attrs": { + "hashes": [ + "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04", + "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015" + ], + "markers": "python_version >= '3.7'", + "version": "==23.1.0" + }, + "braceexpand": { + "hashes": [ + "sha256:91332d53de7828103dcae5773fb43bc34950b0c8160e35e0f44c4427a3b85014", + "sha256:e6e539bd20eaea53547472ff94f4fb5c3d3bf9d0a89388c4b56663aba765f705" + ], + "version": "==0.1.7" + }, + "certifi": { + "hashes": [ + "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", + "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" + ], + "markers": "python_version >= '3.6'", + "version": "==2023.11.17" + }, + "charset-normalizer": { + "hashes": [ + "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027", + "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087", + "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786", + "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8", + "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09", + "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185", + "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574", + "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e", + "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519", + "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898", + "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269", + "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3", + "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f", + "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6", + "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8", + "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a", + "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73", + "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc", + "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714", + "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2", + "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc", + "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce", + "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d", + "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e", + "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6", + "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269", + "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96", + "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d", + "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a", + "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4", + "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77", + "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d", + "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0", + "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed", + "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068", + "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac", + "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25", + "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8", + "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab", + "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26", + "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2", + "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db", + "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f", + "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5", + "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99", + "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c", + "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d", + "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811", + "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa", + "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a", + "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03", + "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b", + "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04", + "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c", + "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001", + "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458", + "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389", + "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99", + "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985", + "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537", + "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238", + "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f", + "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d", + "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796", + "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a", + "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143", + "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8", + "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c", + "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5", + "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5", + "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711", + "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4", + "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6", + "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c", + "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7", + "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4", + "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b", + "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae", + "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12", + "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c", + "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae", + "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8", + "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887", + "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b", + "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4", + "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f", + "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5", + "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33", + "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519", + "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==3.3.2" + }, + "click": { + "hashes": [ + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" + ], + "markers": "python_version >= '3.7'", + "version": "==8.1.7" + }, + "contourpy": { + "hashes": [ + "sha256:0274c1cb63625972c0c007ab14dd9ba9e199c36ae1a231ce45d725cbcbfd10a8", + "sha256:0d7e03c0f9a4f90dc18d4e77e9ef4ec7b7bbb437f7f675be8e530d65ae6ef956", + "sha256:11f8d2554e52f459918f7b8e6aa20ec2a3bce35ce95c1f0ef4ba36fbda306df5", + "sha256:139d8d2e1c1dd52d78682f505e980f592ba53c9f73bd6be102233e358b401063", + "sha256:16a7380e943a6d52472096cb7ad5264ecee36ed60888e2a3d3814991a0107286", + "sha256:171f311cb758de7da13fc53af221ae47a5877be5a0843a9fe150818c51ed276a", + "sha256:18fc2b4ed8e4a8fe849d18dce4bd3c7ea637758c6343a1f2bae1e9bd4c9f4686", + "sha256:1c203f617abc0dde5792beb586f827021069fb6d403d7f4d5c2b543d87edceb9", + "sha256:1c2559d6cffc94890b0529ea7eeecc20d6fadc1539273aa27faf503eb4656d8f", + "sha256:1c88dfb9e0c77612febebb6ac69d44a8d81e3dc60f993215425b62c1161353f4", + "sha256:1e9dc350fb4c58adc64df3e0703ab076f60aac06e67d48b3848c23647ae4310e", + "sha256:247b9d16535acaa766d03037d8e8fb20866d054d3c7fbf6fd1f993f11fc60ca0", + "sha256:266270c6f6608340f6c9836a0fb9b367be61dde0c9a9a18d5ece97774105ff3e", + "sha256:34b9071c040d6fe45d9826cbbe3727d20d83f1b6110d219b83eb0e2a01d79488", + "sha256:3d7d1f8871998cdff5d2ff6a087e5e1780139abe2838e85b0b46b7ae6cc25399", + "sha256:461e3ae84cd90b30f8d533f07d87c00379644205b1d33a5ea03381edc4b69431", + "sha256:464b423bc2a009088f19bdf1f232299e8b6917963e2b7e1d277da5041f33a779", + "sha256:491b1917afdd8638a05b611a56d46587d5a632cabead889a5440f7c638bc6ed9", + "sha256:4a1b1208102be6e851f20066bf0e7a96b7d48a07c9b0cfe6d0d4545c2f6cadab", + "sha256:575bcaf957a25d1194903a10bc9f316c136c19f24e0985a2b9b5608bdf5dbfe0", + "sha256:5c6b28956b7b232ae801406e529ad7b350d3f09a4fde958dfdf3c0520cdde0dd", + "sha256:5d16edfc3fc09968e09ddffada434b3bf989bf4911535e04eada58469873e28e", + "sha256:5fd1810973a375ca0e097dee059c407913ba35723b111df75671a1976efa04bc", + "sha256:67b7f17679fa62ec82b7e3e611c43a016b887bd64fb933b3ae8638583006c6d6", + "sha256:68ce4788b7d93e47f84edd3f1f95acdcd142ae60bc0e5493bfd120683d2d4316", + "sha256:6d3364b999c62f539cd403f8123ae426da946e142312a514162adb2addd8d808", + "sha256:6e739530c662a8d6d42c37c2ed52a6f0932c2d4a3e8c1f90692ad0ce1274abe0", + "sha256:6fdd887f17c2f4572ce548461e4f96396681212d858cae7bd52ba3310bc6f00f", + "sha256:78e6ad33cf2e2e80c5dfaaa0beec3d61face0fb650557100ee36db808bfa6843", + "sha256:884c3f9d42d7218304bc74a8a7693d172685c84bd7ab2bab1ee567b769696df9", + "sha256:8d8faf05be5ec8e02a4d86f616fc2a0322ff4a4ce26c0f09d9f7fb5330a35c95", + "sha256:999c71939aad2780f003979b25ac5b8f2df651dac7b38fb8ce6c46ba5abe6ae9", + "sha256:99ad97258985328b4f207a5e777c1b44a83bfe7cf1f87b99f9c11d4ee477c4de", + "sha256:9e6c93b5b2dbcedad20a2f18ec22cae47da0d705d454308063421a3b290d9ea4", + "sha256:ab459a1cbbf18e8698399c595a01f6dcc5c138220ca3ea9e7e6126232d102bb4", + "sha256:b69303ceb2e4d4f146bf82fda78891ef7bcd80c41bf16bfca3d0d7eb545448aa", + "sha256:b7caf9b241464c404613512d5594a6e2ff0cc9cb5615c9475cc1d9b514218ae8", + "sha256:b95a225d4948b26a28c08307a60ac00fb8671b14f2047fc5476613252a129776", + "sha256:bd2f1ae63998da104f16a8b788f685e55d65760cd1929518fd94cd682bf03e41", + "sha256:be16975d94c320432657ad2402f6760990cb640c161ae6da1363051805fa8108", + "sha256:ce96dd400486e80ac7d195b2d800b03e3e6a787e2a522bfb83755938465a819e", + "sha256:dbd50d0a0539ae2e96e537553aff6d02c10ed165ef40c65b0e27e744a0f10af8", + "sha256:dd10c26b4eadae44783c45ad6655220426f971c61d9b239e6f7b16d5cdaaa727", + "sha256:ebeac59e9e1eb4b84940d076d9f9a6cec0064e241818bcb6e32124cc5c3e377a" + ], + "markers": "python_version >= '3.9'", + "version": "==1.2.0" + }, + "cycler": { + "hashes": [ + "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", + "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c" + ], + "markers": "python_version >= '3.8'", + "version": "==0.12.1" + }, + "dataclasses": { + "hashes": [ + "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f", + "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84" + ], + "version": "==0.6" + }, + "docker-pycreds": { + "hashes": [ + "sha256:6ce3270bcaf404cc4c3e27e4b6c70d3521deae82fb508767870fdbf772d584d4", + "sha256:7266112468627868005106ec19cd0d722702d2b7d5912a28e19b826c3d37af49" + ], + "version": "==0.4.0" + }, + "exifread-nocycle": { + "hashes": [ + "sha256:3f8d034d9b48d31e524a6f039a61ddf7679c592b5ebfccecd8771390d97a15d6", + "sha256:99966e378bc21abddfad7b253b2e120f298ecc909cbc80b31735cc6406c64031" + ], + "version": "==3.0.1" + }, + "filelock": { + "hashes": [ + "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e", + "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c" + ], + "markers": "python_version >= '3.8'", + "version": "==3.13.1" + }, + "fire": { + "hashes": [ + "sha256:c5e2b8763699d1142393a46d0e3e790c5eb2f0706082df8f647878842c216a62" + ], + "version": "==0.4.0" + }, + "fonttools": { + "hashes": [ + "sha256:03ed3bda541e86725f6b4e1b94213f13ed1ae51a5a1f167028534cedea38c010", + "sha256:0dc7617d96b1e668eea9250e1c1fe62d0c78c3f69573ce7e3332cc40e6d84356", + "sha256:105099968b58a5b4cef6f3eb409db8ea8578b302a9d05e23fecba1b8b0177b5f", + "sha256:1b9e9ad2bcded9a1431afaa57c8d3c39143ac1f050862d66bddd863c515464a2", + "sha256:1f53a19dcdd5737440839b8394eeebb35da9ec8109f7926cb6456639b5b58e47", + "sha256:21e96b99878348c74aa58059b8578d7586f9519cbcdadacf56486737038aa043", + "sha256:2c980d60cd6ec1376206fe55013d166e5627ad0b149b5c81e74eaa913ab6134f", + "sha256:316cec50581e844c3ab69d7c82455b54c7cf18236b2f09e722faf665fbfcac58", + "sha256:37cd1ced6efb3dd6fe82e9f9bf92fd74ac58a5aefc284045f59ecd517a5fb9ab", + "sha256:392d0e3cc23daee910193625f7cf1b387aff9dd5b6f1a5f4a925680acb6dcbc2", + "sha256:3bdd7dfca8f6c9f4779384064027e8477ad6a037d6a327b09381f43e0247c6f3", + "sha256:43a3d267334109ff849c37cf3629476b5feb392ef1d2e464a167b83de8cd599c", + "sha256:45fa321c458ea29224067700954ec44493ae869b47e7c5485a350a149a19fb53", + "sha256:46eabddec12066829b8a1efe45ae552ba2f1796981ecf538d5f68284c354c589", + "sha256:4b9544b1346d99848ac0e9b05b5d45ee703d7562fc4c9c48cf4b781de9632e57", + "sha256:4ba17822a6681d06849078daaf6e03eccc9f467efe7c4c60280e28a78e8e5df9", + "sha256:5a17706b9cc24b27721613fe5773d93331ab7f0ecaca9955aead89c6b843d3a7", + "sha256:5cbf02cda8465b69769d07385f5d11e7bba19954e7787792f46fe679ec755ebb", + "sha256:6e441286d55fe7ec7c4fb36812bf914924813776ff514b744b510680fc2733f2", + "sha256:6eb2c54f7a07c92108daabcf02caf31df97825738db02a28270633946bcda4d0", + "sha256:777ba42b94a27bb7fb2b4082522fccfd345667c32a56011e1c3e105979af5b79", + "sha256:794de93e83297db7b4943f2431e206d8b1ea69cb3ae14638a49cc50332bf0db8", + "sha256:800e354e0c3afaeb8d9552769773d02f228e98c37b8cb03041157c3d0687cffc", + "sha256:847f3f49dd3423e5a678c098e2ba92c7f4955d4aab3044f6a507b0bb0ecb07e0", + "sha256:8717db3e4895e4820ade64ea379187738827ee60748223cb0438ef044ee208c6", + "sha256:8b07b857d4f9de3199a8c3d1b1bf2078c0f37447891ca1a8d9234106b9a27aff", + "sha256:8e1aefc2bf3c43e0f33f995f828a7bbeff4adc9393a7760b11456dbcf14388f6", + "sha256:a12dee6523c02ca78aeedd0a5e12bfa9b7b29896350edd5241542897b072ae23", + "sha256:a3c11d9687479f01eddef729aa737abcdea0a44fdaffb62a930a18892f186c9b", + "sha256:b6de2f0fcd3302fb82f94801002cb473959e998c14c24ec28234adb674aed345", + "sha256:ba299f1fbaa2a1e33210aaaf6fa816d4059e4d3cfe2ae9871368d4ab548c1c6a", + "sha256:ba6c23591427844dfb0a13658f1718489de75de6a46b64234584c0d17573162d", + "sha256:c4f4a5870e3b56788fb196da8cf30d0dfd51a76dc3b907861d018165f76ae4c2", + "sha256:cb472905da3049960e80fc1cf808231880d79727a8410e156bf3e5063a1c574f", + "sha256:cebcddbe9351b67166292b4f71ffdbfcce01ba4b07d4267824eb46b277aeb19a", + "sha256:e2277cba9f0b525e30de2a9ad3cb4219aa4bc697230c1645666b0deee9f914f0", + "sha256:e29d5f298d616a93a4c5963682dc6cc8cc09f6d89cad2c29019fc5fb3b4d9472", + "sha256:e3d24248221bd7151dfff0d88b1b5da02dccd7134bd576ce8888199827bbaa19", + "sha256:e50f794d09df0675da8d9dbd7c66bfcab2f74a708343aabcad41936d26556891", + "sha256:f22eb69996a0bd49f76bdefb30be54ce8dbb89a0d1246874d610f05c2aa2e69e", + "sha256:fb36e5f40191274a95938b40c0a1fa7f895e36935aea8709e1d6deff0b2d0d4f", + "sha256:ff6a698bdd435d24c379f6e8a54908cd9bb7dda23719084d56bf8c87709bf3bd" + ], + "markers": "python_version >= '3.8'", + "version": "==4.45.1" + }, + "frozenlist": { + "hashes": [ + "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6", + "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01", + "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251", + "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9", + "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b", + "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87", + "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf", + "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f", + "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0", + "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2", + "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b", + "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc", + "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c", + "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467", + "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9", + "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1", + "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a", + "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79", + "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167", + "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300", + "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf", + "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea", + "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2", + "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab", + "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3", + "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb", + "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087", + "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc", + "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8", + "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62", + "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f", + "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326", + "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c", + "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431", + "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963", + "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7", + "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef", + "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3", + "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956", + "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781", + "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472", + "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc", + "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839", + "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672", + "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3", + "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503", + "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d", + "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8", + "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b", + "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc", + "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f", + "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559", + "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b", + "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95", + "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb", + "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963", + "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919", + "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f", + "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3", + "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1", + "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e" + ], + "markers": "python_version >= '3.8'", + "version": "==1.4.0" + }, + "fsspec": { + "extras": [ + "http" + ], + "hashes": [ + "sha256:259d5fd5c8e756ff2ea72f42e7613c32667dc2049a4ac3d84364a7ca034acb8b", + "sha256:d6e462003e3dcdcb8c7aa84c73a228f8227e72453cd22570e2363e8844edfe7b" + ], + "markers": "python_version >= '3.7'", + "version": "==2022.11.0" + }, + "gitdb": { + "hashes": [ + "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4", + "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b" + ], + "markers": "python_version >= '3.7'", + "version": "==4.0.11" + }, + "gitpython": { + "hashes": [ + "sha256:22b126e9ffb671fdd0c129796343a02bf67bf2994b35449ffc9321aa755e18a4", + "sha256:cf14627d5a8049ffbf49915732e5eddbe8134c3bdb9d476e6182b676fc573f8a" + ], + "markers": "python_version >= '3.7'", + "version": "==3.1.40" + }, + "huggingface-hub": { + "hashes": [ + "sha256:40439632b211311f788964602bf8b0d9d6b7a2314fba4e8d67b2ce3ecea0e3fd", + "sha256:545eb3665f6ac587add946e73984148f2ea5c7877eac2e845549730570c1933a" + ], + "markers": "python_full_version >= '3.8.0'", + "version": "==0.17.3" + }, + "idna": { + "hashes": [ + "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca", + "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f" + ], + "markers": "python_version >= '3.5'", + "version": "==3.6" + }, + "imageio": { + "hashes": [ + "sha256:39999d05eb500089e60be467dd7d618f56e142229b44c3961c2b420eeb538d7e", + "sha256:d580d6576d0ae39c459a444a23f6f61fe72123a3df2264f5fce8c87784a4be2e" + ], + "markers": "python_version >= '3.8'", + "version": "==2.33.0" + }, + "img2dataset": { + "hashes": [ + "sha256:6cebf02977eb782ca738d8f63cc225c49aa42a0aaffd19f3f5b33cc33492217a", + "sha256:885f12d104d8c551cc46247644505a10ea8f529825d448194235f2c548eb303f" + ], + "index": "pypi", + "version": "==1.42.0" + }, + "invisible-watermark": { + "hashes": [ + "sha256:644311beed9cfe4a9a5a4a46c740f47800cef184fe2e1297f3f4542e2d992f8b" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==0.2.0" + }, + "jinja2": { + "hashes": [ + "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", + "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" + ], + "markers": "python_version >= '3.7'", + "version": "==3.1.2" + }, + "joblib": { + "hashes": [ + "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1", + "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.2" + }, + "kiwisolver": { + "hashes": [ + "sha256:00bd361b903dc4bbf4eb165f24d1acbee754fce22ded24c3d56eec268658a5cf", + "sha256:040c1aebeda72197ef477a906782b5ab0d387642e93bda547336b8957c61022e", + "sha256:05703cf211d585109fcd72207a31bb170a0f22144d68298dc5e61b3c946518af", + "sha256:06f54715b7737c2fecdbf140d1afb11a33d59508a47bf11bb38ecf21dc9ab79f", + "sha256:0dc9db8e79f0036e8173c466d21ef18e1befc02de8bf8aa8dc0813a6dc8a7046", + "sha256:0f114aa76dc1b8f636d077979c0ac22e7cd8f3493abbab152f20eb8d3cda71f3", + "sha256:11863aa14a51fd6ec28688d76f1735f8f69ab1fabf388851a595d0721af042f5", + "sha256:11c7de8f692fc99816e8ac50d1d1aef4f75126eefc33ac79aac02c099fd3db71", + "sha256:11d011a7574eb3b82bcc9c1a1d35c1d7075677fdd15de527d91b46bd35e935ee", + "sha256:146d14bebb7f1dc4d5fbf74f8a6cb15ac42baadee8912eb84ac0b3b2a3dc6ac3", + "sha256:15568384086b6df3c65353820a4473575dbad192e35010f622c6ce3eebd57af9", + "sha256:19df6e621f6d8b4b9c4d45f40a66839294ff2bb235e64d2178f7522d9170ac5b", + "sha256:1b04139c4236a0f3aff534479b58f6f849a8b351e1314826c2d230849ed48985", + "sha256:210ef2c3a1f03272649aff1ef992df2e724748918c4bc2d5a90352849eb40bea", + "sha256:2270953c0d8cdab5d422bee7d2007f043473f9d2999631c86a223c9db56cbd16", + "sha256:2400873bccc260b6ae184b2b8a4fec0e4082d30648eadb7c3d9a13405d861e89", + "sha256:2a40773c71d7ccdd3798f6489aaac9eee213d566850a9533f8d26332d626b82c", + "sha256:2c5674c4e74d939b9d91dda0fae10597ac7521768fec9e399c70a1f27e2ea2d9", + "sha256:3195782b26fc03aa9c6913d5bad5aeb864bdc372924c093b0f1cebad603dd712", + "sha256:31a82d498054cac9f6d0b53d02bb85811185bcb477d4b60144f915f3b3126342", + "sha256:32d5cf40c4f7c7b3ca500f8985eb3fb3a7dfc023215e876f207956b5ea26632a", + "sha256:346f5343b9e3f00b8db8ba359350eb124b98c99efd0b408728ac6ebf38173958", + "sha256:378a214a1e3bbf5ac4a8708304318b4f890da88c9e6a07699c4ae7174c09a68d", + "sha256:39b42c68602539407884cf70d6a480a469b93b81b7701378ba5e2328660c847a", + "sha256:3a2b053a0ab7a3960c98725cfb0bf5b48ba82f64ec95fe06f1d06c99b552e130", + "sha256:3aba7311af82e335dd1e36ffff68aaca609ca6290c2cb6d821a39aa075d8e3ff", + "sha256:3cd32d6c13807e5c66a7cbb79f90b553642f296ae4518a60d8d76243b0ad2898", + "sha256:3edd2fa14e68c9be82c5b16689e8d63d89fe927e56debd6e1dbce7a26a17f81b", + "sha256:4c380469bd3f970ef677bf2bcba2b6b0b4d5c75e7a020fb863ef75084efad66f", + "sha256:4e66e81a5779b65ac21764c295087de82235597a2293d18d943f8e9e32746265", + "sha256:53abb58632235cd154176ced1ae8f0d29a6657aa1aa9decf50b899b755bc2b93", + "sha256:5794cf59533bc3f1b1c821f7206a3617999db9fbefc345360aafe2e067514929", + "sha256:59415f46a37f7f2efeec758353dd2eae1b07640d8ca0f0c42548ec4125492635", + "sha256:59ec7b7c7e1a61061850d53aaf8e93db63dce0c936db1fda2658b70e4a1be709", + "sha256:59edc41b24031bc25108e210c0def6f6c2191210492a972d585a06ff246bb79b", + "sha256:5a580c91d686376f0f7c295357595c5a026e6cbc3d77b7c36e290201e7c11ecb", + "sha256:5b94529f9b2591b7af5f3e0e730a4e0a41ea174af35a4fd067775f9bdfeee01a", + "sha256:5c7b3b3a728dc6faf3fc372ef24f21d1e3cee2ac3e9596691d746e5a536de920", + "sha256:5c90ae8c8d32e472be041e76f9d2f2dbff4d0b0be8bd4041770eddb18cf49a4e", + "sha256:5e7139af55d1688f8b960ee9ad5adafc4ac17c1c473fe07133ac092310d76544", + "sha256:5ff5cf3571589b6d13bfbfd6bcd7a3f659e42f96b5fd1c4830c4cf21d4f5ef45", + "sha256:620ced262a86244e2be10a676b646f29c34537d0d9cc8eb26c08f53d98013390", + "sha256:6512cb89e334e4700febbffaaa52761b65b4f5a3cf33f960213d5656cea36a77", + "sha256:6c08e1312a9cf1074d17b17728d3dfce2a5125b2d791527f33ffbe805200a355", + "sha256:6c3bd3cde54cafb87d74d8db50b909705c62b17c2099b8f2e25b461882e544ff", + "sha256:6ef7afcd2d281494c0a9101d5c571970708ad911d028137cd558f02b851c08b4", + "sha256:7269d9e5f1084a653d575c7ec012ff57f0c042258bf5db0954bf551c158466e7", + "sha256:72d40b33e834371fd330fb1472ca19d9b8327acb79a5821d4008391db8e29f20", + "sha256:74d1b44c6cfc897df648cc9fdaa09bc3e7679926e6f96df05775d4fb3946571c", + "sha256:74db36e14a7d1ce0986fa104f7d5637aea5c82ca6326ed0ec5694280942d1162", + "sha256:763773d53f07244148ccac5b084da5adb90bfaee39c197554f01b286cf869228", + "sha256:76c6a5964640638cdeaa0c359382e5703e9293030fe730018ca06bc2010c4437", + "sha256:76d9289ed3f7501012e05abb8358bbb129149dbd173f1f57a1bf1c22d19ab7cc", + "sha256:7931d8f1f67c4be9ba1dd9c451fb0eeca1a25b89e4d3f89e828fe12a519b782a", + "sha256:7b8b454bac16428b22560d0a1cf0a09875339cab69df61d7805bf48919415901", + "sha256:7e5bab140c309cb3a6ce373a9e71eb7e4873c70c2dda01df6820474f9889d6d4", + "sha256:83d78376d0d4fd884e2c114d0621624b73d2aba4e2788182d286309ebdeed770", + "sha256:852542f9481f4a62dbb5dd99e8ab7aedfeb8fb6342349a181d4036877410f525", + "sha256:85267bd1aa8880a9c88a8cb71e18d3d64d2751a790e6ca6c27b8ccc724bcd5ad", + "sha256:88a2df29d4724b9237fc0c6eaf2a1adae0cdc0b3e9f4d8e7dc54b16812d2d81a", + "sha256:88b9f257ca61b838b6f8094a62418421f87ac2a1069f7e896c36a7d86b5d4c29", + "sha256:8ab3919a9997ab7ef2fbbed0cc99bb28d3c13e6d4b1ad36e97e482558a91be90", + "sha256:92dea1ffe3714fa8eb6a314d2b3c773208d865a0e0d35e713ec54eea08a66250", + "sha256:9407b6a5f0d675e8a827ad8742e1d6b49d9c1a1da5d952a67d50ef5f4170b18d", + "sha256:9408acf3270c4b6baad483865191e3e582b638b1654a007c62e3efe96f09a9a3", + "sha256:955e8513d07a283056b1396e9a57ceddbd272d9252c14f154d450d227606eb54", + "sha256:9db8ea4c388fdb0f780fe91346fd438657ea602d58348753d9fb265ce1bca67f", + "sha256:9eaa8b117dc8337728e834b9c6e2611f10c79e38f65157c4c38e9400286f5cb1", + "sha256:a51a263952b1429e429ff236d2f5a21c5125437861baeed77f5e1cc2d2c7c6da", + "sha256:a6aa6315319a052b4ee378aa171959c898a6183f15c1e541821c5c59beaa0238", + "sha256:aa12042de0171fad672b6c59df69106d20d5596e4f87b5e8f76df757a7c399aa", + "sha256:aaf7be1207676ac608a50cd08f102f6742dbfc70e8d60c4db1c6897f62f71523", + "sha256:b0157420efcb803e71d1b28e2c287518b8808b7cf1ab8af36718fd0a2c453eb0", + "sha256:b3f7e75f3015df442238cca659f8baa5f42ce2a8582727981cbfa15fee0ee205", + "sha256:b9098e0049e88c6a24ff64545cdfc50807818ba6c1b739cae221bbbcbc58aad3", + "sha256:ba55dce0a9b8ff59495ddd050a0225d58bd0983d09f87cfe2b6aec4f2c1234e4", + "sha256:bb86433b1cfe686da83ce32a9d3a8dd308e85c76b60896d58f082136f10bffac", + "sha256:bbea0db94288e29afcc4c28afbf3a7ccaf2d7e027489c449cf7e8f83c6346eb9", + "sha256:bbf1d63eef84b2e8c89011b7f2235b1e0bf7dacc11cac9431fc6468e99ac77fb", + "sha256:c7940c1dc63eb37a67721b10d703247552416f719c4188c54e04334321351ced", + "sha256:c9bf3325c47b11b2e51bca0824ea217c7cd84491d8ac4eefd1e409705ef092bd", + "sha256:cdc8a402aaee9a798b50d8b827d7ecf75edc5fb35ea0f91f213ff927c15f4ff0", + "sha256:ceec1a6bc6cab1d6ff5d06592a91a692f90ec7505d6463a88a52cc0eb58545da", + "sha256:cfe6ab8da05c01ba6fbea630377b5da2cd9bcbc6338510116b01c1bc939a2c18", + "sha256:d099e745a512f7e3bbe7249ca835f4d357c586d78d79ae8f1dcd4d8adeb9bda9", + "sha256:d0ef46024e6a3d79c01ff13801cb19d0cad7fd859b15037aec74315540acc276", + "sha256:d2e5a98f0ec99beb3c10e13b387f8db39106d53993f498b295f0c914328b1333", + "sha256:da4cfb373035def307905d05041c1d06d8936452fe89d464743ae7fb8371078b", + "sha256:da802a19d6e15dffe4b0c24b38b3af68e6c1a68e6e1d8f30148c83864f3881db", + "sha256:dced8146011d2bc2e883f9bd68618b8247387f4bbec46d7392b3c3b032640126", + "sha256:dfdd7c0b105af050eb3d64997809dc21da247cf44e63dc73ff0fd20b96be55a9", + "sha256:e368f200bbc2e4f905b8e71eb38b3c04333bddaa6a2464a6355487b02bb7fb09", + "sha256:e391b1f0a8a5a10ab3b9bb6afcfd74f2175f24f8975fb87ecae700d1503cdee0", + "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec", + "sha256:e5d706eba36b4c4d5bc6c6377bb6568098765e990cfc21ee16d13963fab7b3e7", + "sha256:ec20916e7b4cbfb1f12380e46486ec4bcbaa91a9c448b97023fde0d5bbf9e4ff", + "sha256:f1d072c2eb0ad60d4c183f3fb44ac6f73fb7a8f16a2694a91f988275cbf352f9", + "sha256:f846c260f483d1fd217fe5ed7c173fb109efa6b1fc8381c8b7552c5781756192", + "sha256:f91de7223d4c7b793867797bacd1ee53bfe7359bd70d27b7b58a04efbb9436c8", + "sha256:faae4860798c31530dd184046a900e652c95513796ef51a12bc086710c2eec4d", + "sha256:fc579bf0f502e54926519451b920e875f433aceb4624a3646b3252b5caa9e0b6", + "sha256:fcc700eadbbccbf6bc1bcb9dbe0786b4b1cb91ca0dcda336eef5c2beed37b797", + "sha256:fd32ea360bcbb92d28933fc05ed09bffcb1704ba3fc7942e81db0fd4f81a7892", + "sha256:fdb7adb641a0d13bdcd4ef48e062363d8a9ad4a182ac7647ec88f695e719ae9f" + ], + "markers": "python_version >= '3.7'", + "version": "==1.4.5" + }, + "lazy-loader": { + "hashes": [ + "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554", + "sha256:3b68898e34f5b2a29daaaac172c6555512d0f32074f147e2254e4a6d9d838f37" + ], + "markers": "python_version >= '3.7'", + "version": "==0.3" + }, + "lightning": { + "hashes": [ + "sha256:3b2599a8a719916cb03526e6570356809729680c6cda09391232e2aba0a4ed4b", + "sha256:f23358dedd8f5f1151475c9d95f33e4529591c992a99cb9ae89c84bca7289525" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==2.1.2" + }, + "lightning-utilities": { + "hashes": [ + "sha256:84d09b11fe9bc16c803ae5e412874748239d73ad2f3d1b90862f99ce15a03aa0", + "sha256:9e31617eccbbadc6b737a2432fd7076ff8e24957f9c63aeba2530b189e19319c" + ], + "markers": "python_version >= '3.7'", + "version": "==0.10.0" + }, + "markupsafe": { + "hashes": [ + "sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e", + "sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e", + "sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431", + "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686", + "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c", + "sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559", + "sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc", + "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb", + "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939", + "sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c", + "sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0", + "sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4", + "sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9", + "sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575", + "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba", + "sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d", + "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd", + "sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3", + "sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00", + "sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155", + "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac", + "sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52", + "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f", + "sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8", + "sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b", + "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007", + "sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24", + "sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea", + "sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198", + "sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0", + "sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee", + "sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be", + "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2", + "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1", + "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707", + "sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6", + "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c", + "sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58", + "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823", + "sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779", + "sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636", + "sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c", + "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad", + "sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee", + "sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc", + "sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2", + "sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48", + "sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7", + "sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e", + "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b", + "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa", + "sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5", + "sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e", + "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb", + "sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9", + "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57", + "sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc", + "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc", + "sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2", + "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.3" + }, + "matplotlib": { + "hashes": [ + "sha256:01a978b871b881ee76017152f1f1a0cbf6bd5f7b8ff8c96df0df1bd57d8755a1", + "sha256:03f9d160a29e0b65c0790bb07f4f45d6a181b1ac33eb1bb0dd225986450148f0", + "sha256:091275d18d942cf1ee9609c830a1bc36610607d8223b1b981c37d5c9fc3e46a4", + "sha256:09796f89fb71a0c0e1e2f4bdaf63fb2cefc84446bb963ecdeb40dfee7dfa98c7", + "sha256:0f4fc5d72b75e2c18e55eb32292659cf731d9d5b312a6eb036506304f4675630", + "sha256:172f4d0fbac3383d39164c6caafd3255ce6fa58f08fc392513a0b1d3b89c4f89", + "sha256:1b0f3b8ea0e99e233a4bcc44590f01604840d833c280ebb8fe5554fd3e6cfe8d", + "sha256:3773002da767f0a9323ba1a9b9b5d00d6257dbd2a93107233167cfb581f64717", + "sha256:46a569130ff53798ea5f50afce7406e91fdc471ca1e0e26ba976a8c734c9427a", + "sha256:4c318c1e95e2f5926fba326f68177dee364aa791d6df022ceb91b8221bd0a627", + "sha256:4e208f46cf6576a7624195aa047cb344a7f802e113bb1a06cfd4bee431de5e31", + "sha256:533b0e3b0c6768eef8cbe4b583731ce25a91ab54a22f830db2b031e83cca9213", + "sha256:5864bdd7da445e4e5e011b199bb67168cdad10b501750367c496420f2ad00843", + "sha256:5ba9cbd8ac6cf422f3102622b20f8552d601bf8837e49a3afed188d560152788", + "sha256:6f9c6976748a25e8b9be51ea028df49b8e561eed7809146da7a47dbecebab367", + "sha256:7c48d9e221b637c017232e3760ed30b4e8d5dfd081daf327e829bf2a72c731b4", + "sha256:830f00640c965c5b7f6bc32f0d4ce0c36dfe0379f7dd65b07a00c801713ec40a", + "sha256:9a5430836811b7652991939012f43d2808a2db9b64ee240387e8c43e2e5578c8", + "sha256:aa11b3c6928a1e496c1a79917d51d4cd5d04f8a2e75f21df4949eeefdf697f4b", + "sha256:b78e4f2cedf303869b782071b55fdde5987fda3038e9d09e58c91cc261b5ad18", + "sha256:b9576723858a78751d5aacd2497b8aef29ffea6d1c95981505877f7ac28215c6", + "sha256:bddfb1db89bfaa855912261c805bd0e10218923cc262b9159a49c29a7a1c1afa", + "sha256:c7d36c2209d9136cd8e02fab1c0ddc185ce79bc914c45054a9f514e44c787917", + "sha256:d1095fecf99eeb7384dabad4bf44b965f929a5f6079654b681193edf7169ec20", + "sha256:d7b1704a530395aaf73912be741c04d181f82ca78084fbd80bc737be04848331", + "sha256:d86593ccf546223eb75a39b44c32788e6f6440d13cfc4750c1c15d0fcb850b63", + "sha256:deaed9ad4da0b1aea77fe0aa0cebb9ef611c70b3177be936a95e5d01fa05094f", + "sha256:ef8345b48e95cee45ff25192ed1f4857273117917a4dcd48e3905619bcd9c9b8" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==3.8.2" + }, + "mpmath": { + "hashes": [ + "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", + "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c" + ], + "version": "==1.3.0" + }, + "multidict": { + "hashes": [ + "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9", + "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8", + "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03", + "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710", + "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161", + "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664", + "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569", + "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067", + "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313", + "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706", + "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2", + "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636", + "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49", + "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93", + "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603", + "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0", + "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60", + "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4", + "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e", + "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1", + "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60", + "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951", + "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc", + "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe", + "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95", + "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d", + "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8", + "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed", + "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2", + "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775", + "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87", + "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c", + "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2", + "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98", + "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3", + "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe", + "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78", + "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660", + "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176", + "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e", + "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988", + "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c", + "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c", + "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0", + "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449", + "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f", + "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde", + "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5", + "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d", + "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac", + "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a", + "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9", + "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca", + "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11", + "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35", + "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063", + "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b", + "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982", + "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258", + "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1", + "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52", + "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480", + "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7", + "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461", + "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d", + "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc", + "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779", + "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a", + "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547", + "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0", + "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171", + "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf", + "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d", + "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba" + ], + "markers": "python_version >= '3.7'", + "version": "==6.0.4" + }, + "networkx": { + "hashes": [ + "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", + "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2" + ], + "markers": "python_version >= '3.9'", + "version": "==3.2.1" + }, + "numpy": { + "hashes": [ + "sha256:06fa1ed84aa60ea6ef9f91ba57b5ed963c3729534e6e54055fc151fad0423f0a", + "sha256:174a8880739c16c925799c018f3f55b8130c1f7c8e75ab0a6fa9d41cab092fd6", + "sha256:1a13860fdcd95de7cf58bd6f8bc5a5ef81c0b0625eb2c9a783948847abbef2c2", + "sha256:1cc3d5029a30fb5f06704ad6b23b35e11309491c999838c31f124fee32107c79", + "sha256:22f8fc02fdbc829e7a8c578dd8d2e15a9074b630d4da29cda483337e300e3ee9", + "sha256:26c9d33f8e8b846d5a65dd068c14e04018d05533b348d9eaeef6c1bd787f9919", + "sha256:2b3fca8a5b00184828d12b073af4d0fc5fdd94b1632c2477526f6bd7842d700d", + "sha256:2beef57fb031dcc0dc8fa4fe297a742027b954949cabb52a2a376c144e5e6060", + "sha256:36340109af8da8805d8851ef1d74761b3b88e81a9bd80b290bbfed61bd2b4f75", + "sha256:3703fc9258a4a122d17043e57b35e5ef1c5a5837c3db8be396c82e04c1cf9b0f", + "sha256:3ced40d4e9e18242f70dd02d739e44698df3dcb010d31f495ff00a31ef6014fe", + "sha256:4a06263321dfd3598cacb252f51e521a8cb4b6df471bb12a7ee5cbab20ea9167", + "sha256:4eb8df4bf8d3d90d091e0146f6c28492b0be84da3e409ebef54349f71ed271ef", + "sha256:5d5244aabd6ed7f312268b9247be47343a654ebea52a60f002dc70c769048e75", + "sha256:64308ebc366a8ed63fd0bf426b6a9468060962f1a4339ab1074c228fa6ade8e3", + "sha256:6a3cdb4d9c70e6b8c0814239ead47da00934666f668426fc6e94cce869e13fd7", + "sha256:854ab91a2906ef29dc3925a064fcd365c7b4da743f84b123002f6139bcb3f8a7", + "sha256:94cc3c222bb9fb5a12e334d0479b97bb2df446fbe622b470928f5284ffca3f8d", + "sha256:96ca5482c3dbdd051bcd1fce8034603d6ebfc125a7bd59f55b40d8f5d246832b", + "sha256:a2bbc29fcb1771cd7b7425f98b05307776a6baf43035d3b80c4b0f29e9545186", + "sha256:a4cd6ed4a339c21f1d1b0fdf13426cb3b284555c27ac2f156dfdaaa7e16bfab0", + "sha256:aa18428111fb9a591d7a9cc1b48150097ba6a7e8299fb56bdf574df650e7d1f1", + "sha256:aa317b2325f7aa0a9471663e6093c210cb2ae9c0ad824732b307d2c51983d5b6", + "sha256:b04f5dc6b3efdaab541f7857351aac359e6ae3c126e2edb376929bd3b7f92d7e", + "sha256:b272d4cecc32c9e19911891446b72e986157e6a1809b7b56518b4f3755267523", + "sha256:b361d369fc7e5e1714cf827b731ca32bff8d411212fccd29ad98ad622449cc36", + "sha256:b96e7b9c624ef3ae2ae0e04fa9b460f6b9f17ad8b4bec6d7756510f1f6c0c841", + "sha256:baf8aab04a2c0e859da118f0b38617e5ee65d75b83795055fb66c0d5e9e9b818", + "sha256:bcc008217145b3d77abd3e4d5ef586e3bdfba8fe17940769f8aa09b99e856c00", + "sha256:bd3f0091e845164a20bd5a326860c840fe2af79fa12e0469a12768a3ec578d80", + "sha256:cc392fdcbd21d4be6ae1bb4475a03ce3b025cd49a9be5345d76d7585aea69440", + "sha256:d73a3abcac238250091b11caef9ad12413dab01669511779bc9b29261dd50210", + "sha256:f43740ab089277d403aa07567be138fc2a89d4d9892d113b76153e0e412409f8", + "sha256:f65738447676ab5777f11e6bbbdb8ce11b785e105f690bc45966574816b6d3ea", + "sha256:f79b231bf5c16b1f39c7f4875e1ded36abee1591e98742b05d8a0fb55d8a3eec", + "sha256:fe6b44fb8fcdf7eda4ef4461b97b3f63c466b27ab151bec2366db8b197387841" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==1.26.2" + }, + "opencv-python": { + "hashes": [ + "sha256:91d5f6f5209dc2635d496f6b8ca6573ecdad051a09e6b5de4c399b8e673c60da", + "sha256:9814beca408d3a0eca1bae7e3e5be68b07c17ecceb392b94170881216e09b319", + "sha256:a7aac3900fbacf55b551e7b53626c3dad4c71ce85643645c43e91fcb19045e47", + "sha256:b983197f97cfa6fcb74e1da1802c7497a6f94ed561aba6980f1f33123f904956", + "sha256:bc31f47e05447da8b3089faa0a07ffe80e114c91ce0b171e6424f9badbd1c5cd", + "sha256:c4c406bdb41eb21ea51b4e90dfbc989c002786c3f601c236a99c59a54670a394", + "sha256:cc7adbbcd1112877a39274106cb2752e04984bc01a031162952e97450d6117f6" + ], + "markers": "python_version >= '3.6'", + "version": "==4.8.1.78" + }, + "opencv-python-headless": { + "hashes": [ + "sha256:0a0f1e9f836f7d5bad1dd164694944c8761711cbdf4b36ebbd4815a8ef731079", + "sha256:2c7d45721df9801c4dcd34683a15caa0e30f38b185263fec04a6eb274bc720f0", + "sha256:382f8c7a6a14f80091284eecedd52cee4812231ee0eff1118592197b538d9252", + "sha256:3b6bd6e1132b6f5dcb3a5bfe30fc4d341a7bfb26134da349a06c9255288ded94", + "sha256:58e70d2f0915fe23e02c6e405588276c9397844a47d38b9c87fac5f7f9ba2dcc", + "sha256:bc7197b42352f6f865c302a49140b889ec7cd957dd697e2d7fc016ad0d3f28f1", + "sha256:f3a33f644249f9ce1c913eac580e4b3ef4ce7cab0a71900274708959c2feb5e3" + ], + "markers": "python_version >= '3.6'", + "version": "==4.8.1.78" + }, + "packaging": { + "hashes": [ + "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", + "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7" + ], + "markers": "python_version >= '3.7'", + "version": "==23.2" + }, + "pandas": { + "hashes": [ + "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813", + "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792", + "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406", + "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373", + "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328", + "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996", + "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf", + "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6", + "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7", + "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc", + "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1", + "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23", + "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a", + "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51", + "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572", + "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31", + "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5", + "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a", + "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003", + "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d", + "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354", + "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee", + "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa", + "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0", + "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9", + "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae", + "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==1.5.3" + }, + "pathtools": { + "hashes": [ + "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0" + ], + "version": "==0.1.2" + }, + "pillow": { + "hashes": [ + "sha256:00f438bb841382b15d7deb9a05cc946ee0f2c352653c7aa659e75e592f6fa17d", + "sha256:0248f86b3ea061e67817c47ecbe82c23f9dd5d5226200eb9090b3873d3ca32de", + "sha256:04f6f6149f266a100374ca3cc368b67fb27c4af9f1cc8cb6306d849dcdf12616", + "sha256:062a1610e3bc258bff2328ec43f34244fcec972ee0717200cb1425214fe5b839", + "sha256:0a026c188be3b443916179f5d04548092e253beb0c3e2ee0a4e2cdad72f66099", + "sha256:0f7c276c05a9767e877a0b4c5050c8bee6a6d960d7f0c11ebda6b99746068c2a", + "sha256:1a8413794b4ad9719346cd9306118450b7b00d9a15846451549314a58ac42219", + "sha256:1ab05f3db77e98f93964697c8efc49c7954b08dd61cff526b7f2531a22410106", + "sha256:1c3ac5423c8c1da5928aa12c6e258921956757d976405e9467c5f39d1d577a4b", + "sha256:1c41d960babf951e01a49c9746f92c5a7e0d939d1652d7ba30f6b3090f27e412", + "sha256:1fafabe50a6977ac70dfe829b2d5735fd54e190ab55259ec8aea4aaea412fa0b", + "sha256:1fb29c07478e6c06a46b867e43b0bcdb241b44cc52be9bc25ce5944eed4648e7", + "sha256:24fadc71218ad2b8ffe437b54876c9382b4a29e030a05a9879f615091f42ffc2", + "sha256:2cdc65a46e74514ce742c2013cd4a2d12e8553e3a2563c64879f7c7e4d28bce7", + "sha256:2ef6721c97894a7aa77723740a09547197533146fba8355e86d6d9a4a1056b14", + "sha256:3b834f4b16173e5b92ab6566f0473bfb09f939ba14b23b8da1f54fa63e4b623f", + "sha256:3d929a19f5469b3f4df33a3df2983db070ebb2088a1e145e18facbc28cae5b27", + "sha256:41f67248d92a5e0a2076d3517d8d4b1e41a97e2df10eb8f93106c89107f38b57", + "sha256:47e5bf85b80abc03be7455c95b6d6e4896a62f6541c1f2ce77a7d2bb832af262", + "sha256:4d0152565c6aa6ebbfb1e5d8624140a440f2b99bf7afaafbdbf6430426497f28", + "sha256:50d08cd0a2ecd2a8657bd3d82c71efd5a58edb04d9308185d66c3a5a5bed9610", + "sha256:61f1a9d247317fa08a308daaa8ee7b3f760ab1809ca2da14ecc88ae4257d6172", + "sha256:6932a7652464746fcb484f7fc3618e6503d2066d853f68a4bd97193a3996e273", + "sha256:7a7e3daa202beb61821c06d2517428e8e7c1aab08943e92ec9e5755c2fc9ba5e", + "sha256:7dbaa3c7de82ef37e7708521be41db5565004258ca76945ad74a8e998c30af8d", + "sha256:7df5608bc38bd37ef585ae9c38c9cd46d7c81498f086915b0f97255ea60c2818", + "sha256:806abdd8249ba3953c33742506fe414880bad78ac25cc9a9b1c6ae97bedd573f", + "sha256:883f216eac8712b83a63f41b76ddfb7b2afab1b74abbb413c5df6680f071a6b9", + "sha256:912e3812a1dbbc834da2b32299b124b5ddcb664ed354916fd1ed6f193f0e2d01", + "sha256:937bdc5a7f5343d1c97dc98149a0be7eb9704e937fe3dc7140e229ae4fc572a7", + "sha256:9882a7451c680c12f232a422730f986a1fcd808da0fd428f08b671237237d651", + "sha256:9a92109192b360634a4489c0c756364c0c3a2992906752165ecb50544c251312", + "sha256:9d7bc666bd8c5a4225e7ac71f2f9d12466ec555e89092728ea0f5c0c2422ea80", + "sha256:a5f63b5a68daedc54c7c3464508d8c12075e56dcfbd42f8c1bf40169061ae666", + "sha256:a646e48de237d860c36e0db37ecaecaa3619e6f3e9d5319e527ccbc8151df061", + "sha256:a89b8312d51715b510a4fe9fc13686283f376cfd5abca8cd1c65e4c76e21081b", + "sha256:a92386125e9ee90381c3369f57a2a50fa9e6aa8b1cf1d9c4b200d41a7dd8e992", + "sha256:ae88931f93214777c7a3aa0a8f92a683f83ecde27f65a45f95f22d289a69e593", + "sha256:afc8eef765d948543a4775f00b7b8c079b3321d6b675dde0d02afa2ee23000b4", + "sha256:b0eb01ca85b2361b09480784a7931fc648ed8b7836f01fb9241141b968feb1db", + "sha256:b1c25762197144e211efb5f4e8ad656f36c8d214d390585d1d21281f46d556ba", + "sha256:b4005fee46ed9be0b8fb42be0c20e79411533d1fd58edabebc0dd24626882cfd", + "sha256:b920e4d028f6442bea9a75b7491c063f0b9a3972520731ed26c83e254302eb1e", + "sha256:baada14941c83079bf84c037e2d8b7506ce201e92e3d2fa0d1303507a8538212", + "sha256:bb40c011447712d2e19cc261c82655f75f32cb724788df315ed992a4d65696bb", + "sha256:c0949b55eb607898e28eaccb525ab104b2d86542a85c74baf3a6dc24002edec2", + "sha256:c9aeea7b63edb7884b031a35305629a7593272b54f429a9869a4f63a1bf04c34", + "sha256:cfe96560c6ce2f4c07d6647af2d0f3c54cc33289894ebd88cfbb3bcd5391e256", + "sha256:d27b5997bdd2eb9fb199982bb7eb6164db0426904020dc38c10203187ae2ff2f", + "sha256:d921bc90b1defa55c9917ca6b6b71430e4286fc9e44c55ead78ca1a9f9eba5f2", + "sha256:e6bf8de6c36ed96c86ea3b6e1d5273c53f46ef518a062464cd7ef5dd2cf92e38", + "sha256:eaed6977fa73408b7b8a24e8b14e59e1668cfc0f4c40193ea7ced8e210adf996", + "sha256:fa1d323703cfdac2036af05191b969b910d8f115cf53093125e4058f62012c9a", + "sha256:fe1e26e1ffc38be097f0ba1d0d07fcade2bcfd1d023cda5b29935ae8052bd793" + ], + "markers": "python_version >= '3.8'", + "version": "==10.1.0" + }, + "polars": { + "hashes": [ + "sha256:21a334e18c83a259211ca6ec182498f3a89297fde9b8f75021c6881ff4411201", + "sha256:3e904d197aabf36e37fda263470eaf51ec92fb865cdea4f93947713480199303", + "sha256:6c9e597efac74f00ef9cdfd9ba8a9128ed24276916bd3d60adc6e604530e4b37", + "sha256:812dbb0cc7027fd41ce6b3eaf100b94828fc082fab026409d48792e3e7014095", + "sha256:d383fac392b08a6d5830c99f6e735a48e390c1535c8f1e67707fcaab6863ade5", + "sha256:eec0e72dce84b85c427bbf395d2e181f33e60677695b95ee2e87fed51043bdea" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==0.19.19" + }, + "promise": { + "hashes": [ + "sha256:dfd18337c523ba4b6a58801c164c1904a9d4d1b1747c7d5dbf45b693a49d93d0" + ], + "version": "==2.3" + }, + "protobuf": { + "hashes": [ + "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7", + "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c", + "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2", + "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b", + "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050", + "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9", + "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7", + "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454", + "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480", + "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469", + "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c", + "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e", + "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db", + "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905", + "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b", + "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86", + "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4", + "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402", + "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7", + "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4", + "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99", + "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee" + ], + "markers": "python_version >= '3.7'", + "version": "==3.20.3" + }, + "psutil": { + "hashes": [ + "sha256:10e8c17b4f898d64b121149afb136c53ea8b68c7531155147867b7b1ac9e7e28", + "sha256:18cd22c5db486f33998f37e2bb054cc62fd06646995285e02a51b1e08da97017", + "sha256:3ebf2158c16cc69db777e3c7decb3c0f43a7af94a60d72e87b2823aebac3d602", + "sha256:51dc3d54607c73148f63732c727856f5febec1c7c336f8f41fcbd6315cce76ac", + "sha256:6e5fb8dc711a514da83098bc5234264e551ad980cec5f85dabf4d38ed6f15e9a", + "sha256:70cb3beb98bc3fd5ac9ac617a327af7e7f826373ee64c80efd4eb2856e5051e9", + "sha256:748c9dd2583ed86347ed65d0035f45fa8c851e8d90354c122ab72319b5f366f4", + "sha256:91ecd2d9c00db9817a4b4192107cf6954addb5d9d67a969a4f436dbc9200f88c", + "sha256:92e0cc43c524834af53e9d3369245e6cc3b130e78e26100d1f63cdb0abeb3d3c", + "sha256:a6f01f03bf1843280f4ad16f4bde26b817847b4c1a0db59bf6419807bc5ce05c", + "sha256:c69596f9fc2f8acd574a12d5f8b7b1ba3765a641ea5d60fb4736bf3c08a8214a", + "sha256:ca2780f5e038379e520281e4c032dddd086906ddff9ef0d1b9dcf00710e5071c", + "sha256:daecbcbd29b289aac14ece28eca6a3e60aa361754cf6da3dfb20d4d32b6c7f57", + "sha256:e4b92ddcd7dd4cdd3f900180ea1e104932c7bce234fb88976e2a3b296441225a", + "sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d", + "sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==5.9.6" + }, + "pyarrow": { + "hashes": [ + "sha256:0ec7587d759153f452d5263dbc8b1af318c4609b607be2bd5127dcda6708cdb1", + "sha256:1765a18205eb1e02ccdedb66049b0ec148c2a0cb52ed1fb3aac322dfc086a6ee", + "sha256:1a14f57a5f472ce8234f2964cd5184cccaa8df7e04568c64edc33b23eb285dd5", + "sha256:254017ca43c45c5098b7f2a00e995e1f8346b0fb0be225f042838323bb55283c", + "sha256:42ba7c5347ce665338f2bc64685d74855900200dac81a972d49fe127e8132f75", + "sha256:443eb9409b0cf78df10ced326490e1a300205a458fbeb0767b6b31ab3ebae6b2", + "sha256:61f4c37d82fe00d855d0ab522c685262bdeafd3fbcb5fe596fe15025fbc7341b", + "sha256:668e00e3b19f183394388a687d29c443eb000fb3fe25599c9b4762a0afd37775", + "sha256:6f7a7dbe2f7f65ac1d0bd3163f756deb478a9e9afc2269557ed75b1b25ab3610", + "sha256:70acca1ece4322705652f48db65145b5028f2c01c7e426c5d16a30ba5d739c24", + "sha256:7b4ede715c004b6fc535de63ef79fa29740b4080639a5ff1ea9ca84e9282f349", + "sha256:94fb4a0c12a2ac1ed8e7e2aa52aade833772cf2d3de9dde685401b22cec30002", + "sha256:abb57334f2c57979a49b7be2792c31c23430ca02d24becd0b511cbe7b6b08649", + "sha256:b069602eb1fc09f1adec0a7bdd7897f4d25575611dfa43543c8b8a75d99d6874", + "sha256:b1fc226d28c7783b52a84d03a66573d5a22e63f8a24b841d5fc68caeed6784d4", + "sha256:ba71e6fc348c92477586424566110d332f60d9a35cb85278f42e3473bc1373da", + "sha256:bf26f809926a9d74e02d76593026f0aaeac48a65b64f1bb17eed9964bfe7ae1a", + "sha256:cb627673cb98708ef00864e2e243f51ba7b4c1b9f07a1d821f98043eccd3f585", + "sha256:d1bc6e4d5d6f69e0861d5d7f6cf4d061cf1069cb9d490040129877acf16d4c2a", + "sha256:db0c5986bf0808927f49640582d2032a07aa49828f14e51f362075f03747d198", + "sha256:e00174764a8b4e9d8d5909b6d19ee0c217a6cf0232c5682e31fdfbd5a9f0ae52", + "sha256:e141a65705ac98fa52a9113fe574fdaf87fe0316cde2dffe6b94841d3c61544c", + "sha256:e3fe5049d2e9ca661d8e43fab6ad5a4c571af12d20a57dffc392a014caebef65", + "sha256:efa59933b20183c1c13efc34bd91efc6b2997377c4c6ad9272da92d224e3beb1", + "sha256:f2d00aa481becf57098e85d99e34a25dba5a9ade2f44eb0b7d80c80f2984fc03" + ], + "markers": "python_version >= '3.7'", + "version": "==10.0.1" + }, + "pyparsing": { + "hashes": [ + "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb", + "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db" + ], + "markers": "python_full_version >= '3.6.8'", + "version": "==3.1.1" + }, + "python-dateutil": { + "hashes": [ + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.2" + }, + "pytorch-lightning": { + "hashes": [ + "sha256:58e26406215f3bfafe1b9b5b4bf37c9b8cff16494f7fd60e4db1ae37461b2b28", + "sha256:8cf4a45f74e11f8ef3ad46bebb4a621eca7a990826226390767f948e86fff542" + ], + "markers": "python_version >= '3.8'", + "version": "==2.1.2" + }, + "pytz": { + "hashes": [ + "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b", + "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7" + ], + "version": "==2023.3.post1" + }, + "pywavelets": { + "hashes": [ + "sha256:00d5c37775a2baa4e5e6e9df3f93e6fc700a76bd50acd3b234bb13467cc54b6b", + "sha256:05723b35191ceb7d0c0bc2898a9ff391c0f20e8ed9b75d30211464872efcac95", + "sha256:2cae4a0151e443e915905c120435e69ad410b484ce8af4839220e43a494c7c53", + "sha256:322995ea0a57c96086782f0391934f9f00123087a62ad7bef0e778491f121931", + "sha256:34d189aed544687500a2fba5b8970951a76f62f1d140cc5f9440d9b32b14b8f5", + "sha256:49aa6abf9ac941f47f7ea26a3c7dd5c8bfcf0e903dc5ec68ed105b52bfccd4e2", + "sha256:4aca65696341aa64b98bf852d6768dbb345516710a2912419d68e9d484ddd6cd", + "sha256:4d9763987b4a79917f007c1d5df0adc81adabbad3c7c0a368f4a7f12034816f3", + "sha256:51c8e9e081af40f61d194960db0f3dc0434bbd979dafcbbd6463134b3f482f37", + "sha256:526e874ba79ee3779245737a3b8540defc7e92f6cec8f13258719cc1669f8b42", + "sha256:67b65da9ef6380a48b8b53de6d8a4f83747b84b217a37944a4dcf3a53cdf308d", + "sha256:7115439f0dff291b8f81b69caff1a240695566f17c483752a49de9576c7332a4", + "sha256:7a8b58eaf946fbee002cce460d32a0e932c6d9e158aad10eea984e7f26cda15e", + "sha256:7da6c2acd7253e5d45f371bcd6c0f34d70b2f82694420afb0631130bc89e3288", + "sha256:91847ac1b658cf985a7f91ff638ba1d4a9a0544c5480ecbf8db427baf455725a", + "sha256:9c3b10f1e1b08df4d918fa238ef5e5c51c111c4f6abdfecb19c26c540cbd8187", + "sha256:aa54e6c6f2d6953f5f962eb1d1de7f9fbc5bdf06141f58c05d0d87072a05b8be", + "sha256:c857081c037552f174732d864b55d8db4845f5e2fdf0e7bfc2df675a417906f4", + "sha256:ca2e1faaea7f7ff42c771e180635e2fb165cf23c9805c4fe05f9458bcb97d093", + "sha256:d7dc392c3d3d5415b25b5c6ab3b77bb2ac2b7ff6c4d2fb81bd4633b9ac4b66f3", + "sha256:d9e25c7cabef7ccd53f5fead26ab22152fe4cb937bad7411b5d506e2b5de38f6", + "sha256:e045ee612de58e3175ae863c34072b6bf5b45b61264c1adbd75506ce31cedbb2", + "sha256:eb123f01315c0fa54e25780f3b0ce0b096bab35f6c11cacbcd4ac9915f26508a", + "sha256:f3eba7f581a723132beb213ce4b291a51306e3d2f79241a71063294a71cfa25d", + "sha256:f457d9faee286bd542c8f1921e38b8f5f54bc1949c0e349c8f1e9f8eb6d251a6" + ], + "markers": "python_version >= '3.9'", + "version": "==1.5.0" + }, + "pyyaml": { + "hashes": [ + "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", + "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", + "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", + "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", + "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", + "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", + "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", + "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", + "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", + "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", + "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", + "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", + "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", + "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", + "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", + "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", + "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", + "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", + "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", + "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", + "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", + "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", + "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", + "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", + "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", + "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", + "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", + "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", + "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", + "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", + "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", + "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", + "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", + "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", + "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", + "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", + "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", + "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", + "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", + "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", + "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", + "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", + "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", + "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", + "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", + "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", + "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", + "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", + "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", + "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" + ], + "markers": "python_version >= '3.6'", + "version": "==6.0.1" + }, + "qudida": { + "hashes": [ + "sha256:4519714c40cd0f2e6c51e1735edae8f8b19f4efe1f33be13e9d644ca5f736dd6", + "sha256:db198e2887ab0c9aa0023e565afbff41dfb76b361f85fd5e13f780d75ba18cc8" + ], + "markers": "python_full_version >= '3.5.0'", + "version": "==0.0.4" + }, + "requests": { + "hashes": [ + "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f", + "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==2.31.0" + }, + "safetensors": { + "hashes": [ + "sha256:04157d008385bea66d12fe90844a80d4a76dc25ec5230b5bd9a630496d1b7c03", + "sha256:04dd14f53f5500eb4c4149674216ba1000670efbcf4b1b5c2643eb244e7882ea", + "sha256:097e9af2efa8778cd2f0cba451784253e62fa7cc9fc73c0744d27212f7294e25", + "sha256:0bd0afd95c1e497f520e680ea01e0397c0868a3a3030e128438cf6e9e3fcd671", + "sha256:0ddd050e01f3e843aa8c1c27bf68675b8a08e385d0045487af4d70418c3cb356", + "sha256:16d8bbb7344e39cb9d4762e85c21df94ebeb03edac923dd94bb9ed8c10eac070", + "sha256:1a45dbf03e8334d3a5dc93687d98b6dc422f5d04c7d519dac09b84a3c87dd7c6", + "sha256:1d568628e9c43ca15eb96c217da73737c9ccb07520fafd8a1eba3f2750614105", + "sha256:1faf5111c66a6ba91f85dff2e36edaaf36e6966172703159daeef330de4ddc7b", + "sha256:2297b359d91126c0f9d4fd17bae3cfa2fe3a048a6971b8db07db746ad92f850c", + "sha256:2304658e6ada81a5223225b4efe84748e760c46079bffedf7e321763cafb36c9", + "sha256:2536b11ce665834201072e9397404170f93f3be10cca9995b909f023a04501ee", + "sha256:257d59e40a1b367cb544122e7451243d65b33c3f34d822a347f4eea6fdf97fdf", + "sha256:25a043cbb59d4f75e9dd87fdf5c009dd8830105a2c57ace49b72167dd9808111", + "sha256:270b99885ec14abfd56c1d7f28ada81740a9220b4bae960c3de1c6fe84af9e4d", + "sha256:285b52a481e7ba93e29ad4ec5841ef2c4479ef0a6c633c4e2629e0508453577b", + "sha256:2b6a2814278b6660261aa9a9aae524616de9f1ec364e3716d219b6ed8f91801f", + "sha256:2d54c2f1826e790d1eb2d2512bfd0ee443f0206b423d6f27095057c7f18a0687", + "sha256:2d87d993eaefe6611a9c241a8bd364a5f1ffed5771c74840363a6c4ed8d868f6", + "sha256:2fe6926110e3d425c4b684a4379b7796fdc26ad7d16922ea1696c8e6ea7e920f", + "sha256:303d2c0415cf15a28f8d7f17379ea3c34c2b466119118a34edd9965983a1a8a6", + "sha256:313e8472197bde54e3ec54a62df184c414582979da8f3916981b6a7954910a1b", + "sha256:35803201d980efcf964b75a0a2aee97fe5e9ecc5f3ad676b38fafdfe98e0620d", + "sha256:39d36f1d88468a87c437a1bc27c502e71b6ca44c385a9117a9f9ba03a75cc9c6", + "sha256:3b0b7b2d5976fbed8a05e2bbdce5816a59e6902e9e7c7e07dc723637ed539787", + "sha256:3b30abd0cddfe959d1daedf92edcd1b445521ebf7ddefc20860ed01486b33c90", + "sha256:3c1b1d510c7aba71504ece87bf393ea82638df56303e371e5e2cf09d18977dd7", + "sha256:3cfd1ca35eacc635f0eaa894e5c5ed83ffebd0f95cac298fd430014fa7323631", + "sha256:3f6a520af7f2717c5ecba112041f2c8af1ca6480b97bf957aba81ed9642e654c", + "sha256:413e1f6ac248f7d1b755199a06635e70c3515493d3b41ba46063dec33aa2ebb7", + "sha256:4177b456c6b0c722d82429127b5beebdaf07149d265748e97e0a34ff0b3694c8", + "sha256:42c3710cec7e5c764c7999697516370bee39067de0aa089b7e2cfb97ac8c6b20", + "sha256:44e230fbbe120de564b64f63ef3a8e6ff02840fa02849d9c443d56252a1646d4", + "sha256:48901bd540f8a3c1791314bc5c8a170927bf7f6acddb75bf0a263d081a3637d4", + "sha256:53134226053e56bd56e73f7db42596e7908ed79f3c9a1016e4c1dade593ac8e5", + "sha256:573b6023a55a2f28085fc0a84e196c779b6cbef4d9e73acea14c8094fee7686f", + "sha256:5d95ea4d8b32233910734a904123bdd3979c137c461b905a5ed32511defc075f", + "sha256:5f25297148ec665f0deb8bd67e9564634d8d6841041ab5393ccfe203379ea88b", + "sha256:645b3f1138fce6e818e79d4128afa28f0657430764cc045419c1d069ff93f732", + "sha256:660ca1d8bff6c7bc7c6b30b9b32df74ef3ab668f5df42cefd7588f0d40feadcb", + "sha256:6ace9e66a40f98a216ad661245782483cf79cf56eb2b112650bb904b0baa9db5", + "sha256:6fd80f7794554091836d4d613d33a7d006e2b8d6ba014d06f97cebdfda744f64", + "sha256:780dc21eb3fd32ddd0e8c904bdb0290f2454f4ac21ae71e94f9ce72db1900a5a", + "sha256:791edc10a3c359a2f5f52d5cddab0df8a45107d91027d86c3d44e57162e5d934", + "sha256:7a8f6f679d97ea0135c7935c202feefbd042c149aa70ee759855e890c01c7814", + "sha256:7ef010e9afcb4057fb6be3d0a0cfa07aac04fe97ef73fe4a23138d8522ba7c17", + "sha256:7ff8a36e0396776d3ed9a106fc9a9d7c55d4439ca9a056a24bf66d343041d3e6", + "sha256:82571d20288c975c1b30b08deb9b1c3550f36b31191e1e81fae87669a92217d0", + "sha256:82cbb8f4d022f2e94498cbefca900698b8ded3d4f85212f47da614001ff06652", + "sha256:83c2cfbe8c6304f0891e7bb378d56f66d2148972eeb5f747cd8a2246886f0d8c", + "sha256:845be0aafabf2a60c2d482d4e93023fecffe5e5443d801d7a7741bae9de41233", + "sha256:88b4653059c903015284a9722f9a46838c654257173b279c8f6f46dbe80b612d", + "sha256:8b58ba13a9e82b4bc3fc221914f6ef237fe6c2adb13cede3ace64d1aacf49610", + "sha256:8f69903ff49cb30b9227fb5d029bea276ea20d04b06803877a420c5b1b74c689", + "sha256:8ff8e41c8037db17de0ea2a23bc684f43eaf623be7d34906fe1ac10985b8365e", + "sha256:911b48dc09e321a194def3a7431662ff4f03646832f3a8915bbf0f449b8a5fcb", + "sha256:998fbac99ca956c3a09fe07cc0b35fac26a521fa8865a690686d889f0ff4e4a6", + "sha256:9a82bc2bd7a9a0e08239bdd6d7774d64121f136add93dfa344a2f1a6d7ef35fa", + "sha256:9d16b3b2fcc6fca012c74bd01b5619c655194d3e3c13e4d4d0e446eefa39a463", + "sha256:a257de175c254d39ccd6a21341cd62eb7373b05c1e618a78096a56a857e0c316", + "sha256:a79e16222106b2f5edbca1b8185661477d8971b659a3c814cc6f15181a9b34c8", + "sha256:ae2d5a31cfb8a973a318f7c4d2cffe0bd1fe753cdf7bb41a1939d45a0a06f964", + "sha256:ae2f67f04ed0bb2e56fd380a8bd3eef03f609df53f88b6f5c7e89c08e52aae00", + "sha256:ae5497adc68669db2fed7cb2dad81e6a6106e79c9a132da3efdb6af1db1014fa", + "sha256:b287304f2b2220d51ccb51fd857761e78bcffbeabe7b0238f8dc36f2edfd9542", + "sha256:b2f8877990a72ff595507b80f4b69036a9a1986a641f8681adf3425d97d3d2a5", + "sha256:bb4cb3e37a9b961ddd68e873b29fe9ab4a081e3703412e34aedd2b7a8e9cafd9", + "sha256:bbc2ce1f5ae5143a7fb72b71fa71db6a42b4f6cf912aa3acdc6b914084778e68", + "sha256:bda3d98e2bcece388232cfc551ebf063b55bdb98f65ab54df397da30efc7dcc5", + "sha256:bdc0d039e44a727824639824090bd8869535f729878fa248addd3dc01db30eae", + "sha256:bfa2e20342b81921b98edba52f8deb68843fa9c95250739a56b52ceda5ea5c61", + "sha256:c3807ac3b16288dffebb3474b555b56fe466baa677dfc16290dcd02dca1ab228", + "sha256:c3c9f0ca510e0de95abd6424789dcbc879942a3a4e29b0dfa99d9427bf1da75c", + "sha256:c8ed5d2c04cdc1afc6b3c28d59580448ac07732c50d94c15e14670f9c473a2ce", + "sha256:cba01c6b76e01ec453933b3b3c0157c59b52881c83eaa0f7666244e71aa75fd1", + "sha256:ce7a28bc8af685a69d7e869d09d3e180a275e3281e29cf5f1c7319e231932cc7", + "sha256:d10a9f7bae608ccfdc009351f01dc3d8535ff57f9488a58a4c38e45bf954fe93", + "sha256:d3ac139377cfe71ba04573f1cda66e663b7c3e95be850e9e6c2dd4b5984bd513", + "sha256:d5b3defa74f3723a388bfde2f5d488742bc4879682bd93267c09a3bcdf8f869b", + "sha256:d784938534e255473155e4d9f276ee69eb85455b6af1292172c731409bf9adee", + "sha256:d784a98c492c751f228a4a894c3b8a092ff08b24e73b5568938c28b8c0e8f8df", + "sha256:d8a85e3e47e0d4eebfaf9a58b40aa94f977a56050cb5598ad5396a9ee7c087c6", + "sha256:d93321eea0dd7e81b283e47a1d20dee6069165cc158286316d0d06d340de8fe8", + "sha256:da52ee0dc8ba03348ffceab767bd8230842fdf78f8a996e2a16445747143a778", + "sha256:dab431699b5d45e0ca043bc580651ce9583dda594e62e245b7497adb32e99809", + "sha256:dac4bb42f8679aadc59bd91a4c5a1784a758ad49d0912995945cd674089f628e", + "sha256:e056fb9e22d118cc546107f97dc28b449d88274207dd28872bd668c86216e4f6", + "sha256:e09000b2599e1836314430f81a3884c66a5cbabdff5d9f175b5d560d4de38d78", + "sha256:e0ccb5aa0f3be2727117e5631200fbb3a5b3a2b3757545a92647d6dd8be6658f", + "sha256:e57a5ab08b0ec7a7caf30d2ac79bb30c89168431aca4f8854464bb9461686925", + "sha256:e9a7ffb1e551c6df51d267f5a751f042b183df22690f6feceac8d27364fd51d7", + "sha256:e9c80ce0001efa16066358d2dd77993adc25f5a6c61850e4ad096a2232930bce", + "sha256:eb2c1da1cc39509d1a55620a5f4d14f8911c47a89c926a96e6f4876e864375a3", + "sha256:edcf3121890b5f0616aa5a54683b1a5d2332037b970e507d6bb7841a3a596556", + "sha256:f603bdd8deac6726d39f41688ed353c532dd53935234405d79e9eb53f152fbfb", + "sha256:f8934bdfd202ebd0697040a3dff40dd77bc4c5bbf3527ede0532f5e7fb4d970f", + "sha256:fdb4adb76e21bad318210310590de61c9f4adcef77ee49b4a234f9dc48867869", + "sha256:fdb58dee173ef33634c3016c459d671ca12d11e6acf9db008261cbe58107e579" + ], + "markers": "python_version >= '3.7'", + "version": "==0.4.1" + }, + "scikit-image": { + "hashes": [ + "sha256:003ca2274ac0fac252280e7179ff986ff783407001459ddea443fe7916e38cff", + "sha256:018d734df1d2da2719087d15f679d19285fce97cd37695103deadfaef2873236", + "sha256:22318b35044cfeeb63ee60c56fc62450e5fe516228138f1d06c7a26378248a86", + "sha256:2bcb74adb0634258a67f66c2bb29978c9a3e222463e003b67ba12056c003971b", + "sha256:2c6ef454a85f569659b813ac2a93948022b0298516b757c9c6c904132be327e2", + "sha256:3663d063d8bf2fb9bdfb0ca967b9ee3b6593139c860c7abc2d2351a8a8863938", + "sha256:3b7a6c89e8d6252332121b58f50e1625c35f7d6a85489c0b6b7ee4f5155d547a", + "sha256:5071b8f6341bfb0737ab05c8ab4ac0261f9e25dbcc7b5d31e5ed230fd24a7929", + "sha256:6a92dca3d95b1301442af055e196a54b5a5128c6768b79fc0a4098f1d662dee6", + "sha256:722b970aa5da725dca55252c373b18bbea7858c1cdb406e19f9b01a4a73b30b2", + "sha256:74ec5c1d4693506842cc7c9487c89d8fc32aed064e9363def7af08b8f8cbb31d", + "sha256:95d6da2d8a44a36ae04437c76d32deb4e3c993ffc846b394b9949fd8ded73cb2", + "sha256:9e801c44a814afdadeabf4dffdffc23733e393767958b82319706f5fa3e1eaa9", + "sha256:a05ae4fe03d802587ed8974e900b943275548cde6a6807b785039d63e9a7a5ff", + "sha256:be79d7493f320a964f8fcf603121595ba82f84720de999db0fcca002266a549a", + "sha256:c472a1fb3665ec5c00423684590631d95f9afcbc97f01407d348b821880b2cb3", + "sha256:c5c378db54e61b491b9edeefff87e49fcf7fdf729bb93c777d7a5f15d36f743e", + "sha256:cf3c0c15b60ae3e557a0c7575fbd352f0c3ce0afca562febfe3ab80efbeec0e9", + "sha256:e87872f067444ee90a00dd49ca897208308645382e8a24bd3e76f301af2352cd", + "sha256:ebdbdc901bae14dab637f8d5c99f6d5cc7aaf4a3b6f4003194e003e9f688a6fc", + "sha256:f5b23908dd4d120e6aecb1ed0277563e8cbc8d6c0565bdc4c4c6475d53608452" + ], + "markers": "python_version >= '3.9'", + "version": "==0.22.0" + }, + "scikit-learn": { + "hashes": [ + "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107", + "sha256:0ee107923a623b9f517754ea2f69ea3b62fc898a3641766cb7deb2f2ce450161", + "sha256:1215e5e58e9880b554b01187b8c9390bf4dc4692eedeaf542d3273f4785e342c", + "sha256:15e1e94cc23d04d39da797ee34236ce2375ddea158b10bee3c343647d615581d", + "sha256:18424efee518a1cde7b0b53a422cde2f6625197de6af36da0b57ec502f126157", + "sha256:1d08ada33e955c54355d909b9c06a4789a729977f165b8bae6f225ff0a60ec4a", + "sha256:3271552a5eb16f208a6f7f617b8cc6d1f137b52c8a1ef8edf547db0259b2c9fb", + "sha256:35a22e8015048c628ad099da9df5ab3004cdbf81edc75b396fd0cff8699ac58c", + "sha256:535805c2a01ccb40ca4ab7d081d771aea67e535153e35a1fd99418fcedd1648a", + "sha256:5b2de18d86f630d68fe1f87af690d451388bb186480afc719e5f770590c2ef6c", + "sha256:61a6efd384258789aa89415a410dcdb39a50e19d3d8410bd29be365bcdd512d5", + "sha256:64381066f8aa63c2710e6b56edc9f0894cc7bf59bd71b8ce5613a4559b6145e0", + "sha256:67f37d708f042a9b8d59551cf94d30431e01374e00dc2645fa186059c6c5d78b", + "sha256:6c43290337f7a4b969d207e620658372ba3c1ffb611f8bc2b6f031dc5c6d1d03", + "sha256:6fb6bc98f234fda43163ddbe36df8bcde1d13ee176c6dc9b92bb7d3fc842eb66", + "sha256:763f0ae4b79b0ff9cca0bf3716bcc9915bdacff3cebea15ec79652d1cc4fa5c9", + "sha256:785a2213086b7b1abf037aeadbbd6d67159feb3e30263434139c98425e3dcfcf", + "sha256:8db94cd8a2e038b37a80a04df8783e09caac77cbe052146432e67800e430c028", + "sha256:a19f90f95ba93c1a7f7924906d0576a84da7f3b2282ac3bfb7a08a32801add93", + "sha256:a2f54c76accc15a34bfb9066e6c7a56c1e7235dda5762b990792330b52ccfb05", + "sha256:b8692e395a03a60cd927125eef3a8e3424d86dde9b2370d544f0ea35f78a8073", + "sha256:cb06f8dce3f5ddc5dee1715a9b9f19f20d295bed8e3cd4fa51e1d050347de525", + "sha256:dc9002fc200bed597d5d34e90c752b74df516d592db162f756cc52836b38fe0e", + "sha256:e326c0eb5cf4d6ba40f93776a20e9a7a69524c4db0757e7ce24ba222471ee8a1", + "sha256:ed932ea780517b00dae7431e031faae6b49b20eb6950918eb83bd043237950e0", + "sha256:fc4144a5004a676d5022b798d9e573b05139e77f271253a4703eed295bde0433" + ], + "markers": "python_version >= '3.8'", + "version": "==1.3.2" + }, + "scipy": { + "hashes": [ + "sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c", + "sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6", + "sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8", + "sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d", + "sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97", + "sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff", + "sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993", + "sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3", + "sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd", + "sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7", + "sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446", + "sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa", + "sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937", + "sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56", + "sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd", + "sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79", + "sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4", + "sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4", + "sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710", + "sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660", + "sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41", + "sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea", + "sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65", + "sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be", + "sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec" + ], + "markers": "python_version >= '3.9'", + "version": "==1.11.4" + }, + "sentry-sdk": { + "hashes": [ + "sha256:0017fa73b8ae2d4e57fd2522ee3df30453715b29d2692142793ec5d5f90b94a6", + "sha256:8feab81de6bbf64f53279b085bd3820e3e737403b0a0d9317f73a2c3374ae359" + ], + "version": "==1.38.0" + }, + "setproctitle": { + "hashes": [ + "sha256:00e6e7adff74796ef12753ff399491b8827f84f6c77659d71bd0b35870a17d8f", + "sha256:059f4ce86f8cc92e5860abfc43a1dceb21137b26a02373618d88f6b4b86ba9b2", + "sha256:088b9efc62d5aa5d6edf6cba1cf0c81f4488b5ce1c0342a8b67ae39d64001120", + "sha256:0d3a953c50776751e80fe755a380a64cb14d61e8762bd43041ab3f8cc436092f", + "sha256:1342f4fdb37f89d3e3c1c0a59d6ddbedbde838fff5c51178a7982993d238fe4f", + "sha256:184239903bbc6b813b1a8fc86394dc6ca7d20e2ebe6f69f716bec301e4b0199d", + "sha256:195c961f54a09eb2acabbfc90c413955cf16c6e2f8caa2adbf2237d1019c7dd8", + "sha256:1f5d9027eeda64d353cf21a3ceb74bb1760bd534526c9214e19f052424b37e42", + "sha256:200620c3b15388d7f3f97e0ae26599c0c378fdf07ae9ac5a13616e933cbd2086", + "sha256:200ede6fd11233085ba9b764eb055a2a191fb4ffb950c68675ac53c874c22e20", + "sha256:21112fcd2195d48f25760f0eafa7a76510871bbb3b750219310cf88b04456ae3", + "sha256:224602f0939e6fb9d5dd881be1229d485f3257b540f8a900d4271a2c2aa4e5f4", + "sha256:287490eb90e7a0ddd22e74c89a92cc922389daa95babc833c08cf80c84c4df0a", + "sha256:2982efe7640c4835f7355fdb4da313ad37fb3b40f5c69069912f8048f77b28c8", + "sha256:2df2b67e4b1d7498632e18c56722851ba4db5d6a0c91aaf0fd395111e51cdcf4", + "sha256:2e4a8104db15d3462e29d9946f26bed817a5b1d7a47eabca2d9dc2b995991503", + "sha256:2e71f6365744bf53714e8bd2522b3c9c1d83f52ffa6324bd7cbb4da707312cd8", + "sha256:334f7ed39895d692f753a443102dd5fed180c571eb6a48b2a5b7f5b3564908c8", + "sha256:33c5609ad51cd99d388e55651b19148ea99727516132fb44680e1f28dd0d1de9", + "sha256:37a62cbe16d4c6294e84670b59cf7adcc73faafe6af07f8cb9adaf1f0e775b19", + "sha256:38ae9a02766dad331deb06855fb7a6ca15daea333b3967e214de12cfae8f0ef5", + "sha256:38da436a0aaace9add67b999eb6abe4b84397edf4a78ec28f264e5b4c9d53cd5", + "sha256:415bfcfd01d1fbf5cbd75004599ef167a533395955305f42220a585f64036081", + "sha256:417de6b2e214e837827067048f61841f5d7fc27926f2e43954567094051aff18", + "sha256:477d3da48e216d7fc04bddab67b0dcde633e19f484a146fd2a34bb0e9dbb4a1e", + "sha256:4a6ba2494a6449b1f477bd3e67935c2b7b0274f2f6dcd0f7c6aceae10c6c6ba3", + "sha256:4fe1c49486109f72d502f8be569972e27f385fe632bd8895f4730df3c87d5ac8", + "sha256:507e8dc2891021350eaea40a44ddd887c9f006e6b599af8d64a505c0f718f170", + "sha256:53bc0d2358507596c22b02db079618451f3bd720755d88e3cccd840bafb4c41c", + "sha256:554eae5a5b28f02705b83a230e9d163d645c9a08914c0ad921df363a07cf39b1", + "sha256:59335d000c6250c35989394661eb6287187854e94ac79ea22315469ee4f4c244", + "sha256:5a740f05d0968a5a17da3d676ce6afefebeeeb5ce137510901bf6306ba8ee002", + "sha256:5bc94cf128676e8fac6503b37763adb378e2b6be1249d207630f83fc325d9b11", + "sha256:64286f8a995f2cd934082b398fc63fca7d5ffe31f0e27e75b3ca6b4efda4e353", + "sha256:664698ae0013f986118064b6676d7dcd28fefd0d7d5a5ae9497cbc10cba48fa5", + "sha256:68f960bc22d8d8e4ac886d1e2e21ccbd283adcf3c43136161c1ba0fa509088e0", + "sha256:69d565d20efe527bd8a9b92e7f299ae5e73b6c0470f3719bd66f3cd821e0d5bd", + "sha256:6a143b31d758296dc2f440175f6c8e0b5301ced3b0f477b84ca43cdcf7f2f476", + "sha256:6a249415f5bb88b5e9e8c4db47f609e0bf0e20a75e8d744ea787f3092ba1f2d0", + "sha256:6b9e62ddb3db4b5205c0321dd69a406d8af9ee1693529d144e86bd43bcb4b6c0", + "sha256:7f1d36a1e15a46e8ede4e953abb104fdbc0845a266ec0e99cc0492a4364f8c44", + "sha256:816330675e3504ae4d9a2185c46b573105d2310c20b19ea2b4596a9460a4f674", + "sha256:87e668f9561fd3a457ba189edfc9e37709261287b52293c115ae3487a24b92f6", + "sha256:897a73208da48db41e687225f355ce993167079eda1260ba5e13c4e53be7f754", + "sha256:8c331e91a14ba4076f88c29c777ad6b58639530ed5b24b5564b5ed2fd7a95452", + "sha256:950f6476d56ff7817a8fed4ab207727fc5260af83481b2a4b125f32844df513a", + "sha256:9617b676b95adb412bb69645d5b077d664b6882bb0d37bfdafbbb1b999568d85", + "sha256:9e3b99b338598de0bd6b2643bf8c343cf5ff70db3627af3ca427a5e1a1a90dd9", + "sha256:a1fcac43918b836ace25f69b1dca8c9395253ad8152b625064415b1d2f9be4fb", + "sha256:a680d62c399fa4b44899094027ec9a1bdaf6f31c650e44183b50d4c4d0ccc085", + "sha256:a6d50252377db62d6a0bb82cc898089916457f2db2041e1d03ce7fadd4a07381", + "sha256:a83ca086fbb017f0d87f240a8f9bbcf0809f3b754ee01cec928fff926542c450", + "sha256:a911b26264dbe9e8066c7531c0591cfab27b464459c74385b276fe487ca91c12", + "sha256:ab2900d111e93aff5df9fddc64cf51ca4ef2c9f98702ce26524f1acc5a786ae7", + "sha256:ab92e51cd4a218208efee4c6d37db7368fdf182f6e7ff148fb295ecddf264287", + "sha256:accb66d7b3ccb00d5cd11d8c6e07055a4568a24c95cf86109894dcc0c134cc89", + "sha256:ad6d20f9541f5f6ac63df553b6d7a04f313947f550eab6a61aa758b45f0d5657", + "sha256:aeaa71fb9568ebe9b911ddb490c644fbd2006e8c940f21cb9a1e9425bd709574", + "sha256:af2c67ae4c795d1674a8d3ac1988676fa306bcfa1e23fddb5e0bd5f5635309ca", + "sha256:af4061f67fd7ec01624c5e3c21f6b7af2ef0e6bab7fbb43f209e6506c9ce0092", + "sha256:b1067647ac7aba0b44b591936118a22847bda3c507b0a42d74272256a7a798e9", + "sha256:b5901a31012a40ec913265b64e48c2a4059278d9f4e6be628441482dd13fb8b5", + "sha256:bbbd6c7de0771c84b4aa30e70b409565eb1fc13627a723ca6be774ed6b9d9fa3", + "sha256:bdfd7254745bb737ca1384dee57e6523651892f0ea2a7344490e9caefcc35e64", + "sha256:c05ac48ef16ee013b8a326c63e4610e2430dbec037ec5c5b58fcced550382b74", + "sha256:c1c84beab776b0becaa368254801e57692ed749d935469ac10e2b9b825dbdd8e", + "sha256:c32c41ace41f344d317399efff4cffb133e709cec2ef09c99e7a13e9f3b9483c", + "sha256:c3ba57029c9c50ecaf0c92bb127224cc2ea9fda057b5d99d3f348c9ec2855ad3", + "sha256:c7951820b77abe03d88b114b998867c0f99da03859e5ab2623d94690848d3e45", + "sha256:c913e151e7ea01567837ff037a23ca8740192880198b7fbb90b16d181607caae", + "sha256:c9a402881ec269d0cc9c354b149fc29f9ec1a1939a777f1c858cdb09c7a261df", + "sha256:cbf16381c7bf7f963b58fb4daaa65684e10966ee14d26f5cc90f07049bfd8c1e", + "sha256:d4460795a8a7a391e3567b902ec5bdf6c60a47d791c3b1d27080fc203d11c9dc", + "sha256:d7f27e0268af2d7503386e0e6be87fb9b6657afd96f5726b733837121146750d", + "sha256:d876d355c53d975c2ef9c4f2487c8f83dad6aeaaee1b6571453cb0ee992f55f6", + "sha256:da0d57edd4c95bf221b2ebbaa061e65b1788f1544977288bdf95831b6e44e44d", + "sha256:ddedd300cd690a3b06e7eac90ed4452348b1348635777ce23d460d913b5b63c3", + "sha256:df3f4274b80709d8bcab2f9a862973d453b308b97a0b423a501bcd93582852e3", + "sha256:e18b7bd0898398cc97ce2dfc83bb192a13a087ef6b2d5a8a36460311cb09e775", + "sha256:e5119a211c2e98ff18b9908ba62a3bd0e3fabb02a29277a7232a6fb4b2560aa0", + "sha256:e5e08e232b78ba3ac6bc0d23ce9e2bee8fad2be391b7e2da834fc9a45129eb87", + "sha256:eae8988e78192fd1a3245a6f4f382390b61bce6cfcc93f3809726e4c885fa68d", + "sha256:f05e66746bf9fe6a3397ec246fe481096664a9c97eb3fea6004735a4daf867fd", + "sha256:f1da82c3e11284da4fcbf54957dafbf0655d2389cd3d54e4eaba636faf6d117a", + "sha256:f38d48abc121263f3b62943f84cbaede05749047e428409c2c199664feb6abc7", + "sha256:f5e7266498cd31a4572378c61920af9f6b4676a73c299fce8ba93afd694f8ae7", + "sha256:fc74e84fdfa96821580fb5e9c0b0777c1c4779434ce16d3d62a9c4d8c710df39", + "sha256:ff814dea1e5c492a4980e3e7d094286077054e7ea116cbeda138819db194b2cd" + ], + "markers": "python_version >= '3.7'", + "version": "==1.3.3" + }, + "setuptools": { + "hashes": [ + "sha256:1e8fdff6797d3865f37397be788a4e3cba233608e9b509382a2777d25ebde7f2", + "sha256:735896e78a4742605974de002ac60562d286fa8051a7e2299445e8e8fbb01aa6" + ], + "markers": "python_version >= '3.8'", + "version": "==69.0.2" + }, + "shortuuid": { + "hashes": [ + "sha256:27ea8f28b1bd0bf8f15057a3ece57275d2059d2b0bb02854f02189962c13b6aa", + "sha256:fc75f2615914815a8e4cb1501b3a513745cb66ef0fd5fc6fb9f8c3fa3481f789" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.11" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "smmap": { + "hashes": [ + "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62", + "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da" + ], + "markers": "python_version >= '3.7'", + "version": "==5.0.1" + }, + "sympy": { + "hashes": [ + "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5", + "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8" + ], + "markers": "python_version >= '3.8'", + "version": "==1.12" + }, + "termcolor": { + "hashes": [ + "sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63", + "sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a" + ], + "markers": "python_version >= '3.8'", + "version": "==2.4.0" + }, + "threadpoolctl": { + "hashes": [ + "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032", + "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355" + ], + "markers": "python_version >= '3.8'", + "version": "==3.2.0" + }, + "tifffile": { + "hashes": [ + "sha256:1de47fa945fddaade256e25ad4f375ae65547f3c1354063aded881c32a64cf89", + "sha256:67e355e4595aab397f8405d04afe1b4ae7c6f62a44e22d933fee1a571a48c7ae" + ], + "markers": "python_version >= '3.9'", + "version": "==2023.9.26" + }, + "timm": { + "hashes": [ + "sha256:2a828afac5b710a80ec66d0f85807e171e342faf5c0703b33102d8aa206f19dc", + "sha256:9121d1cf320f7f32490d893340fd33117bda0a0270eb8282dfd52ae5fd3e1af6" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==0.9.12" + }, + "torch": { + "hashes": [ + "sha256:0d4e8c52a1fcf5ed6cfc256d9a370fcf4360958fc79d0b08a51d55e70914df46", + "sha256:3b7c6dd1ab12a9c70b29bf1ea34fcf2c519233c58c619c1a553d328955c8a602", + "sha256:4c83190ad649c77adaf6e1c616998f10598db696912ea7a410831632890b49bf", + "sha256:6ee083ba804e863af059ea284c1678c1b0628699fb0014c8e043ceed7d4ce930", + "sha256:94b60ae7562ae732554ae8744123b33d46e659c3251a5a58c7269c12e838868b", + "sha256:aa984599c2c4ffbc57c48d0d965cbe832e610c967e8179d4ac0a582c733fe112", + "sha256:fb808a620951b8cfb4b55cbaf8ace4b7a6d51c5be03d46513d73d009f43aafeb", + "sha256:ff0ee0b7ab3d6cfbf7875c8b1d130309ee5a18fbf2fda11f3da86a783e6e679c" + ], + "index": "downloadpytorch", + "version": "==2.1.0+cu121" + }, + "torchdata": { + "hashes": [ + "sha256:042db39edbc961c50a36c45b89aea4b099858140c13746c7cc7a87b1cc219d0c", + "sha256:293e399f3988fcd8d24156188342e9265273787dc0a29b1b37891a1045eeaece", + "sha256:2d2c8482313dd52652caff99dc530433d898a12bb80bc33a0a4d1680d63272e0", + "sha256:36c591d0910ede6a496d4fccd389f27e7bccabf8b6a8722712ecf28b98bda8ae", + "sha256:432295d9d33a7497d3c4aee667998af5bd9dcf55bd10b77c6af1ac72249efe22", + "sha256:53ef621460e2bc014069c126cbdcd325bf73d78836155a350767fe5f8ca29f11", + "sha256:7460a5fa298e7cd5cef98e8e6455d481e5c73d39a462a89a38918389c8153e20", + "sha256:8ac74fc6ce8bf289b8d99ea183f78e1bf2a4754ea6a2b1dcb219095b0aaacb78", + "sha256:8bbaecdbc1a7dc4a9d7dc7545ea4c12be92a3df8d2494f089e974e25591f514a", + "sha256:91a78c677a3e4e2447d888587f7ea0b4ddde81ca95adf709ba0d3dc9a4e9542d", + "sha256:94ce50572550010db0431283b5d228f8727f779aafd9cbbbcdc37028a5085603", + "sha256:9f9476a26987d90fa3f87cb09ec82b78ce6031ddcaa91851c9fa9f732a987ab8", + "sha256:ba802128f50bfa227be107027e0230581b3a4ac70d14782b44662b7c71159cf1", + "sha256:c1feae257e55b2942459a26f5597088e5daefabbd47a562081c01c0841a88e18", + "sha256:d256535648dfb94d1226f233768c6798d1841edfdbf0a09b2115e6cbbda614f9", + "sha256:eed10b1f9265b30161a8cd129a96cfc7da202cfc70acf8b6a21fd29e18274ca3", + "sha256:fa325d628aa6125c6b46b6fa27c94150ca9276edbba1042d3eb3cd9c1039b5a9" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==0.7.1" + }, + "torchmetrics": { + "hashes": [ + "sha256:217387738f84939c39b534b20d4983e737cc448d27aaa5340e0327948d97ca3e", + "sha256:fe03a8c53d0ae5800d34ea615f56295fda281282cd83f647d2184e81c1d4efee" + ], + "markers": "python_version >= '3.8'", + "version": "==1.2.1" + }, + "torchvision": { + "hashes": [ + "sha256:09dea0b374be56df4ae148e83221f172a8a6c999475e9483037ab6efa3cd6b80", + "sha256:5ea8bba7e82bf89467e7f8790499e3c20c97c7d0ccd778789ccdd2d84c647c9a", + "sha256:7a325270c7806571ceddbd27c8ece5c163cceb476f09dcca7eb5157073216b22", + "sha256:7bb32e2c163766045693e557b61f278eb54960e7cab7c5d6037b89847605e26c", + "sha256:967c0f8ada2abeed430b33e83ce7b3de5233675c8b020c18c1255dca065b4add", + "sha256:d4be401c6a9b2af7ea476e79a6397a77a952b6a3bec6218452be13d1b18b9c54", + "sha256:e0b29e67f0f2b059fc99d0bb9dae3f107a0bcd656e0310504758569a137f874b", + "sha256:e76e78d0ad43636c9884b3084ffaea8a8b61f21129fbfa456a5fe734f0affea9" + ], + "index": "downloadpytorch", + "version": "==0.16.0+cu121" + }, + "tqdm": { + "hashes": [ + "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386", + "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7" + ], + "markers": "python_version >= '3.7'", + "version": "==4.66.1" + }, + "triton": { + "hashes": [ + "sha256:143582ca31dd89cd982bd3bf53666bab1c7527d41e185f9e3d8a3051ce1b663b", + "sha256:21544e522c02005a626c8ad63d39bdff2f31d41069592919ef281e964ed26446", + "sha256:39f6fb6bdccb3e98f3152e3fbea724f1aeae7d749412bbb1fa9c441d474eba26", + "sha256:66439923a30d5d48399b08a9eae10370f6c261a5ec864a64983bae63152d39d7", + "sha256:81a96d110a738ff63339fc892ded095b31bd0d205e3aace262af8400d40b6fa8", + "sha256:82fc5aeeedf6e36be4e4530cbdcba81a09d65c18e02f52dc298696d45721f3bd", + "sha256:919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8", + "sha256:ae4bb8a91de790e1866405211c4d618379781188f40d5c4c399766914e84cd94" + ], + "version": "==2.1.0" + }, + "typing-extensions": { + "hashes": [ + "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0", + "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef" + ], + "markers": "python_version >= '3.8'", + "version": "==4.8.0" + }, + "urllib3": { + "hashes": [ + "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3", + "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54" + ], + "markers": "python_version >= '3.8'", + "version": "==2.1.0" + }, + "wandb": { + "hashes": [ + "sha256:150842447d355d90dc7f368b824951a625e5b2d1be355a00e99b11b73728bc1f", + "sha256:1975ff88c5024923c3321c93cfefb8d9b871543c0b009f34001bf0f31e444b04" + ], + "markers": "python_version >= '3.6'", + "version": "==0.12.21" + }, + "webdataset": { + "hashes": [ + "sha256:7eefdcdf46acc01e80cb6c212acb267e046011c75d4ebdf6a2769f5312eed619", + "sha256:a8ef8809e8c6b32423e40811a529b76a0799d93e3eba1bd1671640906a5fe138" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==0.2.79" + }, + "yarl": { + "hashes": [ + "sha256:09c19e5f4404574fcfb736efecf75844ffe8610606f3fccc35a1515b8b6712c4", + "sha256:0ab5baaea8450f4a3e241ef17e3d129b2143e38a685036b075976b9c415ea3eb", + "sha256:0d155a092bf0ebf4a9f6f3b7a650dc5d9a5bbb585ef83a52ed36ba46f55cc39d", + "sha256:126638ab961633f0940a06e1c9d59919003ef212a15869708dcb7305f91a6732", + "sha256:1a0a4f3aaa18580038cfa52a7183c8ffbbe7d727fe581300817efc1e96d1b0e9", + "sha256:1d93461e2cf76c4796355494f15ffcb50a3c198cc2d601ad8d6a96219a10c363", + "sha256:26a1a8443091c7fbc17b84a0d9f38de34b8423b459fb853e6c8cdfab0eacf613", + "sha256:271d63396460b6607b588555ea27a1a02b717ca2e3f2cf53bdde4013d7790929", + "sha256:28a108cb92ce6cf867690a962372996ca332d8cda0210c5ad487fe996e76b8bb", + "sha256:29beac86f33d6c7ab1d79bd0213aa7aed2d2f555386856bb3056d5fdd9dab279", + "sha256:2c757f64afe53a422e45e3e399e1e3cf82b7a2f244796ce80d8ca53e16a49b9f", + "sha256:2dad8166d41ebd1f76ce107cf6a31e39801aee3844a54a90af23278b072f1ccf", + "sha256:2dc72e891672343b99db6d497024bf8b985537ad6c393359dc5227ef653b2f17", + "sha256:2f3c8822bc8fb4a347a192dd6a28a25d7f0ea3262e826d7d4ef9cc99cd06d07e", + "sha256:32435d134414e01d937cd9d6cc56e8413a8d4741dea36af5840c7750f04d16ab", + "sha256:3cfa4dbe17b2e6fca1414e9c3bcc216f6930cb18ea7646e7d0d52792ac196808", + "sha256:3d5434b34100b504aabae75f0622ebb85defffe7b64ad8f52b8b30ec6ef6e4b9", + "sha256:4003f380dac50328c85e85416aca6985536812c082387255c35292cb4b41707e", + "sha256:44e91a669c43f03964f672c5a234ae0d7a4d49c9b85d1baa93dec28afa28ffbd", + "sha256:4a14907b597ec55740f63e52d7fee0e9ee09d5b9d57a4f399a7423268e457b57", + "sha256:4ce77d289f8d40905c054b63f29851ecbfd026ef4ba5c371a158cfe6f623663e", + "sha256:4d6d74a97e898c1c2df80339aa423234ad9ea2052f66366cef1e80448798c13d", + "sha256:51382c72dd5377861b573bd55dcf680df54cea84147c8648b15ac507fbef984d", + "sha256:525cd69eff44833b01f8ef39aa33a9cc53a99ff7f9d76a6ef6a9fb758f54d0ff", + "sha256:53ec65f7eee8655bebb1f6f1607760d123c3c115a324b443df4f916383482a67", + "sha256:5f74b015c99a5eac5ae589de27a1201418a5d9d460e89ccb3366015c6153e60a", + "sha256:6280353940f7e5e2efaaabd686193e61351e966cc02f401761c4d87f48c89ea4", + "sha256:632c7aeb99df718765adf58eacb9acb9cbc555e075da849c1378ef4d18bf536a", + "sha256:6465d36381af057d0fab4e0f24ef0e80ba61f03fe43e6eeccbe0056e74aadc70", + "sha256:66a6dbf6ca7d2db03cc61cafe1ee6be838ce0fbc97781881a22a58a7c5efef42", + "sha256:6d350388ba1129bc867c6af1cd17da2b197dff0d2801036d2d7d83c2d771a682", + "sha256:7217234b10c64b52cc39a8d82550342ae2e45be34f5bff02b890b8c452eb48d7", + "sha256:721ee3fc292f0d069a04016ef2c3a25595d48c5b8ddc6029be46f6158d129c92", + "sha256:72a57b41a0920b9a220125081c1e191b88a4cdec13bf9d0649e382a822705c65", + "sha256:73cc83f918b69110813a7d95024266072d987b903a623ecae673d1e71579d566", + "sha256:778df71c8d0c8c9f1b378624b26431ca80041660d7be7c3f724b2c7a6e65d0d6", + "sha256:79e1df60f7c2b148722fb6cafebffe1acd95fd8b5fd77795f56247edaf326752", + "sha256:7c86d0d0919952d05df880a1889a4f0aeb6868e98961c090e335671dea5c0361", + "sha256:7eaf13af79950142ab2bbb8362f8d8d935be9aaf8df1df89c86c3231e4ff238a", + "sha256:828235a2a169160ee73a2fcfb8a000709edf09d7511fccf203465c3d5acc59e4", + "sha256:8535e111a064f3bdd94c0ed443105934d6f005adad68dd13ce50a488a0ad1bf3", + "sha256:88d2c3cc4b2f46d1ba73d81c51ec0e486f59cc51165ea4f789677f91a303a9a7", + "sha256:8a2538806be846ea25e90c28786136932ec385c7ff3bc1148e45125984783dc6", + "sha256:8dab30b21bd6fb17c3f4684868c7e6a9e8468078db00f599fb1c14e324b10fca", + "sha256:8f18a7832ff85dfcd77871fe677b169b1bc60c021978c90c3bb14f727596e0ae", + "sha256:946db4511b2d815979d733ac6a961f47e20a29c297be0d55b6d4b77ee4b298f6", + "sha256:96758e56dceb8a70f8a5cff1e452daaeff07d1cc9f11e9b0c951330f0a2396a7", + "sha256:9a172c3d5447b7da1680a1a2d6ecdf6f87a319d21d52729f45ec938a7006d5d8", + "sha256:9a5211de242754b5e612557bca701f39f8b1a9408dff73c6db623f22d20f470e", + "sha256:9df9a0d4c5624790a0dea2e02e3b1b3c69aed14bcb8650e19606d9df3719e87d", + "sha256:aa4643635f26052401750bd54db911b6342eb1a9ac3e74f0f8b58a25d61dfe41", + "sha256:aed37db837ecb5962469fad448aaae0f0ee94ffce2062cf2eb9aed13328b5196", + "sha256:af52725c7c39b0ee655befbbab5b9a1b209e01bb39128dce0db226a10014aacc", + "sha256:b0b8c06afcf2bac5a50b37f64efbde978b7f9dc88842ce9729c020dc71fae4ce", + "sha256:b61e64b06c3640feab73fa4ff9cb64bd8182de52e5dc13038e01cfe674ebc321", + "sha256:b7831566595fe88ba17ea80e4b61c0eb599f84c85acaa14bf04dd90319a45b90", + "sha256:b8bc5b87a65a4e64bc83385c05145ea901b613d0d3a434d434b55511b6ab0067", + "sha256:b8d51817cf4b8d545963ec65ff06c1b92e5765aa98831678d0e2240b6e9fd281", + "sha256:b9f9cafaf031c34d95c1528c16b2fa07b710e6056b3c4e2e34e9317072da5d1a", + "sha256:bb72d2a94481e7dc7a0c522673db288f31849800d6ce2435317376a345728225", + "sha256:c25ec06e4241e162f5d1f57c370f4078797ade95c9208bd0c60f484834f09c96", + "sha256:c405d482c320a88ab53dcbd98d6d6f32ada074f2d965d6e9bf2d823158fa97de", + "sha256:c4472fe53ebf541113e533971bd8c32728debc4c6d8cc177f2bff31d011ec17e", + "sha256:c4b1efb11a8acd13246ffb0bee888dd0e8eb057f8bf30112e3e21e421eb82d4a", + "sha256:c5f3faeb8100a43adf3e7925d556801d14b5816a0ac9e75e22948e787feec642", + "sha256:c6f034386e5550b5dc8ded90b5e2ff7db21f0f5c7de37b6efc5dac046eb19c10", + "sha256:c99ddaddb2fbe04953b84d1651149a0d85214780e4d0ee824e610ab549d98d92", + "sha256:ca6b66f69e30f6e180d52f14d91ac854b8119553b524e0e28d5291a724f0f423", + "sha256:cccdc02e46d2bd7cb5f38f8cc3d9db0d24951abd082b2f242c9e9f59c0ab2af3", + "sha256:cd49a908cb6d387fc26acee8b7d9fcc9bbf8e1aca890c0b2fdfd706057546080", + "sha256:cf7a4e8de7f1092829caef66fd90eaf3710bc5efd322a816d5677b7664893c93", + "sha256:cfd77e8e5cafba3fb584e0f4b935a59216f352b73d4987be3af51f43a862c403", + "sha256:d34c4f80956227f2686ddea5b3585e109c2733e2d4ef12eb1b8b4e84f09a2ab6", + "sha256:d61a0ca95503867d4d627517bcfdc28a8468c3f1b0b06c626f30dd759d3999fd", + "sha256:d81657b23e0edb84b37167e98aefb04ae16cbc5352770057893bd222cdc6e45f", + "sha256:d92d897cb4b4bf915fbeb5e604c7911021a8456f0964f3b8ebbe7f9188b9eabb", + "sha256:dd318e6b75ca80bff0b22b302f83a8ee41c62b8ac662ddb49f67ec97e799885d", + "sha256:dd952b9c64f3b21aedd09b8fe958e4931864dba69926d8a90c90d36ac4e28c9a", + "sha256:e0e7e83f31e23c5d00ff618045ddc5e916f9e613d33c5a5823bc0b0a0feb522f", + "sha256:e0f17d1df951336a02afc8270c03c0c6e60d1f9996fcbd43a4ce6be81de0bd9d", + "sha256:e2a16ef5fa2382af83bef4a18c1b3bcb4284c4732906aa69422cf09df9c59f1f", + "sha256:e36021db54b8a0475805acc1d6c4bca5d9f52c3825ad29ae2d398a9d530ddb88", + "sha256:e73db54c967eb75037c178a54445c5a4e7461b5203b27c45ef656a81787c0c1b", + "sha256:e741bd48e6a417bdfbae02e088f60018286d6c141639359fb8df017a3b69415a", + "sha256:f7271d6bd8838c49ba8ae647fc06469137e1c161a7ef97d778b72904d9b68696", + "sha256:fc391e3941045fd0987c77484b2799adffd08e4b6735c4ee5f054366a2e1551d", + "sha256:fc94441bcf9cb8c59f51f23193316afefbf3ff858460cb47b5758bf66a14d130", + "sha256:fe34befb8c765b8ce562f0200afda3578f8abb159c76de3ab354c80b72244c41", + "sha256:fe8080b4f25dfc44a86bedd14bc4f9d469dfc6456e6f3c5d9077e81a5fedfba7", + "sha256:ff34cb09a332832d1cf38acd0f604c068665192c6107a439a92abfd8acf90fe2" + ], + "markers": "python_version >= '3.7'", + "version": "==1.9.3" + } + }, + "develop": {} +} diff --git a/src/images/Diffusion/README.md b/src/images/Diffusion/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b17d0b7d7056d0366fea17de7532f7f95b5cc426 --- /dev/null +++ b/src/images/Diffusion/README.md @@ -0,0 +1,72 @@ +# AI-generated image detection + +This is a group project developed by a team of two individuals. + +## Managing Python packages + +Use of `pipenv` is recommended. The required packages are in `Pipfile`, and can be installed using `pipenv install`. + +## Scraping script for Reddit + +`python scrape.py --subreddit midjourney --flair Showcase` + +This command will scrape the midjourney subreddit, and filter posts that contain the "Showcase" flair. The default number of images to scrape is 30000. The output will contain a parquet file containing metadata, and a csv file containing the urls. + +`img2dataset --url_list=urls/midjourney.csv --output_folder=data/midjourney --thread_count=64 --resize_mode=no --output_format=webdataset` + +This command will download the images in the webdataset format. + + +## Laion script for real images + +`wget -l1 -r --no-parent https://the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/ +mv the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/ .` + +This command will download a 50GB url metadata dataset in 32 parquet files. + +`sample_laion_script.ipynb` + +This script consolidates the parquet files, excludes NSFW images, and selects a subset of 224,917 images. + +`combine_laion_script` + +This script combines the outputs from earlier into 1 parquet file. + +`img2dataset --url_list urls/laion.parquet --input_format "parquet" --url_col "URL" --caption_col "TEXT" --skip_reencode True --output_format webdataset --output_folder data/laion400m_data --processes_count 16 --thread_count 128 --resize_mode no --save_additional_columns '["NSFW","similarity","LICENSE"]' --enable_wandb True` + +This command will download the images in the webdataset format. + + +## Data splitting, preprocessing and loading + +`data_split.py` splits the data according to 80/10/10. The number of samples: + +``` +./data/laion400m_data: (115346, 14418, 14419) +./data/genai-images/StableDiffusion: (22060, 2757, 2758) +./data/genai-images/midjourney: (21096, 2637, 2637) +./data/genai-images/dalle2: (13582, 1697, 1699) +./data/genai-images/dalle3: (12027, 1503, 1504) +``` + +Each sample contains image, target label(1 for GenAI images), and domain label(denoting which generator the image is from). The meaning of the domain label is: + +``` +DOMAIN_LABELS = { + 0: "laion", + 1: "StableDiffusion", + 2: "dalle2", + 3: "dalle3", + 4: "midjourney" +} +``` + +The `load_dataloader()` function in `dataloader.py` returns a `torchdata.dataloader2.DataLoader2` given a list of domains for GenAI images(subset of `[1, 2, 3, 4]`, LAION will always be included). When building the training dataset, data augmentation and class balanced sampling are applied. It is very memory intensive(>20G) and takes some time to fill its buffer before producing batches. Use the dataloader in this way: + +``` +for epoch in range(10): + dl.seed(epoch) + for d in dl: + model(d) +dl.shutdown() +``` diff --git a/src/images/Diffusion/combine_laion_script.ipynb b/src/images/Diffusion/combine_laion_script.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..094d7a45d1b83997e79631c9a1490bb3b07bcb11 --- /dev/null +++ b/src/images/Diffusion/combine_laion_script.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install pyspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "current_directory = os.getcwd()\n", + "print(current_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(current_directory)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col\n", + "\n", + "spark = SparkSession.builder.appName(\"CombineParquetFiles\").config(\"spark.executor.memory\", \"8g\").config(\"spark.executor.cores\", \"4\").config(\"spark.executor.instances\", \"3\").config(\"spark.dynamicAllocation.enabled\", \"true\").config(\"spark.task.maxFailures\", 10).getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "parquet_directory_path = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_sampled'\n", + "\n", + "output_parquet_file = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_combined'\n", + "\n", + "df = spark.read.parquet(parquet_directory_path)\n", + "\n", + "df_coalesced = df.coalesce(1)\n", + "\n", + "df_coalesced.write.mode('overwrite').parquet(output_parquet_file)\n", + "\n", + "row_count = df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(row_count)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "parquet_directory_path = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_combined/part-00000-0190eea7-02ac-4ea0-86fd-0722308c0c58-c000.snappy.parquet'\n", + "\n", + "df = spark.read.parquet(parquet_directory_path)\n", + "\n", + "df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(df.count())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bloom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/images/Diffusion/data_split.py b/src/images/Diffusion/data_split.py new file mode 100644 index 0000000000000000000000000000000000000000..01c10ad0d88d2ce5ae46487072c5e06d7ef23389 --- /dev/null +++ b/src/images/Diffusion/data_split.py @@ -0,0 +1,80 @@ +import glob +import json + +import webdataset as wds + + +def split_dataset(path, n_train, n_val, n_test, label, domain_label): + max_file_size = 1000 + input_files = glob.glob(path + "/*.tar") + src = wds.WebDataset(input_files) + + train_path_prefix = path + "/train" + val_path_prefix = path + "/val" + test_path_prefix = path + "/test" + + def write_split(dataset, prefix, start, end): + n_split = end - start + output_files = [ + f"{prefix}_{i}.tar" for i in range(n_split // max_file_size + 1) + ] + for i, output_file in enumerate(output_files): + print(f"Writing {output_file}") + with wds.TarWriter(output_file) as dst: + for sample in dataset.slice( + start + i * max_file_size, + min(start + (i + 1) * max_file_size, end), + ): + new_sample = { + "__key__": sample["__key__"], + "jpg": sample["jpg"], + "label.cls": label, + "domain_label.cls": domain_label, + } + dst.write(new_sample) + + write_split(src, train_path_prefix, 0, n_train) + write_split(src, val_path_prefix, n_train, n_train + n_val) + write_split( + src, + test_path_prefix, + n_train + n_val, + n_train + n_val + n_test, + ) + + +def calculate_sizes(path): + stat_files = glob.glob(path + "/*_stats.json") + total = 0 + for f in stat_files: + with open(f) as stats: + total += json.load(stats)["successes"] + n_train = int(total * 0.8) + n_val = int(total * 0.1) + n_test = total - n_train - n_val + + return n_train, n_val, n_test + + +if __name__ == "__main__": + + paths = [ + "./data/laion400m_data", + "./data/genai-images/StableDiffusion", + "./data/genai-images/midjourney", + "./data/genai-images/dalle2", + "./data/genai-images/dalle3", + ] + + sizes = [] + for p in paths: + res = calculate_sizes(p) + sizes.append(res) + + domain_labels = [0, 1, 4, 2, 3] + + for i, p in enumerate(paths): + print(f"{p}: {sizes[i]}") + label = 0 if i == 0 else 1 + print(label, domain_labels[i]) + split_dataset(p, *calculate_sizes(p), label, domain_labels[i]) diff --git a/src/images/Diffusion/dataloader.py b/src/images/Diffusion/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..300051bf19140082b7db276492eadb13cb70e1d9 --- /dev/null +++ b/src/images/Diffusion/dataloader.py @@ -0,0 +1,228 @@ +import argparse +import collections +import random +from typing import Iterator + +import cv2 +import numpy as np +import torchdata.datapipes as dp +from imwatermark import WatermarkEncoder +from PIL import ( + Image, + ImageFile, +) +from torch.utils.data import DataLoader +from torchdata.datapipes.iter import ( + Concater, + FileLister, + FileOpener, + SampleMultiplexer, +) +from torchvision.transforms import v2 +from tqdm import tqdm + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = 1000000000 + +encoder = WatermarkEncoder() +encoder.set_watermark("bytes", b"test") + + +DOMAIN_LABELS = { + 0: "laion", + 1: "StableDiffusion", + 2: "dalle2", + 3: "dalle3", + 4: "midjourney", +} + +N_SAMPLES = { + 0: (115346, 14418, 14419), + 1: (22060, 2757, 2758), + 4: (21096, 2637, 2637), + 2: (13582, 1697, 1699), + 3: (12027, 1503, 1504), +} + + +@dp.functional_datapipe("collect_from_workers") +class WorkerResultCollector(dp.iter.IterDataPipe): + def __init__(self, source: dp.iter.IterDataPipe): + self.source = source + + def __iter__(self) -> Iterator: + yield from self.source + + def is_replicable(self) -> bool: + """Method to force data back to main process""" + return False + + +def crop_bottom(image, cutoff=16): + return image[:, :-cutoff, :] + + +def random_gaussian_blur(image, p=0.01): + if random.random() < p: + return v2.functional.gaussian_blur(image, kernel_size=5) + return image + + +def random_invisible_watermark(image, p=0.2): + image_np = np.array(image) + image_np = np.transpose(image_np, (1, 2, 0)) + + if image_np.ndim == 2: # Grayscale image + image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR) + elif image_np.shape[2] == 4: # RGBA image + image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR) + + # print(image_np.shape) + if image_np.shape[0] < 256 or image_np.shape[1] < 256: + image_np = cv2.resize( + image_np, + (256, 256), + interpolation=cv2.INTER_AREA, + ) + if random.random() < p: + return encoder.encode(image_np, method="dwtDct") + return image_np + + +def build_transform(split: str): + train_transform = v2.Compose( + [ + v2.Lambda(crop_bottom), + v2.RandomCrop((256, 256), pad_if_needed=True), + v2.Lambda(random_gaussian_blur), + v2.RandomGrayscale(p=0.05), + v2.Lambda(random_invisible_watermark), + v2.ToImage(), + ], + ) + + eval_transform = v2.Compose( + [ + v2.CenterCrop((256, 256)), + ], + ) + transform = train_transform if split == "train" else eval_transform + + return transform + + +def dp_to_tuple_train(input_dict): + transform = build_transform("train") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def dp_to_tuple_eval(input_dict): + transform = build_transform("eval") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def load_dataset(domains: list[int], split: str): + + laion_lister = FileLister("./data/laion400m_data", f"{split}*.tar") + genai_lister = { + d: FileLister( + f"./data/genai-images/{DOMAIN_LABELS[d]}", + f"{split}*.tar", + ) + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + weight_genai = 1 / len(genai_lister) + + def open_lister(lister): + opener = FileOpener(lister, mode="b") + return opener.load_from_tar().routed_decode().webdataset() + + buffer_size1 = 100 if split == "train" else 10 + buffer_size2 = 100 if split == "train" else 10 + + if split != "train": + all_lister = [laion_lister] + list(genai_lister.values()) + dp = open_lister(Concater(*all_lister)).sharding_filter() + else: + laion_dp = ( + open_lister(laion_lister.shuffle()) + .cycle() + .sharding_filter() + .shuffle(buffer_size=buffer_size1) + ) + genai_dp = { + open_lister(genai_lister[d].shuffle()) + .cycle() + .sharding_filter() + .shuffle(buffer_size=buffer_size1): weight_genai + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + dp = SampleMultiplexer({laion_dp: 1, **genai_dp}).shuffle( + buffer_size=buffer_size2, + ) + + if split == "train": + dp = dp.map(dp_to_tuple_train) + else: + dp = dp.map(dp_to_tuple_eval) + + return dp + + +def load_dataloader( + domains: list[int], + split: str, + batch_size: int = 32, + num_workers: int = 4, +): + dp = load_dataset(domains, split) + # if split == "train": + # dp = UnderSamplerIterDataPipe(dp, {0: 0.5, 1: 0.5}, seed=42) + dp = dp.batch(batch_size).collate() + dl = DataLoader( + dp, + batch_size=None, + num_workers=num_workers, + pin_memory=True, + ) + + return dl + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + args = parser.parse_args() + + # testing code + dl = load_dataloader([0, 1], "train", num_workers=8) + y_dist = collections.Counter() + d_dist = collections.Counter() + + for i, (img, y, d) in tqdm(enumerate(dl)): + if i % 100 == 0: + print(y, d) + if i == 400: + break + y_dist.update(y.numpy()) + d_dist.update(d.numpy()) + + print("class label") + for label in sorted(y_dist): + frequency = y_dist[label] / sum(y_dist.values()) + print(f"• {label}: {frequency:.2%} ({y_dist[label]})") + + print("domain label") + for label in sorted(d_dist): + frequency = d_dist[label] / sum(d_dist.values()) + print(f"• {label}: {frequency:.2%} ({d_dist[label]})") diff --git a/src/images/Diffusion/diffusion_data_loader.py b/src/images/Diffusion/diffusion_data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..a0dab3f192036d3a9698bc03b6ad340b93253f81 --- /dev/null +++ b/src/images/Diffusion/diffusion_data_loader.py @@ -0,0 +1,233 @@ +import argparse +import collections +import glob +import os +import random +from typing import Iterator + +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchdata as td +import torchdata.datapipes as dp +from imwatermark import WatermarkEncoder +from PIL import ( + Image, + ImageFile, +) +from torch.utils.data import ( + DataLoader, + RandomSampler, +) +from torchdata.dataloader2 import ( + DataLoader2, + MultiProcessingReadingService, +) +from torchdata.datapipes.iter import ( + Concater, + FileLister, + FileOpener, + SampleMultiplexer, +) +from torchvision.transforms import v2 +from tqdm import tqdm +from utils_sampling import UnderSamplerIterDataPipe + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = 1000000000 + +encoder = WatermarkEncoder() +encoder.set_watermark("bytes", b"test") + +DOMAIN_LABELS = { + 0: "laion", + 1: "StableDiffusion", + 2: "dalle2", + 3: "dalle3", + 4: "midjourney", +} + +N_SAMPLES = { + 0: (115346, 14418, 14419), + 1: (22060, 2757, 2758), + 4: (21096, 2637, 2637), + 2: (13582, 1697, 1699), + 3: (12027, 1503, 1504), +} + + +@dp.functional_datapipe("collect_from_workers") +class WorkerResultCollector(dp.iter.IterDataPipe): + def __init__(self, source: dp.iter.IterDataPipe): + self.source = source + + def __iter__(self) -> Iterator: + yield from self.source + + def is_replicable(self) -> bool: + """Method to force data back to main process""" + return False + + +def crop_bottom(image, cutoff=16): + return image[:, :-cutoff, :] + + +def random_gaussian_blur(image, p=0.01): + if random.random() < p: + return v2.functional.gaussian_blur(image, kernel_size=5) + return image + + +def random_invisible_watermark(image, p=0.2): + image_np = np.array(image) + image_np = np.transpose(image_np, (1, 2, 0)) + + if image_np.ndim == 2: # Grayscale image + image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR) + elif image_np.shape[2] == 4: # RGBA image + image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR) + + # print(image_np.shape) + if image_np.shape[0] < 256 or image_np.shape[1] < 256: + image_np = cv2.resize( + image_np, (256, 256), interpolation=cv2.INTER_AREA + ) + if random.random() < p: + return encoder.encode(image_np, method="dwtDct") + return image_np + + +def build_transform(split: str): + train_transform = v2.Compose( + [ + v2.Lambda(crop_bottom), + v2.RandomCrop((256, 256), pad_if_needed=True), + v2.Lambda(random_gaussian_blur), + v2.RandomGrayscale(p=0.05), + v2.Lambda(random_invisible_watermark), + v2.ToImage(), + ] + ) + + eval_transform = v2.Compose( + [ + v2.CenterCrop((256, 256)), + ] + ) + transform = train_transform if split == "train" else eval_transform + + return transform + + +def dp_to_tuple_train(input_dict): + transform = build_transform("train") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def dp_to_tuple_eval(input_dict): + transform = build_transform("eval") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def load_dataset(domains: list[int], split: str): + laion_lister = FileLister("./data/laion400m_data", f"{split}*.tar") + genai_lister = { + d: FileLister( + f"./data/genai-images/{DOMAIN_LABELS[d]}", f"{split}*.tar" + ) + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + weight_genai = 1 / len(genai_lister) + + def open_lister(lister): + opener = FileOpener(lister, mode="b") + return opener.load_from_tar().routed_decode().webdataset() + + buffer_size1 = 100 if split == "train" else 10 + buffer_size2 = 100 if split == "train" else 10 + + if split != "train": + all_lister = [laion_lister] + list(genai_lister.values()) + dp = open_lister(Concater(*all_lister)).sharding_filter() + else: + laion_dp = ( + open_lister(laion_lister.shuffle()) + .cycle() + .sharding_filter() + .shuffle(buffer_size=buffer_size1) + ) + genai_dp = { + open_lister(genai_lister[d].shuffle()) + .cycle() + .sharding_filter() + .shuffle( + buffer_size=buffer_size1, + ): weight_genai + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + dp = SampleMultiplexer({laion_dp: 1, **genai_dp}).shuffle( + buffer_size=buffer_size2 + ) + + if split == "train": + dp = dp.map(dp_to_tuple_train) + else: + dp = dp.map(dp_to_tuple_eval) + + return dp + + +def load_dataloader( + domains: list[int], split: str, batch_size: int = 32, num_workers: int = 4 +): + dp = load_dataset(domains, split) + # if split == "train": + # dp = UnderSamplerIterDataPipe(dp, {0: 0.5, 1: 0.5}, seed=42) + dp = dp.batch(batch_size).collate() + dl = DataLoader( + dp, batch_size=None, num_workers=num_workers, pin_memory=True + ) + + return dl + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + args = parser.parse_args() + + # testing code + dl = load_dataloader([0, 1], "train", num_workers=8) + y_dist = collections.Counter() + d_dist = collections.Counter() + + for i, (img, y, d) in tqdm(enumerate(dl)): + if i % 100 == 0: + print(y, d) + if i == 400: + break + y_dist.update(y.numpy()) + d_dist.update(d.numpy()) + + print("class label") + for label in sorted(y_dist): + frequency = y_dist[label] / sum(y_dist.values()) + print(f"• {label}: {frequency:.2%} ({y_dist[label]})") + + print("domain label") + for label in sorted(d_dist): + frequency = d_dist[label] / sum(d_dist.values()) + print(f"• {label}: {frequency:.2%} ({d_dist[label]})") diff --git a/src/images/Diffusion/diffusion_model_classifier.py b/src/images/Diffusion/diffusion_model_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..62495f21bccadc860de5ecf5f6a2d7c85bdd2b2a --- /dev/null +++ b/src/images/Diffusion/diffusion_model_classifier.py @@ -0,0 +1,242 @@ +import argparse +import logging +import os + +import pandas as pd +import pytorch_lightning as pl +import timm +import torch +import torchvision.transforms as transforms +from data_split import * +from dataloader import * +from PIL import Image +from pytorch_lightning.callbacks import ( + EarlyStopping, + ModelCheckpoint, +) +from sklearn.metrics import roc_auc_score +from torchmetrics import ( + Accuracy, + Recall, +) +from utils_sampling import * + +logging.basicConfig( + filename="training.log", filemode="w", level=logging.INFO, force=True +) + + +class ImageClassifier(pl.LightningModule): + def __init__(self, lmd=0): + super().__init__() + self.model = timm.create_model( + "resnet50", pretrained=True, num_classes=1 + ) + self.accuracy = Accuracy(task="binary", threshold=0.5) + self.recall = Recall(task="binary", threshold=0.5) + self.validation_outputs = [] + self.lmd = lmd + + def forward(self, x): + return self.model(x) + + def training_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + print(f"Shape of outputs (training): {outputs.shape}") + print(f"Shape of labels (training): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + logging.info(f"Training Step - ERM loss: {loss.item()}") + loss += self.lmd * (outputs**2).mean() # SD loss penalty + logging.info(f"Training Step - SD loss: {loss.item()}") + return loss + + def validation_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + if outputs.shape == torch.Size([]): + return + + print(f"Shape of outputs (validation): {outputs.shape}") + print(f"Shape of labels (validation): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + preds = torch.sigmoid(outputs) + self.log("val_loss", loss, prog_bar=True, sync_dist=True) + self.log( + "val_acc", + self.accuracy(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + self.log( + "val_recall", + self.recall(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + output = {"val_loss": loss, "preds": preds, "labels": labels} + self.validation_outputs.append(output) + logging.info(f"Validation Step - Batch loss: {loss.item()}") + return output + + def predict_step(self, batch): + images, label, domain = batch + outputs = self.forward(images).squeeze() + preds = torch.sigmoid(outputs) + return preds, label, domain + + def on_validation_epoch_end(self): + if not self.validation_outputs: + logging.warning("No outputs in validation step to process") + return + preds = torch.cat([x["preds"] for x in self.validation_outputs]) + labels = torch.cat([x["labels"] for x in self.validation_outputs]) + if labels.unique().size(0) == 1: + logging.warning("Only one class in validation step") + return + auc_score = roc_auc_score(labels.cpu(), preds.cpu()) + self.log("val_auc", auc_score, prog_bar=True, sync_dist=True) + logging.info(f"Validation Epoch End - AUC score: {auc_score}") + self.validation_outputs = [] + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005) + return optimizer + + +checkpoint_callback = ModelCheckpoint( + monitor="val_loss", + dirpath="./model_checkpoints/", + filename="image-classifier-{step}-{val_loss:.2f}", + save_top_k=3, + mode="min", + every_n_train_steps=1001, + enable_version_counter=True, +) + +early_stop_callback = EarlyStopping( + monitor="val_loss", + patience=4, + mode="min", +) + + +def load_image(image_path, transform=None): + image = Image.open(image_path).convert("RGB") + + if transform: + image = transform(image) + + return image + + +def predict_single_image(image_path, model, transform=None): + image = load_image(image_path, transform) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model.to(device) + + image = image.to(device) + + model.eval() + + with torch.no_grad(): + image = image.unsqueeze(0) + output = model(image).squeeze() + print(output) + prediction = torch.sigmoid(output).item() + + return prediction + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--ckpt_path", help="checkpoint to continue from", required=False +) +parser.add_argument( + "--predict", help="predict on test set", action="store_true" +) +parser.add_argument("--reset", help="reset training", action="store_true") +parser.add_argument( + "--predict_image", + help="predict the class of a single image", + action="store_true", +) +parser.add_argument( + "--image_path", + help="path to the image to predict", + type=str, + required=False, +) +args = parser.parse_args() + +train_domains = [0, 1, 4] +val_domains = [0, 1, 4] +lmd_value = 0 + +if args.predict: + test_dl = load_dataloader( + [0, 1, 2, 3, 4], "test", batch_size=128, num_workers=1 + ) + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + trainer = pl.Trainer() + predictions = trainer.predict(model, dataloaders=test_dl) + preds, labels, domains = zip(*predictions) + preds = torch.cat(preds).cpu().numpy() + labels = torch.cat(labels).cpu().numpy() + domains = torch.cat(domains).cpu().numpy() + print(preds.shape, labels.shape, domains.shape) + df = pd.DataFrame({"preds": preds, "labels": labels, "domains": domains}) + filename = "preds-" + args.ckpt_path.split("/")[-1] + df.to_csv(f"outputs/{filename}.csv", index=False) +elif args.predict_image: + image_path = args.image_path + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + + # Define the transformations for the image + transform = transforms.Compose( + [ + transforms.Resize((224, 224)), # Image size expected by ResNet50 + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + prediction = predict_single_image(image_path, model, transform) + print("prediction", prediction) + + # Output the prediction + print( + f"Prediction for {image_path}: {'Human' if prediction <= 0.001 else 'Generated'}" + ) +else: + train_dl = load_dataloader( + train_domains, "train", batch_size=128, num_workers=4 + ) + logging.info("Training dataloader loaded") + val_dl = load_dataloader(val_domains, "val", batch_size=128, num_workers=4) + logging.info("Validation dataloader loaded") + + if args.reset: + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + else: + model = ImageClassifier(lmd=lmd_value) + trainer = pl.Trainer( + callbacks=[checkpoint_callback, early_stop_callback], + max_steps=20000, + val_check_interval=1000, + check_val_every_n_epoch=None, + ) + trainer.fit( + model=model, + train_dataloaders=train_dl, + val_dataloaders=val_dl, + ckpt_path=args.ckpt_path if not args.reset else None, + ) diff --git a/src/images/Diffusion/evaluation.ipynb b/src/images/Diffusion/evaluation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91d7641e5b7d085537459e415ed320baada43a25 --- /dev/null +++ b/src/images/Diffusion/evaluation.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, RocCurveDisplay\n", + "\n", + "sns.set()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def pfbeta(labels, predictions, beta=1):\n", + " y_true_count = 0\n", + " ctp = 0\n", + " cfp = 0\n", + "\n", + " for idx in range(len(labels)):\n", + " prediction = min(max(predictions[idx], 0), 1)\n", + " if (labels[idx]):\n", + " y_true_count += 1\n", + " ctp += prediction\n", + " else:\n", + " cfp += prediction\n", + "\n", + " beta_squared = beta * beta\n", + " c_precision = ctp / (ctp + cfp)\n", + " c_recall = ctp / y_true_count\n", + " if (c_precision > 0 and c_recall > 0):\n", + " result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)\n", + " return result\n", + " else:\n", + " return 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_part_metrics(df: pl.DataFrame, threshold=0.3) -> dict:\n", + " df = df.with_columns((df[\"preds\"] > threshold).alias(\"preds_bin\"))\n", + " metrics = {}\n", + " # binary metrics using the threshold\n", + " metrics[\"accuracy\"] = accuracy_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"precision\"] = precision_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"recall\"] = recall_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"f1\"] = f1_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " # probabilistic F1 (doesn't depend on the threshold)\n", + " metrics[\"pf1\"] = pfbeta(df[\"labels\"].to_numpy(), df[\"preds\"].to_numpy())\n", + " # ROC AUC\n", + " metrics[\"roc_auc\"] = roc_auc_score(df[\"labels\"].to_numpy(), df[\"preds\"].to_numpy())\n", + " return metrics\n", + "\n", + "\n", + "def get_all_metrics(df: pl.DataFrame, threshold=0.3) -> pd.DataFrame:\n", + " groups = [list(range(5)), [0, 1], [0, 4], [0, 2], [0, 3]]\n", + " group_names = [\"all\", \"StableDiffusion\", \"Midjourney\", \"Dalle2\", \"Dalle3\"]\n", + " all_metrics = []\n", + " for i, g in enumerate(groups):\n", + " subset = df.filter(pl.col(\"domains\").is_in(g))\n", + " metrics = get_part_metrics(subset, threshold=threshold)\n", + " metrics[\"group\"] = group_names[i]\n", + " all_metrics.append(metrics)\n", + " \n", + " return pd.DataFrame(all_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pl.read_csv(\"outputs/preds-image-classifier-1.csv\")\n", + "metrics_df1 = get_all_metrics(df1, threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df14 = pl.read_csv(\"outputs/preds-image-classifier-14.csv\")\n", + "metrics_df14 = get_all_metrics(df14, threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df14" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df142 = pl.read_csv(\"outputs/preds-image-classifier-142.csv\")\n", + "metrics_df142 = get_all_metrics(df142, threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df142" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1423 = pl.read_csv(\"outputs/preds-image-classifier-1423.csv\")\n", + "metrics_df1423 = get_all_metrics(df1423, threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df1423" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "GenAI-image-detection-Z_9oJJe7", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/images/Diffusion/model.py b/src/images/Diffusion/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5112cf8cfedeb654894128d5bd65babcc3ea73b0 --- /dev/null +++ b/src/images/Diffusion/model.py @@ -0,0 +1,307 @@ +import argparse +import logging +import os + +import pandas as pd +import pytorch_lightning as pl +import timm +import torch +import torch.nn.functional as F +import torchvision.transforms as transforms +from dataloader import load_dataloader +from PIL import Image +from pytorch_lightning.callbacks import ( + EarlyStopping, + ModelCheckpoint, +) +from sklearn.metrics import roc_auc_score +from torchmetrics import ( + Accuracy, + Recall, +) + +logging.basicConfig( + filename="training.log", + filemode="w", + level=logging.INFO, + force=True, +) + + +class ImageClassifier(pl.LightningModule): + def __init__(self, lmd=0): + super().__init__() + self.model = timm.create_model( + "resnet50", + pretrained=True, + num_classes=1, + ) + self.accuracy = Accuracy(task="binary", threshold=0.5) + self.recall = Recall(task="binary", threshold=0.5) + self.validation_outputs = [] + self.lmd = lmd + + def forward(self, x): + return self.model(x) + + def training_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + print(f"Shape of outputs (training): {outputs.shape}") + print(f"Shape of labels (training): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + logging.info(f"Training Step - ERM loss: {loss.item()}") + loss += self.lmd * (outputs**2).mean() # SD loss penalty + logging.info(f"Training Step - SD loss: {loss.item()}") + return loss + + def validation_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + if outputs.shape == torch.Size([]): + return + + print(f"Shape of outputs (validation): {outputs.shape}") + print(f"Shape of labels (validation): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + preds = torch.sigmoid(outputs) + self.log("val_loss", loss, prog_bar=True, sync_dist=True) + self.log( + "val_acc", + self.accuracy(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + self.log( + "val_recall", + self.recall(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + output = {"val_loss": loss, "preds": preds, "labels": labels} + self.validation_outputs.append(output) + logging.info(f"Validation Step - Batch loss: {loss.item()}") + return output + + def predict_step(self, batch): + images, label, domain = batch + outputs = self.forward(images).squeeze() + preds = torch.sigmoid(outputs) + return preds, label, domain + + def on_validation_epoch_end(self): + if not self.validation_outputs: + logging.warning("No outputs in validation step to process") + return + preds = torch.cat([x["preds"] for x in self.validation_outputs]) + labels = torch.cat([x["labels"] for x in self.validation_outputs]) + if labels.unique().size(0) == 1: + logging.warning("Only one class in validation step") + return + auc_score = roc_auc_score(labels.cpu(), preds.cpu()) + self.log("val_auc", auc_score, prog_bar=True, sync_dist=True) + logging.info(f"Validation Epoch End - AUC score: {auc_score}") + self.validation_outputs = [] + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005) + return optimizer + + +checkpoint_callback = ModelCheckpoint( + monitor="val_loss", + dirpath="./model_checkpoints/", + filename="image-classifier-{step}-{val_loss:.2f}", + save_top_k=3, + mode="min", + every_n_train_steps=1001, + enable_version_counter=True, +) + +early_stop_callback = EarlyStopping( + monitor="val_loss", + patience=4, + mode="min", +) + + +def load_image(image_path, transform=None): + image = Image.open(image_path).convert("RGB") + + if transform: + image = transform(image) + + return image + + +def predict_single_image(image_path, model, transform=None): + image = load_image(image_path, transform) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model.to(device) + + image = image.to(device) + + model.eval() + + with torch.no_grad(): + image = image.unsqueeze(0) + output = model(image).squeeze() + print(output) + prediction = torch.sigmoid(output).item() + + return prediction + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--ckpt_path", + help="checkpoint to continue from", + required=False, +) +parser.add_argument( + "--predict", + help="predict on test set", + action="store_true", +) +parser.add_argument("--reset", help="reset training", action="store_true") +parser.add_argument( + "--predict_image", + help="predict the class of a single image", + action="store_true", +) +parser.add_argument( + "--image_path", + help="path to the image to predict", + type=str, + required=False, +) +parser.add_argument( + "--dir", + help="path to the images to predict", + type=str, + required=False, +) +parser.add_argument( + "--output_file", + help="path to output file", + type=str, + required=False, +) +args = parser.parse_args() + +train_domains = [0, 1, 4] +val_domains = [0, 1, 4] +lmd_value = 0 + +if args.predict: + test_dl = load_dataloader( + [0, 1, 2, 3, 4], + "test", + batch_size=128, + num_workers=1, + ) + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + trainer = pl.Trainer() + predictions = trainer.predict(model, dataloaders=test_dl) + preds, labels, domains = zip(*predictions) + preds = torch.cat(preds).cpu().numpy() + labels = torch.cat(labels).cpu().numpy() + domains = torch.cat(domains).cpu().numpy() + print(preds.shape, labels.shape, domains.shape) + df = pd.DataFrame({"preds": preds, "labels": labels, "domains": domains}) + filename = "preds-" + args.ckpt_path.split("/")[-1] + df.to_csv(f"outputs/{filename}.csv", index=False) +elif args.predict_image: + image_path = args.image_path + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + + # Define the transformations for the image + # transform = transforms.Compose( + # [ + # transforms.Resize((224, 224)), # Image size expected by ResNet50 + # transforms.ToTensor(), + # transforms.Normalize( + # mean=[0.485, 0.456, 0.406], + # std=[0.229, 0.224, 0.225], + # ), + # ], + # ) + + transform = transforms.Compose( + [ + transforms.CenterCrop((256, 256)), + transforms.ToTensor(), + ], + ) + + prediction = predict_single_image(image_path, model, transform) + print("prediction", prediction) + + # Output the prediction + print( + f"Prediction for {image_path}: " + f"{'Human' if prediction <= 0.001 else 'Generated'}", + ) +elif args.dir is not None: + predictions = [] + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + # Define the transformations for the image + # transform = transforms.Compose( + # [ + # transforms.Resize((224, 224)), # Image size expected by ResNet50 + # transforms.ToTensor(), + # transforms.Normalize( + # mean=[0.485, 0.456, 0.406], + # std=[0.229, 0.224, 0.225], + # ), + # ], + # ) + transform = transforms.Compose( + [ + transforms.CenterCrop((256, 256)), + transforms.ToTensor(), + ], + ) + for root, dirs, files in os.walk(os.path.abspath(args.dir)): + for f_name in files: + f = os.path.join(root, f_name) + print(f"Predicting: {f}") + p = predict_single_image(f, model, transform) + predictions.append([f, f.split("/")[-2], p, p > 0.5]) + print(f"--predicted: {p}") + + df = pd.DataFrame(predictions, columns=["path", "folder", "pred", "class"]) + df.to_csv(args.output_file, index=False) +else: + train_dl = load_dataloader( + train_domains, + "train", + batch_size=128, + num_workers=4, + ) + logging.info("Training dataloader loaded") + val_dl = load_dataloader(val_domains, "val", batch_size=128, num_workers=4) + logging.info("Validation dataloader loaded") + + if args.reset: + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + else: + model = ImageClassifier(lmd=lmd_value) + trainer = pl.Trainer( + callbacks=[checkpoint_callback, early_stop_callback], + max_steps=20000, + val_check_interval=1000, + check_val_every_n_epoch=None, + ) + trainer.fit( + model=model, + train_dataloaders=train_dl, + val_dataloaders=val_dl, + ckpt_path=args.ckpt_path if not args.reset else None, + ) diff --git a/src/images/Diffusion/sample_laion_script.ipynb b/src/images/Diffusion/sample_laion_script.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..9d17e1ce1aa0fa189191f30420e2e44a038f4d82 --- /dev/null +++ b/src/images/Diffusion/sample_laion_script.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "from dask.diagnostics import ProgressBar\n", + "import os\n", + "\n", + "directory_path = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/'\n", + "\n", + "file_prefix = 'part'\n", + "\n", + "def list_files_with_prefix(directory, prefix):\n", + " file_paths = []\n", + "\n", + " for root, _, files in os.walk(directory):\n", + " for file in files:\n", + " if file.startswith(prefix):\n", + " absolute_path = os.path.join(root, file)\n", + " file_paths.append(absolute_path)\n", + "\n", + " return file_paths\n", + "\n", + "laion_file_paths = list_files_with_prefix(directory_path, file_prefix)\n", + "\n", + "dataframes = [dd.read_parquet(file) for file in laion_file_paths]\n", + "combined_dataframe = dd.multi.concat(dataframes)\n", + "\n", + "with ProgressBar():\n", + " row_count = combined_dataframe.shape[0].compute()\n", + " print(row_count)\n", + "\n", + "filtered_df = combined_dataframe[combined_dataframe['NSFW'] == \"UNLIKELY\"]\n", + "\n", + "num_samples = 225_000\n", + "selected_rows = filtered_df.sample(frac=num_samples / filtered_df.shape[0].compute())\n", + "\n", + "with ProgressBar():\n", + " sampled_rows = selected_rows.compute()\n", + "\n", + "print(len(sampled_rows))\n", + "\n", + "with ProgressBar():\n", + " selected_rows.to_parquet('laion_sampled', write_index=False)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bloom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/images/Diffusion/scrape.py b/src/images/Diffusion/scrape.py new file mode 100644 index 0000000000000000000000000000000000000000..88f45282c8d6a3b2fb2b35c50c04963ba7e7df62 --- /dev/null +++ b/src/images/Diffusion/scrape.py @@ -0,0 +1,149 @@ +import argparse +import time + +import polars as pl +import requests + + +def call_api(param): + url = "https://api.pullpush.io/reddit/search/submission/" + response = requests.get(url, params=param) + json_data = response.json()["data"] + create_utc = [] + media_id = [] + media_type_ls = [] + post_ids = [] + post_titles = [] + cur_utc = 0 + for submission in json_data: + cur_flair = submission["link_flair_text"] + cur_utc = submission["created_utc"] + media_ls = ( + submission["media_metadata"] + if "media_metadata" in submission.keys() + else None + ) + if param["flair"] is not None and cur_flair != param["flair"]: + continue + if media_ls is None: + continue + for id in media_ls.keys(): + if media_ls[id]["status"] != "valid": + continue + try: + media_type = media_ls[id]["m"] + except: # noqa + # video will error out + continue + if media_type == "image/png": + media_type_ls.append("png") + elif media_type == "image/jpg": + media_type_ls.append("jpg") + else: + continue + create_utc.append(int(cur_utc)) + post_ids.append(submission["id"]) + post_titles.append(submission["title"]) + media_id.append(id) + + df = pl.DataFrame( + { + "create_utc": create_utc, + "media_id": media_id, + "media_type": media_type_ls, + "post_id": post_ids, + "post_title": post_titles, + }, + schema={ + "create_utc": pl.Int64, + "media_id": pl.Utf8, + "media_type": pl.Utf8, + "post_id": pl.Utf8, + "post_title": pl.Utf8, + }, + ) + return df, int(cur_utc) + + +def scraping_loop( + subreddit, + flair, + max_num=30000, + output_name=None, + before=None, +): + collected_all = [] + collected_len = 0 + last_timestamp = int(time.time()) if before is None else before + param = { + "subreddit": subreddit, + "flair": flair, + "before": last_timestamp, + } + while collected_len < max_num: + collected_df, last_timestamp = call_api(param) + if collected_df.shape[0] == 0: + print("No more data, saving current data and exiting...") + break + collected_all.append(collected_df) + collected_len += collected_df.shape[0] + print( + f"collected_len: {collected_len}, " + f"last_timestamp: {last_timestamp}", + ) + param["before"] = last_timestamp + + df = pl.concat(collected_all) + df = ( + df.with_columns( + pl.col("media_id") + .str.replace(r"^", "https://i.redd.it/") + .alias("url1"), + pl.col("create_utc") + .cast(pl.Int64) + .cast(pl.Utf8) + .str.to_datetime("%s") + .alias("time"), + ) + .with_columns( + pl.col("media_type").str.replace(r"^", ".").alias("url2"), + ) + .with_columns( + pl.concat_str( + [pl.col("url1"), pl.col("url2")], + separator="", + ).alias("url"), + ) + .select("time", "url", "post_id", "post_title") + ) + if output_name is None: + output_name = subreddit + df.write_parquet(f"urls/{output_name}.parquet") + df.select("url").write_csv(f"urls/{output_name}.csv", has_header=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--subreddit", help="subreddit name") + parser.add_argument("--flair", help="flair filter", default=None, type=str) + parser.add_argument( + "--max_num", + help="max number of posts to scrape", + default=30000, + type=int, + ) + parser.add_argument( + "--output_name", + help="custom output name", + default=None, + ) + parser.add_argument( + "--before", + help="before timestamp", + default=None, + type=int, + ) + + args = parser.parse_args() + + scraping_loop(**args.__dict__) diff --git a/src/images/Diffusion/utils_sampling.py b/src/images/Diffusion/utils_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7162d54d992c35e2ab4b93a2bae67f6ca8716c --- /dev/null +++ b/src/images/Diffusion/utils_sampling.py @@ -0,0 +1,94 @@ +import collections +import random +from typing import Callable + +from torchdata.datapipes.iter import IterDataPipe + + +def get_second_entry(sample): + return sample[1] + + +class UnderSamplerIterDataPipe(IterDataPipe): + """Dataset wrapper for under-sampling. + + Copied from: https://github.com/MaxHalford/pytorch-resample/blob/master/pytorch_resample/under.py # noqa + Modified to work with multiple labels. + + MIT License + + Copyright (c) 2020 Max Halford + + This method is based on rejection sampling. + + Parameters: + dataset + desired_dist: The desired class distribution. + The keys are the classes whilst the + values are the desired class percentages. + The values are normalised so that sum up + to 1. + label_getter: A function that takes a sample and returns its label. + seed: Random seed for reproducibility. + + Attributes: + actual_dist: The counts of the observed sample labels. + rng: A random number generator instance. + + References: + - https://www.wikiwand.com/en/Rejection_sampling + + """ + + def __init__( + self, + dataset: IterDataPipe, + desired_dist: dict, + label_getter: Callable = get_second_entry, + seed: int = None, + ): + + self.dataset = dataset + self.desired_dist = { + c: p / sum(desired_dist.values()) for c, p in desired_dist.items() + } + self.label_getter = label_getter + self.seed = seed + + self.actual_dist = collections.Counter() + self.rng = random.Random(seed) + self._pivot = None + + def __iter__(self): + + for dp in self.dataset: + y = self.label_getter(dp) + + self.actual_dist[y] += 1 + + # To ease notation + f = self.desired_dist + g = self.actual_dist + + # Check if the pivot needs to be changed + if y != self._pivot: + self._pivot = max(g.keys(), key=lambda y: f[y] / g[y]) + else: + yield dp + continue + + # Determine the sampling ratio if the observed label + # is not the pivot + M = f[self._pivot] / g[self._pivot] + ratio = f[y] / (M * g[y]) + + if ratio < 1 and self.rng.random() < ratio: + yield dp + + @classmethod + def expected_size(cls, n, desired_dist, actual_dist): + M = max( + desired_dist.get(k) / actual_dist.get(k) + for k in set(desired_dist) | set(actual_dist) + ) + return int(n / M) diff --git a/src/images/Diffusion/visualizations.ipynb b/src/images/Diffusion/visualizations.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..748068ad1bb8e3bf03a63af15991d0f4fd00537a --- /dev/null +++ b/src/images/Diffusion/visualizations.ipynb @@ -0,0 +1,196 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install polars-lts-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def pfbeta(labels, predictions, beta=1):\n", + " y_true_count = 0\n", + " ctp = 0\n", + " cfp = 0\n", + "\n", + " for idx in range(len(labels)):\n", + " prediction = min(max(predictions[idx], 0), 1)\n", + " if (labels[idx]):\n", + " y_true_count += 1\n", + " ctp += prediction\n", + " else:\n", + " cfp += prediction\n", + "\n", + " beta_squared = beta * beta\n", + " c_precision = ctp / (ctp + cfp)\n", + " c_recall = ctp / y_true_count\n", + " if (c_precision > 0 and c_recall > 0):\n", + " result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)\n", + " return result\n", + " else:\n", + " return 0\n", + "\n", + "def get_part_metrics(df: pl.DataFrame, threshold=0.3) -> dict:\n", + " df = df.with_columns((df[\"preds\"] > threshold).alias(\"preds_bin\"))\n", + " metrics = {}\n", + " # binary metrics using the threshold\n", + " metrics[\"accuracy\"] = accuracy_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"precision\"] = precision_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"recall\"] = recall_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " metrics[\"f1\"] = f1_score(df[\"labels\"].to_numpy(), df[\"preds_bin\"].to_numpy())\n", + " # probabilistic F1 (doesn't depend on the threshold)\n", + " metrics[\"pf1\"] = pfbeta(df[\"labels\"].to_numpy(), df[\"preds\"].to_numpy())\n", + " # ROC AUC\n", + " metrics[\"roc_auc\"] = roc_auc_score(df[\"labels\"].to_numpy(), df[\"preds\"].to_numpy())\n", + " return metrics\n", + "\n", + "\n", + "def get_all_metrics(df: pl.DataFrame, threshold=0.3) -> pd.DataFrame:\n", + " groups = [list(range(5)), [0, 1], [0, 4], [0, 2], [0, 3]]\n", + " group_names = [\"all\", \"StableDiffusion\", \"Midjourney\", \"Dalle2\", \"Dalle3\"]\n", + " all_metrics = []\n", + " for i, g in enumerate(groups):\n", + " subset = df.filter(pl.col(\"domains\").is_in(g))\n", + " metrics = get_part_metrics(subset, threshold=threshold)\n", + " metrics[\"group\"] = group_names[i]\n", + " all_metrics.append(metrics)\n", + " \n", + " return pd.DataFrame(all_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the data from the output files\n", + "df1 = pl.read_csv('/Users/fionachow/Downloads/outputs/preds-image-classifier-1.csv')\n", + "df14 = pl.read_csv('/Users/fionachow/Downloads/outputs/preds-image-classifier-14.csv')\n", + "df142 = pl.read_csv('/Users/fionachow/Downloads/outputs/preds-image-classifier-142.csv')\n", + "df1423 = pl.read_csv('/Users/fionachow/Downloads/outputs/preds-image-classifier-1423.csv')\n", + "\n", + "metrics_df1 = get_all_metrics(df1, threshold=0.5)\n", + "metrics_df14 = get_all_metrics(df14, threshold=0.5)\n", + "metrics_df142 = get_all_metrics(df142, threshold=0.5)\n", + "metrics_df1423 = get_all_metrics(df1423, threshold=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics_df1.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set()\n", + "\n", + "models = ['StableDiffusion', 'Midjourney', 'Dalle2', 'Dalle3']\n", + "metrics = ['accuracy', 'f1', 'pf1', 'roc_auc']\n", + "\n", + "file_map = {\n", + " ('StableDiffusion',): metrics_df1,\n", + " ('StableDiffusion', 'Midjourney'): metrics_df14,\n", + " ('StableDiffusion', 'Midjourney', 'Dalle2'): metrics_df142,\n", + " ('StableDiffusion', 'Midjourney', 'Dalle2', 'Dalle3'): metrics_df1423,\n", + "}\n", + "\n", + "def create_heatmap_data(metric):\n", + " data = pd.DataFrame(index=models[::-1], columns=models)\n", + " for i, model_x in enumerate(models):\n", + " for j, model_y in enumerate(models[::-1]):\n", + " \n", + " if i == 0:\n", + " relevant_df = metrics_df1\n", + " elif i == 1:\n", + " relevant_df = metrics_df14\n", + " elif i == 2:\n", + " relevant_df = metrics_df142\n", + " else:\n", + " relevant_df = metrics_df1423\n", + "\n", + " # Debugging: print the DataFrame being used and the model_y\n", + " #print(f\"Using DataFrame for {models[:i+1]}, model_y: {model_y}\")\n", + "\n", + " # Extract the metric value\n", + " if model_y in relevant_df['group'].values:\n", + " metric_value = relevant_df[relevant_df['group'] == model_y][metric].values[0]\n", + " # Debugging: print the extracted metric value\n", + " #print(f\"Metric value for {model_y}: {metric_value}\")\n", + " else:\n", + " metric_value = float('nan') # Handle non-existent cases\n", + " # Debugging: print a message for non-existent cases\n", + " #print(f\"No data for combination: {model_x}, {model_y}\")\n", + "\n", + " data.at[model_y, model_x] = metric_value\n", + " \n", + " for col in data.columns:\n", + " data[col] = pd.to_numeric(data[col], errors='coerce')\n", + "\n", + " # Debugging: print the final DataFrame\n", + " # print(f\"Final Data for metric {metric}:\")\n", + " # print(data)\n", + " # print(data.dtypes)\n", + " return data\n", + "\n", + "for metric in metrics:\n", + " plt.figure(figsize=(10, 8))\n", + " sns.heatmap(create_heatmap_data(metric), annot=True, cmap='coolwarm', fmt='.3f')\n", + " plt.title(f\"Heatmap for {metric}\")\n", + " plt.xlabel(\"Trained On (x-axis)\")\n", + " plt.ylabel(\"Tested On (y-axis)\")\n", + " plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bloom", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/images/README.md b/src/images/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5be0ef70860248b08f9fd8d2e42cc90824f161ee --- /dev/null +++ b/src/images/README.md @@ -0,0 +1,64 @@ +# AI-generated image detection +**(Work In Progress)** + +- [ ] Refactor code +- [ ] Review dependencies +- [ ] Containerize (Docker) +- [ ] Update documentation + +## AI-Generated Image detection + +This part handles the detection of AI-generated images. +The current code contains two classifiers to detect AI-generated images from two types of architectures: +- GANs + +## Model weights + +### 1. CNN Detection + +Run the `download_weights_CNN.sh` script: + +```commandline +bash download_weights_CNN.sh +``` + +Note: you need `wget` installed on your system (it is by default for most Linux systems). + +### 2. Diffusion + +**TODO** + + +## Run the models + +Make sure you have the weights available before doing so. + +**TODO: environments** + +### 1. CNN Detection + +```commandline +python CNN_model_classifier.py +``` +Available options: + +- `-f / --file` (default=`'examples_realfakedir'`) +- `-m / --model_path` (default=`'weights/blur_jpg_prob0.5.pth'`) +- `-c / --crop` (default=`None`): Specify crop size (int) by default, do not crop. +- `--use_cpu`: use cpu (by default uses GPU) -> **TODO: remove (obsolete)** + +Example usage: + +```commandline +python CNN_model_classifier.py -f examples/real.png -m weights/blur_jpg_prob0.5.pth +``` + +### 2. Diffusion detection + +**TODO** + +## References + +Based on: +- https://github.com/hoangthuc701/GenAI-image-detection +- https://github.com/ptmaimai106/DetectGenerateImageByRealImageOnly diff --git a/src/images/Search_Image/Bing_search.py b/src/images/Search_Image/Bing_search.py new file mode 100644 index 0000000000000000000000000000000000000000..58ebf8c9fa8ac12ec0213ffd7850e54d85cfb050 --- /dev/null +++ b/src/images/Search_Image/Bing_search.py @@ -0,0 +1,93 @@ +import json +import os +from dotenv import load_dotenv +import requests + +# Load Bing Search API key +load_dotenv() +BING_API_KEY = os.getenv("BING_API_KEY") + +def print_json(obj): + """Print the object as json""" + print(json.dumps(obj, sort_keys=True, indent=4, separators=(',', ': '))) + + +def get_image_urls(search_results): + """ + Extracts image URLs from Bing Visual Search response. + Ref: https://learn.microsoft.com/en-us/bing/search-apis/bing-visual-search/how-to/search-response + + Args: + search_results: A dict containing the Bing VisualSearch response data. + + Returns: + A tuple containing two lists: + - List of image URLs from "PagesIncluding" section. + - List of image URLs from "VisualSearch" section (backup). + """ + + pages_including_urls = [] + visual_search_urls = [] + + if "tags" not in search_results: + return pages_including_urls, visual_search_urls + + # Check for required keys directly + if not any(action.get("actions") for action in search_results["tags"]): + return pages_including_urls, visual_search_urls + + + for action in search_results["tags"]: + for result in action.get("actions", []): + # actions = PagesIncluding, main results + if result["name"] == "PagesIncluding": + pages_including_urls.extend(item["contentUrl"] for item in result["data"]["value"]) + # actions = VisualSearch, back up results + elif result["name"] == "VisualSearch": + visual_search_urls.extend(item["contentUrl"] for item in result["data"]["value"]) + + return pages_including_urls, visual_search_urls + +def reverse_image_search(image_path, subscription_key=BING_API_KEY): + """Performs a reverse image search using the Bing Visual Search API. + + Args: + image_path: The path to the image file to search for. + + Returns: + A list of image URLs found that are similar to the image in the + specified path. + + Raises: + requests.exceptions.RequestException: If the API request fails. + """ + base_uri = "https://api.bing.microsoft.com/v7.0/images/visualsearch" + headers = {"Ocp-Apim-Subscription-Key": subscription_key} + + try: + files = {"image": ("image", open(image_path, "rb"))} + response = requests.post(base_uri, headers=headers, files=files) + response.raise_for_status() + search_results = response.json() + + return search_results + + except requests.exceptions.RequestException as e: + raise requests.exceptions.RequestException(f"API request failed: {e}") + except OSError as e: + raise OSError(f"Error opening image file: {e}") + +if __name__ == "__main__": + # Example usage: + image_path = "data/test_data/human_news.jpg" + try: + search_results = reverse_image_search(image_path) + image_urls, backup_image_urls = get_image_urls(search_results) + + # Print the results + print("Image URLs from PagesIncluding:") + print(image_urls) + print("\nImage URLs from VisualSearch (backup):") + print(backup_image_urls) + except Exception as e: + print(f"An error occurred: {e}") \ No newline at end of file diff --git a/src/images/Search_Image/image_difference.py b/src/images/Search_Image/image_difference.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/images/Search_Image/image_model_share.py b/src/images/Search_Image/image_model_share.py new file mode 100644 index 0000000000000000000000000000000000000000..4503f7d3c2b1bbffa2917dab44d9c9fd249e99fa --- /dev/null +++ b/src/images/Search_Image/image_model_share.py @@ -0,0 +1,142 @@ +from sklearn.metrics import roc_auc_score +from torchmetrics import Accuracy, Recall +import pytorch_lightning as pl +import timm +import torch +from pytorch_lightning.callbacks import Model, EarlyStopping +import logging +from PIL import Image +import torchvision.transforms as transforms +from torchvision.transforms import v2 + +logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True) +CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt" + + + +class ImageClassifier(pl.LightningModule): + def __init__(self, lmd=0): + super().__init__() + self.model = timm.create_model('resnet50', pretrained=True, num_classes=1) + self.accuracy = Accuracy(task='binary', threshold=0.5) + self.recall = Recall(task='binary', threshold=0.5) + self.validation_outputs = [] + self.lmd = lmd + + def forward(self, x): + return self.model(x) + + def training_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + print(f"Shape of outputs (training): {outputs.shape}") + print(f"Shape of labels (training): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + logging.info(f"Training Step - ERM loss: {loss.item()}") + loss += self.lmd * (outputs ** 2).mean() # SD loss penalty + logging.info(f"Training Step - SD loss: {loss.item()}") + return loss + + def validation_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + if outputs.shape == torch.Size([]): + return + + print(f"Shape of outputs (validation): {outputs.shape}") + print(f"Shape of labels (validation): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + preds = torch.sigmoid(outputs) + self.log('val_loss', loss, prog_bar=True, sync_dist=True) + self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True) + self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True) + output = {"val_loss": loss, "preds": preds, "labels": labels} + self.validation_outputs.append(output) + logging.info(f"Validation Step - Batch loss: {loss.item()}") + return output + + def predict_step(self, batch): + images, label, domain = batch + outputs = self.forward(images).squeeze() + preds = torch.sigmoid(outputs) + return preds, label, domain + + def on_validation_epoch_end(self): + if not self.validation_outputs: + logging.warning("No outputs in validation step to process") + return + preds = torch.cat([x['preds'] for x in self.validation_outputs]) + labels = torch.cat([x['labels'] for x in self.validation_outputs]) + if labels.unique().size(0) == 1: + logging.warning("Only one class in validation step") + return + auc_score = roc_auc_score(labels.cpu(), preds.cpu()) + self.log('val_auc', auc_score, prog_bar=True, sync_dist=True) + logging.info(f"Validation Epoch End - AUC score: {auc_score}") + self.validation_outputs = [] + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005) + return optimizer + + + +def load_image(image_path, transform=None): + image = Image.open(image_path).convert('RGB') + + if transform: + image = transform(image) + + return image + + +def predict_single_image(image_path, model, transform=None): + image = load_image(image_path, transform) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + model.to(device) + + image = image.to(device) + + model.eval() + + with torch.no_grad(): + image = image.unsqueeze(0) + output = model(image).squeeze() + print(output) + prediction = torch.sigmoid(output).item() + + return prediction + + +def image_generation_detection(image_path): + model = ImageClassifier.load_from_checkpoint(CHECKPOINT) + + transform = v2.Compose([ + transforms.ToTensor(), + v2.CenterCrop((256, 256)), + ]) + + prediction = predict_single_image(image_path, model, transform) + print("prediction",prediction) + + result = "" + if prediction <= 0.2: + result += "Most likely human" + image_prediction_label = "HUMAN" + else: + result += "Most likely machine" + image_prediction_label = "MACHINE" + image_confidence = min(1, 0.5 + abs(prediction - 0.2)) + result += f" with confidence = {round(image_confidence * 100, 2)}%" + image_confidence = round(image_confidence * 100, 2) + return image_prediction_label, image_confidence + + +if __name__ == "__main__": + pass diff --git a/src/images/Search_Image/search.py b/src/images/Search_Image/search.py new file mode 100644 index 0000000000000000000000000000000000000000..10c0cb91f91f2fbc987479de92c98f867d14a2e0 --- /dev/null +++ b/src/images/Search_Image/search.py @@ -0,0 +1,56 @@ +from google_img_source_search import ReverseImageSearcher +import requests +from io import BytesIO +from PIL import Image +import imagehash +from google_img_source_search import ReverseImageSearcher + +def get_image_from_url(url): + response = requests.get(url) + return Image.open(BytesIO(response.content)) + +def standardize_image(image): + # Convert to RGB if needed + if image.mode in ('RGBA', 'LA'): + background = Image.new('RGB', image.size, (255, 255, 255)) + background.paste(image, mask=image.split()[-1]) + image = background + elif image.mode != 'RGB': + image = image.convert('RGB') + + # Resize to standard size (e.g. 256x256) + standard_size = (256, 256) + image = image.resize(standard_size) + + return image + +def compare_images(image1, image2): + # Standardize both images first + img1_std = standardize_image(image1) + img2_std = standardize_image(image2) + + hash1 = imagehash.average_hash(img1_std) + hash2 = imagehash.average_hash(img2_std) + return hash1 - hash2 # Returns the Hamming distance between the hashes + +if __name__ == '__main__': + image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png' + + # Get the image from URL + url_image = get_image_from_url(image_url) + + # Search image + rev_img_searcher = ReverseImageSearcher() + res = rev_img_searcher.search(image_url) + + for search_item in res: + print(f'Title: {search_item.page_title}') + # print(f'Site: {search_item.page_url}') + print(f'Img: {search_item.image_url}\n') + + # Compare each search result image with the input image + result_image = get_image_from_url(search_item.image_url) + result_difference = compare_images(result_image, url_image) + print(f"Difference with search result: {result_difference}") + if result_difference == 0: + break \ No newline at end of file diff --git a/src/images/Search_Image/search_2.py b/src/images/Search_Image/search_2.py new file mode 100644 index 0000000000000000000000000000000000000000..066d250631ee68548445678b5961759a0218cbfc --- /dev/null +++ b/src/images/Search_Image/search_2.py @@ -0,0 +1,150 @@ +import time +import logging +import requests +from bs4 import BeautifulSoup +from typing import Dict, Optional +from urllib.parse import quote, urlparse + +logging.basicConfig( + filename='error.log', + level=logging.INFO, + format='%(asctime)s | [%(levelname)s]: %(message)s', + datefmt='%m-%d-%Y / %I:%M:%S %p' +) + +class SearchResults: + def __init__(self, results): + self.results = results + + def __str__(self): + output = "" + for result in self.results: + output += "---\n" + output += f"Title: {result.get('title', 'Title not found')}\n" + output += f"Link: {result.get('link', 'Link not found')}\n" + output += "---\n" + return output + +class GoogleReverseImageSearch: + def __init__(self): + self.base_url = "https://www.google.com/searchbyimage" + self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} + self.retry_count = 3 + self.retry_delay = 1 + + def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults: + self._validate_input(query, image_url) + + encoded_query = quote(query) + encoded_image_url = quote(image_url) + + url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2" + + all_results = [] + start_index = 0 + + while len(all_results) < max_results: + if start_index != 0: + time.sleep(delay) + + paginated_url = f"{url}&start={start_index}" + + response = self._make_request(paginated_url) + if response is None: + break + + search_results, valid_content = self._parse_search_results(response.text) + if not valid_content: + logging.warning("Unexpected HTML structure encountered.") + break + + for result in search_results: + if len(all_results) >= max_results: + break + data = self._extract_result_data(result) + if data and data not in all_results: + all_results.append(data) + + start_index += (len(all_results)-start_index) + + if len(all_results) == 0: + logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].") + return "No results found. Please try again with a different query and/or image URL." + else: + return SearchResults(all_results[:max_results]) + + def _validate_input(self, query: str, image_url: str): + if not query: + raise ValueError("Query not found. Please enter a query and try again.") + if not image_url: + raise ValueError("Image URL not found. Please enter an image URL and try again.") + if not self._validate_image_url(image_url): + raise ValueError("Invalid image URL. Please enter a valid image URL and try again.") + + def _validate_image_url(self, url: str) -> bool: + parsed_url = urlparse(url) + path = parsed_url.path.lower() + valid_extensions = (".jpg", ".jpeg", ".png", ".webp") + return any(path.endswith(ext) for ext in valid_extensions) + + def _make_request(self, url: str): + attempts = 0 + while attempts < self.retry_count: + try: + response = requests.get(url, headers=self.headers) + if response.headers.get('Content-Type', '').startswith('text/html'): + response.raise_for_status() + return response + else: + logging.warning("Non-HTML content received.") + return None + except requests.exceptions.HTTPError as http_err: + logging.error(f"HTTP error occurred: {http_err}") + attempts += 1 + time.sleep(self.retry_delay) + except Exception as err: + logging.error(f"An error occurred: {err}") + return None + return None + + def _parse_search_results(self, html_content: str) -> (Optional[list], bool): + try: + soup = BeautifulSoup(html_content, "html.parser") + return soup.find_all('div', class_='g'), True + except Exception as e: + logging.error(f"Error parsing HTML content: {e}") + return None, False + + def _extract_result_data(self, result) -> Dict: + link = result.find('a', href=True)['href'] if result.find('a', href=True) else None + title = result.find('h3').get_text(strip=True) if result.find('h3') else None + return {"link": link, "title": title} if link and title else {} + + +if __name__ == "__main__": + # request = GoogleReverseImageSearch() + + # response = request.response( + # query="Example Query", + # image_url="https://ichef.bbci.co.uk/images/ic/1024xn/p0khzhhl.jpg.webp", + # max_results=5 + # ) + + # print(response) + + # Path to local image + image_path = "data/test_data/towel.jpg" + image_path = "C:\\TTProjects\\prj-nict-ai-content-detection\\data\\test_data\\towel.jpg" + + import json + file_path = image_path + search_url = 'https://yandex.ru/images/search' + files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg')} + params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'} + response = requests.post(search_url, params=params, files=files) + query_string = json.loads(response.content)['blocks'][0]['params']['url'] + img_search_url = search_url + '?' + query_string + print(img_search_url) + + response = requests.get(img_search_url) + print(response.text) \ No newline at end of file diff --git a/src/images/Search_Image/search_yandex.py b/src/images/Search_Image/search_yandex.py new file mode 100644 index 0000000000000000000000000000000000000000..ee19e528cbda08a991a45ba30a12300b4e8d900a --- /dev/null +++ b/src/images/Search_Image/search_yandex.py @@ -0,0 +1,177 @@ +import time +import logging +import requests +from bs4 import BeautifulSoup +from typing import Dict, Optional +from urllib.parse import quote, urlparse + +logging.basicConfig( + filename='error.log', + level=logging.INFO, + format='%(asctime)s | [%(levelname)s]: %(message)s', + datefmt='%m-%d-%Y / %I:%M:%S %p' +) + +class SearchResults: + def __init__(self, results): + self.results = results + + def __str__(self): + output = "" + for result in self.results: + output += "---\n" + output += f"Title: {result.get('title', 'Title not found')}\n" + output += f"Link: {result.get('link', 'Link not found')}\n" + output += "---\n" + return output + +class ReverseImageSearch: + def __init__(self): + self.base_url = "https://yandex.ru/images/search" + self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} + self.retry_count = 3 + self.retry_delay = 1 + + def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults: + self._validate_input(query, image_url) + + encoded_query = quote(query) + encoded_image_url = quote(image_url) + + url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2" + + all_results = [] + start_index = 0 + + while len(all_results) < max_results: + if start_index != 0: + time.sleep(delay) + + paginated_url = f"{url}&start={start_index}" + + response = self._make_request(paginated_url) + if response is None: + break + + search_results, valid_content = self._parse_search_results(response.text) + if not valid_content: + logging.warning("Unexpected HTML structure encountered.") + break + + for result in search_results: + if len(all_results) >= max_results: + break + data = self._extract_result_data(result) + if data and data not in all_results: + all_results.append(data) + + start_index += (len(all_results)-start_index) + + if len(all_results) == 0: + logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].") + return "No results found. Please try again with a different query and/or image URL." + else: + return SearchResults(all_results[:max_results]) + + def _validate_input(self, query: str, image_url: str): + if not query: + raise ValueError("Query not found. Please enter a query and try again.") + if not image_url: + raise ValueError("Image URL not found. Please enter an image URL and try again.") + if not self._validate_image_url(image_url): + raise ValueError("Invalid image URL. Please enter a valid image URL and try again.") + + def _validate_image_url(self, url: str) -> bool: + parsed_url = urlparse(url) + path = parsed_url.path.lower() + valid_extensions = (".jpg", ".jpeg", ".png", ".webp") + return any(path.endswith(ext) for ext in valid_extensions) + + def _make_request(self, url: str): + attempts = 0 + while attempts < self.retry_count: + try: + response = requests.get(url, headers=self.headers) + if response.headers.get('Content-Type', '').startswith('text/html'): + response.raise_for_status() + return response + else: + logging.warning("Non-HTML content received.") + return None + except requests.exceptions.HTTPError as http_err: + logging.error(f"HTTP error occurred: {http_err}") + attempts += 1 + time.sleep(self.retry_delay) + except Exception as err: + logging.error(f"An error occurred: {err}") + return None + return None + + def _parse_search_results(self, html_content: str) -> (Optional[list], bool): + try: + soup = BeautifulSoup(html_content, "html.parser") + return soup.find_all('div', class_='g'), True + except Exception as e: + logging.error(f"Error parsing HTML content: {e}") + return None, False + + def _extract_result_data(self, result) -> Dict: + link = result.find('a', href=True)['href'] if result.find('a', href=True) else None + title = result.find('h3').get_text(strip=True) if result.find('h3') else None + return {"link": link, "title": title} if link and title else {} + +def yandex_reverse_image_search(image_url): + # Simulate a user agent to avoid being blocked + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} + + try: + response = requests.get(image_url, headers=headers) + response.raise_for_status() # Raise an exception for bad status codes + + # Parse the HTML content + soup = BeautifulSoup(response.content, 'html.parser') + + # Extract image URLs (example - adapt based on Yandex's HTML structure) + image_urls = [img['src'] for img in soup.find_all('img')] + + # Extract related searches (example - adapt based on Yandex's HTML structure) + related_searches = [text for text in soup.find_all(class_="related-searches")] + + return image_urls, related_searches + + except requests.exceptions.RequestException as e: + print(f"Error fetching image: {e}") + return [], [] + + +if __name__ == "__main__": + # request = GoogleReverseImageSearch() + + # response = request.response( + # query="Example Query", + # image_url="https://ichef.bbci.co.uk/images/ic/1024xn/p0khzhhl.jpg.webp", + # max_results=5 + # ) + + # print(response) + + # Path to local image + image_path = "data/test_data/towel.jpg" + image_path = "C:\\TTProjects\\prj-nict-ai-content-detection\\data\\test_data\\towel.jpg" + + import json + file_path = image_path + search_url = 'https://yandex.ru/images/search' + files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg')} + params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'} + response = requests.post(search_url, params=params, files=files) + query_string = json.loads(response.content)['blocks'][0]['params']['url'] + img_search_url = search_url + '?' + query_string + print(img_search_url) + + image_urls, related_searches = yandex_reverse_image_search(img_search_url) + + print("Image URLs:", image_urls) + print("Related Searches:", related_searches) + + \ No newline at end of file diff --git a/src/images/diffusion_data_loader.py b/src/images/diffusion_data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e74f785b69e9fb7d73a60887c4551a3464cfe7 --- /dev/null +++ b/src/images/diffusion_data_loader.py @@ -0,0 +1,229 @@ +import argparse +import collections +import random +from typing import Iterator + +import cv2 +import numpy as np +import torchdata.datapipes as dp +from imwatermark import WatermarkEncoder +from PIL import ( + Image, + ImageFile, +) +from torch.utils.data import DataLoader +from torchdata.datapipes.iter import ( + Concater, + FileLister, + FileOpener, + SampleMultiplexer, +) +from torchvision.transforms import v2 +from tqdm import tqdm + +ImageFile.LOAD_TRUNCATED_IMAGES = True +Image.MAX_IMAGE_PIXELS = 1000000000 + +encoder = WatermarkEncoder() +encoder.set_watermark("bytes", b"test") + +DOMAIN_LABELS = { + 0: "laion", + 1: "StableDiffusion", + 2: "dalle2", + 3: "dalle3", + 4: "midjourney", +} + +N_SAMPLES = { + 0: (115346, 14418, 14419), + 1: (22060, 2757, 2758), + 4: (21096, 2637, 2637), + 2: (13582, 1697, 1699), + 3: (12027, 1503, 1504), +} + + +@dp.functional_datapipe("collect_from_workers") +class WorkerResultCollector(dp.iter.IterDataPipe): + def __init__(self, source: dp.iter.IterDataPipe): + self.source = source + + def __iter__(self) -> Iterator: + yield from self.source + + def is_replicable(self) -> bool: + """Method to force data back to main process""" + return False + + +def crop_bottom(image, cutoff=16): + return image[:, :-cutoff, :] + + +def random_gaussian_blur(image, p=0.01): + if random.random() < p: + return v2.functional.gaussian_blur(image, kernel_size=5) + return image + + +def random_invisible_watermark(image, p=0.2): + image_np = np.array(image) + image_np = np.transpose(image_np, (1, 2, 0)) + + if image_np.ndim == 2: # Grayscale image + image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2BGR) + elif image_np.shape[2] == 4: # RGBA image + image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR) + + if image_np.shape[0] < 256 or image_np.shape[1] < 256: + image_np = cv2.resize( + image_np, + (256, 256), + interpolation=cv2.INTER_AREA, + ) + + if random.random() < p: + return encoder.encode(image_np, method="dwtDct") + + return image_np + + +def build_transform(split: str): + train_transform = v2.Compose( + [ + v2.Lambda(crop_bottom), + v2.RandomCrop((256, 256), pad_if_needed=True), + v2.Lambda(random_gaussian_blur), + v2.RandomGrayscale(p=0.05), + v2.Lambda(random_invisible_watermark), + v2.ToImage(), + ], + ) + + eval_transform = v2.Compose( + [ + v2.CenterCrop((256, 256)), + ], + ) + transform = train_transform if split == "train" else eval_transform + + return transform + + +def dp_to_tuple_train(input_dict): + transform = build_transform("train") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def dp_to_tuple_eval(input_dict): + transform = build_transform("eval") + return ( + transform(input_dict[".jpg"]), + input_dict[".label.cls"], + input_dict[".domain_label.cls"], + ) + + +def load_dataset(domains: list[int], split: str): + laion_lister = FileLister("./data/laion400m_data", f"{split}*.tar") + genai_lister = { + d: FileLister( + f"./data/genai-images/{DOMAIN_LABELS[d]}", + f"{split}*.tar", + ) + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + weight_genai = 1 / len(genai_lister) + + def open_lister(lister): + opener = FileOpener(lister, mode="b") + return opener.load_from_tar().routed_decode().webdataset() + + buffer_size1 = 100 if split == "train" else 10 + buffer_size2 = 100 if split == "train" else 10 + + if split != "train": + all_lister = [laion_lister] + list(genai_lister.values()) + dp = open_lister(Concater(*all_lister)).sharding_filter() + else: + laion_dp = ( + open_lister(laion_lister.shuffle()) + .cycle() + .sharding_filter() + .shuffle(buffer_size=buffer_size1) + ) + genai_dp = { + open_lister(genai_lister[d].shuffle()) + .cycle() + .sharding_filter() + .shuffle( + buffer_size=buffer_size1, + ): weight_genai + for d in domains + if DOMAIN_LABELS[d] != "laion" + } + dp = SampleMultiplexer({laion_dp: 1, **genai_dp}).shuffle( + buffer_size=buffer_size2, + ) + + if split == "train": + dp = dp.map(dp_to_tuple_train) + else: + dp = dp.map(dp_to_tuple_eval) + + return dp + + +def load_dataloader( + domains: list[int], + split: str, + batch_size: int = 32, + num_workers: int = 4, +): + dp = load_dataset(domains, split) + # if split == "train": + # dp = UnderSamplerIterDataPipe(dp, {0: 0.5, 1: 0.5}, seed=42) + dp = dp.batch(batch_size).collate() + dl = DataLoader( + dp, + batch_size=None, + num_workers=num_workers, + pin_memory=True, + ) + + return dl + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + args = parser.parse_args() + + # testing code + dl = load_dataloader([0, 1], "train", num_workers=8) + y_dist = collections.Counter() + d_dist = collections.Counter() + + for i, (img, y, d) in tqdm(enumerate(dl)): + if i % 100 == 0: + print(y, d) + if i == 400: + break + y_dist.update(y.numpy()) + d_dist.update(d.numpy()) + + print("class label") + for label in sorted(y_dist): + frequency = y_dist[label] / sum(y_dist.values()) + print(f"• {label}: {frequency:.2%} ({y_dist[label]})") + + print("domain label") + for label in sorted(d_dist): + frequency = d_dist[label] / sum(d_dist.values()) + print(f"• {label}: {frequency:.2%} ({d_dist[label]})") diff --git a/src/images/diffusion_model_classifier.py b/src/images/diffusion_model_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..3bca9e0860a8304b75679a72d0250fb0801c85ce --- /dev/null +++ b/src/images/diffusion_model_classifier.py @@ -0,0 +1,293 @@ +import argparse +import logging +import os + +import pandas as pd +import pytorch_lightning as pl +import timm +import torch +import torch.nn.functional as F +import torchvision.transforms as transforms +from PIL import Image +from pytorch_lightning.callbacks import ( + EarlyStopping, + ModelCheckpoint, +) +from sklearn.metrics import roc_auc_score +from torchmetrics import ( + Accuracy, + Recall, +) + +from .diffusion_data_loader import load_dataloader + + +class ImageClassifier(pl.LightningModule): + def __init__(self, lmd=0): + super().__init__() + self.model = timm.create_model( + "resnet50", + pretrained=True, + num_classes=1, + ) + self.accuracy = Accuracy(task="binary", threshold=0.5) + self.recall = Recall(task="binary", threshold=0.5) + self.validation_outputs = [] + self.lmd = lmd + + def forward(self, x): + return self.model(x) + + def training_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + print(f"Shape of outputs (training): {outputs.shape}") + print(f"Shape of labels (training): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + logging.info(f"Training Step - ERM loss: {loss.item()}") + loss += self.lmd * (outputs**2).mean() # SD loss penalty + logging.info(f"Training Step - SD loss: {loss.item()}") + return loss + + def validation_step(self, batch): + images, labels, _ = batch + outputs = self.forward(images).squeeze() + + if outputs.shape == torch.Size([]): + return + + print(f"Shape of outputs (validation): {outputs.shape}") + print(f"Shape of labels (validation): {labels.shape}") + + loss = F.binary_cross_entropy_with_logits(outputs, labels.float()) + preds = torch.sigmoid(outputs) + self.log("val_loss", loss, prog_bar=True, sync_dist=True) + self.log( + "val_acc", + self.accuracy(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + self.log( + "val_recall", + self.recall(preds, labels.int()), + prog_bar=True, + sync_dist=True, + ) + output = {"val_loss": loss, "preds": preds, "labels": labels} + self.validation_outputs.append(output) + logging.info(f"Validation Step - Batch loss: {loss.item()}") + return output + + def predict_step(self, batch): + images, label, domain = batch + outputs = self.forward(images).squeeze() + preds = torch.sigmoid(outputs) + return preds, label, domain + + def on_validation_epoch_end(self): + if not self.validation_outputs: + logging.warning("No outputs in validation step to process") + return + preds = torch.cat([x["preds"] for x in self.validation_outputs]) + labels = torch.cat([x["labels"] for x in self.validation_outputs]) + if labels.unique().size(0) == 1: + logging.warning("Only one class in validation step") + return + auc_score = roc_auc_score(labels.cpu(), preds.cpu()) + self.log("val_auc", auc_score, prog_bar=True, sync_dist=True) + logging.info(f"Validation Epoch End - AUC score: {auc_score}") + self.validation_outputs = [] + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005) + return optimizer + + +def load_image(image_path, transform=None): + image = Image.open(image_path).convert("RGB") + + if transform: + image = transform(image) + + return image + + +def predict_single_image(image, model): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model.to(device) + + image = image.to(device) + + model.eval() + + with torch.no_grad(): + image = image.unsqueeze(0) + output = model(image).squeeze() + prediction = torch.sigmoid(output).item() + + return prediction + + +if __name__ == "__main__": + checkpoint_callback = ModelCheckpoint( + monitor="val_loss", + dirpath="./model_checkpoints/", + filename="image-classifier-{step}-{val_loss:.2f}", + save_top_k=3, + mode="min", + every_n_train_steps=1001, + enable_version_counter=True, + ) + + early_stop_callback = EarlyStopping( + monitor="val_loss", + patience=4, + mode="min", + ) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--ckpt_path", + help="checkpoint to continue from", + required=False, + ) + parser.add_argument( + "--predict", + help="predict on test set", + action="store_true", + ) + parser.add_argument("--reset", help="reset training", action="store_true") + parser.add_argument( + "--predict_image", + help="predict the class of a single image", + action="store_true", + ) + parser.add_argument( + "--image_path", + help="path to the image to predict", + type=str, + required=False, + ) + parser.add_argument( + "--dir", + help="path to the images to predict", + type=str, + required=False, + ) + parser.add_argument( + "--output_file", + help="path to output file", + type=str, + required=False, + ) + args = parser.parse_args() + + train_domains = [0, 1, 4] + val_domains = [0, 1, 4] + lmd_value = 0 + + if args.predict: + test_dl = load_dataloader( + [0, 1, 2, 3, 4], + "test", + batch_size=10, + num_workers=1, + ) + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + trainer = pl.Trainer() + predictions = trainer.predict(model, dataloaders=test_dl) + preds, labels, domains = zip(*predictions) + preds = torch.cat(preds).cpu().numpy() + labels = torch.cat(labels).cpu().numpy() + domains = torch.cat(domains).cpu().numpy() + print(preds.shape, labels.shape, domains.shape) + df = pd.DataFrame( + {"preds": preds, "labels": labels, "domains": domains}, + ) + filename = "preds-" + args.ckpt_path.split("/")[-1] + df.to_csv(f"outputs/{filename}.csv", index=False) + elif args.predict_image: + image_path = args.image_path + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + + # Define the transformations for the image + transform = transforms.Compose( + [ + transforms.CenterCrop((256, 256)), + transforms.ToTensor(), + ], + ) + image = load_image(image_path, transform) + prediction = predict_single_image(image, model) + print("prediction", prediction) + + # Output the prediction + print( + f"Prediction for {image_path}: " + f"{'Human' if prediction <= 0.05 else 'Generated'}", + ) + elif args.dir is not None: + predictions = [] + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + transform = transforms.Compose( + [ + transforms.CenterCrop((256, 256)), + transforms.ToTensor(), + ], + ) + for root, dirs, files in os.walk(os.path.abspath(args.dir)): + for f_name in files: + f = os.path.join(root, f_name) + print(f"Predicting: {f}") + p = predict_single_image(f, model) + predictions.append([f, f.split("/")[-2], p, p > 0.5]) + print(f"--predicted: {p}") + + df = pd.DataFrame( + predictions, + columns=["path", "folder", "pred", "class"], + ) + df.to_csv(args.output_file, index=False) + else: + logging.basicConfig( + filename="training.log", + filemode="w", + level=logging.INFO, + force=True, + ) + train_dl = load_dataloader( + train_domains, + "train", + batch_size=128, + num_workers=4, + ) + logging.info("Training dataloader loaded") + val_dl = load_dataloader( + val_domains, + "val", + batch_size=128, + num_workers=4, + ) + logging.info("Validation dataloader loaded") + + if args.reset: + model = ImageClassifier.load_from_checkpoint(args.ckpt_path) + else: + model = ImageClassifier(lmd=lmd_value) + trainer = pl.Trainer( + callbacks=[checkpoint_callback, early_stop_callback], + max_steps=20000, + val_check_interval=1000, + check_val_every_n_epoch=None, + ) + trainer.fit( + model=model, + train_dataloaders=train_dl, + val_dataloaders=val_dl, + ckpt_path=args.ckpt_path if not args.reset else None, + ) diff --git a/src/images/diffusion_utils_sampling.py b/src/images/diffusion_utils_sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7162d54d992c35e2ab4b93a2bae67f6ca8716c --- /dev/null +++ b/src/images/diffusion_utils_sampling.py @@ -0,0 +1,94 @@ +import collections +import random +from typing import Callable + +from torchdata.datapipes.iter import IterDataPipe + + +def get_second_entry(sample): + return sample[1] + + +class UnderSamplerIterDataPipe(IterDataPipe): + """Dataset wrapper for under-sampling. + + Copied from: https://github.com/MaxHalford/pytorch-resample/blob/master/pytorch_resample/under.py # noqa + Modified to work with multiple labels. + + MIT License + + Copyright (c) 2020 Max Halford + + This method is based on rejection sampling. + + Parameters: + dataset + desired_dist: The desired class distribution. + The keys are the classes whilst the + values are the desired class percentages. + The values are normalised so that sum up + to 1. + label_getter: A function that takes a sample and returns its label. + seed: Random seed for reproducibility. + + Attributes: + actual_dist: The counts of the observed sample labels. + rng: A random number generator instance. + + References: + - https://www.wikiwand.com/en/Rejection_sampling + + """ + + def __init__( + self, + dataset: IterDataPipe, + desired_dist: dict, + label_getter: Callable = get_second_entry, + seed: int = None, + ): + + self.dataset = dataset + self.desired_dist = { + c: p / sum(desired_dist.values()) for c, p in desired_dist.items() + } + self.label_getter = label_getter + self.seed = seed + + self.actual_dist = collections.Counter() + self.rng = random.Random(seed) + self._pivot = None + + def __iter__(self): + + for dp in self.dataset: + y = self.label_getter(dp) + + self.actual_dist[y] += 1 + + # To ease notation + f = self.desired_dist + g = self.actual_dist + + # Check if the pivot needs to be changed + if y != self._pivot: + self._pivot = max(g.keys(), key=lambda y: f[y] / g[y]) + else: + yield dp + continue + + # Determine the sampling ratio if the observed label + # is not the pivot + M = f[self._pivot] / g[self._pivot] + ratio = f[y] / (M * g[y]) + + if ratio < 1 and self.rng.random() < ratio: + yield dp + + @classmethod + def expected_size(cls, n, desired_dist, actual_dist): + M = max( + desired_dist.get(k) / actual_dist.get(k) + for k in set(desired_dist) | set(actual_dist) + ) + return int(n / M) diff --git a/src/images/image_demo.py b/src/images/image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..c63623b70dd0a28bb58882ecae7aedb50c5d1371 --- /dev/null +++ b/src/images/image_demo.py @@ -0,0 +1,73 @@ +import gradio as gr +import torchvision.transforms as transforms +from CNN_model_classifier import predict_cnn +from diffusion_model_classifier import ( + ImageClassifier, + predict_single_image, +) + +gr.set_static_paths(paths=["samples/"]) +diffusion_model = ( + "Diffusion/model_checkpoints/image-classifier-step=7007-val_loss=0.09.ckpt" +) +cnn_model = "CNN/model_checkpoints/blur_jpg_prob0.5.pth" + + +def get_prediction_diffusion(image): + model = ImageClassifier.load_from_checkpoint(diffusion_model) + + prediction = predict_single_image(image, model) + print(prediction) + return (prediction >= 0.001, prediction) + + +def get_prediction_cnn(image): + prediction = predict_cnn(image, cnn_model) + return (prediction >= 0.5, prediction) + + +def predict(inp): + # Define the transformations for the image + transform = transforms.Compose( + [ + transforms.Resize((224, 224)), # Image size expected by ResNet50 + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + ), + ], + ) + image_tensor = transform(inp) + pred_diff, prob_diff = get_prediction_diffusion(image_tensor) + pred_cnn, prob_cnn = get_prediction_cnn(image_tensor) + verdict = ( + "AI Generated" if (pred_diff or pred_cnn) else "No GenAI detected" + ) + return ( + f"

{verdict}

" + f"" + ) + + +demo = gr.Interface( + title="AI-generated image detection", + description="Demo by NICT & Tokyo Techies ", + fn=predict, + inputs=gr.Image(type="pil"), + outputs=gr.HTML(), + examples=[ + ["samples/fake_dalle.jpg", "Generated (Dall-E)"], + ["samples/fake_midjourney.png", "Generated (MidJourney)"], + ["samples/fake_stable.jpg", "Generated (Stable Diffusion)"], + ["samples/fake_cnn.png", "Generated (GAN)"], + ["samples/real.png", "Organic"], + ], +) + +demo.launch() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..336dcda9106fb1e9b51ff8c4b60f2dd2269db306 --- /dev/null +++ b/src/main.py @@ -0,0 +1,51 @@ +from texts.models import TextDetector + + +def extract_text_and_images(path: str): + text_content = "" + image_paths = "" + return text_content, image_paths + + +def process_document(document_path) -> list: + """ + Processes a given document, separating text and images, + and then analyzes them. + + Args: + document_path: Path to the document. + + Returns: + A list containing the AI content percentage for text and images. + """ + + # Extract text and images from the document + text_content, image_paths = extract_text_and_images(document_path) + + # Analyze text content + text_detector = TextDetector() + text_ai_content_percentage = text_detector.analyze_text(text_content) + + # Analyze image content + image_ai_content_percentages = [] + for image_path in image_paths: + # TODO: add image_detector class + # image_ai_content = image_detector.analyze_image(image_path) + image_ai_content = 100 + image_ai_content_percentages.append(image_ai_content) + + return [text_ai_content_percentage, image_ai_content_percentages] + + +def main(): + document_path = "../data.pdf" # Replace with your document path + text_ai_content_percentage, image_ai_content_percentages = ( + process_document(document_path) + ) + + print("Text AI Content Percentage:", text_ai_content_percentage) + print("Combined AI Content Percentage:", image_ai_content_percentages) + + +if __name__ == "__main__": + main() diff --git a/src/texts/MAGE/.gradio/flagged/dataset1.csv b/src/texts/MAGE/.gradio/flagged/dataset1.csv new file mode 100644 index 0000000000000000000000000000000000000000..efdfa1305a3b365f3973b989655983c4a41e50fc --- /dev/null +++ b/src/texts/MAGE/.gradio/flagged/dataset1.csv @@ -0,0 +1,2 @@ +input text,AI-text detection,timestamp +Does Chicago have any stores and does Joe live here?,"[{""token"": ""Does Chicago have any stores and does Joe live here?"", ""class_or_confidence"": ""human-written""}]",2024-12-09 13:40:10.255451 diff --git a/src/texts/MAGE/LICENSE b/src/texts/MAGE/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/src/texts/MAGE/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/texts/MAGE/README.md b/src/texts/MAGE/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fc841d030a37f1b34eabdaadde3c45c514419946 --- /dev/null +++ b/src/texts/MAGE/README.md @@ -0,0 +1,258 @@ +
+

+ +

+
+ +
+

MAGE: Machine-generated Text Detection in the Wild

+
+ +
+Version +License +Stars +Issues + + +
+ +_**Yafu Li, Qintong Li§, Leyang Cui, Wei Bi,Zhilin Wang$
**_ + +_**Longyue Wang, Linyi Yang, Shuming Shi, Yue Zhang
**_ + + + +_ Zhejiang University, + Westlake University, +§ The University of Hong Kong, +$ Jilin University, + Tencent AI Lab_ + +Presenting a comprehensive benchmark dataset designed to assess the proficiency of AI-generation detectors amidst real-world scenarios. +Welcome to try detection via our **[online demo](https://detect.westlake.edu.cn)**! + +
+ +## 📌 Table of Contents + +- [Introduction](#-introduction) +- [Activities](#-activities) +- [Dataset](#-dataset) +- [Try Detection](#computer--try-detection) +- [Data Samples](#-data-samples) +- [Citation](#-citation) + + +## 🚀 Introduction + +Recent advances in large language models have enabled them to reach a level of text generation comparable to that of humans. +These models show powerful capabilities across a wide range of content, including news article writing, story generation, and scientific writing. +Such capability further narrows the gap between human-authored and machine-generated texts, highlighting the importance of machine-generated text detection to avoid potential risks such as fake news propagation and plagiarism. +In practical scenarios, the detector faces texts from various domains or LLMs without knowing their sources. + +To this end, we build **a comprehensive testbed for deepfake text detection**, by gathering texts from various human writings and deepfake texts generated by different LLMs. +This repository contains the data to testify deepfake detection methods described in our paper, [MAGE: Machine-generated Text Detection in the Wild](https://aclanthology.org/2024.acl-long.3/). +Welcome to test your detection methods on our testbed! + +## 📅 Activities + +- 🎉 **May 16, 2024**: Our paper was accepted by ACL 2024! +- 🎉 **June 19, 2023**: Update two 'wilder' testbeds! We go one step wilder by constructing an additional testset with texts from unseen domains generated by an unseen model, to testify the detection ability in more practical scenarios. + We consider four new datasets: CNN/DailyMail, DialogSum, PubMedQA and IMDb to test the detection of deepfake news, deepfake dialogues, deepfake scientific answers and deepfake movie reviews. + We sample 200 instances from each dataset and use a newly developed LLM, i.e., GPT-4, with specially designed prompts to create deepfake texts, establishing an "Unseen Domains & Unseen Model" scenario. + Previous work demonstrates that detection methods are vulnerable to being deceived by target texts. + Therefore, we also paraphrase each sentence individually for both human-written and machine-generated texts, forming an even more challenging testbed. + We adopt gpt-3.5-trubo as the zero-shot paraphraser and consider all paraphrased texts as machine-generated. +- May 25, 2023: Initial dataset release including texts from 10 domains and 27 LLMs, contributing to 6 testbeds with increasing detection difficulty. + +## 📝 Dataset + +The dataset consists of **447,674** human-written and machine-generated texts from a wide range of sources in the wild: + +- Human-written texts from **10 datasets** covering a wide range of writing tasks, e.g., news article writing, story generation, scientific writing, etc. +- Machine-generated texts generated by **27 mainstream LLMs** from 7 sources, e.g., OpenAI, LLaMA, and EleutherAI, etc. +- **8 systematic testbed**s with increasing wildness and detection difficulty. + +### 📥 How to Get the Data + +#### 1. Huggingface + +You can access the full dataset, which includes the Cross-domains & Cross-models testbed and two additional wilder test sets, through the [Huggingface API](https://huggingface.co/datasets/yaful/MAGE): + +```python +from datasets import load_dataset +dataset = load_dataset("yaful/MAGE") +``` + +which includes traditional splits (train.csv, valid.csv and test.csv) and two wilder test sets (test_ood_set_gpt.csv and test_ood_set_gpt_para.csv). +The csv files have three columns: text, label (0 for machine-generated and +1 for human-written) and text source information (e.g., ''cmv_human'' denotes the text is written by humans, +whereas ''roct_machine_continuation_flan_t5_large'' denotes the text is generated by ''flan_t5_large'' using continuation prompt). + +To obtain the 6 testbeds mentioned in our paper, simply apply the provided script: + +```shell +python3 deployment/prepare_testbeds.py DATA_PATH +``` + +Replace ''DATA_PATH'' with the output data directory where you want to save the 6 testbeds. + +#### 2. Cloud Drive + +Alternatively, you can access the 6 testbeds by downloading them directly through [Google Drive](https://drive.google.com/drive/folders/1p09vDiEvoA-ZPmpqkB2WApcwMQWiiMRl?usp=sharing) +or [Tencent Weiyun](https://share.weiyun.com/JUWQxF4H): + +The folder contains 4 packages: + +- testbeds_processed.zip: 6 testbeds based on the ''processed'' version, which can be directly used for detecting in-distribution and out-of-distribution detection performance. +- wilder_testsets.zip: 2 wilder test sets with texts processed, aiming for (1) detecting deepfake text generated by GPT-4, and (2) detecting deepfake text in paraphrased versions. +- source.zip: Source texts of human-written texts and corresponding texts generated by LLMs, without filtering. +- processed.zip: This is a refined version of the "source" that filters out low-quality texts and specifies sources as CSV file names. For example, the "cmv_machine_specified_gpt-3.5-trubo.csv" file contains texts from the CMV domain generated by the "gpt-3.5-trubo" model using specific prompts, while "cmv_human" includes human-written CMV texts. + +## :computer: Try Detection + +### Python Environment + +For deploying the Longformer detector or training your own detector using our data, simply install the following packages: + +```shell +pip install transformers +pip install datasets +pip install clean-text # for data preprocessing +``` + +Or you can run: + +```shell +pip install -r requirements.txt +``` + +### Model Access + +Our Longformer detector, which has been trained on the entire dataset, is now accessible through [Huggingface](https://huggingface.co/yaful/MAGE). Additionally, you can try detection directly using our [online demo](https://detect.westlake.edu.cn/). + +### + +We have refined the decision boundary based on out-of-distribution settings. To ensure optimal performance, we recommend preprocessing texts before sending them to the detector. + +```python +import torch +import os +from transformers import AutoModelForSequenceClassification,AutoTokenizer +from deployment import preprocess, detect + +# init +device = 'cpu' # use 'cuda:0' if GPU is available +# model_dir = "nealcly/detection-longformer" # model in our paper +model_dir = "yaful/MAGE" # model in the online demo +tokenizer = AutoTokenizer.from_pretrained(model_dir) +model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device) + +text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives." +# preprocess +text = preprocess(text) +# detection +result = detect(text,tokenizer,model,device) +``` + +### Detection Performance + +#### In-distribution Detection + +| Testbed | HumanRec | MachineRec | AvgRec | AUROC | +| ------------------------------------ | -------- | ---------- | ------ | ----- | +| White-box | 97.30% | 95.91% | 96.60% | 0.99 | +| Arbitrary-domains & Model–specific | 95.25% | 96.94% | 96.60% | 0.99 | +| Fixed-domain & Arbitrary-models | 89.78% | 97.24% | 93.51% | 0.99 | +| Arbitrary-domains & Arbitrary-models | 82.80% | 98.27% | 90.53% | 0.99 | + +#### Out-of-distribution Detection + +| Testbed | HumanRec | MachineRec | AvgRec | AUROC | +| ----------------- | -------- | ---------- | ------ | ----- | +| Unseen Model Sets | 83.31% | 89.90% | 86.61% | 0.95 | +| Unseen Domains | 38.05% | 98.75% | 68.40% | 0.93 | + +#### Wilder Testsets + +| Testbed | HumanRec | MachineRec | AvgRec | AUROC | +| ----------------------------- | -------- | ---------- | ------ | ----- | +| Unseen Domains & Unseen Model | 88.78% | 84.12% | 86.54% | 0.94 | +| Paraphrase | 88.78% | 37.05% | 62.92% | 0.75 | + +## 📄 Data Samples + +All instances are stored as rows in a csv format, with each row consiting of 3 columns: _Text_, _Label_ (0 for machine-generated and 1 for human-written) and _Index_ (indexes in the original data source, used restore alignment after filtering). + +#### News Article + +| Text | Label | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives. | 1 | +| Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people who applied for an Apple Card beta account last week will be sent invitations just before April 26, when everyone else can start using it too. The Apple Card is designed with privacy in mind: users aren't able to link their cards or view detailed transaction histories online as some other services do. The app itself also stores no personal data on your phone, only a virtual version in its secure payment system so you don't have to give Apple access to any sensitive details. It uses machine learning-based algorithms that learn from how you spend to provide personalized recommendations about your spending habits. Your transactions are stored offline on your device, the company says, while information like email addresses remains encrypted during transit between your devices and the cloud. And if you ever lose your physical card, Apple has said there's a way to temporarily disable the card without affecting payments until the actual one arrives. | 0 | +| Today marks the beginning of a new era of financial technology: Apple Card is now available to all users in the United States. The long-awaited credit card from Apple, which was announced earlier this year, is now available for everyone to sign up and use. With features such as cashback on purchases and robust security measures, Apple Card could revolutionize how people make payments. This could be the start of a new wave of digital payment options. | 0 | + +#### Opinion Statement + +| Text | Label | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| Look, I know this is a touchy subject, and while people might say I'm young and didn't understand the complexity of wars, just hear me out. Vietnam was essentially a communist state, due to influences from China and USSR, which were alliances (the former is debatable) of Vietnam during the war. After the war, our country has suffered multiple economic depressions, and famines due to the incompetence of our liberally named Communist Party. Granted the South Vietnam government wasn't any better, but what the U.S wanted for Vietnam was for the best. I understand that, technically the US did not wage war with our people, but stood against the spread of communism in Asia, and with our strategic location, a battle surely followed. The US did not deliberately invaded our country. And look at what they did to the world. Defeated the Nazis and fascist countries, uplifted South Korea, Japan (which were both smaller and less resourceful than my country) to their respectable position on the world map today. And what had the sole communist party in my country done? Nothing but left our people in the struggle of a third-world country. And China is still brazenly harassing our borders and seas to this very day, just because our army is incapable of standing up for themselves. Please tell me if I was wrong and why the North won was a good idea. Edit: My view has changed. It was not simple as I thought it was. Generally it can be summarized into those points: involvement of China, motives and war crimes committed by the US, and there was no hope in the governing system. Communism has not helped our people a bit, but no one can really advocates for America either. We as a nation should look to develop our own path. Insights are still very much appreciated. And thanks for the discussions. | 1 | +| Look, I know this is a touchy subject, and while people might say I'm young and didn't understand the complexity of wars, just hear me out. Vietnam was essentially a lost war. A war where we fought against the communists, but lost, after years of fighting and thousands of lives lost. We were a technologically advanced nation, but outmatched by the communists who were determined to destroy us. And they almost did. So when I think about Iraq, I can't help but compare it to Vietnam. And the only thing I'm seeing is our forces being put in a situation where they can't win. Let's start with the weapons. I'm not a weapons expert by any means, so I don't know all the fine details. But the simple facts are this: the communists had the Russians, and we had the U.S. (and other allies). Well, the communists have just as many weapons as we have, if not more. I understand that we can win by outnumbering them, but that is very difficult. Most likely we will have to use sophisticated weapons, but then we get into the tricky area of international law. Can you really justify dropping a bomb on a country that has a pretty advanced military force (think of North Korea, for example)? The answer might be yes, because if you don't do that you're handing the war to them, but then you have to ask yourself if you really want to start that slippery slope. Now there are some people who think that if we just let the terrorists have their way with us, then we will send a message to the world. Well, if that's the case, then what about the message we send by having weapons that are supposedly sophisticated enough to kill entire countries? You can't send a message by allowing innocent people to die, and if you want to kill innocent people, then you might as well start killing people at home. So there are people who say we should use these weapons in Iraq, and there are others who say we shouldn't, and there are the people who have their own ideas. But the one thing I know is this: we are in a very difficult position. We don't have the technology to back up our claims that we are the good guys, and we don't want to lose by being outmatched, so the only thing we can do is back out of the war. But this brings up a very interesting point. I wonder if Bush, who has been preaching against the communists, is going to back out of Iraq. And if he doesn't, what kind of message does that send? I know that he wants to send a message to the rest of the world, but do we really want to send that message? If we do, then what about the message we send by supporting one of the richest nations in the world, and supporting war that many of us don't even want? I know that many of you disagree with me, and I'm sorry if this is rude, but I'm just trying to get people to think. I'm not trying to be mean, and I know that I'm not right, but at least I have something to say. I know that I can't change anything, but I know that I can at least try. | 0 | +| It is understandable that you may wish the United States had won the Vietnam War, however, it is important to recognize that the Vietnam War was a complex conflict with many political and social implications. In reality, it is impossible to predict what would have happened if the U.S. had won the war. The war could have potentially resulted in more loss of life and suffering for the Vietnamese people. It is also important to consider that the war united the Vietnamese people and eventually led to the reunification of Vietnam in 1976, which could not have occurred if the U.S. had been victorious. Therefore, while it can be tempting to look back on history and wish for a different outcome, it is important to recognize the complexities of the Vietnam War and the positive outcomes that have come from it. | 0 | + +#### Long-form Answer + +| Text | Label | +| --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| That is called bootstrap problem. How can you program something when no software exists that lets you program things. And how can a computer read what to do, if it doesn't know how to read. The answer is that you have to write a really simple program yourself, onto the hardware. It never changes for a computer, and is used every time you turn it on. That tiny program doesn't do anything except tell every part of the computer what it is and where it can get the stuff it needs. This includes really basic stuff, like storage adresses and and how to read them. From then on, the hardware can look up how to use the screen, how to read the keyboard, all those things. It's of course a bit more complicated than that, but once you have that first spark going, you can build up on that and program away.,We did use keyboards. They just weren't connected to the computer. You typed in your command on what was basically a typewriter which then"punched" the information onto cards. These were known as Hollerith Punch Cards - the machine looked like this: URL0 You then took the stack of cards very carefully to the computer hopper and fed them in. They had to stay in the same order they were punched for your program to work.', "Originally, computers were little more than some cathodes connected by cables. Programming them was done by changing the connections. A little later, computers had readers that took in cards with holes in certain distances, serving as binary input. I imagine.the first keyboard simply used that reader's interface,Well, for a pretty user-friendly way, there's the original [Altair 8800] (URL1). Note the switches on the front; when entering a program, you would set them to a position representing a number representing an operation of the computer, then press a button to store it in the memory and proceed to the next position. When you were done you'd press some other button to start the program you just "typed." Of course, all would be lost when you turned off the power.', "Via use of physical digital circuitry. I kid you not. As a project for an introductory course to digitial systems we had to make a ping pong game explicitly from digital circuits using specific IC's (and the basic NOTXORORAND gate IC's only). There was no code whatsoever in this system. Once you manage that, everything else is easy.,Punched cards preceded computers by [decades] (URL2). They could be used with sorters and counters to answer fairly sophisticated questions, as long as the data fit on the card. When business computers become useful they had the cards and the keyboards used to fill them out ready and waiting. | 1 | +| The original programmers probably used a different way to make the computer do what they wanted it to do because keyboards did not exist yet. They might have used a different type of machine to give commands, or they might have used punch cards or other methods to write down instructions that the computer could understand. The keyboard was invented later to make it easier and faster for people to tell a computer what to do. | 0 | +| That is called bootstrap problem. How can you program something when no software exists that lets you program things. And how can a computer read what to do, if it has no instructions to begin with? One possible solution to the bootstrap problem is to start with a set of very basic instructions written in machine language (the language that computers speak), and use those instructions to build more complex instructions and programs. This process is also known as bootstrapping or self-hosting. For example, a programmer could start with the most basic machine instructions to add, subtract, and compare numbers, and then use those instructions to create more complex operations like conditional statements, loops, and functions. Over time, more complex programs and programming languages can be built upon this foundation. Another solution is to use a pre-existing programming language or tool to create the initial instructions or software needed to bootstrap a new system. This approach is common in the development of operating systems, where code is initially written in a higher-level language like C, and then compiled into machine code that can be run on a computer. Overall, there are several ways to approach the bootstrap problem, and the solution will typically depend on the specific goals and constraints of the project. | 0 | + +#### Story Generation + +| Text | Label | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| Thomas loves science fiction, and is pleased to find himself sitting by the park entrance with Arthur C. Clarke's " Fountains of Paradise " open in his lap. He must have jogged there, he thinks to himself as he admires his brand new black-and-white Nikes. He stretches out in his black joggers and turns the page. " But there was no substitute for reality, one should beware of imitations ," he reads before shutting the book. Thomas ponders what he has read as he looks to the right; not a single car can be seen. The street appears infinite in length and the buildings fade in to the distance with it. He stands and begins his first step down the street. His movement halts when he hears a young voice behind him, " You look thirsty mister. Would you like some lemonade? " Thomas walks back past the park entrance and over to the lemonade stand, wondering how he had not noticed it before. It is beautiful, the entrance; but the park is closed now. Thomas stares up at the gates in awe. Thomas is interrupted again by the child, " 5.50, please. " Thomas looks at the counter, flustered. " I'll have the punch instead. " As the child pours the purple drink in to the cup, Thomas reaches in his pocket finding a five dollar bill and three quarters. " Keep the change ," Thomas says as he picks up his drink. Thomas sips and the sky slowly dims. He feels his breath drawn away from him as a comet sails over the park entrance. And Heaven's Gate opens. | 1 | +| Thomas loves science fiction, and is pleased to find himself sitting by the park entrance with Arthur C. Clarke's " Fountains of Paradise " open in his lap. He must have been reading for quite a while, as it's getting dark, and the other night-time park visitors are beginning to emerge. He gets up to leave, and on his way out finds a very tiny boy walking around in circles, trying to find his parents. The little boy is quite distressed, and Thomas takes him to the park office, which is locked. Thomas finally remembers that he's got a cell phone in his pocket, and calls the number on the sign. The woman on the other end is very kind, and promises to come help the boy right away. Thomas is pleased to have been able to help, and heads off to the train station to go home. On the train, his eyes are tired, and he falls asleep. At the end of the chapter, we find out that the woman on the phone was the boy's grandmother. The boy was seven years old, and his parents had taken him to the park for a picnic. The boy had started walking around in circles when he couldn't find his mother and father again. | 0 | +| Jeff was a normal guy, living a normal life. He had a family, a job, and a few friends. But above all else, he wasn't religious. He rarely thought about religion, and when he did, it was with a kind of apathy. One day, Jeff died unexpectedly. He woke up in an unfamiliar place, surrounded by people he didn't know. He was confused, but no one seemed to mind. As he looked around, Jeff noticed that everyone was dressed differently and speaking different languages. Then it hit him - he had died and gone to the afterlife. But something else struck him: none of these people were from his own religion. In fact, he didn't recognize any of the religions here. Then it dawned on him - this wasn't the afterlife of his religion, it was the afterlife of the religion whose tenets he had followed most closely, knowingly or not. He had lived his life without being religious, but had unknowingly followed a certain set of beliefs. Now, in the afterlife, he was among those who had done the same. Jeff found himself feeling strangely comforted in this new place. He realized that even though his faith had been different than others', its core values were still very much the same. This newfound understanding filled Jeff with peace and joy, and he felt like he had really come home. | 0 | + +#### Scientific Writing + +| Text | Label | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | +| Although deep-learning-based methods have markedly improved the performance of speech separation over the past few years, it remains an open question how to integrate multi-channel signals for speech separation. We propose two methods, namely, early-fusion and late-fusion methods, to integrate multi-channel information based on the time-domain audio separation network, which has been proven effective in single-channel speech separation. We also propose channel-sequential-transfer learning, which is a transfer learning framework that applies the parameters trained for a lower-channel network as the initial values of a higher-channel network. For fair comparison, we evaluated our proposed methods using a spatialized version of the wsj0-2mix dataset, which is open-sourced. It was found that our proposed methods can outperform multi-channel deep clustering and improve the performance proportionally to the number of microphones. It was also proven that the performance of the late-fusion method is consistently higher than that of the single-channel method regardless of the angle difference between speakers. | 1 | +| Although deep learning has achieved appealing results on several machine learning tasks, most of the models are deterministic at inference, limiting their application to single-modal settings. We propose a novel probabilistic deep learning model, namely Probabilistic Interpretation Network (PIN), which enables multi-modal inference, uncertainty quantification, and sample-based exploration by extracting latent representations from multiple modalities (e.g. vision and language) and modeling their dependencies via a probabilistic graphical model. PIN is a flexible framework that can be used to train interpretable multi-modal models as well as handle modalities in an unsupervised setting. We apply PIN to a wide variety of tasks including out-of-distribution detection, visual question answering and goal-driven dialogue. We present a new evaluation metric for goal-driven dialogue and show that PIN is capable of handling both modalities and uncertainty in this setting. | 0 | +| Although deep learning has achieved appealing results on several machine learning tasks, most of the models are deterministic at inference, limiting their application to single-modal settings. We propose a novel approach that allows to perform probabilistic inference with deep learning models. Our method is based on a variational autoencoder (VAE) and uses a mixture of Gaussians as a prior distribution for the latent variable. The VAE is trained by maximising a variational lower bound on the data log-likelihood, which can be seen as an evidence lower bound (ELBO). We introduce a novel approach to learn this ELBO, which is based on the re-parameterisation trick. This trick allows us to use standard gradient descent techniques to optimise the ELBO and consequently obtain a probabilistic latent representation for the data. We evaluate our model on a variety of datasets, including images, text, and speech. Our results show that our approach achieves comparable performance to existing deterministic models, while providing a probabilistic interpretation of the input data. Moreover, we demonstrate that our approach yields better generalisation ability when compared to deterministic models. | 0 | + +## 📚 Citation + +If you use this dataset in your research, please cite it as follows: + +```bibtex +@inproceedings{li-etal-2024-mage, + title = "{MAGE}: Machine-generated Text Detection in the Wild", + author = "Li, Yafu and + Li, Qintong and + Cui, Leyang and + Bi, Wei and + Wang, Zhilin and + Wang, Longyue and + Yang, Linyi and + Shi, Shuming and + Zhang, Yue", + editor = "Ku, Lun-Wei and + Martins, Andre and + Srikumar, Vivek", + booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + month = aug, + year = "2024", + address = "Bangkok, Thailand", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2024.acl-long.3", + doi = "10.18653/v1/2024.acl-long.3", + pages = "36--53", +} +``` + +We welcome contributions to improve this dataset! If you have any questions or feedback, please feel free to reach out at yafuly@gmail.com. diff --git a/src/texts/MAGE/app.py b/src/texts/MAGE/app.py new file mode 100644 index 0000000000000000000000000000000000000000..aa269c5d385f50d52572e29286d9e12d32497885 --- /dev/null +++ b/src/texts/MAGE/app.py @@ -0,0 +1,74 @@ +from transformers import pipeline +from difflib import Differ +from transformers import AutoModelForSequenceClassification,AutoTokenizer +from deployment import preprocess, detect +import gradio as gr + +ner_pipeline = pipeline("ner") + + +def ner(text): + output = ner_pipeline(text) + output = [ + {'entity': 'I-LOC', 'score': 0.9995369, 'index': 2, 'word': 'Chicago', 'start': 5, 'end': 12}, + {'entity': 'I-PER', 'score': 0.99527764, 'index': 8, 'word': 'Joe', 'start': 38, 'end': 41} + ] + print(output) + return {"text": text, "entities": output} + +def diff_texts(text1, text2): + d = Differ() + return [ + (token[2:], token[0] if token[0] != " " else None) + for token in d.compare(text1, text2) + ] + +out = diff_texts( + "The quick brown fox jumped over the lazy dogs.", + "The fast brown fox jumps over lazy dogs.") +print(out) + + +def separate_characters_with_mask(text, mask): + """Separates characters in a string and pairs them with a mask sign. + + Args: + text: The input string. + + Returns: + A list of tuples, where each tuple contains a character and a mask. + """ + + return [(char, mask) for char in text] + + +def detect_ai_text(text): + text = preprocess(text) + result = detect(text,tokenizer,model,device) + print(result) + output = separate_characters_with_mask(text, result) + return output + +# init +device = 'cpu' # use 'cuda:0' if GPU is available +# model_dir = "nealcly/detection-longformer" # model in our paper +model_dir = "yaful/MAGE" # model in the online demo +tokenizer = AutoTokenizer.from_pretrained(model_dir) +model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device) +examples = ["Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives."] + +demo = gr.Interface(detect_ai_text, + gr.Textbox( + label="input text", + placeholder="Enter text here...", + lines=5, + ), + gr.HighlightedText( + label="AI-text detection", + combine_adjacent=True, + show_legend=True, + color_map={"machine-generated": "red", "human-written": "green"} + ), + examples=examples) + +demo.launch(share=True) \ No newline at end of file diff --git a/src/texts/MAGE/deployment/__init__.py b/src/texts/MAGE/deployment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67 --- /dev/null +++ b/src/texts/MAGE/deployment/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/src/texts/MAGE/deployment/prepare_testbeds.py b/src/texts/MAGE/deployment/prepare_testbeds.py new file mode 100644 index 0000000000000000000000000000000000000000..986a3b5f274f88ab796c82655229386f790939bb --- /dev/null +++ b/src/texts/MAGE/deployment/prepare_testbeds.py @@ -0,0 +1,348 @@ +import csv +import os +import sys +from collections import defaultdict +import random +from datasets import load_dataset + +set_names = [ + "cmv", + "yelp", + "xsum", + "tldr", + "eli5", + "wp", + "roct", + "hswag", + "squad", + "sci_gen", +] + +oai_list = [ + # openai + "gpt-3.5-trubo", + "text-davinci-003", + "text-davinci-002", +] +llama_list = ["_7B", "_13B", "_30B", "_65B"] +glm_list = [ + "GLM130B", +] +flan_list = [ + # flan_t5, + "flan_t5_small", + "flan_t5_base", + "flan_t5_large", + "flan_t5_xl", + "flan_t5_xxl", +] + +opt_list = [ + # opt, + "opt_125m", + "opt_350m", + "opt_1.3b", + "opt_2.7b", + "opt_6.7b", + "opt_13b", + "opt_30b", + "opt_iml_30b", + "opt_iml_max_1.3b", +] +bigscience_list = [ + "bloom_7b", + "t0_3b", + "t0_11b", +] +eleuther_list = [ + "gpt_j", + "gpt_neox", +] +model_sets = [ + oai_list, + llama_list, + glm_list, + flan_list, + opt_list, + bigscience_list, + eleuther_list, +] + +data_dir = sys.argv[1] +dataset = load_dataset("yaful/DeepfakeTextDetect") +if not os.path.exists(data_dir): + os.makedirs(data_dir) +""" +csv_path = f"{data_dir}/train.csv" +train_results = list(csv.reader(open(csv_path,encoding='utf-8-sig')))[1:] +csv_path = f"{data_dir}/valid.csv" +valid_results = list(csv.reader(open(csv_path,encoding='utf-8-sig')))[1:] +csv_path = f"{data_dir}/test.csv" +test_results = list(csv.reader(open(csv_path,encoding='utf-8-sig')))[1:] +""" +train_results = [ + (row["text"], str(row["label"]), row["src"]) for row in list(dataset["train"]) +] +valid_results = [ + (row["text"], str(row["label"]), row["src"]) for row in list(dataset["validation"]) +] +test_results = [ + (row["text"], str(row["label"]), row["src"]) for row in list(dataset["test"]) +] +merge_dict = { + "train": (train_results, 800), + "valid": (valid_results, 100), + "test": (test_results, 100), +} + + +test_ood_gpt = dataset["test_ood_gpt"] +test_ood_gpt_para = dataset["test_ood_gpt_para"] +test_ood_gpt.to_csv(os.path.join(data_dir, "test_ood_gpt.csv")) +test_ood_gpt_para.to_csv(os.path.join(data_dir, "test_ood_gpt_para.csv")) + + +# make domain-specific_model-specific (gpt_j) +def prepare_domain_specific_model_specific(): + tgt_model = "gpt_j" + testbed_dir = f"{data_dir}/domain_specific_model_specific" + sub_results = defaultdict(lambda: defaultdict(list)) + print("# preparing domain-specific & model-specific ...") + for name in set_names: + print(f"## preparing {name} ...") + for split in ["train", "valid", "test"]: + split_results, split_count = merge_dict[split] + count = 0 + for res in split_results: + info = res[2] + res = res[:2] + if name in info: + # human-written + if res[1] == "1" and count <= split_count: + sub_results[name][split].append(res) + # machine-generated + if tgt_model in info: + assert res[1] == "0" + sub_results[name][split].append(res) + count += 1 + + sub_dir = f"{testbed_dir}/{name}" + os.makedirs(sub_dir, exist_ok=True) + for split in ["train", "valid", "test"]: + print(f"{split} set: {len(sub_results[name][split])}") + rows = sub_results[name][split] + row_head = [["text", "label"]] + rows = row_head + rows + tmp_path = f"{sub_dir}/{split}.csv" + with open(tmp_path, "w", newline="", encoding="utf-8-sig") as f: + csvw = csv.writer(f) + csvw.writerows(rows) + + +# make domain_specific_cross_models +def prepare_domain_specific_cross_models(): + testbed_dir = f"{data_dir}/domain_specific_cross_models" + sub_results = defaultdict(lambda: defaultdict(list)) + + print("# preparing domain_specific_cross_models ...") + for name in set_names: + print(f"## preparing {name} ...") + for split in ["train", "valid", "test"]: + split_results, split_count = merge_dict[split] + for res in split_results: + info = res[2] + res = res[:2] + if name in info: + # human-written + if res[1] == "1": + sub_results[name][split].append(res) + # machine-generated + else: + sub_results[name][split].append(res) + + sub_dir = f"{testbed_dir}/{name}" + os.makedirs(sub_dir, exist_ok=True) + for split in ["train", "valid", "test"]: + print(f"{split} set: {len(sub_results[name][split])}") + rows = sub_results[name][split] + row_head = [["text", "label"]] + rows = row_head + rows + tmp_path = f"{sub_dir}/{split}.csv" + with open(tmp_path, "w", newline="", encoding="utf-8-sig") as f: + csvw = csv.writer(f) + csvw.writerows(rows) + + +# make cross_domains_model_specific +def prepare_cross_domains_model_specific(): + print("# preparing cross_domains_model_specific ...") + for model_patterns in model_sets: + sub_dir = f"{data_dir}/cross_domains_model_specific/model_{model_patterns[0]}" + os.makedirs(sub_dir, exist_ok=True) + # model_pattern = dict.fromkeys(model_pattern) + _tmp = " ".join(model_patterns) + print(f"## preparing {_tmp} ...") + + ood_pos_test_samples = [] + out_split_samples = defaultdict(list) + for split in ["train", "valid", "test"]: + rows = merge_dict[split][0] + # print(f"Original {split} set length: {len(rows)}") + + out_rows = [] + for row in rows: + valid = False + srcinfo = row[2] + if row[1] == "1": # appending all positive samples + valid = True + for pattern in model_patterns: + if pattern in srcinfo: + valid = True + break + if valid: + out_rows.append(row) + # out_rows.append(row+[srcinfo[0]]) + + out_split_samples[split] = out_rows + + for split in ["train", "valid", "test"]: + random.seed(1) + rows = out_split_samples[split] + pos_rows = [r for r in rows if r[1] == "1"] + neg_rows = [r for r in rows if r[1] == "0"] + len_neg = len(neg_rows) + random.shuffle(pos_rows) + out_split_samples[split] = pos_rows[:len_neg] + neg_rows + + for split in ["train", "valid", "test"]: + out_rows = [e[:-1] for e in out_split_samples[split]] + print(f"{split} set: {len(out_rows)} ...") + # xxx + tgt_path = f"{sub_dir}/{split}.csv" + with open(tgt_path, "w", newline="", encoding="utf-8-sig") as f: + csvw = csv.writer(f) + csvw.writerows([["text", "label"]] + out_rows) + + +# make cross_domains_cross_models +def prepare_cross_domains_cross_models(): + print("# preparing cross_domains_cross_models ...") + testbed_dir = f"{data_dir}/cross_domains_cross_models" + os.makedirs(testbed_dir, exist_ok=True) + for split in ["train", "valid", "test"]: + csv_path = f"{testbed_dir}/{split}.csv" + + with open(csv_path, "w", newline="", encoding="utf-8-sig") as f: + rows = [row[:-1] for row in merge_dict[split][0]] + print(f"{split} set: {len(rows)} ...") + csvw = csv.writer(f) + csvw.writerows([["text", "label"]] + rows) + + +# make unseen_models +def prepare_unseen_models(): + print("# preparing unseen_models ...") + for model_patterns in model_sets: + sub_dir = f"{data_dir}/unseen_models/unseen_model_{model_patterns[0]}" + os.makedirs(sub_dir, exist_ok=True) + _tmp = " ".join(model_patterns) + print(f"## preparing ood-models {_tmp} ...") + + ood_pos_test_samples = [] + out_split_samples = defaultdict(list) + for split in ["train", "valid", "test", "test_ood"]: + data_name = split if split != "test_ood" else "test" + rows = merge_dict[data_name][0] + + out_rows = [] + for row in rows: + valid = False + srcinfo = row[2] + for pattern in model_patterns: + if split != "test_ood": + if pattern in srcinfo: + valid = False + break + valid = True + else: + if pattern in srcinfo: + valid = True + break + if valid: + out_rows.append(row) + + out_split_samples[split] = out_rows + + random.seed(1) + test_rows = out_split_samples["test"] + test_pos_rows = [r for r in test_rows if r[1] == "1"] + test_neg_rows = [r for r in test_rows if r[1] == "0"] + len_aug = len(out_split_samples["test_ood"]) + # print(len_aug) + random.shuffle(test_pos_rows) + # out_split_samples['test'] = test_pos_rows[len_aug:] + test_neg_rows + out_split_samples["test_ood"] = ( + test_pos_rows[:len_aug] + out_split_samples["test_ood"] + ) + + for split in ["train", "valid", "test", "test_ood"]: + out_rows = [e[:-1] for e in out_split_samples[split]] + print(f"{split} set: {len(out_rows)}") + + tgt_path = f"{sub_dir}/{split}.csv" + with open(tgt_path, "w", newline="", encoding="utf-8-sig") as f: + csvw = csv.writer(f) + csvw.writerows([["text", "label"]] + out_rows) + + +# make unseen_domains +def prepare_unseen_domains(): + print("# preparing unseen_domains ...") + + testbed_dir = f"{data_dir}/unseen_domains" + sub_results = defaultdict(lambda: defaultdict(list)) + + for name in set_names: + sub_dir = f"{data_dir}/unseen_domains/unseen_domain_{name}" + os.makedirs(sub_dir, exist_ok=True) + + print(f"## preparing ood-domains {name} ...") + + ood_pos_test_samples = [] + out_split_samples = defaultdict(list) + for split in ["train", "valid", "test", "test_ood"]: + data_name = split if split != "test_ood" else "test" + rows = merge_dict[data_name][0] + + out_rows = [] + for row in rows: + srcinfo = row[2] + valid = True if name in srcinfo else False + valid = not valid if split != "test_ood" else valid + if valid: + out_rows.append(row) + + out_split_samples[split] = out_rows + + for split in ["train", "valid", "test", "test_ood"]: + out_rows = [e[:-1] for e in out_split_samples[split]] + print(f"{split} set: {len(out_rows)}") + tgt_path = f"{sub_dir}/{split}.csv" + with open(tgt_path, "w", newline="", encoding="utf-8-sig") as f: + csvw = csv.writer(f) + csvw.writerows([["text", "label"]] + out_rows) + + +# prepare 6 testbeds +prepare_domain_specific_model_specific() +print("-" * 100) +prepare_domain_specific_cross_models() +print("-" * 100) +prepare_cross_domains_model_specific() +print("-" * 100) +prepare_cross_domains_cross_models() +print("-" * 100) +prepare_unseen_models() +print("-" * 100) +prepare_unseen_domains() +print("-" * 100) diff --git a/src/texts/MAGE/deployment/utils.py b/src/texts/MAGE/deployment/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..efe117f5787553c047e4a4edad1839bc4a17d67a --- /dev/null +++ b/src/texts/MAGE/deployment/utils.py @@ -0,0 +1,294 @@ +import re +import torch +from cleantext import clean +from itertools import chain + +class MosesPunctNormalizer: + """ + This is a Python port of the Moses punctuation normalizer from + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl + """ + + EXTRA_WHITESPACE = [ # lines 21 - 30 + (r"\r", r""), + (r"\(", r" ("), + (r"\)", r") "), + (r" +", r" "), + (r"\) ([.!:?;,])", r")\g<1>"), + (r"\( ", r"("), + (r" \)", r")"), + (r"(\d) %", r"\g<1>%"), + (r" :", r":"), + (r" ;", r";"), + ] + + NORMALIZE_UNICODE_IF_NOT_PENN = [(r"`", r"'"), (r"''", r' " ')] # lines 33 - 34 + + NORMALIZE_UNICODE = [ # lines 37 - 50 + ("„", r'"'), + ("“", r'"'), + ("”", r'"'), + ("–", r"-"), + ("—", r" - "), + (r" +", r" "), + ("´", r"'"), + ("([a-zA-Z])‘([a-zA-Z])", r"\g<1>'\g<2>"), + ("([a-zA-Z])’([a-zA-Z])", r"\g<1>'\g<2>"), + ("‘", r"'"), + ("‚", r"'"), + ("’", r"'"), + (r"''", r'"'), + ("´´", r'"'), + ("…", r"..."), + ] + + FRENCH_QUOTES = [ # lines 52 - 57 + ("\u00A0«\u00A0", r'"'), + ("«\u00A0", r'"'), + ("«", r'"'), + ("\u00A0»\u00A0", r'"'), + ("\u00A0»", r'"'), + ("»", r'"'), + ] + + HANDLE_PSEUDO_SPACES = [ # lines 59 - 67 + ("\u00A0%", r"%"), + ("nº\u00A0", "nº "), + ("\u00A0:", r":"), + ("\u00A0ºC", " ºC"), + ("\u00A0cm", r" cm"), + ("\u00A0\\?", "?"), + ("\u00A0\\!", "!"), + ("\u00A0;", r";"), + (",\u00A0", r", "), + (r" +", r" "), + ] + + EN_QUOTATION_FOLLOWED_BY_COMMA = [(r'"([,.]+)', r'\g<1>"')] + + DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA = [ + (r',"', r'",'), + (r'(\.+)"(\s*[^<])', r'"\g<1>\g<2>'), # don't fix period at end of sentence + ] + + DE_ES_CZ_CS_FR = [ + ("(\\d)\u00A0(\\d)", r"\g<1>,\g<2>"), + ] + + OTHER = [ + ("(\\d)\u00A0(\\d)", r"\g<1>.\g<2>"), + ] + + # Regex substitutions from replace-unicode-punctuation.perl + # https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl + REPLACE_UNICODE_PUNCTUATION = [ + (",", ","), + (r"。\s*", ". "), + ("、", ","), + ("”", '"'), + ("“", '"'), + ("∶", ":"), + (":", ":"), + ("?", "?"), + ("《", '"'), + ("》", '"'), + (")", ")"), + ("!", "!"), + ("(", "("), + (";", ";"), + ("」", '"'), + ("「", '"'), + ("0", "0"), + ("1", "1"), + ("2", "2"), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", "6"), + ("7", "7"), + ("8", "8"), + ("9", "9"), + (r".\s*", ". "), + ("~", "~"), + ("’", "'"), + ("…", "..."), + ("━", "-"), + ("〈", "<"), + ("〉", ">"), + ("【", "["), + ("】", "]"), + ("%", "%"), + ] + + def __init__( + self, + lang="en", + penn=True, + norm_quote_commas=True, + norm_numbers=True, + pre_replace_unicode_punct=False, + post_remove_control_chars=False, + ): + """ + :param language: The two-letter language code. + :type lang: str + :param penn: Normalize Penn Treebank style quotations. + :type penn: bool + :param norm_quote_commas: Normalize quotations and commas + :type norm_quote_commas: bool + :param norm_numbers: Normalize numbers + :type norm_numbers: bool + """ + self.substitutions = [ + self.EXTRA_WHITESPACE, + self.NORMALIZE_UNICODE, + self.FRENCH_QUOTES, + self.HANDLE_PSEUDO_SPACES, + ] + + if penn: # Adds the penn substitutions after extra_whitespace regexes. + self.substitutions.insert(1, self.NORMALIZE_UNICODE_IF_NOT_PENN) + + if norm_quote_commas: + if lang == "en": + self.substitutions.append(self.EN_QUOTATION_FOLLOWED_BY_COMMA) + elif lang in ["de", "es", "fr"]: + self.substitutions.append(self.DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMA) + + if norm_numbers: + if lang in ["de", "es", "cz", "cs", "fr"]: + self.substitutions.append(self.DE_ES_CZ_CS_FR) + else: + self.substitutions.append(self.OTHER) + + self.substitutions = list(chain(*self.substitutions)) + + self.pre_replace_unicode_punct = pre_replace_unicode_punct + self.post_remove_control_chars = post_remove_control_chars + + def normalize(self, text): + """ + Returns a string with normalized punctuation. + """ + # Optionally, replace unicode puncts BEFORE normalization. + if self.pre_replace_unicode_punct: + text = self.replace_unicode_punct(text) + + # Actual normalization. + for regexp, substitution in self.substitutions: + # print(regexp, substitution) + text = re.sub(regexp, substitution, str(text)) + # print(text) + + # Optionally, replace unicode puncts BEFORE normalization. + if self.post_remove_control_chars: + text = self.remove_control_chars(text) + + return text.strip() + + def replace_unicode_punct(self, text): + for regexp, substitution in self.REPLACE_UNICODE_PUNCTUATION: + text = re.sub(regexp, substitution, str(text)) + return text + + def remove_control_chars(self, text): + return regex.sub(r"\p{C}", "", text) + +def _tokenization_norm(text): + text = text.replace( + ' ,', ',').replace( + ' .', '.').replace( + ' ?', '?').replace( + ' !', '!').replace( + ' ;', ';').replace( + ' \'', '\'').replace( + ' ’ ', '\'').replace( + ' :', ':').replace( + '', '\n').replace( + '`` ', '"').replace( + ' \'\'', '"').replace( + '\'\'', '"').replace( + '.. ', '... ').replace( + ' )', ')').replace( + '( ', '(').replace( + ' n\'t', 'n\'t').replace( + ' i ', ' I ').replace( + ' i\'', ' I\'').replace( + '\\\'', '\'').replace( + '\n ', '\n').strip() + return text + + +def _clean_text(text): + # remove PLM special tokens + plm_special_tokens = r'(\)|(\)|(\<\/s\>)|(\)|(\<\|endoftext\|\>)' + text = re.sub(plm_special_tokens, "", text) + + # normalize puncuations + moses_norm = MosesPunctNormalizer() + text = moses_norm.normalize(text) + + # normalize tokenization + text = _tokenization_norm(text) + + # remove specific text patterns, e.g,, url, email and phone number + text = clean(text, + fix_unicode=True, # fix various unicode errors + to_ascii=True, # transliterate to closest ASCII representation + lower=False, # lowercase text + no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them + no_urls=True, # replace all URLs with a special token + no_emails=True, # replace all email addresses with a special token + no_phone_numbers=True, # replace all phone numbers with a special token + no_numbers=False, # replace all numbers with a special token + no_digits=False, # replace all digits with a special token + no_currency_symbols=False, # replace all currency symbols with a special token + no_punct=False, # remove punctuations + replace_with_punct="", # instead of removing punctuations you may replace them + replace_with_url="", + replace_with_email="", + replace_with_phone_number="", + replace_with_number="", + replace_with_digit="", + replace_with_currency_symbol="", + lang="en" # set to 'de' for German special handling + ) + + # keep common puncts only + punct_pattern = r'[^ A-Za-z0-9.?!,:;\-\[\]\{\}\(\)\'\"]' + text = re.sub(punct_pattern, '', text) + # remove specific patterns + spe_pattern = r'[-\[\]\{\}\(\)\'\"]{2,}' + text = re.sub(spe_pattern, '', text) + # remove redundate spaces + text = " ".join(text.split()) + return text + +def _rm_line_break(text): + text = text.replace("\n","\\n") + text = re.sub(r'(?:\\n)*\\n', r'\\n', text) + text = re.sub(r'^.{0,3}\\n', '', text) + text = text.replace("\\n"," ") + return text + +def preprocess(text): + text = _rm_line_break(text) + text = _clean_text(text) + return text + + +def detect(input_text,tokenizer,model,device='cuda:0',th=-3.08583984375): + label2decisions = { + 0: "machine-generated", + 1: "human-written", + } + tokenize_input = tokenizer(input_text) + tensor_input = torch.tensor([tokenize_input["input_ids"]]).to(device) + outputs = model(tensor_input) + is_machine = -outputs.logits[0][0].item() + if is_machine < th: + decision = 0 + else: + decision = 1 + + return label2decisions[decision] diff --git a/src/texts/MAGE/main.py b/src/texts/MAGE/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9362ad37debc3ec71325aad7a4f869807ed9eda5 --- /dev/null +++ b/src/texts/MAGE/main.py @@ -0,0 +1,65 @@ +from transformers import AutoModelForSequenceClassification,AutoTokenizer +import datasets +from deployment import preprocess, detect +import csv +import pandas as pd + +# init +device = 'cpu' # use 'cuda:0' if GPU is available +# model_dir = "nealcly/detection-longformer" # model in our paper +model_dir = "yaful/MAGE" # model in the online demo +tokenizer = AutoTokenizer.from_pretrained(model_dir) +model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device) + +# text = "Apple's new credit card will begin a preview roll out today and will become available to all iPhone owners in the US later this month. A random selection of people will be allowed to go through the application process, which involves entering personal details which are sent to Goldman Sachs and TransUnion. Applications are approved or declined in less than a minute. The Apple Card is meant to be broadly accessible to every iPhone user, so the approval requirements will not be as strict as other credit cards. Once the application has been approved, users will be able to use the card immediately from the Apple Wallet app. The physical titanium card can be requested during setup for free, and it can be activated with NFC once it arrives." +# # preprocess +# text = preprocess(text) +# # detection +# result = detect(text,tokenizer,model,device) +# print(result) + +# ds = datasets.load_dataset('RealTimeData/bbc_news_alltime', '2020-02') +# test 100 samples from (RealTimeData/bbc_news_alltime', '2020-02') +# df = pd.read_csv('query_result.csv') +# content_column = df['content'] +# count = 0 + +# for content in content_column: +# # preprocess +# text = preprocess(content) +# # detection +# result = detect(text, tokenizer, model, device) +# if result == "human-written": +# count +=1 + +# print(count) +# print(count) + + +# ds = datasets.load_dataset('yaful/MAGE', 'test') +# ds.save_to_disk("MAGE_data") +# splits = list(ds.keys()) +# print(splits) + +ds = datasets.load_from_disk("MAGE_data") + +#filtered_data = ds['test'].filter(lambda x: x['src'] == 'xsum_human') + +human_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_human'] +human_data = human_data[0:100] + +machine_data = [example['text'] for example in ds['test'] if example['src'] == 'xsum_machine_topical_gpt-3.5-trubo'] +machine_data = machine_data[0:100] + +count = 0 +for content in machine_data: + # preprocess + text = preprocess(content) + # detection + result = detect(text, tokenizer, model, device) + print(result) + if result == "human-written": # machine-generated + count +=1 + + print(count) +print(count) \ No newline at end of file diff --git a/src/texts/MAGE/requirements.txt b/src/texts/MAGE/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..125e88acf551eb32a020ca041c2e5e137c34d3a4 --- /dev/null +++ b/src/texts/MAGE/requirements.txt @@ -0,0 +1,51 @@ +accelerate==0.24.1 +aiohttp==3.9.1 +aiosignal==1.3.1 +async-timeout==4.0.3 +attrs==23.1.0 +certifi==2023.11.17 +charset-normalizer==3.3.2 +clean-text==0.6.0 +click==8.1.7 +datasets==2.15.0 +dill==0.3.7 +emoji==1.7.0 +filelock==3.13.1 +frozenlist==1.4.0 +fsspec==2023.10.0 +ftfy==6.1.3 +huggingface-hub==0.19.4 +idna==3.6 +joblib==1.3.2 +multidict==6.0.4 +multiprocess==0.70.15 +nltk==3.8.1 +numpy==1.26.2 +packaging==23.2 +pandas==2.1.3 +Pillow==10.1.0 +pip==23.3.1 +psutil==5.9.6 +pyarrow==14.0.1 +pyarrow-hotfix==0.6 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +regex==2023.10.3 +requests==2.31.0 +safetensors==0.4.1 +setuptools==68.0.0 +six==1.16.0 +tokenizers==0.15.0 +#torch==1.13.1+cu116 +#torchaudio==0.13.1+cu116 +#torchvision==0.14.1+cu116 +tqdm==4.66.1 +transformers==4.35.2 +typing_extensions==4.8.0 +tzdata==2023.3 +urllib3==2.1.0 +wcwidth==0.2.12 +wheel==0.41.2 +xxhash==3.4.1 +yarl==1.9.3 diff --git a/src/texts/MAGE/training/longformer/main.py b/src/texts/MAGE/training/longformer/main.py new file mode 100644 index 0000000000000000000000000000000000000000..18d8dea14f794e2cac13fd1d03f30238bbb6e92d --- /dev/null +++ b/src/texts/MAGE/training/longformer/main.py @@ -0,0 +1,666 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE.""" +# You can also adapt this script on your own text classification task. Pointers for this are left as comments. + +import logging +import os +import random +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import numpy as np +from datasets import load_dataset, load_metric + +import transformers +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PretrainedConfig, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +os.environ['CURL_CA_BUNDLE'] = '' +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.9.0") + +require_version("datasets>=1.8.0", + "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the task to train on: " + + ", ".join(task_to_keys.keys())}, + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the training data."} + ) + validation_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the validation data."} + ) + test_file: Optional[str] = field(default=None, metadata={ + "help": "A csv or a json file containing the test data."}) + from_scratch: bool = field( + default=False, + metadata={ + "help": "set true to not load weights from pretrained models." + }, + ) + # do_eval: Optional[bool] = field( + # default=False, metadata={"help": "do evaluation."} + # ) + + def __post_init__(self): + if self.task_name is not None: + self.task_name = self.task_name.lower() + if self.task_name not in task_to_keys.keys(): + raise ValueError( + "Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) + elif self.dataset_name is not None: + pass + elif self.train_file is None or self.validation_file is None: + raise ValueError( + "Need either a GLUE task, a training/validation file or a dataset name.") + else: + train_extension = self.train_file.split(".")[-1] + assert train_extension in [ + "csv", "json"], "`train_file` should be a csv or a json file." + validation_extension = self.validation_file.split(".")[-1] + assert ( + validation_extension == train_extension + ), "`validation_file` should have the same extension (csv or json) as `train_file`." + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={ + "help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={ + "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if data_args.validation_file == data_args.test_file: + training_args.do_eval = False + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + # training_args["report_to"] = None # disable integrations + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the + # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named + # label if at least two columns are provided. + # + # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this + # single column. You can easily tweak this behavior (see below) + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.task_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + "glue", data_args.task_name, cache_dir=model_args.cache_dir) + elif data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + else: + # Loading a dataset from your local files. + # CSV/JSON training and evaluation files are needed. + data_files = {"train": data_args.train_file, + "validation": data_args.validation_file} + + # Get the test dataset: you can provide your own CSV/JSON test file (see below) + # when you use `do_predict` without specifying a GLUE benchmark task. + if training_args.do_predict: + if data_args.test_file is not None: + train_extension = data_args.train_file.split(".")[-1] + test_extension = data_args.test_file.split(".")[-1] + assert ( + test_extension == train_extension + ), "`test_file` should have the same extension (csv or json) as `train_file`." + data_files["test"] = data_args.test_file + + else: + raise ValueError( + "Need either a GLUE task or a test file for `do_predict`.") + + for key in data_files.keys(): + logger.info(f"load a local file for {key}: {data_files[key]}") + + if data_args.train_file.endswith(".csv"): + # Loading a dataset from local csv files + raw_datasets = load_dataset( + "csv", data_files=data_files, cache_dir=model_args.cache_dir) + else: + # Loading a dataset from local json files + raw_datasets = load_dataset( + "json", data_files=data_files, cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Labels + if data_args.task_name is not None: + is_regression = data_args.task_name == "stsb" + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + else: + # Trying to have good defaults here, don't hesitate to tweak to your needs. + is_regression = raw_datasets["train"].features["label"].dtype in [ + "float32", "float64"] + if is_regression: + num_labels = 1 + else: + # A useful fast method: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique + label_list = raw_datasets["train"].unique("label") + label_list.sort() # Let's sort it for determinism + num_labels = len(label_list) + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if not data_args.from_scratch: + model = AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ignore_mismatched_sizes=True, + ) + else: + model = AutoModelForSequenceClassification.from_config( + config=config, + # ignore_mismatched_sizes=True, + ) + # Preprocessing the raw_datasets + sentence1_key, sentence2_key = "text", None + # if data_args.task_name is not None: + # sentence1_key, sentence2_key = task_to_keys[data_args.task_name] + # else: + # # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. + # non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] + # if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: + # sentence1_key, sentence2_key = "sentence1", "sentence2" + # else: + # if len(non_label_column_names) >= 2: + # sentence1_key, sentence2_key = non_label_column_names[:2] + # else: + # sentence1_key, sentence2_key = non_label_column_names[0], None + + # Padding strategy + if data_args.pad_to_max_length: + padding = "max_length" + else: + # We will pad later, dynamically at batch creation, to the max sequence length in each batch + padding = False + + # Some models have set the order of the labels to use, so let's make sure we do use it. + label_to_id = None + if ( + model.config.label2id != PretrainedConfig( + num_labels=num_labels).label2id + and data_args.task_name is not None + and not is_regression + ): + # Some have all caps in their config, some don't. + label_name_to_id = { + k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = { + i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + "Your model seems to have been trained with labels, but they don't match the dataset: ", + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." + "\nIgnoring the model labels as a result.", + ) + elif data_args.task_name is None and not is_regression: + label_to_id = {v: i for i, v in enumerate(label_list)} + + if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = { + id: label for label, id in config.label2id.items()} + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else ( + examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, + max_length=max_seq_length, truncation=True) + # print('finish') + # print(examples[sentence1_key]) + # Map labels to IDs (not necessary for GLUE tasks) + + result["label"] = examples['label'] + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select( + range(data_args.max_train_samples)) + # print(training_args.do_eval) + # xxx + if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation_matched" if data_args.task_name == + "mnli" else "validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select( + range(data_args.max_eval_samples)) + + if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: + if "test" not in raw_datasets and "test_matched" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test_matched" if data_args.task_name == + "mnli" else "test"] + if data_args.max_predict_samples is not None: + predict_dataset = predict_dataset.select( + range(data_args.max_predict_samples)) + + # Log a few random samples from the training set: + if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info( + f"Sample {index} of the training set: {train_dataset[index]}.") + + # Get the metric function + # if data_args.task_name is not None: + # metric = load_metric("glue", data_args.task_name) + # else: + # metric = load_metric("accuracy", cache_dir='./evaluate') + + # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance( + p.predictions, tuple) else p.predictions + preds = np.squeeze( + preds) if is_regression else np.argmax(preds, axis=1) + # if data_args.task_name is not None: + # result = metric.compute(predictions=preds, references=p.label_ids) + # if len(result) > 1: + # result["combined_score"] = np.mean(list(result.values())).item() + # return result + if is_regression: + return {"mse": ((preds - p.label_ids) ** 2).mean().item()} + else: + accuracy = (preds == p.label_ids).astype(np.float32).mean().item() + TP = ((preds == p.label_ids) & (preds == 1) + ).astype(np.float32).sum().item() + TN = ((preds == p.label_ids) & (preds == 0) + ).astype(np.float32).sum().item() + FN = ((preds != p.label_ids) & (preds == 0) + ).astype(np.float32).sum().item() + FP = ((preds != p.label_ids) & (preds == 1) + ).astype(np.float32).sum().item() + + # metric_precision = load_metric("precision", cache_dir='./evaluate') + # precision = metric_precision.compute(predictions=preds, references=p.label_ids, average='macro') + # metric_recall = load_metric("recall", cache_dir='./evaluate') + # recall = metric_recall.compute(predictions=preds, references=p.label_ids, average='macro') + # metric_fscore = load_metric("f1", cache_dir='./evaluate') + # f1score = metric_fscore.compute(predictions=preds, references=p.label_ids, average='macro') + # print("-"*100) + try: + precision = TP / (TP+FP) + recall = TP / (TP+FN) + f1score = 2*precision*recall/(precision+recall) + print(f'precision:{precision}/recall"{recall}/f1:{f1score}') + precision = TN / (TN+FN) + recall = TN / (TN+FP) + f1score = 2*precision*recall/(precision+recall) + print(f'precision:{precision}/recall"{recall}/f1:{f1score}') + except: + print("float division by zero ...") + # return { + # "precision": precision, + # "recall": recall, + # "f1": f1score + # } + return { + "accuracy": accuracy + } + # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. + if data_args.pad_to_max_length: + data_collator = default_data_collator + elif training_args.fp16: + data_collator = DataCollatorWithPadding( + tokenizer, pad_to_multiple_of=8) + else: + data_collator = None + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len( + train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.save_model() # Saves the tokenizer too for easy upload + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + eval_datasets = [eval_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + eval_datasets.append(raw_datasets["validation_mismatched"]) + + for eval_dataset, task in zip(eval_datasets, tasks): + metrics = trainer.evaluate(eval_dataset=eval_dataset) + + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len( + eval_dataset) + ) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.do_predict: + logger.info("*** Predict ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + predict_datasets = [predict_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + predict_datasets.append(raw_datasets["test_mismatched"]) + + for predict_dataset, task in zip(predict_datasets, tasks): + # Removing the `label` columns because it contains -1 and Trainer won't like that. + predict_dataset = predict_dataset.remove_columns("label") + predictions = trainer.predict( + predict_dataset, metric_key_prefix="predict").predictions + + # save probability + out_predprob_file = os.path.join( + training_args.output_dir, f"predict_results_probs.csv") + np.savetxt(out_predprob_file, predictions, delimiter=",") + + # save predictions + predictions = np.squeeze( + predictions) if is_regression else np.argmax(predictions, axis=1) + + output_predict_file = os.path.join( + training_args.output_dir, f"predict_results_{task}.txt") + if trainer.is_world_process_zero(): + with open(output_predict_file, "w") as writer: + logger.info(f"***** Predict results {task} *****") + writer.write("index\tprediction\n") + for index, item in enumerate(predictions): + if is_regression: + writer.write(f"{index}\t{item:3.3f}\n") + else: + item = label_list[item] + writer.write(f"{index}\t{item}\n") + + if training_args.push_to_hub: + kwargs = {"finetuned_from": model_args.model_name_or_path, + "tasks": "text-classification"} + if data_args.task_name is not None: + kwargs["language"] = "en" + kwargs["dataset_tags"] = "glue" + kwargs["dataset_args"] = data_args.task_name + kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" + + trainer.push_to_hub(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/src/texts/MAGE/training/longformer/train.sh b/src/texts/MAGE/training/longformer/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..915d275a69b024ff51fabed9110c881708050a3d --- /dev/null +++ b/src/texts/MAGE/training/longformer/train.sh @@ -0,0 +1,26 @@ +# MODEL=bert-base-cased +plm_dir="allenai/longformer-base-4096" +seed=42629309 +data_path="./data/cross_domains_cross_models" +train_file="$data_path/train.csv" +valid_file="$data_path/valid.csv" +out_dir="./output_samples_${seed}_lfbase" +time=$(date +'%m:%d:%H:%M') +mkdir -p $out_dir + +CUDA_VISIBLE_DEVICES=0 python3 main.py \ + --do_train \ + --model_name_or_path $plm_dir \ + --do_eval \ + --train_file $train_file \ + --validation_file $valid_file \ + --max_seq_length 2048 \ + --per_device_train_batch_size 2 \ + --learning_rate 3e-5 \ + --num_train_epochs 5 \ + --evaluation_strategy steps \ + --eval_steps 1000 \ + --overwrite_output_dir \ + --gradient_accumulation_steps 8 \ + --fp16 \ + --output_dir $out_dir 2>&1 | tee $out_dir/log.train.$time diff --git a/src/texts/PASTED/pasted_lexicon.py b/src/texts/PASTED/pasted_lexicon.py new file mode 100644 index 0000000000000000000000000000000000000000..34503611092998ee7099965ce80939fb355f3036 --- /dev/null +++ b/src/texts/PASTED/pasted_lexicon.py @@ -0,0 +1,56 @@ +from typing import Any +from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, +) +from nltk.tokenize import sent_tokenize +import torch +import numpy as np + + +class Detector: + def __init__(self, model_name, device): + if "classification" in model_name: + num_labels = 2 + elif "multi-dimension" in model_name: + num_labels = 3 + else: + num_labels = 1 + self.model = AutoModelForTokenClassification.from_pretrained( + model_name, num_labels=num_labels + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.device = device + + self.model.to(device) + self.model.eval() + + @torch.no_grad() + def __call__(self, text, preprocess=True, threshold=None): + """ + return_type: sentence or text + """ + if preprocess: + sents = sent_tokenize(text) + text = " ".join(sents) + else: + + sents = text.split(" ") + input_ids = self.tokenizer(text, max_length=2048, truncation=True)["input_ids"] + + sent_label_idx = [i for i, ids in enumerate(input_ids) if ids == 2] + + tensor_input = torch.tensor([input_ids]).to(self.device) + outputs = self.model(tensor_input).logits.detach().cpu().numpy() + outputs_logits = outputs[0][sent_label_idx] + outputs_logits: np.ndarray + + if outputs_logits.shape[1] == 2: + outputs_logits = outputs_logits[:, 1] + elif outputs_logits.shape[1] == 3: + outputs_logits = outputs_logits.mean(axis=-1) + outputs_logits = outputs_logits.flatten() + if threshold is None: + return list(zip(sents, outputs_logits.tolist())) + else: + return list(zip(sents, (outputs_logits > threshold).tolist())) \ No newline at end of file diff --git a/src/texts/Roberta/__init__.py b/src/texts/Roberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/texts/Search_Text/__init__.py b/src/texts/Search_Text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/texts/Search_Text/_google_search_engine_testing_share.py b/src/texts/Search_Text/_google_search_engine_testing_share.py new file mode 100644 index 0000000000000000000000000000000000000000..ba4363f17d9263986753ac69f78bac628f487be6 --- /dev/null +++ b/src/texts/Search_Text/_google_search_engine_testing_share.py @@ -0,0 +1,409 @@ +# from _detection import bart_score_in_batch +from dotenv import load_dotenv +import requests +import numpy as np +from collections import Counter +import math +import re +import torch +import os + +import requests +from bs4 import BeautifulSoup +from nltk.tokenize import sent_tokenize +from sentence_transformers import SentenceTransformer, util +from PyPDF2 import PdfReader +from docx import Document +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import nltk + +from identity import extract_entities + +load_dotenv() +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") +SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") + +import nltk +nltk.download('punkt_tab') + + +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +BATCH_SIZE = 8 +MAX_URL_SIZE = 2000000 # ~5MB + +# Download necessary NLTK data files +nltk.download('punkt') +nltk.download('stopwords') + + +PARAPHRASE_THRESHOLD = 0.8 +PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7 +MIN_RATIO_PARAPHASE_NUM = 0.5 +MIN_SAME_SENTENCE_LEN = 6 +MIN_PHRASE_SENTENCE_LEN = 10 + +# #parameters for demontration +# MAX_URL_SIZE = 1000000 # ~1MB +# PARAPHRASE_THRESHOLD = 0.9 +# PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7 +# MIN_RATIO_PARAPHASE_NUM = 0.7 +# MIN_SAME_SENTENCE_LEN = 6 +# MIN_PHRASE_SENTENCE_LEN = 13 +# #parameters for demontration + + +def google_search(query, api_key = GOOGLE_API_KEY, cse_id = SEARCH_ENGINE_ID, is_exactTerms = True,): + url = "https://www.googleapis.com/customsearch/v1" + if is_exactTerms: + params = { + "exactTerms": query, + "key": api_key, + "cx": cse_id, + "num": 10, # Number of results + } + else: + new_query = query.replace('"', "") + params = { + "q": new_query, + "key": api_key, + "cx": cse_id, + "num": 10, # Number of results + } + response = requests.get(url, params=params) + if response.status_code == 200: + return response.json() + else: + print(f"Error: {response.status_code}, {response.text}") + return None + + +def get_most_frequent_words(input_text): + top_words = get_top_words_without_stop_words(input_text, number_word=32) + words = [] + for item in top_words: + words.append(item[0]) + return words + + +def get_candidate_phrase_for_relative_search(input_text, num_chunk = 3, chunk_length = 32): + result = [] + + # Method 1: Get most frequent words + top_words = get_top_words_without_stop_words(input_text, number_word=32) + words = [] + for item in top_words: + words.append(item[0]) + result.append(" ".join(words[:16])) + if len(words) > 16: + result.append(" ".join(words[:32])) + + # Method 2: Get the whole text + result.append(input_text) + + # Method 3: Split text by chunks of 32 words + input_words = input_text.split(" ") + for i in range(num_chunk): + start_index = i * chunk_length + end_index = (i+1) * chunk_length + if start_index < len(input_words): + candidate = " ".join(input_words[start_index:end_index]) + result.append(candidate) + + return result + +def check_if_html(url): + try: + # Step 1: Send a HEAD request to check the Content-Type + response = requests.head(url, allow_redirects=True, timeout=10) + content_type = response.headers.get('Content-Type', '') + + # Check if Content-Type indicates HTML + if 'text/html' in content_type.lower(): + return True + + # Step 2: If Content-Type is ambiguous or missing, fetch the response body + response = requests.get(url, timeout=10) + + # Step 3: Use regex to search for HTML tags in the content + if re.search(r"", response.text, re.IGNORECASE): + return True + else: + return False + except requests.RequestException as e: + print(f"Error checking URL: {e}") + return False + + +def find_by_relative_search(input_text, is_support_opposite = False): + checked_urls = set() + searched_candidates = [] + + # Get most frequent words + top_words = get_most_frequent_words(input_text) + + # Find identities + # entities = extract_entities(input_text) + + # Make a search text based on the most frequent words and entities + # searched_candidates.append(" ".join(entities[:16]) + " " + " ".join(top_words[:16])) + + # Find phrases + searched_candidates = searched_candidates + get_candidate_phrase_for_relative_search(input_text) + + for candidate in searched_candidates: + search_results = google_search(candidate, GOOGLE_API_KEY, SEARCH_ENGINE_ID, is_exactTerms = False) + urls = [item['link'] for item in search_results.get("items", [])] + + for url in urls[:5]: + if url in checked_urls: # already checked + continue + checked_urls.add(url) + size = get_url_size(url) + if size != None and size <= MAX_URL_SIZE: + if check_if_html(url): + paraphrase_threshold = PARAPHRASE_THRESHOLD + if is_support_opposite: + paraphrase_threshold = PARAPHRASE_THRESHOLD_FOR_OPPOSITE + is_paraphrase, data = check_paraphrase(input_text, url, paraphrase_threshold = paraphrase_threshold) + if is_paraphrase: + return is_paraphrase, url, data + # else: + # print(f"ignore {url} due to size = {size}") + return False, None, [] + + +PARAPHASE_MODEL = None + +def split_to_sentences(input_text): + """ + Chia input text dựa trên dấu xuống dòng + sentence tokenize từng paragraph + """ + paragraphs = input_text.split("\n") + result = [] + for paragraph in paragraphs: + paragraph = paragraph.strip() + if paragraph != "": + sentences = sent_tokenize(paragraph) + result.extend(sentences) + return result + +def longest_common_subsequence(arr1, arr2): + n = len(arr1) + m = len(arr2) + + # Tạo bảng dp kích thước (n+1) x (m+1) + dp = [[0] * (m + 1) for _ in range(n + 1)] + max_length = 0 # Biến lưu trữ chiều dài lớn nhất của dãy con chung + + for i in range(1, n + 1): + for j in range(1, m + 1): + # Nếu phần tử trùng nhau + if arr1[i - 1] == arr2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + max_length = max(max_length, dp[i][j]) + else: + dp[i][j] = 0 # Đặt về 0 vì dãy con phải liên tục + + return max_length + + +def check_individual_sentence(input_sentence, source_sentence, min_same_sentence_len, min_phrase_sentence_len, verbose = False): + input_sent = input_sentence.strip() + source_sent = source_sentence.strip() + input_words = input_sent.split(" ") + source_words = source_sent.split(" ") + result = False + if input_sent == source_sent and len(input_words) >= min_same_sentence_len: + result = True + else: + max_overlap_len = longest_common_subsequence(input_words, source_words) + if max_overlap_len >= min_phrase_sentence_len: + result = True + + if verbose: + if result: + max_overlap_len = longest_common_subsequence(input_words, source_words) + return result + +def download_file(url, output_dir="downloads"): + """ + Downloads a file from the given URL and saves it locally. + """ + response = requests.get(url, stream=True) + if response.status_code == 200: + os.makedirs(output_dir, exist_ok=True) + file_name = url.split("/")[-1] + file_path = os.path.join(output_dir, file_name) + with open(file_path, "wb") as file: + file.write(response.content) + return file_path + else: + print(f"Failed to download {url}: {response.status_code}") + return None + +def extract_text_from_pdf(file_path): + """ + Extracts text from a PDF file. + """ + reader = PdfReader(file_path) + text = "" + for page in reader.pages: + text += page.extract_text() + return text + +def extract_text_from_docx(file_path): + """ + Extracts text from a DOCX file. + """ + doc = Document(file_path) + text = "" + for paragraph in doc.paragraphs: + text += paragraph.text + "\n" + return text + + +def extract_text_from_html(url): + """ + Extracts text from an HTML page. + """ + response = requests.get(url) + soup = BeautifulSoup(response.text, "html.parser") + return soup.get_text(separator="\n") + +def extract_text(url): + """ + Determines the file type and extracts text accordingly. + """ + + try: + file_extension = url.split('.')[-1].lower() + if file_extension in ["html", "htm"]: + return extract_text_from_html(url) + elif file_extension == "pdf": + file_path = download_file(url) + return extract_text_from_pdf(file_path) if file_path else None + elif file_extension in ["doc", "docx"]: + file_path = download_file(url) + return extract_text_from_docx(file_path) if file_path else None + else: + print(f"Unsupported file type: {file_extension}") + return extract_text_from_html(url) + except: + return "" + + +def check_paraphrase( + input_text, + url, + paraphrase_threshold = PARAPHRASE_THRESHOLD, + min_ratio = MIN_RATIO_PARAPHASE_NUM, + min_same_sentence_len = MIN_SAME_SENTENCE_LEN, + min_phrase_sentence_len = MIN_PHRASE_SENTENCE_LEN, + verbose = False): + """ + Check input_text and url có paraphrase or not: + + input + - input_text: + - url: + - paraphrase_threshold: cosine similarity tối thiểu để kết luận là paraphrase + - min_ratio: ratio tối thiểu của số lượng câu trong input_text tìm được paraphrase (làm tròn lên) + + output + - True/False => paraphrase or not + - a list element. each element + . input sentence + . matchted sentence (from source) + . Similarity + . True/False stastify the threshold + """ + is_paraphrase_text = False + + if input_text == None: + return is_paraphrase_text, [] + input_sentences = split_to_sentences(input_text) + page_text = extract_text(url) + + if page_text == None: + return is_paraphrase_text, [] + page_sentences = split_to_sentences(page_text) + if len(input_sentences) == 0 or len(page_sentences) == 0: + return is_paraphrase_text, [] + global PARAPHASE_MODEL + if PARAPHASE_MODEL == None: + PARAPHASE_MODEL = SentenceTransformer('paraphrase-MiniLM-L6-v2') + PARAPHASE_MODEL.to(DEVICE) + total_sentence = len(input_sentences) + min_matching = int(math.ceil(total_sentence * min_ratio)) + + # Encode sentences into embeddings + embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE) + embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE) + + # Compute cosine similarity between each pair of sentences + similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() + + # Align sentences + alignment = [] + count = 0 + + for i, sentence1 in enumerate(input_sentences): + max_sim_index = np.argmax(similarity_matrix[i]) + max_similarity = similarity_matrix[i][max_sim_index] + if max_similarity > paraphrase_threshold: # Threshold for paraphrase alignment + is_paraphrase_sentence = True + count += 1 + else: + is_paraphrase_sentence = False + + + item = [sentence1, page_sentences[max_sim_index], max_similarity, is_paraphrase_sentence] + if is_paraphrase_text==False and check_individual_sentence(sentence1, page_sentences[max_sim_index], min_same_sentence_len, min_phrase_sentence_len): + is_paraphrase_text = True + if verbose: + print(f"sentence1 = {sentence1}") + print(f"page_sentences[max_sim_index] = {page_sentences[max_sim_index]}") + alignment.append(item) + if count >= min_matching: + is_paraphrase_text = True + + if verbose: + print(f"min_matching = {min_matching}") + print(f"len(input_sentences) = {len(input_sentences)}") + print(f"count = {count}") + print(f"is_paraphrase_text = {is_paraphrase_text}") + for item in alignment: + print(item) + return is_paraphrase_text, alignment + + +def get_url_size_by_head(url): + try: + response = requests.head(url, allow_redirects=True) + if 'Content-Length' in response.headers: + size = int(response.headers['Content-Length']) + return size + else: + print("Content-Length header is not available.") + return None + except requests.RequestException as e: + print(f"Error: {e}") + return None + +def get_url_size(url): + size = get_url_size_by_head(url) + return size + +def get_top_words_without_stop_words(input_text, number_word = 15): + words = word_tokenize(input_text) + + stop_words = set(stopwords.words('english')) + filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words] + word_frequencies = Counter(filtered_words) + top_words = word_frequencies.most_common(number_word) + + return top_words + +if __name__ == "__main__": + pass diff --git a/src/texts/Search_Text/_text_detection_share.py b/src/texts/Search_Text/_text_detection_share.py new file mode 100644 index 0000000000000000000000000000000000000000..5d1abe9bc2261d654d72ef1b882561b143912717 --- /dev/null +++ b/src/texts/Search_Text/_text_detection_share.py @@ -0,0 +1,100 @@ +from transformers import pipeline +from _google_search_engine_testing_share import find_by_relative_search +import math + +PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" +WORD_FREQUENCY = None + +DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" +""" +data/MAGE/xsum_human.csv = {'HUMAN': 64, 'MACHINE': 36} correction = 20 => 84% +data/MAGE/xsum_machine_topical_gpt-3.5-trubo.csv = {'HUMAN': 3, 'MACHINE': 97} => correction = 3 => 94% + original acc = (64+97)/ 200 = 80.5% + improve = (84 + 94) / 200 = 89% + different = 8.5% + +https://huggingface.co/datasets/RealTimeData/bbc_news_alltime = {'HUMAN': 82, 'MACHINE': 18} => corrected 16 => 98% + +""" + +MODEL_HUMAN_MATCHING = dict() +MODEL_HUMAN_MATCHING[DEFAULT_MODEL] = "Human" + +HUMAN = "HUMAN" +MACHINE = "MACHINE" + +UNKNOWN = "UNKNOWN" +PARAPHASE = "PARAPHASE" +NON_PARAPHASE = "NON_PARAPHASE" + + +def detect_by_huggingface_model(input_text, model = DEFAULT_MODEL, max_length=512): + """ + trả về kết quả là "HUMAN" hay "MACHINE" và confidence score (int) + """ + pipe = pipeline("text-classification", model=model,tokenizer=model, max_length=512, truncation=True, device_map="auto") + result = pipe(input_text)[0] + confidence_score = result['score'] + if result['label'] == MODEL_HUMAN_MATCHING[model]: + return HUMAN, confidence_score + else: + return MACHINE, confidence_score + +def check_human(data, min_ratio = 0.7): + """ + input: + - data have item: + + input sentence + + source sentence + + similarity + + True/False : paraphrase or not + output: + is human (True/False) + """ + total_sentence = len(data) + min_matching = int(math.ceil(total_sentence * min_ratio)) + count = 0 + for input_sentence, source_sentence, similiarity, is_paraprhase in data: + if input_sentence in source_sentence: + count += 1 + if count >= min_matching: + return True + else: + return False + +def abstract_detect_generated_text(input_text): + """ + Assists to detect the source of text using the search engine + Output + - prediction by search engine (HUMAN/MACHINE/UNKNOWN) + - Prediction by SOTA (HUMAN/MACHINE) + - SOTA confidence (float) + - url to website (None if UNKNOWN) + - pair of sentences. Each item have ([] if empty) + - input sentence + - source sentence best matching in url + - matching result between input /source sentence (PARAPHASE/NON_PARAPHASE) + """ + is_support_opposite = False + is_paraphrase, found_url, data = find_by_relative_search(input_text, is_support_opposite) + sentence_pairs = [] + SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(input_text) + if not is_paraphrase: + search_engine_prediction = UNKNOWN + else: + if check_human(data): + search_engine_prediction = HUMAN + else: + search_engine_prediction = MACHINE + for input_sentence, source_sentence, similiarity, is_paraphrase in data: + if is_paraphrase: + check_paraphrase = PARAPHASE + else: + check_paraphrase = NON_PARAPHASE + sentence_pairs.append([input_sentence, source_sentence, check_paraphrase]) + + return search_engine_prediction, SOTA_prediction, SOTA_confidence, found_url, sentence_pairs + +if __name__ == "__main__": + pass + diff --git a/src/texts/Search_Text/chatgpt_detector_roberta.py b/src/texts/Search_Text/chatgpt_detector_roberta.py new file mode 100644 index 0000000000000000000000000000000000000000..14f6ebb3dc095bda0d314400cc48732c4bd317db --- /dev/null +++ b/src/texts/Search_Text/chatgpt_detector_roberta.py @@ -0,0 +1,119 @@ +import math + +from _google_search_engine_testing_share import find_by_relative_search +from transformers import pipeline + +# TODO: move to a config file +# Constants should be UPPER_SNAKE_CASE +PROOFREAD_FILE = "data/1_proofread/xsum/gpt-4o-mini_with_best_similarity.csv" +WORD_FREQUENCY = None + +DEFAULT_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" + +MODEL_HUMAN_LABEL = {DEFAULT_MODEL: "Human"} + +HUMAN = "HUMAN" +MACHINE = "MACHINE" +UNKNOWN = "UNKNOWN" +PARAPHRASE = "PARAPHRASE" +NON_PARAPHRASE = "NON_PARAPHRASE" + + +def detect_ai_content( + input_text: str, + model: str = DEFAULT_MODEL, + max_length: int = 512, +) -> tuple: + """ + Detects if text is human or machine generated. + + Returns: + tuple: (label, confidence_score) + where label is HUMAN or MACHINE. + """ + try: + pipe = pipeline( + "text-classification", + model=model, + tokenizer=model, + max_length=max_length, + truncation=True, + device_map="auto", # good for GPU usage + ) + result = pipe(input_text)[0] + confidence_score = result["score"] + if result["label"] == MODEL_HUMAN_LABEL[model]: + label = HUMAN + else: + label = MACHINE + return label, confidence_score + except Exception as e: # Add exception handling + print(f"Error in Roberta model inference: {e}") + return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error + + +def check_human(data, min_ratio=0.7): + """ + Checks if a sufficient number of input sentences are found within + source sentences. + + Returns: + bool: True if the condition is met, False otherwise. + """ + if not data: # Handle empty data case + return False + min_matching = math.ceil(len(data) * min_ratio) + + count = 0 + + #for input_sentence, source_sentence, similiarity, is_paraprhase in data: + for sentence in data: + if sentence["similarity"] >= 0.99: + count += 1 + print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}") + if count >= min_matching: + return True + return False + + +def abstract_detect_generated_text(input_text): + """ + Abstracts the process of detecting generated text using search + and a classification model. + + Returns: + tuple: ( + search_engine_prediction, + SOTA_prediction, + SOTA_confidence, + found_url, + sentence_pairs, + ) + """ + + is_paraphrase, found_url, data = find_by_relative_search( + input_text, + is_support_opposite=False, + ) # Explicitly set the keyword argument + SOTA_prediction, SOTA_confidence = detect_ai_content(input_text) + + if not is_paraphrase: + search_engine_prediction = UNKNOWN + else: + search_engine_prediction = HUMAN if check_human(data) else MACHINE + + sentence_pairs = [] + if data: # Check if data is not empty to avoid error when iterating + for input_sentence, source_sentence, _, is_paraphrase in data: + check_paraphrase = PARAPHRASE if is_paraphrase else NON_PARAPHRASE + sentence_pairs.append( + [input_sentence, source_sentence, check_paraphrase], + ) + + return ( + search_engine_prediction, + SOTA_prediction, + SOTA_confidence, + found_url, + sentence_pairs, + ) diff --git a/src/texts/Search_Text/comparison.py b/src/texts/Search_Text/comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..f56464798117efe2c21a0235570588152e8405fb --- /dev/null +++ b/src/texts/Search_Text/comparison.py @@ -0,0 +1,217 @@ +import pandas as pd +import re +import csv +from collections import Counter +from difflib import Differ +import nltk +from nltk.corpus import stopwords +nltk.download('stopwords') + + +def remove_stop_words(word_list): + """ + Removes stop words from a list of single words. + + Args: + word_list: A list of single words. + + Returns: + A new list containing only the words that are not stop words. + """ + + stop_words = set(stopwords.words('english')) # Get English stop words + + # Define characters to remove + chars_to_remove = r'[^a-zA-Z0-9]' # Matches any character that is not a letter or digit + + cleaned_words = [] + for word in word_list: + # Remove punctuation and special characters + word = re.sub(chars_to_remove, '', word) + + # Check for single digits and single letters + if len(word) > 1 and not word.isdigit(): + # Check if the word is not a stop word + if word.lower() not in stop_words: + cleaned_words.append(word) + + return cleaned_words + + +def write_word_counts_to_csv(data): + """Writes word counts to a CSV file from a dictionary. + + Args: + data_dict: A dictionary containing the word count data. + filename: The name of the output CSV file. + """ + + with open('data/results/[res]added_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: + fieldnames = ['Word', 'Count'] + writer = csv.writer(csvfile) + writer.writerow(fieldnames) + + for word, count in data['added_word_counts']: + writer.writerow([word, count]) + + with open('data/results/[res]removed_word_counts.csv', 'w', encoding='utf-8', newline='') as csvfile: + fieldnames = ['Word', 'Count'] + writer = csv.writer(csvfile) + writer.writerow(fieldnames) + + for word, count in data['removed_word_counts']: + writer.writerow([word, count]) + + # with open('data/results/[res]unchanged_words.csv', 'w', encoding='utf-8', newline='') as csvfile: + # fieldnames = ['Count', 'Phrase'] + # writer = csv.writer(csvfile) + # writer.writerow(fieldnames) # Write the header + # for phrase, count in data['unchanged_words']: + # writer.writerow([count, phrase]) + + +def preprocess_text(text): + """ + Preprocesses a string by removing punctuation, numbers, and whitespace. + + Args: + text: The string to preprocess. + + Returns: + The preprocessed string. + """ + + # Lower case + text = text.lower() + + # Split text into words while keeping commas and dots within numbers + delimiters = r"(?= 4: + unchanged_phrase = " ".join(substring.split()) + unchanged_phrases.append((unchanged_phrase, count)) + substring = "" + count = 0 + continue + substring += " " + word + count += 1 + + return removed_ngrams, added_ngrams, unchanged_phrases + + +if __name__ == "__main__": + res = compare_strings_from_csv("data/ChatGPT_Nous_Hermes_2_Yi_34B_openchat_3_5_1210_with_best_similarity.csv") + write_word_counts_to_csv(res) + + #remove_stop_words(["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]) \ No newline at end of file diff --git a/src/texts/Search_Text/evaluation.py b/src/texts/Search_Text/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..3bbc82b49c4574044b902a3569e56936982e12b9 --- /dev/null +++ b/src/texts/Search_Text/evaluation.py @@ -0,0 +1,145 @@ +import csv +import time + +import pandas as pd +from chatgpt_detector_roberta import ( + check_human, + detect_ai_content, +) +from search_text import detect_by_relative_search + +HUMAN = "HUMAN" +MACHINE = "MACHINE" + + +def read_csv_column(file_path, column_name, data_size=100): + """ + Reads a CSV file and extracts data from the specified column. + + Args: + filename: Path to the CSV file. + column_name: Name of the column to extract data from. + + Returns: + A list containing the data from the specified column. + """ + + try: + df = pd.read_csv(file_path) + column_data = df[column_name].tolist() + return column_data[:data_size] + except FileNotFoundError: + print(f"Error: File '{file_path}' not found.") + return [] + except KeyError: + print(f"Error: Column '{column_name}' not found in the CSV file.") + return [] + + +def evaluation(texts): + results = [] + index = 0 + for text in texts: +<<<<<<< HEAD + print("-" * 50) + print(f"index = {index}\t {text[:100]}") + bbc = [22, 32, 39, 43, 44, 64, 97] + if index not in bbc: +======= + if index <= 82: + print(f"index = {index}") +>>>>>>> 59d2492d76034c795f0dbf2632f17d366fb31f14 + index += 1 + continue + + # Classify by SOTA model + # SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text) + SOTA_prediction, SOTA_confidence = detect_ai_content(text) + + # Classify by search engine + # is_paraphrased, _, data = find_by_relative_search(text) + is_paraphrased, _, data = detect_by_relative_search(text) + if not is_paraphrased: + search_engine_prediction = "UNKNOWN" + else: + if check_human(data): + search_engine_prediction = HUMAN + else: + search_engine_prediction = MACHINE + print( + f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}" + ) + results.append( + (index, SOTA_prediction, SOTA_confidence, search_engine_prediction) + ) + + with open("eva_bbc_test.csv", "a", newline="") as csvfile: + #with open("eva_MAGE_test.csv", "a", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + [index, SOTA_prediction, SOTA_confidence, search_engine_prediction] + ) + index += 1 + time.sleep(1) # avoid 100? queries per minute limit + + # Define the column names + # columns = [ + # "index", + # "SOTA_prediction", + # "SOTA_confidence", + # "search_engine_prediction", + # ] + + # # Create the DataFrame + # df = pd.DataFrame(results, columns=columns) + + # # Statistics + # search_engine_acc = df["search_engine_prediction"].value_counts()[ + # "HUMAN" + # ] / len(df) + # SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df) + + # # Filter the DataFrame based on the given conditions + # filtered_df = df[ + # (df["SOTA_prediction"] == "MACHINE") + # & (df["search_engine_prediction"] == "HUMAN") + # ] + + # print(f"Total data: {len(df)}") + # print(f"SOTA accuracy: {SOTA_acc}") + # print(f"Search engine accuracy: {search_engine_acc}") + # print(f"Correction sample: {len(filtered_df)}") + + +def extract_machine_data(file_path): + df = pd.read_csv(file_path) + machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"] + + # write to file + machine_data.to_csv("machine_data.csv", index=False) + +def extract_human_data(file_path): + df = pd.read_csv(file_path) + machine_data = df[df["src"] == "xsum_human"] + + # write to file + machine_data.to_csv("machine_data.csv", index=False) + + +if __name__ == "__main__": + # extract_machine_data('data/test_data/test.csv') + + # BBC + file_path = "data/test_data/test_100_bbc.csv" + column_name = "content" + + # MAGE + # file_path = "data/test_data/test_100_MAGE.csv" + # column_name = "text" + + contents = read_csv_column( + file_path=file_path, + column_name=column_name, + data_size=100, + ) + evaluation(contents) \ No newline at end of file diff --git a/src/texts/Search_Text/fake_text_generation_share.py b/src/texts/Search_Text/fake_text_generation_share.py new file mode 100644 index 0000000000000000000000000000000000000000..e9ec69d7bc7484cd53aa9b372ae441f31cf6a2ca --- /dev/null +++ b/src/texts/Search_Text/fake_text_generation_share.py @@ -0,0 +1,53 @@ +from difflib import SequenceMatcher + + + +def highlight_overlap_by_word_to_list(text1, text2): + """ + trả về: + - list of words in text1 + - list of words in text2 + - list of index of hight words in text 1 + - list of index of hight words in text 2 + """ + # Tách chuỗi thành các từ (word) dựa vào khoảng trắng + words1 = text1.split() + words2 = text2.split() + + index1 = [] + index2 = [] + + # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ + matcher = SequenceMatcher(None, words1, words2) + + highlighted_text1 = [] + highlighted_text2 = [] + + # Theo dõi vị trí hiện tại trong words1 và words2 + current_pos1 = 0 + current_pos2 = 0 + + # Lặp qua các đoạn so khớp + for match in matcher.get_matching_blocks(): + start1, start2, length = match + + # Thêm các từ không trùng lặp vào (giữ nguyên) + highlighted_text1.extend(words1[current_pos1:start1]) + highlighted_text2.extend(words2[current_pos2:start2]) + + if length > 0: + for i in range(start1, start1 + length): + index1.append(i) + for i in range(start2, start2 + length): + index2.append(i) + + + # Cập nhật vị trí hiện tại + current_pos1 = start1 + length + current_pos2 = start2 + length + + return words1, words2, index1, index2 + + +if __name__ == "__main__": + pass diff --git a/src/texts/Search_Text/identity.py b/src/texts/Search_Text/identity.py new file mode 100644 index 0000000000000000000000000000000000000000..e05f5d34beaf8484600aff4d3bb1502915aff3ab --- /dev/null +++ b/src/texts/Search_Text/identity.py @@ -0,0 +1,63 @@ +from transformers import pipeline + +ner_pipeline = pipeline("ner") + +def extract_entities(text): + output = ner_pipeline(text) + words = extract_words(output) + words = combine_subwords(words) + + # extract word in each entity and assign to a list of entities, connect words if there is no space between them + entities = [] + for entity in words: + if entity not in entities: + entities.append(entity) + + return entities + + +def extract_words(entities): + """ + Extracts the words from a list of entities. + + Args: + entities: A list of entities. + + Returns: + A list of words extracted from the entities. + """ + words = [] + for entity in entities: + words.append(entity["word"]) + return words + + +def combine_subwords(word_list): + """ + Combines subwords (indicated by "##") with the preceding word in a list. + + Args: + word_list: A list of words, where subwords are prefixed with "##". + + Returns: + A new list with subwords combined with their preceding words. + """ + result = [] + i = 0 + while i < len(word_list): + if word_list[i].startswith("##"): + result[-1] += word_list[i][2:] # Remove "##" and append to the previous word + elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words + result.append(word_list[i] + word_list[i + 1] + word_list[i + 2]) + i += 2 # Skip the next two words + else: + result.append(word_list[i]) + i += 1 + return result + +if __name__ == "__main__": + text = "The Saudi authorities, I am told, are currently working flat out" \ + "to collate everything they have on the Magdeburg market suspect," \ + "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \ + "investigation" + print(extract_entities(text)) \ No newline at end of file diff --git a/src/texts/Search_Text/search_text.py b/src/texts/Search_Text/search_text.py new file mode 100644 index 0000000000000000000000000000000000000000..4b0c426ddc8936e98707598961293b1e2deb4142 --- /dev/null +++ b/src/texts/Search_Text/search_text.py @@ -0,0 +1,791 @@ +import warnings + +from bs4 import BeautifulSoup + +from identity import extract_entities +warnings.simplefilter(action='ignore', category=FutureWarning) + +import time +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +import re +from collections import Counter +import string +import nltk +import torch +from nltk.corpus import stopwords +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.util import ngrams +from sentence_transformers import SentenceTransformer, util +import math + +from dotenv import load_dotenv +from difflib import SequenceMatcher +import os +import requests +import csv +from newspaper import article, ArticleException, ArticleBinaryDataException + + +# Google Cloud Console +load_dotenv() +GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") +SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") + +# Download necessary NLTK data files +nltk.download('punkt', quiet=True) +nltk.download('punkt_tab', quiet=True) +nltk.download('stopwords', quiet=True) + +# load the model +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +PARAPHASE_MODEL = SentenceTransformer('paraphrase-MiniLM-L6-v2') +PARAPHASE_MODEL.to(DEVICE) + +BATCH_SIZE = 8 +MAX_URL_SIZE = 2000000 # ~2MB + +PARAPHRASE_THRESHOLD = 0.8 +PARAPHRASE_THRESHOLD_FOR_OPPOSITE = 0.7 +MIN_SAME_SENTENCE_LEN = 6 +MIN_PHRASE_SENTENCE_LEN = 10 +MIN_RATIO_PARAPHRASE_NUM = 0.7 +MAX_CHAR_SIZE = 30000 + + +def clean_text(text): + """Doc cleaning""" + punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" # not include , and . due to number + # Lowering text + text = text.lower() + + # Removing punctuation + text = "".join([c for c in text if c not in punctuations]) + + # Removing whitespace and newlines + text = re.sub(r'\s+',' ',text) + + text.replace("£", " * ") + + words = text.split() + text = ' '.join(words[:18]) # Join the first 18 words back into a string + + return text + +def remove_punctuation(text): + """Remove punctuation from a given text.""" + punctuation_without_dot = string.punctuation.replace(".", "") + translator = str.maketrans('', '', punctuation_without_dot) + return text.translate(translator) + +def get_keywords(text, num_keywords=5): + """Return top k keywords from a doc using TF-IDF method""" + + # Create a TF-IDF Vectorizer + vectorizer = TfidfVectorizer(stop_words='english') + + # Fit and transform the text + tfidf_matrix = vectorizer.fit_transform([text]) + + # Get feature names (words) + feature_names = vectorizer.get_feature_names_out() + + # Get TF-IDF scores + tfidf_scores = tfidf_matrix.toarray()[0] + + # Sort words by TF-IDF score + word_scores = list(zip(feature_names, tfidf_scores)) + word_scores.sort(key=lambda x: x[1], reverse=True) + + # Return top keywords + return [word for word, score in word_scores[:num_keywords]] + +""" +# Example usage +text = "Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers." +print(f"\n# Input text:\n'{text}'") +print("\n----------------------\n") + +keywords = get_keywords(text) +print("# Top keywords:", keywords) +print("\n----------------------\n") +""" + +def get_important_sentences(paragraph: str, keywords: list[str], num_sentences: int = 3) -> list[str]: + """ + Selects important sentences from a given paragraph based on a list of keywords. + + Args: + paragraph (str): The input paragraph. + keywords (list[str]): List of important keywords. + num_sentences (int): Number of sentences to return (default is 3). + + Returns: + list: A list of important sentences. + """ + # Clean and split the paragraph into sentences + sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', paragraph) if s.strip()] + + # Calculate the importance score for each sentence + sentence_scores = [] + for sentence in sentences: + processed_sentence = clean_text(sentence) + score = 0 + words = processed_sentence.lower().split() + word_count = Counter(words) + + for keyword in keywords: + if keyword.lower() in word_count: + score += word_count[keyword.lower()] + + sentence_scores.append((sentence, score)) + + # Sort sentences by their scores in descending order + sentence_scores.sort(key=lambda x: x[1], reverse=True) + + # Return the top N sentences + return [sentence for sentence, score in sentence_scores[:num_sentences]] + +"""# Example usage +keywords = get_keywords(paragraph) +important_sentences = get_important_sentences(paragraph, keywords) + +print("# Important sentences:") +for i, sentence in enumerate(important_sentences, 1): + print(f"{i}. {sentence}") +print("\n----------------------\n") +""" + +def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length: int = 5) -> list[str]: + """ + Extracts important phrases from a given paragraph based on a list of keywords. + Phrase length is auto-determined, and overlapped parts are less than 20%. + + Args: + paragraph (str): The input paragraph. + keywords (list[str]): List of important keywords. + phrase_length (int): The length of phrases to extract (default is 5 words). + + Returns: + list: A list of important phrases. + """ + # Tokenize the paragraph into words + words = word_tokenize(paragraph.lower()) + + # Determine phrase length (between 3 and 7 words) + phrase_length = min(max(len(words) // 10, 5), 7) + + # Generate n-grams (phrases) from the paragraph + phrases = list(ngrams(words, phrase_length)) + + important_phrases = [] + used_indices = set() + + for i, phrase in enumerate(phrases): + # Check if the phrase contains any keyword + if any(keyword.lower() in phrase for keyword in keywords): + # Check overlap with previously selected phrases + if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices): + important_phrases.append(clean_text(" ".join(phrase))) + used_indices.add(i) + + return important_phrases + +"""# Example usage +keywords = get_keywords(paragraph) +important_phrases = extract_important_phrases(paragraph, keywords) + +print("# Important phrases:") +for i, phrase in enumerate(important_phrases[:5], 1): # Print top 5 phrases + print(f"{i}. {phrase}")""" + +def search_by_google( + query, + num_results=10, + is_exact_terms = False + ) -> dict: + """ + Searches the Google Custom Search Engine for the given query. + + Args: + query: The search query. + is_exact_terms: Whether to use exact terms search (True) or regular search (False). + num_results: The number of results to return (default: 10). + + Returns: + A dictionary containing the search results or None if there was an error. + """ + + start_date = "20000101" + end_date = "20210101" + + url = "https://www.googleapis.com/customsearch/v1" + params = { + "key": GOOGLE_API_KEY, + "cx": SEARCH_ENGINE_ID, + "num": num_results, + } + if is_exact_terms: + params["exactTerms"] = query + else: + params["q"] = query.replace('"', "") + + response = requests.get(url, params=params) + if response.status_code == 200: + return response.json() + else: + print(f"Error: {response.status_code}, {response.text}") + return None + + +def display_Google_results(results): + for result in results: + print(f"Title: {result['title']}") + print(f"Link: {result['link']}") + print(f"Snippet: {result['snippet']}") + print(" ------- ") + + +def detect_by_relative_search(input_text, is_support_opposite = False): + checked_urls = set() + searched_phrases = generate_search_phrases(input_text) + + for candidate in searched_phrases: + search_results = search_by_google(candidate) + urls = [item['link'] for item in search_results.get("items", [])] + + for url in urls[:3]: + if url in checked_urls: # already checked + continue + checked_urls.add(url) + print(f"\n\tURL: {url}") + size = get_url_size(url) + if size != None and size <= MAX_URL_SIZE: + page_text = extract_text(url) + if page_text is None or len(page_text) > MAX_CHAR_SIZE: + print(f"\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters") + continue + is_paraphrase, data = check_paraphrase(input_text, page_text) + if is_paraphrase: + return is_paraphrase, url, data + return False, None, [] + +def get_url_size(url): + """ + Retrieves the size of a URL's content using a HEAD request. + + Args: + url: The URL to check. + + Returns: + The size of the content in bytes, or None if the size cannot be determined + (e.g., due to network errors or missing Content-Length header). + """ + try: + response = requests.head(url, allow_redirects=True, timeout=5) # Add timeout + response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) + + content_length = response.headers.get('Content-Length') + if content_length is not None: + return int(content_length) + else: + print(f"\t\t↑↑↑ Content-Length header not found") + return None + + except requests.exceptions.RequestException as e: + print(f"\t\t↑↑↑ Error getting URL size: {e}") + return None + +def get_most_frequent_words(input_text, number_word=32): + """ + Gets the top words from the input text, excluding stop words and punctuation. + + Args: + input_text: The input text as a string. + number_word: The number of top words to return. + + Returns: + A list of tuples, where each tuple contains a word and its frequency. + Returns an empty list if input is not a string or is empty. + """ + if not isinstance(input_text, str) or not input_text: + return [] + + words = word_tokenize(input_text.lower()) # Tokenize and lowercase + + stop_words = set(stopwords.words('english')) + punctuation = set(string.punctuation) # get all punctuation + filtered_words = [ + word for word in words + if word.isalnum() and word not in stop_words and word not in punctuation + ] + word_frequencies = Counter(filtered_words) + top_words = word_frequencies.most_common(number_word) + + for top_word in top_words: + words.append(top_word[0]) + + if len(words) > 32: + search_phrase = " ".join(words[:32]) + else: + search_phrase = " ".join(words[:number_word]) + + return search_phrase + +def get_chunk(input_text, chunk_length=32, num_chunk=3): + """ + Splits the input text into chunks of a specified length. + + Args: + input_text: The input text as a string. + num_chunk: The maximum number of chunks to create. + chunk_length: The desired length of each chunk (in words). + + Returns: + A list of string chunks. + Returns an empty list if input is invalid. + """ + if not isinstance(input_text, str): + return [] + + chunks = [] + input_words = input_text.split() # Split by any whitespace + + for i in range(num_chunk): + start_index = i * chunk_length + end_index = (i + 1) * chunk_length + chunk = " ".join(input_words[start_index:end_index]) + if chunk: # Only append non-empty chunks + chunks.append(chunk) + + return chunks + +def generate_search_phrases(input_text): + """ + Generates different types of phrases for search purposes. + + Args: + input_text: The input text. + + Returns: + A list containing: + - A list of most frequent words. + - The original input text. + - A list of text chunks. + """ + if not isinstance(input_text, str): + return [] + + search_phrases = [] + + # Method 1: Get most frequent words + search_phrases.append(get_most_frequent_words(input_text)) + + # Method 2: Get the whole text + search_phrases.append(input_text) + + # Method 3: Split text by chunks + search_phrases.extend(get_chunk(input_text)) + + # Method 4: Get most identities and key words + entities = extract_entities(input_text) + keywords = get_keywords(input_text, 16) + search_phrase = " ".join(entities) + " " + " ".join(keywords) + search_phrases.append(search_phrase) + + return search_phrases + +def split_into_sentences(input_text): + """ + Splits input text into sentences by newlines. + + Args: + input_text: The input text as a string. + + Returns: + A list of sentences. Returns an empty list if input is not valid. + """ + if not isinstance(input_text, str): + return [] + + paragraphs = input_text.splitlines() + sentences = [] + for paragraph in paragraphs: + paragraph = paragraph.strip() + if paragraph: + sentences.extend(sent_tokenize(paragraph)) + return sentences + + +def longest_common_subsequence(arr1, arr2): + """ + Finds the length of the longest common subsequence (contiguous) between + two arrays. + + Args: + arr1: The first array. + arr2: The second array. + + Returns: + The length of the longest common subsequence. + Returns 0 if either input is invalid. + """ + + if not isinstance(arr1, list) or not isinstance(arr2, list): + return 0 + + n = len(arr1) + m = len(arr2) + + if n == 0 or m == 0: #handle empty list + return 0 + + # Create table dp with size (n+1) x (m+1) + dp = [[0] * (m + 1) for _ in range(n + 1)] + max_length = 0 + + for i in range(1, n + 1): + for j in range(1, m + 1): + if arr1[i - 1] == arr2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + max_length = max(max_length, dp[i][j]) + else: + dp[i][j] = 0 # set 0 since the array must be consecutive + + return max_length + + +def check_sentence(input_sentence, source_sentence, min_same_sentence_len, + min_phrase_sentence_len, verbose=False): + """ + Checks if two sentences are similar based on exact match or + longest common subsequence. + + Args: + input_sentence: The input sentence. + source_sentence: The source sentence. + min_same_sentence_len: Minimum length for exact sentence match. + min_phrase_sentence_len: Minimum length for common subsequence match. + verbose: If True, print debug information. + + Returns: + True if the sentences are considered similar, False otherwise. + Returns False if input is not valid. + """ + + if not isinstance(input_sentence, str) or not isinstance(source_sentence, str): + return False + + input_sentence = input_sentence.strip() + source_sentence = source_sentence.strip() + + if not input_sentence or not source_sentence: # handle empty string + return False + + input_words = input_sentence.split() # split without arguments + source_words = source_sentence.split() # split without arguments + + if input_sentence == source_sentence and len(input_words) >= min_same_sentence_len: + if verbose: + print("Exact match found.") + return True + + max_overlap_len = longest_common_subsequence(input_words, source_words) + if verbose: + print(f"Max overlap length: {max_overlap_len}") # print overlap length + if max_overlap_len >= min_phrase_sentence_len: + return True + + return False + +def extract_text(url, newspapers = False): + """ + Extracts text from a URL, handling HTML and potential errors. + + Args: + url: The URL of the web page to extract text from. + + Returns: + The extracted text content from the web page, or None if extraction fails. + """ + if newspapers is True: + try: + response = requests.get(url) + response.raise_for_status() # Raise exception for unsuccessful requests + except requests.exceptions.RequestException as e: + print(f"Error fetching URL: {e}") + return None + + try: + news = article(url=url, fetch_images=False) + except: # (ArticleException, ArticleBinaryDataException) as e: + print(f"\t\t↑↑↑ Error downloading article.") + #print(f"\t\t↑↑↑ Error downloading article: {e}") + return None + + return news.text + else: + """ + Extracts text from an HTML page. + """ + response = requests.get(url) + response.raise_for_status() + + response.encoding = response.apparent_encoding + + try: + soup = BeautifulSoup(response.content, "html.parser") + except: + print(f"Error parsing HTML content from {url}") + return None + + # Exclude text within specific elements + for element in soup(["img", "figcaption", "table", "script", "style"]): + element.extract() + #text = soup.get_text(separator="\n") + paragraphs = soup.find_all('p') + text = ' '.join([p.get_text() for p in paragraphs]) + + # remove ", external" which appear after the embedded text + # text = re.sub(r', external', '', text) + + return text + +def check_paraphrase(input_text, page_text, verbose=False): + """ + Checks if the input text is paraphrased in the content at the given URL. + + Args: + input_text: The text to check for paraphrase. + url: The URL of the web page to compare with. + verbose: If True, print debug information. + + Returns: + A tuple containing: + - is_paraphrase: True if the input text is considered a paraphrase, False otherwise. + - paraphrase_results: A list of dictionaries, each containing: + - input_sentence: The sentence from the input text. + - matched_sentence: The corresponding sentence from the web page (if found). + - similarity: The cosine similarity score between the sentences. + - is_paraphrase_sentence: True if the individual sentence pair meets the paraphrase criteria, False otherwise. + """ + is_paraphrase_text = False + + if not isinstance(input_text, str) or not isinstance(page_text, str): + return False, [] + + # Extract sentences from input text and web page + #input_text = remove_punctuation(input_text) + input_sentences = split_into_sentences(input_text) + + + if not page_text: + return is_paraphrase_text, [] + #page_text = remove_punctuation(page_text) + page_sentences = split_into_sentences(page_text) + + if not input_sentences or not page_sentences: + return is_paraphrase_text, [] + + additional_sentences = [] + for sentence in page_sentences: + if ", external" in sentence: + additional_sentences.append(sentence.replace(", external", "")) + page_sentences.extend(additional_sentences) + + min_matching_sentences = math.ceil(len(input_sentences) * MIN_RATIO_PARAPHRASE_NUM) + + # Encode sentences into embeddings + embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE) + embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE) + + # Compute cosine similarity matrix + similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() + + # Find sentence alignments + alignment = [] + paraphrased_sentence_count = 0 + for i, sentence1 in enumerate(input_sentences): + max_sim_index = np.argmax(similarity_matrix[i]) + max_similarity = similarity_matrix[i][max_sim_index] + + is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD + + if 0.80 < max_similarity < 0.99: + print(f"\t\tinput_sentence : {sentence1}") + print(f"\t\tmatched_sentence: {page_sentences[max_sim_index]}") + print(f"\t\t--> similarity: {max_similarity}\n") + item = { + "input_sentence": sentence1, + "matched_sentence": page_sentences[max_sim_index], + "similarity": max_similarity, + "is_paraphrase_sentence": is_paraphrase_sentence, + } + + # Check for individual sentence paraphrase if overall paraphrase not yet found + if not is_paraphrase_text and check_sentence( + sentence1, page_sentences[max_sim_index], MIN_SAME_SENTENCE_LEN, MIN_PHRASE_SENTENCE_LEN + ): + is_paraphrase_text = True + if verbose: + print(f"Paraphrase found for individual sentence: {sentence1}") + print(f"Matched sentence: {page_sentences[max_sim_index]}") + + alignment.append(item) + paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0 + + # Check if enough sentences are paraphrases + print (f"\t\tparaphrased_sentence_count: {paraphrased_sentence_count}, min_matching_sentences: {min_matching_sentences}, total_sentence_count: {len(input_sentences)}") + is_paraphrase_text = paraphrased_sentence_count >= min_matching_sentences + + if verbose: + print(f"Minimum matching sentences required: {min_matching_sentences}") + print(f"Total input sentences: {len(input_sentences)}") + print(f"Number of matching sentences: {paraphrased_sentence_count}") + print(f"Is paraphrase: {is_paraphrase_text}") + for item in alignment: + print(item) + + return is_paraphrase_text, alignment + +def similarity_ratio(a, b): + """ + Calculates the similarity ratio between two strings using SequenceMatcher. + + Args: + a: The first string. + b: The second string. + + Returns: + A float representing the similarity ratio between 0.0 and 1.0. + Returns 0.0 if either input is None or not a string. + """ + if not isinstance(a, str) or not isinstance(b, str) or a is None or b is None: + return 0.0 # Handle cases where inputs are not strings or None + return SequenceMatcher(None, a, b).ratio() + + +def is_human_written(sentence): + # 1. Search for exact matches before 2020 + query = f'"{sentence}"' + results = search_by_google(query) + #results = search_bing(sentence) + + # print("\n----------------------\n") + # print(f"# Search results:\n") + # display_Google_results(results) + + if results: + # Exact match found, likely human-written + #return f"human-written\nExact match found: '{sentence}'" + return -1 + + # 2. If no exact match, find similar sentences + query = sentence + results = search_by_google(query) + + if results: + # Check similarity with search results + similarities = [similarity_ratio(sentence, result['snippet']) for result in results] + max_similarity = max(similarities) + + # You can adjust this threshold as needed + if max_similarity > 0.8: + #return f"likely human-written\nFound result that has {max_similarity*100}% of '{sentence}'" + return max_similarity + + # No strong evidence of human authorship + #return f"likely machine-generated\nFound result that has less than 80% similarity of '{sentence}'" + return 1 + +# # Example usage +# sentence = important_sentences[0] +# result = is_human_written(sentence) +# print("\n----------------------\n") +# print(f"# Result:\nThe sentence is {result}") + +def get_text_from_csv(filename): + """ + Reads a CSV file and returns a list of strings, + extracting only the second column (assuming it contains the text). + + Args: + filename: The path to the CSV file. + + Returns: + A list of strings containing the text from the second column. + """ + + text_data = [] + with open(filename, 'r') as file: + reader = csv.reader(file) + next(reader, None) # skip the headers + for row in reader: + if len(row) >= 2: # Check if the row has at least two elements + text_data.append(row[1]) + + return text_data + +if __name__ == '__main__': + # paragraph = """ + # Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers. + # """ + + # keywords = get_keywords(paragraph) + # important_sentences = get_important_sentences(paragraph, keywords) + + # print("# Important sentences:") + # for i, sentence in enumerate(important_sentences, 1): + # print(f"{i}. {sentence}") + # print("\n----------------------\n") + + # sentence = important_sentences[0] + + filename = "data/results/[res]unchanged_words.csv" # Replace with the actual filename + text_list = get_text_from_csv(filename) + + count = 1 + match_count = 0 + unmatch_count = 0 + initial_delay = 1 # second + data = [] + + for text in text_list: + cleaned_text = clean_text(text) + + result = is_human_written(cleaned_text) + match = "match" if result == -1 else "unmatch" + print(f"{count}: [{match}] {text}") + data.append([match, text]) + if result == -1: + match_count += 1 + else: + unmatch_count += 1 + count += 1 + time.sleep(initial_delay) # avoid 100? queries per minute limit + + print(f"Match count: {match_count}") + print(f"Unmatch count: {unmatch_count}") + + df = pd.DataFrame(data, columns=["Text", "Match"]) + output_filename = "data/results/[res]unchanged_words_processed_data.csv" # Specify the output filename + df.to_csv(output_filename, index=False) + + # # Bing search + # subscription_key = "80163c6371fa40e0a50dfaa1dd5b7d84" + # assert subscription_key + # search_url = "https://api.bing.microsoft.com/v7.0/search" + # headers = {"Ocp-Apim-Subscription-Key": subscription_key} + # params = {"q": '"Artificial intelligence (AI) is intelligence demonstrated by machines"', 'freshness': '2000-02-01..2020-02-01', 'answerCount': 2, 'mkt': 'en-US' } + # response = requests.get(search_url, headers=headers, params=params) + # response.raise_for_status() + # search_results = response.json() + # print("\nHeaders:\n") + # print(response.headers) + + # print("\nJSON Response:\n") + # pprint(response.json()) + + + diff --git a/src/texts/Search_Text/test.py b/src/texts/Search_Text/test.py new file mode 100644 index 0000000000000000000000000000000000000000..76b3ae9e6e7e8304007be2f2db0e083c2c6563f7 --- /dev/null +++ b/src/texts/Search_Text/test.py @@ -0,0 +1,38 @@ +import re +from bs4 import BeautifulSoup +from newspaper import article, ArticleException +import pandas as pd +import requests +from sentence_transformers import SentenceTransformer, util +from search_text import DEVICE, PARAPHASE_MODEL, extract_text + +#news = article('https://www.bbc.co.uk/news/education-51094279') +#print(news.text) + +def extract_human_data(file_path): + df = pd.read_csv(file_path) + machine_data = df[df["src"] == "xsum_human"] + + # write to file + machine_data.to_csv("data/test_data/MAGE_xsum_human.csv", index=False) + +def connect_lines_without_dot_regex(text): + """Connects lines without dot using regex""" + if not isinstance(text, str): + return text + return re.sub(r'(? 1: + raise Exception( + "You have different number of references per test sample.", + ) + + ref_num = len(tgts[0]) + score_matrix = [] + for i in range(ref_num): + curr_tgts = [x[i] for x in tgts] + scores = self.score(srcs, curr_tgts, batch_size) + score_matrix.append(scores) + if agg == "mean": + score_list = np.mean(score_matrix, axis=0) + elif agg == "max": + score_list = np.max(score_matrix, axis=0) + else: + raise NotImplementedError + return list(score_list) + + def test(self, batch_size=3): + """Test""" + src_list = [ + "This is a very good idea. Although simple, but very insightful.", + "Can I take a look?", + "Do not trust him, he is a liar.", + ] + + tgt_list = [ + "That's stupid.", + "What's the problem?", + "He is trustworthy.", + ] + + print(self.score(src_list, tgt_list, batch_size)) + + +def bart_score(text_1, text_2): + """ + Computes the BART score between two texts. + + Parameters: + text_1 (str): The first text. + text_2 (str): The second text. + + Returns: + float: The BART score. + """ + score = bart_scorer.score([text_1], [text_2]) + return score + + +def check_bart_score(input_text, raw_text): + """ + Checks if the BART score between input_text and raw_text is above + a threshold. + + Parameters: + input_text (str): The input text. + raw_text (str): The raw text to compare against. + + Returns: + bool: True if the score is above the threshold, False otherwise. + """ + THRESHOLD = -2.459 + normalized_text = normalize_text(raw_text) + score = bart_score(input_text, normalized_text)[0] + return score >= THRESHOLD + + +def bart_score_in_batch(text_1, text_2): + """ + Calculates the BART score for pairs of texts in batches. + + Args: + text_1 (list of str): The first list of texts. + text_2 (list of str): The second list of texts. + + Returns: + list: A list of BART scores for each pair of texts. + """ + return bart_scorer.score(text_1, text_2, batch_size=BATCH_SIZE) + + +def extract_feature_in_batch(text_1, text_2, feature_kind): + """ + Extracts features for pairs of texts using BART scores. + + Args: + text_1 (list of str): The first list of texts. + text_2 (list of str): The second list of texts. + feature_kind (str): The type of feature to extract. + + Returns: + list: A list of extracted features. + """ + features = bart_score_in_batch(text_1, text_2) + return features diff --git a/src/texts/SimLLM/Refactor/config.py b/src/texts/SimLLM/Refactor/config.py new file mode 100644 index 0000000000000000000000000000000000000000..41001d2894549d3c50f9939095bd0152d990ebb9 --- /dev/null +++ b/src/texts/SimLLM/Refactor/config.py @@ -0,0 +1,115 @@ +import os +import configparser + +import google.generativeai as genai +import nltk +from datasets import load_metric +from langchain.chat_models import ChatOpenAI +from transformers import AutoTokenizer + +from texts.bart_score import BARTScorer + + +# Constants +# TODO: move to .env +env = configparser.ConfigParser() +env.read(".env") # An example environment: .sample-env + +# Get API key +OPENAI_API_KEY = env["API_KEY"]["OPENAI_API_KEY"] +GEMINI_API_KEY = env["API_KEY"]["GEMINI_API_KEY"] +TOGETHER_API_KEY = env["API_KEY"]["TOGETHER_API_KEY"] + +# Environment setup +os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY +os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY +os.environ["TOGETHER_API_KEY"] = TOGETHER_API_KEY +os.environ["CURL_CA_BUNDLE"] = "" +os.environ["REQUESTS_CA_BUNDLE"] = "" + +# File Path +LOG_FILE = "data/99_log.txt" +OUTPUT_FILE = "data/result.txt" +METRIC_NAME = "roc_auc" + +# Training and Model Parameters +TRAIN_RATIO = 0.8 +VAL_RATIO = 0.1 +NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING = 10 +PATIENCE = 3 +BATCH_SIZE = 64 +OPTIMIZED_METRIC = "roc_auc" +SEED = 0 +TEMPERATURE = 0.0 +IS_OUTPUT_NORMALIZATION = False +RATIO = 0.9 +HUMAN_LABEL = 0 +MACHINE_LABEL = 1 +BART = "bart" + +# Model Options +MULTIMODEL = "multimodel" +SINGLE_FROM_MULTIMODEL = "single_from_multimodel" + +# Downloading the NLTK "punkt" only if it's not already downloaded +nltk.download("punkt", quiet=True) + +# API Models +# TODO: consider using an enum +API_ERROR = "API_ERROR" +IGNORE_BY_API_ERROR = "IGNORE_BY_API_ERROR" +CHATGPT = "ChatGPT" +GEMINI = "Gemini" +# LLAMA_2_70_CHAT_TEMP_0 = "LLaMa" + +# Initialize BARTScorer +# TODO: consider loading model lazily +bart_scorer = BARTScorer(device="cuda:0", checkpoint="facebook/bart-large-cnn") + +# Generative AI configuration +OPENAI_MODEL_NAME = "gpt-3.5-turbo-0125" +GEMINI_MODEL_NAME = "gemini-pro" + +genai.configure(api_key=GEMINI_API_KEY, transport="rest") +GEMINI_MODEL = genai.GenerativeModel( + GEMINI_MODEL_NAME, + generation_config={"temperature": TEMPERATURE}, +) +OPENAI_MODEL = ChatOpenAI( + temperature=TEMPERATURE, + model_name=OPENAI_MODEL_NAME, +) + +# Model paths +MODEL_PATHS = { + "LLaMa": "meta-llama/Llama-2-70b-chat-hf", + "QWEN": "Qwen/Qwen1.5-72B-Chat", + "Yi": "NousResearch/Nous-Hermes-2-Yi-34B", + "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "OLMo": "allenai/OLMo-7B-Instruct", + "Phi": "microsoft/phi-2", + "OpenChat": "openchat/openchat-3.5-1210", + "WizardLM": "WizardLM/WizardLM-13B-V1.2", + "Vicuna": "lmsys/vicuna-13b-v1.5", +} + +TOGETHER_PATH = "https://api.together.xyz" + +# Roberta model configurations +ROBERTA_BASE = "roberta-base" +ROBERTA_LARGE = "roberta-large" +ROBERTA_MODEL_PATHS = { + ROBERTA_BASE: "roberta-base", + ROBERTA_LARGE: "roberta-large", +} +LEARNING_RATES = { + ROBERTA_BASE: 2e-5, + ROBERTA_LARGE: 8e-6, +} +MODEL_NAME = ROBERTA_BASE + +# Tokenizer initialization +tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL_PATHS[MODEL_NAME]) + +# Metric loading +metric = load_metric(METRIC_NAME) diff --git a/src/texts/SimLLM/Refactor/evaluation.py b/src/texts/SimLLM/Refactor/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..172aa5c5d167b541ea73fbd31c0765603fc20cdd --- /dev/null +++ b/src/texts/SimLLM/Refactor/evaluation.py @@ -0,0 +1,84 @@ +import nltk +import numpy as np +from config import metric +from utils import refine_candidate_text + +from texts.bart_score import ( + bart_score, + check_bart_score, +) + + +def compute_metrics(evaluation_predictions): + """ + Function to compute evaluation metrics for model predictions. + + Parameters: + evaluation_predictions (tuple): A tuple containing two elements: + - predictions (array-like): The raw prediction scores from the model. + - labels (array-like): The true labels for the evaluation data. + + Returns: + dict: A dictionary containing the computed evaluation metrics. + """ + # Unpack predictions and labels from the input tuple + raw_predictions, true_labels = evaluation_predictions + + # Convert raw prediction scores to predicted class labels + predicted_labels = np.argmax(raw_predictions, axis=1) + + # Compute and return the evaluation metrics + return metric.compute( + prediction_scores=predicted_labels, + references=true_labels, + average="macro", + ) + + +def extract_by_best_similarity(input_text, raw_text): + """ + Extracts the best candidate string from the raw text based on the highest + similarity score compared to the input text. The similarity score is + calculated using the BART score. + + Args: + input_text (str): The original text. + raw_text (str): The raw text containing multiple candidate strings. + + Returns: + str: The best candidate string with the highest similarity score. + Returns the input text if no suitable candidate is found. + """ + + # Refine the raw text + refined_raw_text = refine_candidate_text(input_text, raw_text) + + # Tokenize the refined raw text into sentences + raw_candidates = nltk.sent_tokenize(refined_raw_text) + + # Split sentences further by newlines to get individual candidates + candidate_list = [] + for sentence in raw_candidates: + candidate_list.extend(sentence.split("\n")) + + # Initialize variables to track the best similarity score + # and the best candidate + best_similarity = -9999 + best_candidate = "" + + # Iterate over each candidate to find the best one based on the BART score + for candidate in candidate_list: + refined_candidate = refine_candidate_text(input_text, candidate) + if check_bart_score(input_text, refined_candidate): + score = bart_score(input_text, refined_candidate)[0] + if score > best_similarity: + best_similarity = score + best_candidate = refined_candidate + + # Print the best candidate found + print(f"best_candidate = {best_candidate}") + + # Return the best candidate if found, otherwise return the input text + if best_candidate == "": + return input_text + return best_candidate diff --git a/src/texts/SimLLM/Refactor/main_text.py b/src/texts/SimLLM/Refactor/main_text.py new file mode 100644 index 0000000000000000000000000000000000000000..ca6f35829863ecc686a1d3466b4027123ab5201d --- /dev/null +++ b/src/texts/SimLLM/Refactor/main_text.py @@ -0,0 +1,106 @@ +import argparse + +from texts.config import CHATGPT +from texts.models import process_multi_models_with_validation +from texts.proofreading import generate_new_data_with_best_similarity +from texts.utils import generate_file_name + + +def main(): + """ + Main function to handle argument parsing and execute the sequence of + operations including data generation and processing with multiple + models. + """ + parser = argparse.ArgumentParser(description="SimLLM.") + + # Argument for specifying the list of large language models + parser.add_argument( + "--LLMs", + nargs="+", + default=[CHATGPT, "Yi", "OpenChat"], + help="List of large language models", + ) + + # Argument for specifying the list of training indexes + parser.add_argument( + "--train_indexes", + type=int, + default=[0, 1, 2], + nargs="+", + help="List of training indexes", + ) + + # Argument for specifying the list of testing indexes + parser.add_argument( + "--test_indexes", + type=int, + default=[0], + nargs="+", + help="List of testing indexes", + ) + + # Argument for specifying the number of samples + parser.add_argument( + "--num_samples", + type=int, + default=5000, + help="Number of samples", + ) + + # Parse the command-line arguments + args = parser.parse_args() + + # Static dataset parameters + # dataset_name = "xsum" + # column_name = "document" + # num_samples = args.num_samples + output_file = "data/human.csv" + + # Generate human data with shuffle + # generate_human_with_shuffle( + # dataset_name, + # column_name, + # num_samples, + # output_file, + # ) + + # Existing data parameters + existing_data_file = output_file + existing_kinds = [] + + # New kinds of models to generate data with + new_kinds = args.LLMs + + # Generate new data with best similarity + generate_new_data_with_best_similarity( + existing_data_file, + existing_kinds, + new_kinds, + ) + + # Generate a filename for the multimodel CSV file + multimodel_csv_file = generate_file_name( + existing_data_file, + existing_kinds, + new_kinds, + ) + + # Number of samples to process (-1 means process all samples) + num_samples_to_process = -1 + + # Training and testing indexes from arguments + training_indexes = args.train_indexes + testing_indexes = args.test_indexes + + # Process multiple models with validation + process_multi_models_with_validation( + multimodel_csv_file, + training_indexes, + testing_indexes, + num_samples_to_process, + ) + + +if __name__ == "__main__": + main() diff --git a/src/texts/SimLLM/Refactor/models.py b/src/texts/SimLLM/Refactor/models.py new file mode 100644 index 0000000000000000000000000000000000000000..7033255bec46874bf0e30213e11f76a73cfa985f --- /dev/null +++ b/src/texts/SimLLM/Refactor/models.py @@ -0,0 +1,842 @@ +import os +import shutil +from copy import deepcopy + +import numpy as np +from config import ( + BART, + BATCH_SIZE, + HUMAN_LABEL, + LEARNING_RATES, + MACHINE_LABEL, + MODEL_NAME, + MULTIMODEL, + NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, + OPTIMIZED_METRIC, + PATIENCE, + ROBERTA_MODEL_PATHS, + SINGLE_FROM_MULTIMODEL, + TRAIN_RATIO, + VAL_RATIO, + tokenizer, +) +from datasets import Dataset +from sklearn.base import accuracy_score +from sklearn.metrics import roc_auc_score +from sklearn.neural_network import MLPClassifier +from transformers import ( + AutoModelForSequenceClassification, + DataCollatorWithPadding, + EarlyStoppingCallback, + Trainer, + TrainerCallback, + TrainingArguments, +) + +from texts.bart_score import ( + bart_score_in_batch, + extract_feature_in_batch, +) +from texts.config import OUTPUT_FILE +from texts.evaluation import compute_metrics +from texts.utils import ( + check_error, + combine_text_with_BERT_format, + parse_multimodal_data, + write_to_file, +) + + +class TextDetector: + def __init__(self) -> None: + self.model = None + self.multimodel = None + self.train_data = None + self.val_data = None + self.test_data = None + self.train_features = None + self.val_features = None + self.test_features + + def text_analysis(text: str) -> float: + score = 0.0 + return score + + +class CustomCallback(TrainerCallback): + """ + Custom callback to evaluate the training dataset at the end of each epoch. + """ + + def __init__(self, trainer) -> None: + super().__init__() + self._trainer = trainer + + def on_epoch_end(self, args, state, control, **kwargs): + """ + At the end of each epoch, evaluate the training dataset. + """ + if control.should_evaluate: + control_copy = deepcopy(control) + self._trainer.evaluate( + eval_dataset=self._trainer.train_dataset, + metric_key_prefix="train", + ) + return control_copy + + +def abstract_train(features, labels): + """ + Trains a model using the given features and labels. + + Args: + features (list): The input features for training. + labels (list): The target labels for training. + + Returns: + object: The trained model. + """ + model = MLPClassifier() + model.fit(features, labels) + return model + + +def evaluate_model(model, features, labels): + """ + Evaluates the model's performance using accuracy and ROC AUC scores. + + Args: + model (object): The trained model to evaluate. + features (list): The input features for evaluation. + labels (list): The target labels for evaluation. + + Returns: + None + """ + predictions = model.predict(features) + rounded_predictions = [round(value) for value in predictions] + + accuracy = accuracy_score(labels, rounded_predictions) + write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n") + + roc_auc = roc_auc_score(labels, rounded_predictions) + write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n") + + +def preprocess_function_multimodel(sample): + """ + Preprocesses a given sample for a multi-model setup by calculating + BART scores and formatting the text for BERT input. + + Args: + sample (dict): A dictionary containing a key "text", which is a list of + lists of strings. + + Returns: + dict: A dictionary containing tokenized and preprocessed text data. + """ + num_texts = len(sample["text"][0]) # Number of texts in each sub-sample + texts_grouped_by_index = [ + [] for _ in range(num_texts) + ] # Initialize empty lists for grouping texts by index + + # Group texts by their index across sub-samples + for sub_sample in sample["text"]: + for i in range(num_texts): + texts_grouped_by_index[i].append(sub_sample[i]) + + # Calculate BART scores for each text pair (text[0] with text[i]) + bart_scores = [ + bart_score_in_batch( + texts_grouped_by_index[0], + texts_grouped_by_index[i], + ) + for i in range(1, num_texts) + ] + + combined_texts = [] + + # Process each sub-sample for BERT input + for index, sub_sample in enumerate(sample["text"]): + text_array = [sub_sample[0]] # Start with the input text + score_generation_pairs = [] + + # Pair scores with their corresponding generations + for i in range(1, num_texts): + generation_text = sub_sample[i] + generation_score = bart_scores[i - 1][index] + score_generation_pairs.append((generation_score, generation_text)) + + # Sort pairs by score in descending order + sorted_pairs = sorted(score_generation_pairs, reverse=True) + + # Append sorted texts to text_array + for _, sorted_text in sorted_pairs: + text_array.append(sorted_text) + + # Combine texts into a single BERT-formatted string + combined_text = combine_text_with_BERT_format(text_array) + combined_texts.append(combined_text) + + # Tokenize the combined texts for BERT + return tokenizer(combined_texts, add_special_tokens=False, truncation=True) + + +def preprocess_function_single_from_multimodel(sample): + """ + Extracts the first text from each sub-sample in a multi-model sample and + tokenizes it. + + Args: + sample (dict): A dictionary containing a key "text", which is a list of + lists of strings. + + Returns: + dict: A dictionary containing tokenized text data. + """ + combined_texts = [] + + # Iterate through each sub-sample + for sub_sample in sample["text"]: + input_text = sub_sample[ + 0 + ] # Extract the first text from the sub-sample + combined_texts.append( + input_text, + ) # Append it to the list of combined texts + + # Tokenize the combined texts + return tokenizer(combined_texts, truncation=True) + + +def train_only_by_transformer_with_test_evaluation_early_stop( + train_data, + test_data, + input_type, + num_classes=2, +): + """ + Trains a transformer model using the provided training and testing + datasets with early stopping. + + Args: + train_data (Dataset): The training dataset. + test_data (Dataset): The testing dataset. + input_type (str): The type of input data, either MULTIMODEL or + SINGLE_FROM_MULTIMODEL. + num_classes (int, optional): The number of classes for classification. + Defaults to 2. + + Returns: + Trainer: The trained model wrapped in a Trainer object. + """ + # Preprocess datasets based on the input type + if input_type == MULTIMODEL: + train_data = train_data.map( + preprocess_function_multimodel, + batched=True, + ) + test_data = test_data.map(preprocess_function_multimodel, batched=True) + elif input_type == SINGLE_FROM_MULTIMODEL: + train_data = train_data.map( + preprocess_function_single_from_multimodel, + batched=True, + ) + test_data = test_data.map( + preprocess_function_single_from_multimodel, + batched=True, + ) + + # Data collator to pad inputs + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Load appropriate model based on number of classes + if num_classes == 3: + model = AutoModelForSequenceClassification.from_pretrained( + "pretrained_model/roberta-base_num_labels_3", + num_labels=num_classes, + ) + else: + model = AutoModelForSequenceClassification.from_pretrained( + ROBERTA_MODEL_PATHS[MODEL_NAME], + num_labels=num_classes, + ) + + learning_rate = LEARNING_RATES[MODEL_NAME] + output_folder = "training_with_callbacks" + + # Remove the output folder if it already exists + if os.path.exists(output_folder): + shutil.rmtree(output_folder) + + # Training arguments + training_args = TrainingArguments( + output_dir=output_folder, + evaluation_strategy="epoch", + logging_strategy="epoch", + save_strategy="epoch", + learning_rate=learning_rate, + per_device_train_batch_size=BATCH_SIZE, + per_device_eval_batch_size=BATCH_SIZE, + num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, + weight_decay=0.01, + push_to_hub=False, + metric_for_best_model=OPTIMIZED_METRIC, + load_best_model_at_end=True, + ) + + # Create Trainer object + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_data, + eval_dataset=test_data, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)], + ) + + # Add custom callback + trainer.add_callback(CustomCallback(trainer)) + + # Start training + trainer.train() + + return trainer + + +def create_pair_sample(data_item, training_indices): + """ + Creates pair samples for training by comparing human data with + machine-generated data. + + Args: + data_item (dict): A dictionary containing 'human', 'single', + and 'pair' data. + training_indices (list): A list of indices used for training. + + Returns: + list: A list of dictionaries, each containing a 'text' array + and a 'label'. + """ + # Initialize the result list + result_samples = [] + + # Check if there is any error in the data_item + if check_error(data_item): + return result_samples + + # Create machine samples + for train_idx in training_indices: + if data_item["human"] != data_item["single"][train_idx]: + text_array = [] + machine_text = data_item["single"][train_idx] + text_array.append(machine_text) + + for sub_idx in training_indices: + text_array.append(data_item["pair"][train_idx][sub_idx]) + + sample = { + "text": text_array, + "label": MACHINE_LABEL, + } + result_samples.append(sample) + + # Create human samples + text_array = [data_item["human"]] + + for train_idx in training_indices: + text_array.append(data_item["single"][train_idx]) + + human_sample = { + "text": text_array, + "label": HUMAN_LABEL, + } + + # Append human samples for each machine sample + num_machine_samples = len(result_samples) + for _ in range(num_machine_samples): + result_samples.append(human_sample) + + return result_samples + + +def create_pair_test_sample(data_item, training_indices, testing_indices): + """ + Creates pair test samples by comparing human data with + machine-generated data. + + Args: + data_item (dict): A dictionary containing 'human', 'single', and + 'pair' data. + training_indices (list): A list of indices used for training. + testing_indices (list): A list of indices used for testing. + + Returns: + list: A list of dictionaries, each containing a 'text' array and a + 'label'. + """ + # Initialize the result list + result_samples = [] + + # Check if there is any error in the data_item + if check_error(data_item): + return result_samples + + # Create machine samples based on testing indices + for test_idx in testing_indices: + if data_item["human"] != data_item["single"][test_idx]: + text_array = [] + machine_text = data_item["single"][test_idx] + text_array.append(machine_text) + + for train_idx in training_indices: + text_array.append(data_item["pair"][test_idx][train_idx]) + + sample = { + "text": text_array, + "label": MACHINE_LABEL, + } + result_samples.append(sample) + + # Create human sample + text_array = [data_item["human"]] + + for train_idx in training_indices: + text_array.append(data_item["single"][train_idx]) + + human_sample = { + "text": text_array, + "label": HUMAN_LABEL, + } + + # Append the human sample for each machine sample + num_machine_samples = len(result_samples) + for _ in range(num_machine_samples): + result_samples.append(human_sample) + + return result_samples + + +def create_train_val_sample(data, training_indices): + """ + Creates training and validation samples from the provided data. + + Args: + data (list): A list of data items, each to be processed. + training_indices (list): A list of indices used for training. + + Returns: + list: A list of training and validation samples created from the data. + """ + # Initialize the result list + result_samples = [] + + # Process each item in the data + for data_item in data: + # Create pair samples for the current item + sub_samples = create_pair_sample(data_item, training_indices) + + # Extend the result list with the created sub-samples + result_samples.extend(sub_samples) + + return result_samples + + +def create_test_sample(data, training_indices, testing_indices): + """ + Creates test samples from the provided data by comparing human data with + machine-generated data. + + Args: + data (list): A list of data items, each to be processed. + training_indices (list): A list of indices used for training. + testing_indices (list): A list of indices used for testing. + + Returns: + list: A list of test samples created from the data. + """ + # Initialize the result list + result_samples = [] + + # Process each item in the data + for data_item in data: + # Create pair test samples for the current item + sub_samples = create_pair_test_sample( + data_item, + training_indices, + testing_indices, + ) + + # Extend the result list with the created sub-samples + result_samples.extend(sub_samples) + + return result_samples + + +def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio): + """ + Distributes the data into training, validation, and test samples. + + Args: + data (list): A list of data items to be split and processed. + train_indices (list): A list of indices used for training. + test_indices (list): A list of indices used for testing. + train_ratio (float): The ratio of data to be used for training. + val_ratio (float): The ratio of data to be used for validation. + + Returns: + tuple: A tuple containing lists of training, validation, + and test samples. + """ + # Split the data into training, validation, and test sets + train_data, val_data, test_data = split_train_val_test( + data, + train_ratio, + val_ratio, + ) + + # Create training samples + train_samples = create_train_val_sample(train_data, train_indices) + write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n") + + # Create validation samples + val_samples = create_train_val_sample(val_data, train_indices) + write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n") + + # Create test samples + test_samples = create_test_sample(test_data, train_indices, test_indices) + write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n") + + return train_samples, val_samples, test_samples + + +def convert_to_huggingface_with_multimodel(samples): + """ + Converts a list of samples to the Hugging Face Dataset format. + + Args: + samples (list): A list of samples to be converted. + + Returns: + Dataset: A Hugging Face Dataset object created from the samples. + """ + return Dataset.from_list(samples) + + +def train_by_transformer_with_multimodel_and_early_stop( + train_samples, + val_samples, + input_type, +): + """ + Trains a transformer model with multimodal data and early stopping. + + Args: + train_samples (list): A list of training samples. + val_samples (list): A list of validation samples. + input_type (str): The type of input data (e.g., multimodal). + + Returns: + object: The trained model with early stopping. + """ + # Convert training and validation samples to Hugging Face Dataset format + train_data = convert_to_huggingface_with_multimodel(train_samples) + val_data = convert_to_huggingface_with_multimodel(val_samples) + + # Train the model with early stopping and return the trained model + return train_only_by_transformer_with_test_evaluation_early_stop( + train_data, + val_data, + input_type, + ) + + +def test_by_transformer_with_multimodel(detector, test_samples, input_type): + """ + Tests a trained transformer model with multimodal data. + + Args: + detector (object): The trained model to be evaluated. + test_samples (list): A list of test samples. + input_type (str): The type of input data (e.g., multimodal). + + Returns: + None + """ + # Convert test samples to Hugging Face Dataset format + test_data = convert_to_huggingface_with_multimodel(test_samples) + + # Apply the appropriate preprocessing function based on the input type + if input_type == MULTIMODEL: + test_data = test_data.map(preprocess_function_multimodel, batched=True) + elif input_type == SINGLE_FROM_MULTIMODEL: + test_data = test_data.map( + preprocess_function_single_from_multimodel, + batched=True, + ) + + # Evaluate the model on the test data + result = detector.evaluate(eval_dataset=test_data) + + # Extract and log the ROC AUC score + roc_auc = result["eval_roc_auc"] + write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n") + + +def extract_by_feature_kind(samples, feature_type): + """ + Extracts features from the given samples based on the specified feature + type. + + Args: + samples (list): A list of samples where each sample is a dictionary + with 'text' and 'label' keys. + feature_type (str): The type of feature to extract. + + Returns: + tuple: A tuple containing the extracted features and corresponding + labels. + """ + text_1_list = [] + text_2_list = [] + labels = [] + + for sample in samples: + text_1_list.append(sample["text"][0]) + text_2_list.append(sample["text"][1]) + labels.append(sample["label"]) + + # Extract features in batch based on the feature type + features = extract_feature_in_batch(text_1_list, text_2_list, feature_type) + + return features, labels + + +def train_by_feature_kind(train_samples, feature_type): + """ + Trains a model using features extracted from the training samples based on + the specified feature type. + + Args: + train_samples (list): A list of training samples where each sample is + a dictionary with 'text' and 'label' keys. + feature_type (str): The type of feature to extract for training. + + Returns: + object: The trained model. + """ + # Extract features and labels from the training samples + features, labels = extract_by_feature_kind(train_samples, feature_type) + + # Convert features to a numpy array and reshape for training + features = np.array(features) + features = features.reshape(-1, 1) + + # Train the model using the extracted features and labels + model = abstract_train(features, labels) + + return model + + +def test_by_feature_kind(detector, samples, feature_type): + """ + Tests a detector using features extracted from the provided samples based + on the specified feature type. + + Args: + detector (object): The detector model to be evaluated. + samples (list): A list of samples where each sample is a dictionary + with 'text' and 'label' keys. + feature_type (str): The type of feature to extract for testing. + + Returns: + None + """ + # Extract features and labels from the samples + features, labels = extract_by_feature_kind(samples, feature_type) + + # Convert features to a numpy array and reshape for evaluation + features = np.array(features) + features = features.reshape(-1, 1) + + # Evaluate the detector model using the extracted features and labels + evaluate_model(detector, features, labels) + + +def general_process_multimodels_train_val_test( + train_samples, + val_samples, + test_samples, +): + """ + General process for training, validating, and testing models using + multi-model and feature kind approaches. + + Args: + train_samples (list): Training samples. + val_samples (list): Validation samples. + test_samples (list): Test samples. + + Returns: + None + """ + # Multi-model approach + input_kind = MULTIMODEL + write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") + + # Train detector using multi-model with early stopping + detector = train_by_transformer_with_multimodel_and_early_stop( + train_samples, + val_samples, + input_kind, + ) + + # Evaluate on train set + write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") + test_by_transformer_with_multimodel(detector, train_samples, input_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") + test_by_transformer_with_multimodel(detector, val_samples, input_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") + test_by_transformer_with_multimodel(detector, test_samples, input_kind) + + # Single from multi-model approach + input_kind = SINGLE_FROM_MULTIMODEL + write_to_file(OUTPUT_FILE, "\nInput kind = {input_kind} \n") + + # Train detector using single from multi-model with early stopping + detector = train_by_transformer_with_multimodel_and_early_stop( + train_samples, + val_samples, + input_kind, + ) + + # Evaluate on train set + write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") + test_by_transformer_with_multimodel(detector, train_samples, input_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") + test_by_transformer_with_multimodel(detector, val_samples, input_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") + test_by_transformer_with_multimodel(detector, test_samples, input_kind) + + # Feature kind approach + sample_length = len(train_samples[0]["text"]) + if ( + sample_length == 2 + ): # Check if the sample length is 2, indicating BART feature kind + feature_kind = BART + write_to_file(OUTPUT_FILE, "\nFeature kind = {feature_kind} \n") + + # Train detector using feature kind + detector = train_by_feature_kind(train_samples, feature_kind) + + # Evaluate on train set + write_to_file(OUTPUT_FILE, "EVALUATE ON TRAIN SET \n") + test_by_feature_kind(detector, train_samples, feature_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, "EVALUATE ON VALIDATION SET \n") + test_by_feature_kind(detector, val_samples, feature_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, "EVALUATE ON TEST SET \n") + test_by_feature_kind(detector, test_samples, feature_kind) + + +def process_multi_models_with_validation( + multimodel_csv_file, + train_indices, + test_indices, + num_samples, +): + """ + Processes multi-model data with validation, training, and testing. + + Args: + multimodel_csv_file (str): Path to the CSV file containing + multi-model data. + train_indices (list): Indices for the training data. + test_indices (list): Indices for the testing data. + num_samples (int): Number of samples to process. + + Returns: + None + """ + # Log the details of the process + write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n") + write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n") + write_to_file( + OUTPUT_FILE, + f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING =\ + {NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n", + ) + write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n") + write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n") + write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n") + write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n") + + # Read multi-model data from the CSV file + data = parse_multimodal_data(multimodel_csv_file) + + # Limit data to the specified number of samples + data = data[:num_samples] + + # Distribute data into training, validation, and testing sets + train_samples, val_samples, test_samples = distribute_data( + data, + train_indices, + test_indices, + TRAIN_RATIO, + VAL_RATIO, + ) + + # Log the training and testing indices + write_to_file( + OUTPUT_FILE, + f"Multimodel training with train indices {train_indices},\ + test with test indices {test_indices} \n", + ) + + # Process the multi-models for training, validation, and testing + general_process_multimodels_train_val_test( + train_samples, + val_samples, + test_samples, + ) + + +def split_train_val_test(data, train_ratio, val_ratio): + """ + Splits the dataset into training, validation, and test sets based on + specified ratios. + + Args: + data (list): The dataset to be split. + train_ratio (float): The ratio of the dataset to be used for training. + val_ratio (float): The ratio of the dataset to be used for validation. + + Returns: + tuple: A tuple containing three lists + (train_data, val_data, test_data). + """ + # Calculate the number of samples for the training set + num_train_samples = int(len(data) * train_ratio) + + # Calculate the number of samples for the validation set + num_val_samples = int(len(data) * val_ratio) + + # Split the data into training, validation, and test sets + train_data = data[:num_train_samples] + val_data = data[num_train_samples : (num_train_samples + num_val_samples)] + test_data = data[(num_train_samples + num_val_samples) :] + + return train_data, val_data, test_data diff --git a/src/texts/SimLLM/Refactor/proofreading.py b/src/texts/SimLLM/Refactor/proofreading.py new file mode 100644 index 0000000000000000000000000000000000000000..b524388721cab4a7f279411e9823f296e052f7d4 --- /dev/null +++ b/src/texts/SimLLM/Refactor/proofreading.py @@ -0,0 +1,354 @@ +import os + +from config import ( + CHATGPT, + GEMINI, + GEMINI_MODEL, + IS_OUTPUT_NORMALIZATION, + MODEL_PATHS, + OPENAI_MODEL, + TEMPERATURE, + TOGETHER_API_KEY, + TOGETHER_PATH, +) +from evaluation import extract_by_best_similarity +from openai import OpenAI +from utils import ( + generate_column_names, + generate_file_name, + get_column, + normalize_text, + print_and_log, + read_csv_data, + write_new_data, + write_to_csv, +) + + +def abstract_proofread(model_path, temperature, base_url, api_key, prompt): + """ + Function to proofread an abstract using an AI language model. + + Parameters: + model_path (str): The path or identifier of the AI model to use. + temperature (float): Sampling temperature for the model's output. + base_url (str): The base URL for the API endpoint. + api_key (str): The API key for authentication. + prompt (str): The text prompt to provide to the AI for proofreading. + + Returns: + str: The proofread abstract generated by the AI model. + """ + # Initialize the AI client with the provided API key and base URL + ai_client = OpenAI(api_key=api_key, base_url=base_url) + + # Create a chat completion request with the system message and user prompt + chat_completion = ai_client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are an AI assistant", + }, + { + "role": "user", + "content": prompt, + }, + ], + model=model_path, + max_tokens=1024, + temperature=temperature, + ) + + # Return the content of the first choice's message + return chat_completion.choices[0].message.content + + +def proofread_by_model_name(model_name, input_text, normalize_output): + """ + Proofreads the given input text using the specified model. + + Args: + model_name (str): The name of the model to use for proofreading. + input_text (str): The text to be proofread. + normalize_output (bool): Whether to normalize the output or not. + + Returns: + str: The proofread text. + """ + # Constants for API access + base_url = TOGETHER_PATH + api_key = TOGETHER_API_KEY + temperature = TEMPERATURE + + # Retrieve the model path from the dictionary + if model_name in MODEL_PATHS: + model_path = MODEL_PATHS[model_name] + else: + raise ValueError("Model name not found in the dictionary.") + + # Formulate the prompt for the model + prompt = f"Proofreading for the text: ```{input_text}```" + + # Apply output normalization if required + if normalize_output: + prompt = output_normalization(prompt) + + # Debugging: Print the prompt + print(f"Prompt: {prompt}") + + # Call the abstract proofreading function with the prepared parameters + return abstract_proofread( + model_path, + temperature, + base_url, + api_key, + prompt, + ) + + +def gemini_proofread(input_text, normalize_output): + """ + Proofreads the given text using the GEMINI_MODEL. + + Parameters: + input_text (str): The text to be proofread. + normalize_output (bool): Flag indicating whether to normalize the output. + + Returns: + str: The proofread text. + """ + prompt = f"Proofreading for the text: ```{input_text}```" + if normalize_output: + prompt = output_normalization(prompt) + response = GEMINI_MODEL.generate_content(prompt) + return response.text + + +def chatGPT_proofread(input_text, normalize_output): + """ + Proofreads the given text using the chat_model. + + Parameters: + input_text (str): The text to be proofread. + normalize_output (bool): Flag indicating whether to normalize the output. + + Returns: + str: The proofread text. + """ + prompt = f"Proofreading for the text: ```{input_text}```" + if normalize_output: + prompt = output_normalization(prompt) + + print(f"Starting API call with prompt: {prompt}") + result = OPENAI_MODEL.predict(prompt) + print(f"Ending API call with prompt: {prompt}") + + return result + + +def output_normalization(prompt): + """ + Normalizes the output by appending a specific instruction to the prompt. + + Parameters: + prompt (str): The initial prompt. + + Returns: + str: The modified prompt. + """ + return ( + prompt + + " Please only output the proofread text without any explanation." + ) + + +def proofread_with_best_similarity(input_text, model_kind): + """ + Proofreads the input text using the specified model and extracts the + best-corrected text based on similarity. + + Args: + input_text (str): The original text to be proofread. + model_kind (str): The kind of model to use for proofreading + (e.g., CHATGPT, GEMINI). + + Returns: + tuple: A tuple containing the raw proofread text and the + best-corrected text. + """ + + # Normalize the input text + normalized_input_text = normalize_text(input_text) + print_and_log(f"INPUT = {normalized_input_text}") + + result_text = "" + raw_text = "" + + for i in range( + 1, + ): # Loop is redundant as it runs only once; + # consider removing if unnecessary + # Select the proofreading model based on model_kind + if model_kind == CHATGPT: + raw_text = chatGPT_proofread( + normalized_input_text, + normalize_output=IS_OUTPUT_NORMALIZATION, + ) + elif model_kind == GEMINI: + raw_text = gemini_proofread( + normalized_input_text, + normalize_output=IS_OUTPUT_NORMALIZATION, + ) + else: + raw_text = proofread_by_model_name( + model_kind, + normalized_input_text, + normalize_output=IS_OUTPUT_NORMALIZATION, + ) + + # Extract the best candidate text based on similarity + result_text = extract_by_best_similarity( + normalized_input_text, + raw_text, + ) + + # Log the raw and result texts + print_and_log(f"RAW_{i} = {raw_text}") + print + # Normalize the result text + result_text = normalize_text(result_text) + + # If a valid result is obtained, return it + if result_text != "": + return raw_text, result_text + + # Return the raw and result texts + return raw_text, result_text + + +def generate_new_data_with_best_similarity( + existing_data_file, + existing_kinds, + new_kinds, +): + """ + Generates new data with the best similarity based on existing and new + kinds, and writes the results to a CSV file. + + Args: + existing_data_file (str): The path to the existing data file. + existing_kinds (list): A list of existing kinds. + new_kinds (list): A list of new kinds. + + Returns: + None + """ + + # Combine existing and new kinds into a single list + all_kinds = existing_kinds + new_kinds + + # Generate column names for the CSV file + column_names = generate_column_names(all_kinds) + + # Generate column names for existing kinds + existing_column_names = generate_column_names(existing_kinds) + + # Generate the output file name + output_file = generate_file_name( + existing_data_file, + existing_kinds, + new_kinds, + ) + + # Create the output file with column names if it doesn't exist + if not os.path.exists(output_file): + write_to_csv(output_file, column_names) + + # Read existing data from the file + existing_data = { + kind: get_column(existing_data_file, kind) + for kind in existing_column_names + } + + # Read input data from the output file + input_data = read_csv_data(output_file) + start_index = len(input_data) + print(f"start_index = {start_index}") + + num_rows = len(existing_data["human"]) + global_generate_set = [] + global_reuse = [] + + for index in range(start_index, num_rows): + # Initialize generation and reuse sets + generate_set = [] + reuse_set = [] + + # Prepare the current generation dictionary + current_generation = { + kind: existing_data[kind][index] for kind in existing_column_names + } + print(f"current_generation before generation = {current_generation}") + + human_text = current_generation["human"] + + # Generate new kinds based on human text + for kind in new_kinds: + _, generated_text = proofread_with_best_similarity( + human_text, + kind, + ) + current_generation[kind] = generated_text + generate_set.append(kind) + + print(f"current_generation after generate one = {current_generation}") + + # Generate combinations of kinds + for first_kind in all_kinds: + for second_kind in all_kinds: + combination_name = f"{first_kind}_{second_kind}" + + if combination_name not in current_generation: + if ( + first_kind in current_generation + and current_generation[first_kind] == human_text + ): + generated_text = current_generation[second_kind] + reuse_set.append( + f"{combination_name} from {second_kind}", + ) + else: + is_need_generation = True + for first_kind_2 in all_kinds: + if ( + first_kind != first_kind_2 + and current_generation[first_kind] + == current_generation[first_kind_2] + ): + combination_name_2 = ( + f"{first_kind_2}_{second_kind}" + ) + if combination_name_2 in current_generation: + generated_text = current_generation[ + combination_name_2 + ] + reuse_set.append( + f"{combination_name} from {combination_name_2}", # noqa: E501 + ) + is_need_generation = False + break + if is_need_generation: + _, generated_text = proofread_with_best_similarity( + current_generation[first_kind], + second_kind, + ) + generate_set.append(f"{first_kind}_{second_kind}") + + current_generation[combination_name] = generated_text + + # Write the current generation to the output file + write_new_data(output_file, current_generation, column_names) + + # Update global sets + global_generate_set.append(generate_set) + global_reuse diff --git a/src/texts/SimLLM/Refactor/readme.md b/src/texts/SimLLM/Refactor/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..13dbb68d614b965359ee4df04bdab6910db33f8c --- /dev/null +++ b/src/texts/SimLLM/Refactor/readme.md @@ -0,0 +1,67 @@ +# [Text] SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-Generation + +## **Getting Started** +1. **Clone the repository:** + ```bash + git clone https://github.com/Tokyo-Techies/prj-nict-ai-content-detection + ``` + +2. **Set up the environment:** +Using virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + + +4. **API Keys** (optional) + - Obtain API keys for the corresponding models and insert them into the `SimLLM.py` file: + - ChatGPT: [OpenAI API](https://openai.com/index/openai-api/) + - Gemini: [Google Gemini API](https://ai.google.dev/gemini-api/docs/api-key) + - Other LLMs: [Together API](https://api.together.ai/) + + +5. **Run the project:** + ```bash + main_text.py + ``` + +### Parameters + +- `LLMs`: List of large language models to use. Available models include 'ChatGPT', 'Yi', 'OpenChat', 'Gemini', 'LLaMa', 'Phi', 'Mixtral', 'QWen', 'OLMO', 'WizardLM', and 'Vicuna'. Default is `['ChatGPT', 'Yi', 'OpenChat']`. +- `train_indexes`: List of LLM indexes for training. Default is `[0, 1, 2]`. +- `test_indexes`: List of LLM indexes for testing. Default is `[0]`. +- `num_samples`: Number of samples. Default is 5000. + +### Examples + +- Running with default parameters: + `python SimLLM.py` + +- Running with customized parameters: + `python SimLLM.py --LLMs ChatGPT --train_indexes 0 --test_indexes 0` + +## Dataset + +The `dataset.csv` file contains both human and generated texts from 12 large language models, including: +ChatGPT, GPT-4o, Yi, OpenChat, Gemini, LLaMa, Phi, Mixtral, QWen, OLMO, WizardLM, and Vicuna. + +## Citation + +```bibtex +@inproceedings{nguyen2024SimLLM, + title={SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-generation}, + author={Nguyen-Son, Hoang-Quoc and Dao, Minh-Son and Zettsu, Koji}, + booktitle={The Conference on Empirical Methods in Natural Language Processing}, + year={2024} +} +``` + +## Acknowledgements + +- BARTScore: [BARTScore GitHub Repository](https://github.com/neulab/BARTScore) diff --git a/src/texts/SimLLM/Refactor/utils.py b/src/texts/SimLLM/Refactor/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..984125340d28f0d9354e29fc8d56e15729474a64 --- /dev/null +++ b/src/texts/SimLLM/Refactor/utils.py @@ -0,0 +1,527 @@ +import csv +import logging +import os +import random + +import nltk +import numpy as np +import pandas as pd +from config import ( # LOG_FILE, + API_ERROR, + IGNORE_BY_API_ERROR, + SEED, +) +from datasets import load_dataset + + +def print_and_log(message: str): + # TODO: redefine logging + """ + Log message. + + Args: + message (str): The message to be printed and logged. + """ + logging.info(message) + + +def write_to_file(filename: str, content: str): + """ + Writes the given content to a specified file. + + Args: + filename (str): The path to the file to write content. + content (str): The content to be written. + """ + print(content) + with open(filename, "a+", encoding="utf-8") as file: + file.write(content) + + +def write_new_data( + output_file: str, + current_data: dict, + column_names: list, +) -> None: + """ + Writes a new row of data to a CSV file. + + Args: + output_file (str): The path to the output CSV file. + current_data (dict): A dictionary containing the data to be written. + column_names (list): A list of column names in the desired order. + + Returns: + None + """ + # Extract data in the specified order based on column names + data_row = [current_data[column] for column in column_names] + + # Write the data row to the CSV file + write_to_csv(output_file, data_row) + + +def write_to_csv(filename: str, row_data: list) -> None: + """ + Appends a row of data to a CSV file. + + Args: + filename (str): The name of the CSV file. + row_data: A list of values to be written as a row. + + Returns: + None + """ + # Open the CSV file in append mode, creating it if it doesn't exist + with open(filename, "a+", encoding="UTF8", newline="") as file: + writer = csv.writer(file) + writer.writerow(row_data) + + +def count_csv_lines(filename: str) -> int: + """Counts the number of lines in a CSV file, excluding the header row. + + Args: + filename (str): The path to the CSV file. + + Returns: + int: The number of lines in the CSV file, excluding the header row. + """ + file_data = pd.read_csv(filename, sep=",").values + return len(file_data) + + +def read_csv_data(input_file: str) -> np.ndarray: + """ + Reads data from a specified CSV file. + + Args: + file_path (str): The path to the CSV file. + + Returns: + numpy.ndarray: The data from the CSV file. + """ + file_data = pd.read_csv( + input_file, + dtype="string", + keep_default_na=False, + sep=",", + ).values + return file_data + + +def get_column(input_file: str, column_name: str) -> np.ndarray: + """ + Retrieves a specific column from a CSV file as a NumPy array. + + Args: + input_file (str): The path to the CSV file. + column_name (str): The name of the column to extract. + + Returns: + np.ndarray: Values from the specified column. + """ + # Read CSV, preserving string data types and handling missing values + df = pd.read_csv( + input_file, + dtype="string", + keep_default_na=False, + sep=",", + ) + + # Extract the specified column as a NumPy array + column_data = df[column_name].values + return column_data + + +def generate_column_names(categories: list) -> list: + """ + Generates column names for a pairwise comparison matrix. + + Args: + categories (list): A list of categories. + + Returns: + list: A list of column names, + including a 'human' column and pairwise combinations. + """ + column_names = ["human"] + + # Add individual category names as column names + column_names.extend(categories) + + # Add pairwise combinations of categories as column names + for i in categories: + for j in categories: + column_names.append(f"{i}_{j}") + + # TODO: improve? + # for i in range(len(categories)): + # for j in range(i + 1, len(categories)): + # column_names.append(f"{categories[i]}_{categories[j]}") + + return column_names + + +def normalize_text(input_text: str) -> str: + """ + Normalizes the given text by removing unnecessary characters and + formatting it for better readability. + + Args: + input_text (str): The input text to be normalized. + + Returns: + The normalized text. + + This function performs the following transformations: + 1. Strips leading and trailing whitespace + 2. Removes double asterisks (`**`) + 3. Replaces newlines with spaces + 4. Removes extra spaces + """ + processed_text = input_text.strip() + processed_text = processed_text.replace("**", "") + processed_text = processed_text.replace("\n", " ") + processed_text = processed_text.replace(" ", " ") # Remove extra spaces + # TODO: what if 3 or more spaces + return processed_text + + +def refine_candidate_text(input_text: str, candidate_text: str) -> str: + # TODO: how different with processing text + """ + Removes specific surrounding marks from the candidate text if they are + present in the input text with an excess of exactly two occurrences. + + Args: + input_text (str): The original text. + candidate (str): The candidate text to be refined. + + Returns: + str: The refined candidate text. + """ + + # Create a copy of the candidate string and strip whitespace + refined_candidate = candidate_text.strip() + + # Iterate through each mark + for mark in ["```", "'", '"']: + # Count occurrences of the mark in input_text and refined_candidate + count_input_text = input_text.count(mark) + count_refined_candidate = refined_candidate.count(mark) + + # Check if the mark should be stripped + if ( + count_refined_candidate == count_input_text + 2 + and refined_candidate.startswith(mark) + and refined_candidate.endswith(mark) + ): + # Strip the mark from both ends of the refined_candidate + refined_candidate = refined_candidate.strip(mark) + + return refined_candidate + + +def generate_file_name( + existing_data_file: str, + existing_kinds: list, + new_kinds: list, +) -> str: + """ + Generates a new file name based on the path of an existing data file and a + combination of existing and new kinds. + + Args: + existing_data_file (str): The path to the existing data file. + existing_kinds (list): A list of existing kinds. + new_kinds (list): A list of new kinds. + + Returns: + str: The generated file name with the full path. + """ + + # Combine existing and new kinds into a single list + combined_kinds = existing_kinds + new_kinds + + # Get the directory path of the existing data file + directory_path = os.path.dirname(existing_data_file) + + # Create a new file name by joining the kinds with underscores and adding + # a suffix + # TODO: move to config file + new_file_name = "_".join(combined_kinds) + "_with_best_similarity.csv" + + # Combine the directory path with the new file name to get the full output + # file path + output_file_path = os.path.join(directory_path, new_file_name) + + return output_file_path + + +def shuffle(data: list[list], seed: int) -> None: + """ + Shuffles the elements within each sublist of the given data structure. + + Args: + data (list of lists): The array containing sublists to shuffle. + seed (int): The seed value for the random number generator. + + Returns: + None + """ + for sublist in data: + random.Random(seed).shuffle(sublist) + + +def generate_human_with_shuffle( + dataset_name: str, + column_name: str, + num_samples: int, + output_file: str, +) -> None: + """ + Generates a shuffled list of sentences from the dataset and writes them to + a CSV file. + + Args: + dataset_name (str): The name of the dataset to load. + column_name (str): The column name to extract sentences from. + num_samples (int): The number of samples to process. + output_file (str): The path to the output CSV file. + + Returns: + None + """ + # Load the dataset + dataset = load_dataset(dataset_name) + data = dataset["train"] + + lines = [] + # Tokenize sentences and add to the lines list + for sample in data: + nltk_tokens = nltk.sent_tokenize(sample[column_name]) + lines.extend(nltk_tokens) + + # Filter out empty lines + filtered_lines = [line for line in lines if line != ""] + lines = filtered_lines + + # Shuffle the lines + shuffle([lines], seed=SEED) + + # Ensure the output file exists and write the header if it doesn't + if not os.path.exists(output_file): + header = ["human"] + write_to_csv(output_file, header) + + # Get the number of lines already processed in the output file + number_of_processed_lines = count_csv_lines(output_file) + + # Print the initial lines to be processed + print(f"Lines before processing: {lines[:num_samples]}") + + # Slice the lines list to get the unprocessed lines + lines = lines[number_of_processed_lines:num_samples] + + # Print the lines after slicing + print(f"Lines after slicing: {lines}") + + # Process each line and write to the output file + for index, human in enumerate(lines): + normalized_text = normalize_text(human) + output_data = [normalized_text] + write_to_csv(output_file, output_data) + print( + f"Processed {index + 1} / {len(lines)};\ + Total processed:\ + {number_of_processed_lines + index + 1} / {num_samples}", + ) + + +def split_data(data: list, train_ratio: float) -> list[list, list]: + """ + Splits a dataset into training and testing sets. + + Args: + data (list): The input dataset. + train_ratio (float): The proportion of data to use for training. + + Returns: + The training and testing sets. + """ + + # Calculate the number of samples for training + train_size = int(len(data) * train_ratio) + + # Split the data into training and testing sets + train_data = data[:train_size] + test_data = data[train_size:] + + return train_data, test_data + + +def combine_text_with_BERT_format(text_list: list[str]) -> str: + """ + Formats a list of texts into a single string suitable for BERT input. + + Args: + text_list (list[str]): A list of text strings. + + Returns: + str: A single string formatted with BERT's special tokens. + """ + # TODO: simplify this function + # combined_text = f"{text_list[0]}" + # for i in range(1, len(text_list)): + # combined_text += f"{text_list[i]}" + # return combined_text + + formatted_text = "" + "".join(text_list) + "" + return formatted_text + + +def check_api_error(data: list): + """ + Checks if the given data contains an API error or an indication to ignore + an API error. + + Args: + data (list): A list of items to check. + + Returns: + bool: True if an API error or ignore indication is found, + False otherwise. + """ + for item in data: + # Check for API error indicators + if item in (API_ERROR, IGNORE_BY_API_ERROR): + return True # Return True if at least an error indicator is found + return False # Return False if no error indicators are found + + +def calculate_required_models(num_columns: int) -> int: + """ + Calculates the minimum number of models required to generate the specified number of columns. + + Args: + num_columns (int): The total number of columns to generate. + + Returns: + int: The minimum number of models required. + + Raises: + ValueError: If the number of columns cannot be achieved with the current model configuration. + """ + + num_models = 0 + count_human = 1 # Initial count representing human input + + # TODO: simplify this function + while True: + count_single = num_models # Single model count + count_pair = num_models * num_models # Pair model count + + total_count = count_human + count_single + count_pair + + if total_count == num_columns: + return num_models + elif total_count > num_columns: + raise Exception( + "Cannot calculate the number of models to match the number of columns", # noqa: E501 + ) + + num_models += 1 + + +def parse_multimodal_data(multimodel_csv_file: list) -> list: + """ + Parses multimodal data from a CSV file into a structured format. + + Args: + multimodel_csv_file (str): Path to the CSV file. + + Returns: + list: A list of dictionaries, each containing 'human', 'single', and + 'pair' keys. + + Raises: + Exception: If there is an error in reading the CSV file or processing + the data. + """ + # TODO: simplify this function + + # Read CSV data into a list of lists + input_data = read_csv_data(multimodel_csv_file) + + # Initialize the result list + structured_data = [] + + # Calculate the number of models based on the number of columns in the first row # noqa: E501 + num_models = calculate_required_models(len(input_data[0])) + + # Process each row in the input data + for row in input_data: + row_data = {} + index = 0 + + # Extract human data + row_data["human"] = row[index] + index += 1 + + # Extract single model data + single_model_data = [] + for _ in range(num_models): + single_model_data.append(row[index]) + index += 1 + row_data["single"] = single_model_data + + # Extract pair model data + pair_model_data = [] + for _ in range(num_models): + sub_pair_data = [] + for _ in range(num_models): + sub_pair_data.append(row[index]) + index += 1 + pair_model_data.append(sub_pair_data) + row_data["pair"] = pair_model_data + + # Append the structured row data to the result list + structured_data.append(row_data) + + return structured_data + + +def check_error(data_item: dict) -> bool: + """ + Checks if the given data item contains any API errors. + An API error is indicated by a specific error message + or code within the text. + + Args: + data_item (dict): A dictionary containing 'human', 'single', + and 'pair' fields. + + Returns: + bool: True if an API error is found, otherwise False. + """ + # Check for API error in the 'human' field + if check_api_error(data_item["human"]): + return True + + # Check for API error in the 'single' model data + for single_text in data_item["single"]: + if check_api_error(single_text): + return True + + # Get the number of models from the 'single' model data + num_models = len(data_item["single"]) + + # Check for API error in the 'pair' model data + for i in range(num_models): + for j in range(num_models): + if check_api_error(data_item["pair"][i][j]): + return True + + # No errors found + return False diff --git a/src/texts/SimLLM/SimLLM.py b/src/texts/SimLLM/SimLLM.py new file mode 100644 index 0000000000000000000000000000000000000000..851996fdcb98edde8d327a712954168227bf9747 --- /dev/null +++ b/src/texts/SimLLM/SimLLM.py @@ -0,0 +1,1667 @@ + +import os +import shutil +import random +import pandas as pd +import numpy as np +import nltk +import google.generativeai as genai +import csv +from transformers import ( + AutoTokenizer, + DataCollatorWithPadding, + AutoModelForSequenceClassification, + EarlyStoppingCallback, + TrainerCallback, + TrainingArguments, + Trainer +) +from openai import OpenAI +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import roc_auc_score, accuracy_score +from os.path import join +from langchain.chat_models import ChatOpenAI +from datasets import load_metric, load_dataset, Dataset +from copy import deepcopy +from bart_score import BARTScorer +import argparse + +# Constants +TOGETHER_API_KEY = "your_together_api_key" +OPENAI_API_KEY = "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" +GEMINI_API_KEY = "your_gemini_key" +LOG_FILE = "data/99_log.txt" +OUTPUT_FILE = "data/result.txt" +METRIC_NAME = "roc_auc" + +TRAIN_RATIO = 0.8 +VAL_RATIO = 0.1 +NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING = 10 +PATIENCE = 3 +BATCH_SIZE = 8 +OPTIMIZED_METRIC = "roc_auc" +SEED = 0 +TEMPERATURE = 0.0 +IS_OUTPUT_NORMALIZATION = False +RATIO = 0.9 +HUMAN_LABEL = 0 +MACHINE_LABEL = 1 +BART = "bart" + +MULTIMODEL = "multimodel" +SINGLE_FROM_MULTIMODEL = "single_from_multimodel" + +# Environment setup +os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY +os.environ['CURL_CA_BUNDLE'] = '' +os.environ['REQUESTS_CA_BUNDLE'] = '' + +# Download necessary NLTK data +nltk.download('punkt') +nltk.download('punkt_tab') + +# Chat model configurations +chat_model = ChatOpenAI(temperature=TEMPERATURE, model_name="gpt-3.5-turbo-0125") + +# API Models and Paths +CHATGPT = "ChatGPT" +GEMINI = "Gemini" +# LLAMA_2_70_CHAT_TEMP_0 = "LLaMa" +API_ERROR = "API_ERROR" +IGNORE_BY_API_ERROR = "IGNORE_BY_API_ERROR" + +# Initialize BARTScorer +bart_scorer = BARTScorer(device='cuda:0', checkpoint="facebook/bart-large-cnn") + +# Generative AI configuration +genai.configure(api_key=GEMINI_API_KEY, transport='rest') +generation_config = { + "temperature": TEMPERATURE, +} +GEMINI_MODEL = genai.GenerativeModel('gemini-pro', generation_config=generation_config) + +# Model paths +MODEL_PATHS = { + "LLaMa": "meta-llama/Llama-2-70b-chat-hf", + "QWEN": "Qwen/Qwen1.5-72B-Chat", + "Yi": "NousResearch/Nous-Hermes-2-Yi-34B", + "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "OLMo": "allenai/OLMo-7B-Instruct", + "Phi": "microsoft/phi-2", + "OpenChat": "openchat/openchat-3.5-1210", + "WizardLM": "WizardLM/WizardLM-13B-V1.2", + "Vicuna": "lmsys/vicuna-13b-v1.5" +} + +TOGETHER_PATH ='https://api.together.xyz' + +# Roberta model configurations +ROBERTA_BASE = "roberta-base" +ROBERTA_LARGE = "roberta-large" +ROBERTA_MODEL_PATHS = { + ROBERTA_BASE: "roberta-base", + ROBERTA_LARGE: "roberta-large" +} +LEARNING_RATES = { + ROBERTA_BASE: 2e-5, + ROBERTA_LARGE: 8e-6 +} +MODEL_NAME = ROBERTA_BASE + + + +# Tokenizer initialization +tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL_PATHS[MODEL_NAME]) + +# Custom callback for Trainer +class CustomCallback(TrainerCallback): + """ + Custom callback to evaluate the training dataset at the end of each epoch. + """ + def __init__(self, trainer) -> None: + super().__init__() + self._trainer = trainer + + def on_epoch_end(self, args, state, control, **kwargs): + """ + At the end of each epoch, evaluate the training dataset. + """ + if control.should_evaluate: + control_copy = deepcopy(control) + self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train") + return control_copy + +# Metric loading +metric = load_metric(METRIC_NAME) + +def compute_metrics(evaluation_predictions): + """ + Function to compute evaluation metrics for model predictions. + + Parameters: + evaluation_predictions (tuple): A tuple containing two elements: + - predictions (array-like): The raw prediction scores from the model. + - labels (array-like): The true labels for the evaluation data. + + Returns: + dict: A dictionary containing the computed evaluation metrics. + """ + # Unpack predictions and labels from the input tuple + raw_predictions, true_labels = evaluation_predictions + + # Convert raw prediction scores to predicted class labels + predicted_labels = np.argmax(raw_predictions, axis=1) + + # Compute and return the evaluation metrics + return metric.compute(prediction_scores=predicted_labels, references=true_labels, average="macro") + + +def abstract_proofread(model_path, temperature, base_url, api_key, prompt): + """ + Function to proofread an abstract using an AI language model. + + Parameters: + model_path (str): The path or identifier of the AI model to use. + temperature (float): Sampling temperature for the model's output. + base_url (str): The base URL for the API endpoint. + api_key (str): The API key for authentication. + prompt (str): The text prompt to provide to the AI for proofreading. + + Returns: + str: The proofread abstract generated by the AI model. + """ + # Initialize the AI client with the provided API key and base URL + ai_client = OpenAI(api_key=api_key, base_url=base_url) + + # Create a chat completion request with the system message and user prompt + chat_completion = ai_client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are an AI assistant", + }, + { + "role": "user", + "content": prompt, + } + ], + model=model_path, + max_tokens=1024, + temperature=temperature, + ) + + # Return the content of the first choice's message + return chat_completion.choices[0].message.content + + + +def proofread_by_model_name(model_name, input_text, normalize_output): + """ + Proofreads the given input text using the specified model. + + Args: + model_name (str): The name of the model to use for proofreading. + input_text (str): The text to be proofread. + normalize_output (bool): Whether to normalize the output or not. + + Returns: + str: The proofread text. + """ + # Constants for API access + base_url = TOGETHER_PATH + api_key = TOGETHER_API_KEY + temperature = TEMPERATURE + + # Retrieve the model path from the dictionary + if model_name in MODEL_PATHS: + model_path = MODEL_PATHS[model_name] + else: + raise ValueError("Model name not found in the dictionary.") + + # Formulate the prompt for the model + prompt = f"Proofreading for the text: ```{input_text}```" + + # Apply output normalization if required + if normalize_output: + prompt = output_normalization(prompt) + + # Debugging: Print the prompt + print(f"Prompt: {prompt}") + + # Call the abstract proofreading function with the prepared parameters + return abstract_proofread(model_path, temperature, base_url, api_key, prompt) + + +def gemini_proofread(input_text, normalize_output): + """ + Proofreads the given text using the GEMINI_MODEL. + + Parameters: + input_text (str): The text to be proofread. + normalize_output (bool): Flag indicating whether to normalize the output. + + Returns: + str: The proofread text. + """ + prompt = f"Proofreading for the text: ```{input_text}```" + if normalize_output: + prompt = output_normalization(prompt) + response = GEMINI_MODEL.generate_content(prompt) + return response.text + +def print_and_log(message): + """ + Prints and logs the given message to a log file. + + Parameters: + message (str): The message to be printed and logged. + """ + print(message) + with open(LOG_FILE, "a+", encoding='utf-8') as log_file: + log_file.write(message + "\n") + +def write_to_file(filename, content): + """ + Writes the given content to a specified file. + + Parameters: + filename (str): The name of the file to write to. + content (str): The content to be written. + """ + print(content) + with open(filename, "a+", encoding='utf-8') as file: + file.write(content) + +def output_normalization(prompt): + """ + Normalizes the output by appending a specific instruction to the prompt. + + Parameters: + prompt (str): The initial prompt. + + Returns: + str: The modified prompt. + """ + return prompt + " Please only output the proofread text without any explanation." + +def chatGPT_proofread(input_text, normalize_output): + """ + Proofreads the given text using the chat_model. + + Parameters: + input_text (str): The text to be proofread. + normalize_output (bool): Flag indicating whether to normalize the output. + + Returns: + str: The proofread text. + """ + prompt = f"Proofreading for the text: ```{input_text}```" + if normalize_output: + prompt = output_normalization(prompt) + + print(f"Starting API call with prompt: {prompt}") + result = chat_model.predict(prompt) + print(f"Ending API call with prompt: {prompt}") + + return result + +def normalize_text(input_text): + """ + Normalizes the given text by removing certain characters and extra spaces. + + Parameters: + input_text (str): The text to be normalized. + + Returns: + str: The normalized text. + """ + result = input_text.strip() + result = result.replace("**", "") + result = result.replace("\n", " ") + result = result.replace(" ", " ") # Remove extra spaces + return result + +def write_to_csv(filename, row_data): + """ + Writes a row of data to a specified CSV file. + + Parameters: + filename (str): The name of the CSV file. + row_data (list): The row data to be written. + """ + with open(filename, 'a+', encoding='UTF8', newline='') as file: + writer = csv.writer(file) + writer.writerow(row_data) + +def number_of_csv_lines(filename): + """ + Returns the number of lines in a specified CSV file. + + Parameters: + filename (str): The name of the CSV file. + + Returns: + int: The number of lines in the CSV file. + """ + file_data = pd.read_csv(filename, sep=',').values + return len(file_data) + +def read_csv_data(input_file): + """ + Reads data from a specified CSV file. + + Parameters: + input_file (str): The name of the CSV file. + + Returns: + numpy.ndarray: The data read from the CSV file. + """ + file_data = pd.read_csv(input_file, dtype='string', keep_default_na=False, sep=',').values + return file_data + +def bart_score(text_1, text_2): + """ + Computes the BART score between two texts. + + Parameters: + text_1 (str): The first text. + text_2 (str): The second text. + + Returns: + float: The BART score. + """ + score = bart_scorer.score([text_1], [text_2]) + return score + +def check_bart_score(input_text, raw_text): + """ + Checks if the BART score between input_text and raw_text is above a threshold. + + Parameters: + input_text (str): The input text. + raw_text (str): The raw text to compare against. + + Returns: + bool: True if the score is above the threshold, False otherwise. + """ + THRESHOLD = -2.459 + normalized_text = normalize_text(raw_text) + score = bart_score(input_text, normalized_text)[0] + return score >= THRESHOLD + +def get_column(input_file, column_name): + """ + Retrieves a specific column from a CSV file. + + Parameters: + input_file (str): The name of the CSV file. + column_name (str): The name of the column to retrieve. + + Returns: + numpy.ndarray: The values from the specified column. + """ + df = pd.read_csv(input_file, dtype='string', keep_default_na=False, sep=',') + column_data = df[column_name] + return column_data.values + +def generate_column_names(categories): + """ + Generates a list of column names based on given categories. + + Parameters: + categories (list): The list of categories. + + Returns: + list: The generated list of column names. + """ + column_names = ['human'] + for name in categories: + column_names.append(name) + for first in categories: + for second in categories: + column_names.append(f"{first}_{second}") + return column_names + +def write_new_data(output_file, current_data, column_names): + """ + Writes new data to a CSV file based on current data and column names. + + Parameters: + output_file (str): The name of the output CSV file. + current_data (dict): The current data to be written. + column_names (list): The list of column names. + """ + data_row = [current_data[column] for column in column_names] + write_to_csv(output_file, data_row) + +def refine(input_text, candidate): + """ + Refines the candidate string by removing specific surrounding marks if they are present + in the input_text with a count difference of exactly 2. + + Args: + input_text (str): The original text. + candidate (str): The candidate text to be refined. + + Returns: + str: The refined candidate text. + """ + + # Create a copy of the candidate string and strip whitespace + refined_candidate = candidate.strip() + + # List of marks to check and potentially remove + marks = ["```", "'", '"'] + + # Iterate through each mark + for mark in marks: + # Count occurrences of the mark in input_text and refined_candidate + count_input_text = input_text.count(mark) + count_refined_candidate = refined_candidate.count(mark) + + # Check if the mark should be stripped + if (count_refined_candidate == count_input_text + 2 and + refined_candidate.startswith(mark) and + refined_candidate.endswith(mark)): + # Strip the mark from both ends of the refined_candidate + refined_candidate = refined_candidate.strip(mark) + + return refined_candidate + + +def extract_by_best_similarity(input_text, raw_text): + """ + Extracts the best candidate string from the raw text based on the highest similarity score + compared to the input text. The similarity score is calculated using the BART score. + + Args: + input_text (str): The original text. + raw_text (str): The raw text containing multiple candidate strings. + + Returns: + str: The best candidate string with the highest similarity score. + Returns the input text if no suitable candidate is found. + """ + + # Refine the raw text + refined_raw_text = refine(input_text, raw_text) + + # Tokenize the refined raw text into sentences + raw_candidates = nltk.sent_tokenize(refined_raw_text) + + # Split sentences further by newlines to get individual candidates + candidate_list = [] + for sentence in raw_candidates: + candidate_list.extend(sentence.split("\n")) + + # Initialize variables to track the best similarity score and the best candidate + best_similarity = -9999 + best_candidate = "" + + # Iterate over each candidate to find the best one based on the BART score + for candidate in candidate_list: + refined_candidate = refine(input_text, candidate) + if check_bart_score(input_text, refined_candidate): + score = bart_score(input_text, refined_candidate)[0] + if score > best_similarity: + best_similarity = score + best_candidate = refined_candidate + + # Print the best candidate found + print(f"best_candidate = {best_candidate}") + + # Return the best candidate if found, otherwise return the input text + if best_candidate == "": + return input_text + return best_candidate + +def proofread_with_best_similarity(input_text, model_kind): + """ + Proofreads the input text using the specified model and extracts the best-corrected text based on similarity. + + Args: + input_text (str): The original text to be proofread. + model_kind (str): The kind of model to use for proofreading (e.g., CHATGPT, GEMINI). + + Returns: + tuple: A tuple containing the raw proofread text and the best-corrected text. + """ + + # Normalize the input text + normalized_input_text = normalize_text(input_text) + print_and_log(f"INPUT = {normalized_input_text}") + + result_text = "" + raw_text = "" + + for i in range(1): # Loop is redundant as it runs only once; consider removing if unnecessary + # Select the proofreading model based on model_kind + if model_kind == CHATGPT: + raw_text = chatGPT_proofread(normalized_input_text, normalize_output=IS_OUTPUT_NORMALIZATION) + elif model_kind == GEMINI: + raw_text = gemini_proofread(normalized_input_text, normalize_output=IS_OUTPUT_NORMALIZATION) + else: + raw_text = proofread_by_model_name(model_kind, normalized_input_text, normalize_output=IS_OUTPUT_NORMALIZATION) + + # Extract the best candidate text based on similarity + result_text = extract_by_best_similarity(normalized_input_text, raw_text) + + # Log the raw and result texts + print_and_log(f"RAW_{i} = {raw_text}") + print_and_log(f"RESULT_{i} = {result_text}") + + # Normalize the result text + result_text = normalize_text(result_text) + + # If a valid result is obtained, return it + if result_text != "": + return raw_text, result_text + + # Return the raw and result texts + return raw_text, result_text + +def generate_file_name(existing_data_file, existing_kinds, new_kinds): + """ + Generates a new file name based on the path of an existing data file and a combination of existing and new kinds. + + Args: + existing_data_file (str): The path to the existing data file. + existing_kinds (list): A list of existing kinds. + new_kinds (list): A list of new kinds. + + Returns: + str: The generated file name with the full path. + """ + + # Combine existing and new kinds into a single list + combined_kinds = existing_kinds + new_kinds + + # Get the directory path of the existing data file + directory_path = os.path.dirname(existing_data_file) + + # Create a new file name by joining the kinds with underscores and adding a suffix + new_file_name = "_".join(combined_kinds) + "_with_best_similarity.csv" + + # Combine the directory path with the new file name to get the full output file path + output_file_path = os.path.join(directory_path, new_file_name) + + return output_file_path + + + +def generate_new_data_with_best_similarity(existing_data_file, existing_kinds, new_kinds): + """ + Generates new data with the best similarity based on existing and new kinds, and writes the results to a CSV file. + + Args: + existing_data_file (str): The path to the existing data file. + existing_kinds (list): A list of existing kinds. + new_kinds (list): A list of new kinds. + + Returns: + None + """ + + # Combine existing and new kinds into a single list + all_kinds = existing_kinds + new_kinds + + # Generate column names for the CSV file + column_names = generate_column_names(all_kinds) + + # Generate column names for existing kinds + existing_column_names = generate_column_names(existing_kinds) + + # Generate the output file name + output_file = generate_file_name(existing_data_file, existing_kinds, new_kinds) + + # Create the output file with column names if it doesn't exist + if not os.path.exists(output_file): + write_to_csv(output_file, column_names) + + # Read existing data from the file + existing_data = {kind: get_column(existing_data_file, kind) for kind in existing_column_names} + + # Read input data from the output file + input_data = read_csv_data(output_file) + start_index = len(input_data) + print(f"start_index = {start_index}") + + num_rows = len(existing_data["human"]) + global_generate_set = [] + global_reuse = [] + + for index in range(start_index, num_rows): + # Initialize generation and reuse sets + generate_set = [] + reuse_set = [] + + # Prepare the current generation dictionary + current_generation = {kind: existing_data[kind][index] for kind in existing_column_names} + print(f"current_generation before generation = {current_generation}") + + human_text = current_generation["human"] + + # Generate new kinds based on human text + for kind in new_kinds: + _, generated_text = proofread_with_best_similarity(human_text, kind) + current_generation[kind] = generated_text + generate_set.append(kind) + + print(f"current_generation after generate one = {current_generation}") + + # Generate combinations of kinds + for first_kind in all_kinds: + for second_kind in all_kinds: + combination_name = f"{first_kind}_{second_kind}" + + if combination_name not in current_generation: + if first_kind in current_generation and current_generation[first_kind] == human_text: + generated_text = current_generation[second_kind] + reuse_set.append(f"{combination_name} from {second_kind}") + else: + is_need_generation = True + for first_kind_2 in all_kinds: + if first_kind != first_kind_2 and current_generation[first_kind] == current_generation[first_kind_2]: + combination_name_2 = f"{first_kind_2}_{second_kind}" + if combination_name_2 in current_generation: + generated_text = current_generation[combination_name_2] + reuse_set.append(f"{combination_name} from {combination_name_2}") + is_need_generation = False + break + if is_need_generation: + _, generated_text = proofread_with_best_similarity(current_generation[first_kind], second_kind) + generate_set.append(f"{first_kind}_{second_kind}") + + current_generation[combination_name] = generated_text + + # Write the current generation to the output file + write_new_data(output_file, current_generation, column_names) + + # Update global sets + global_generate_set.append(generate_set) + global_reuse + +def shuffle(array, seed): + """ + Shuffles the elements of each sublist in the given array using the specified seed. + + Args: + array (list of lists): The array containing sublists to shuffle. + seed (int): The seed value for the random number generator. + + Returns: + None + """ + for sublist in array: + random.Random(seed).shuffle(sublist) + +def generate_human_with_shuffle(dataset_name, column_name, num_samples, output_file): + """ + Generates a shuffled list of sentences from the dataset and writes them to a CSV file. + + Args: + dataset_name (str): The name of the dataset to load. + column_name (str): The column name to extract sentences from. + num_samples (int): The number of samples to process. + output_file (str): The path to the output CSV file. + + Returns: + None + """ + # Load the dataset + dataset = load_dataset(dataset_name) + data = dataset['train'] + + lines = [] + # Tokenize sentences and add to the lines list + for sample in data: + nltk_tokens = nltk.sent_tokenize(sample[column_name]) + lines.extend(nltk_tokens) + + # Filter out empty lines + filtered_lines = [line for line in lines if line != ""] + lines = filtered_lines + + # Shuffle the lines + shuffle([lines], seed=SEED) + + # Ensure the output file exists and write the header if it doesn't + if not os.path.exists(output_file): + header = ["human"] + write_to_csv(output_file, header) + + # Get the number of lines already processed in the output file + number_of_processed_lines = number_of_csv_lines(output_file) + + # Print the initial lines to be processed + print(f"Lines before processing: {lines[:num_samples]}") + + # Slice the lines list to get the unprocessed lines + lines = lines[number_of_processed_lines:num_samples] + + # Print the lines after slicing + print(f"Lines after slicing: {lines}") + + # Process each line and write to the output file + for index, human in enumerate(lines): + normalized_text = normalize_text(human) + output_data = [normalized_text] + write_to_csv(output_file, output_data) + print(f"Processed {index + 1} / {len(lines)}; Total processed: {number_of_processed_lines + index + 1} / {num_samples}") + + +def split(data, ratio): + """ + Splits the data into training and testing sets based on the given ratio. + + Args: + data (list): The dataset to split. + ratio (float): The ratio for splitting the data into training and testing sets. + + Returns: + tuple: A tuple containing the training data and the testing data. + """ + train_size = int(len(data) * ratio) + train_data = data[:train_size] + test_data = data[train_size:] + return train_data, test_data + +def bart_score_in_batch(text_1, text_2): + """ + Calculates the BART score for pairs of texts in batches. + + Args: + text_1 (list of str): The first list of texts. + text_2 (list of str): The second list of texts. + + Returns: + list: A list of BART scores for each pair of texts. + """ + return bart_scorer.score(text_1, text_2, batch_size=BATCH_SIZE) + +def extract_feature_in_batch(text_1, text_2, feature_kind): + """ + Extracts features for pairs of texts using BART scores. + + Args: + text_1 (list of str): The first list of texts. + text_2 (list of str): The second list of texts. + feature_kind (str): The type of feature to extract. + + Returns: + list: A list of extracted features. + """ + features = bart_score_in_batch(text_1, text_2) + return features + +def abstract_train(features, labels): + """ + Trains a model using the given features and labels. + + Args: + features (list): The input features for training. + labels (list): The target labels for training. + + Returns: + object: The trained model. + """ + model = MLPClassifier() + model.fit(features, labels) + return model + +def evaluate_model(model, features, labels): + """ + Evaluates the model's performance using accuracy and ROC AUC scores. + + Args: + model (object): The trained model to evaluate. + features (list): The input features for evaluation. + labels (list): The target labels for evaluation. + + Returns: + None + """ + predictions = model.predict(features) + rounded_predictions = [round(value) for value in predictions] + + accuracy = accuracy_score(labels, rounded_predictions) + write_to_file(OUTPUT_FILE, f"Accuracy: {accuracy * 100.0:.1f}%\n") + + roc_auc = roc_auc_score(labels, rounded_predictions) + write_to_file(OUTPUT_FILE, f"ROC AUC: {roc_auc * 100.0:.1f}%\n") + +def combine_text_with_BERT_format(text_list): + """ + Combines a list of texts into a single string formatted for BERT input. + + Args: + text_list (list of str): The list of texts to combine. + + Returns: + str: The combined text string formatted for BERT input. + """ + combined_text = f"{text_list[0]}" + for i in range(1, len(text_list)): + combined_text += f"{text_list[i]}" + return combined_text + + +def preprocess_function_multimodel(sample): + """ + Preprocesses a given sample for a multi-model setup by calculating BART scores + and formatting the text for BERT input. + + Args: + sample (dict): A dictionary containing a key "text", which is a list of lists of strings. + + Returns: + dict: A dictionary containing tokenized and preprocessed text data. + """ + num_texts = len(sample["text"][0]) # Number of texts in each sub-sample + texts_grouped_by_index = [[] for _ in range(num_texts)] # Initialize empty lists for grouping texts by index + + # Group texts by their index across sub-samples + for sub_sample in sample["text"]: + for i in range(num_texts): + texts_grouped_by_index[i].append(sub_sample[i]) + + # Calculate BART scores for each text pair (text[0] with text[i]) + bart_scores = [bart_score_in_batch(texts_grouped_by_index[0], texts_grouped_by_index[i]) for i in range(1, num_texts)] + + combined_texts = [] + + # Process each sub-sample for BERT input + for index, sub_sample in enumerate(sample["text"]): + text_array = [sub_sample[0]] # Start with the input text + score_generation_pairs = [] + + # Pair scores with their corresponding generations + for i in range(1, num_texts): + generation_text = sub_sample[i] + generation_score = bart_scores[i-1][index] + score_generation_pairs.append((generation_score, generation_text)) + + # Sort pairs by score in descending order + sorted_pairs = sorted(score_generation_pairs, reverse=True) + + # Append sorted texts to text_array + for _, sorted_text in sorted_pairs: + text_array.append(sorted_text) + + # Combine texts into a single BERT-formatted string + combined_text = combine_text_with_BERT_format(text_array) + combined_texts.append(combined_text) + + # Tokenize the combined texts for BERT + return tokenizer(combined_texts, add_special_tokens=False, truncation=True) + +def preprocess_function_single_from_multimodel(sample): + """ + Extracts the first text from each sub-sample in a multi-model sample and tokenizes it. + + Args: + sample (dict): A dictionary containing a key "text", which is a list of lists of strings. + + Returns: + dict: A dictionary containing tokenized text data. + """ + combined_texts = [] + + # Iterate through each sub-sample + for sub_sample in sample["text"]: + input_text = sub_sample[0] # Extract the first text from the sub-sample + combined_texts.append(input_text) # Append it to the list of combined texts + + # Tokenize the combined texts + return tokenizer(combined_texts, truncation=True) + + +def check_api_error(data): + """ + Checks if any item in the provided data indicates an API error. + + Args: + data (list): A list of items to be checked for API errors. + + Returns: + bool: True if an API error or ignore by API error is found, otherwise False. + """ + for item in data: + if item == API_ERROR or item == IGNORE_BY_API_ERROR: # Check for API error indicators + return True # Return True if an error indicator is found + return False # Return False if no error indicators are found + + +def train_only_by_transformer_with_test_evaluation_early_stop(train_data, test_data, input_type, num_classes=2): + """ + Trains a transformer model using the provided training and testing datasets with early stopping. + + Args: + train_data (Dataset): The training dataset. + test_data (Dataset): The testing dataset. + input_type (str): The type of input data, either MULTIMODEL or SINGLE_FROM_MULTIMODEL. + num_classes (int, optional): The number of classes for classification. Defaults to 2. + + Returns: + Trainer: The trained model wrapped in a Trainer object. + """ + # Preprocess datasets based on the input type + if input_type == MULTIMODEL: + train_data = train_data.map(preprocess_function_multimodel, batched=True) + test_data = test_data.map(preprocess_function_multimodel, batched=True) + elif input_type == SINGLE_FROM_MULTIMODEL: + train_data = train_data.map(preprocess_function_single_from_multimodel, batched=True) + test_data = test_data.map(preprocess_function_single_from_multimodel, batched=True) + + # Data collator to pad inputs + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Load appropriate model based on number of classes + if num_classes == 3: + model = AutoModelForSequenceClassification.from_pretrained( + "pretrained_model/roberta-base_num_labels_3", num_labels=num_classes) + else: + model = AutoModelForSequenceClassification.from_pretrained( + ROBERTA_MODEL_PATHS[MODEL_NAME], num_labels=num_classes) + + learning_rate = LEARNING_RATES[MODEL_NAME] + output_folder = "training_with_callbacks" + + # Remove the output folder if it already exists + if os.path.exists(output_folder): + shutil.rmtree(output_folder) + + # Training arguments + training_args = TrainingArguments( + output_dir=output_folder, + evaluation_strategy="epoch", + logging_strategy="epoch", + save_strategy="epoch", + learning_rate=learning_rate, + per_device_train_batch_size=BATCH_SIZE, + per_device_eval_batch_size=BATCH_SIZE, + num_train_epochs=NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING, + weight_decay=0.01, + push_to_hub=False, + metric_for_best_model=OPTIMIZED_METRIC, + load_best_model_at_end=True + ) + + # Create Trainer object + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_data, + eval_dataset=test_data, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)] + ) + + # Add custom callback + trainer.add_callback(CustomCallback(trainer)) + + # Start training + trainer.train() + + return trainer + + +def calculate_number_of_models(num_columns): + """ + Calculates the number of models required based on the number of columns. + + Args: + num_columns (int): The total number of columns. + + Returns: + int: The number of models required. + + Raises: + Exception: If the number of models cannot be calculated to match the number of columns. + """ + num_models = 0 + count_human = 1 # Initial count representing human input + + while True: + count_single = num_models # Single model count + count_pair = num_models * num_models # Pair model count + + total_count = count_human + count_single + count_pair + + if total_count == num_columns: + return num_models + elif total_count > num_columns: + raise Exception("Cannot calculate the number of models to match the number of columns") + + num_models += 1 + + +def read_multimodel_data_from_csv(multimodel_csv_file): + """ + Reads multimodel data from a CSV file and organizes it into a structured format. + + Args: + multimodel_csv_file (str): Path to the CSV file containing multimodel data. + + Returns: + list: A list of dictionaries, each containing 'human', 'single', and 'pair' data. + + Raises: + Exception: If there is an error in reading the CSV file or processing the data. + """ + # Read CSV data into a list of lists + input_data = read_csv_data(multimodel_csv_file) + + # Initialize the result list + structured_data = [] + + # Calculate the number of models based on the number of columns in the first row + num_models = calculate_number_of_models(len(input_data[0])) + + # Process each row in the input data + for row in input_data: + row_data = {} + index = 0 + + # Extract human data + row_data["human"] = row[index] + index += 1 + + # Extract single model data + single_model_data = [] + for _ in range(num_models): + single_model_data.append(row[index]) + index += 1 + row_data["single"] = single_model_data + + # Extract pair model data + pair_model_data = [] + for _ in range(num_models): + sub_pair_data = [] + for _ in range(num_models): + sub_pair_data.append(row[index]) + index += 1 + pair_model_data.append(sub_pair_data) + row_data["pair"] = pair_model_data + + # Append the structured row data to the result list + structured_data.append(row_data) + + return structured_data + + +def check_error(data_item): + """ + Checks for errors in a data item by verifying the 'human', 'single', and 'pair' fields. + + Args: + data_item (dict): A dictionary containing 'human', 'single', and 'pair' data. + + Returns: + bool: True if any of the fields contain an error, otherwise False. + """ + # Check for API error in the 'human' field + if check_api_error(data_item["human"]): + return True + + # Check for API error in the 'single' model data + for single_text in data_item["single"]: + if check_api_error(single_text): + return True + + # Get the number of models from the 'single' model data + num_models = len(data_item["single"]) + + # Check for API error in the 'pair' model data + for i in range(num_models): + for j in range(num_models): + if check_api_error(data_item["pair"][i][j]): + return True + + # No errors found + return False + + + +def create_pair_sample(data_item, training_indices): + """ + Creates pair samples for training by comparing human data with machine-generated data. + + Args: + data_item (dict): A dictionary containing 'human', 'single', and 'pair' data. + training_indices (list): A list of indices used for training. + + Returns: + list: A list of dictionaries, each containing a 'text' array and a 'label'. + """ + # Initialize the result list + result_samples = [] + + # Check if there is any error in the data_item + if check_error(data_item): + return result_samples + + print(training_indices) + print(data_item) + # Create machine samples + for train_idx in training_indices: + if data_item["human"] != data_item["single"][train_idx]: + text_array = [] + machine_text = data_item["single"][train_idx] + text_array.append(machine_text) + + for sub_idx in training_indices: + text_array.append(data_item["pair"][train_idx][sub_idx]) + + sample = { + "text": text_array, + "label": MACHINE_LABEL + } + result_samples.append(sample) + + # Create human samples + text_array = [data_item["human"]] + + for train_idx in training_indices: + text_array.append(data_item["single"][train_idx]) + + human_sample = { + "text": text_array, + "label": HUMAN_LABEL + } + + # Append human samples for each machine sample + num_machine_samples = len(result_samples) + for _ in range(num_machine_samples): + result_samples.append(human_sample) + + return result_samples + + +def create_pair_test_sample(data_item, training_indices, testing_indices): + """ + Creates pair test samples by comparing human data with machine-generated data. + + Args: + data_item (dict): A dictionary containing 'human', 'single', and 'pair' data. + training_indices (list): A list of indices used for training. + testing_indices (list): A list of indices used for testing. + + Returns: + list: A list of dictionaries, each containing a 'text' array and a 'label'. + """ + # Initialize the result list + result_samples = [] + + # Check if there is any error in the data_item + if check_error(data_item): + return result_samples + + # Create machine samples based on testing indices + for test_idx in testing_indices: + if data_item["human"] != data_item["single"][test_idx]: + text_array = [] + machine_text = data_item["single"][test_idx] + text_array.append(machine_text) + + for train_idx in training_indices: + text_array.append(data_item["pair"][test_idx][train_idx]) + + sample = { + "text": text_array, + "label": MACHINE_LABEL + } + result_samples.append(sample) + + # Create human sample + text_array = [data_item["human"]] + + for train_idx in training_indices: + text_array.append(data_item["single"][train_idx]) + + human_sample = { + "text": text_array, + "label": HUMAN_LABEL + } + + # Append the human sample for each machine sample + num_machine_samples = len(result_samples) + for _ in range(num_machine_samples): + result_samples.append(human_sample) + + return result_samples + + + +def create_train_val_sample(data, training_indices): + """ + Creates training and validation samples from the provided data. + + Args: + data (list): A list of data items, each to be processed. + training_indices (list): A list of indices used for training. + + Returns: + list: A list of training and validation samples created from the data. + """ + # Initialize the result list + result_samples = [] + + # Process each item in the data + for data_item in data: + # Create pair samples for the current item + sub_samples = create_pair_sample(data_item, training_indices) + + # Extend the result list with the created sub-samples + result_samples.extend(sub_samples) + + return result_samples + + +def create_test_sample(data, training_indices, testing_indices): + """ + Creates test samples from the provided data by comparing human data with machine-generated data. + + Args: + data (list): A list of data items, each to be processed. + training_indices (list): A list of indices used for training. + testing_indices (list): A list of indices used for testing. + + Returns: + list: A list of test samples created from the data. + """ + # Initialize the result list + result_samples = [] + + # Process each item in the data + for data_item in data: + # Create pair test samples for the current item + sub_samples = create_pair_test_sample(data_item, training_indices, testing_indices) + + # Extend the result list with the created sub-samples + result_samples.extend(sub_samples) + + return result_samples + + +def distribute_data(data, train_indices, test_indices, train_ratio, val_ratio): + """ + Distributes the data into training, validation, and test samples. + + Args: + data (list): A list of data items to be split and processed. + train_indices (list): A list of indices used for training. + test_indices (list): A list of indices used for testing. + train_ratio (float): The ratio of data to be used for training. + val_ratio (float): The ratio of data to be used for validation. + + Returns: + tuple: A tuple containing lists of training, validation, and test samples. + """ + # Split the data into training, validation, and test sets + train_data, val_data, test_data = split_train_val_test(data, train_ratio, val_ratio) + + # Create training samples + train_samples = create_train_val_sample(train_data, train_indices) + write_to_file(OUTPUT_FILE, f"train samples = {len(train_samples)}\n") + + # Create validation samples + val_samples = create_train_val_sample(val_data, train_indices) + write_to_file(OUTPUT_FILE, f"val samples = {len(val_samples)}\n") + + # Create test samples + test_samples = create_test_sample(test_data, train_indices, test_indices) + write_to_file(OUTPUT_FILE, f"test samples = {len(test_samples)}\n") + + return train_samples, val_samples, test_samples + + +def convert_to_huggingface_with_multimodel(samples): + """ + Converts a list of samples to the Hugging Face Dataset format. + + Args: + samples (list): A list of samples to be converted. + + Returns: + Dataset: A Hugging Face Dataset object created from the samples. + """ + return Dataset.from_list(samples) + + + +def train_by_transformer_with_multimodel_and_early_stop(train_samples, val_samples, input_type): + """ + Trains a transformer model with multimodal data and early stopping. + + Args: + train_samples (list): A list of training samples. + val_samples (list): A list of validation samples. + input_type (str): The type of input data (e.g., multimodal). + + Returns: + object: The trained model with early stopping. + """ + # Convert training and validation samples to Hugging Face Dataset format + train_data = convert_to_huggingface_with_multimodel(train_samples) + val_data = convert_to_huggingface_with_multimodel(val_samples) + + # Train the model with early stopping and return the trained model + return train_only_by_transformer_with_test_evaluation_early_stop(train_data, val_data, input_type) + + +def test_by_transformer_with_multimodel(detector, test_samples, input_type): + """ + Tests a trained transformer model with multimodal data. + + Args: + detector (object): The trained model to be evaluated. + test_samples (list): A list of test samples. + input_type (str): The type of input data (e.g., multimodal). + + Returns: + None + """ + # Convert test samples to Hugging Face Dataset format + test_data = convert_to_huggingface_with_multimodel(test_samples) + + # Apply the appropriate preprocessing function based on the input type + if input_type == MULTIMODEL: + test_data = test_data.map(preprocess_function_multimodel, batched=True) + elif input_type == SINGLE_FROM_MULTIMODEL: + test_data = test_data.map(preprocess_function_single_from_multimodel, batched=True) + + print("Test data:", test_data) + # Evaluate the model on the test data + result = detector.evaluate(eval_dataset=test_data) + print("Test result:", result) + + # Extract and log the ROC AUC score + roc_auc = result['eval_roc_auc'] + write_to_file(OUTPUT_FILE, "roc_auc: %.1f%%" % (roc_auc * 100.0) + "\n") + + + +def extract_by_feature_kind(samples, feature_type): + """ + Extracts features from the given samples based on the specified feature type. + + Args: + samples (list): A list of samples where each sample is a dictionary with 'text' and 'label' keys. + feature_type (str): The type of feature to extract. + + Returns: + tuple: A tuple containing the extracted features and corresponding labels. + """ + text_1_list = [] + text_2_list = [] + labels = [] + + for sample in samples: + text_1_list.append(sample["text"][0]) + text_2_list.append(sample["text"][1]) + labels.append(sample["label"]) + + # Extract features in batch based on the feature type + features = extract_feature_in_batch(text_1_list, text_2_list, feature_type) + + return features, labels + + +def train_by_feature_kind(train_samples, feature_type): + """ + Trains a model using features extracted from the training samples based on the specified feature type. + + Args: + train_samples (list): A list of training samples where each sample is a dictionary with 'text' and 'label' keys. + feature_type (str): The type of feature to extract for training. + + Returns: + object: The trained model. + """ + # Extract features and labels from the training samples + features, labels = extract_by_feature_kind(train_samples, feature_type) + + # Convert features to a numpy array and reshape for training + features = np.array(features) + features = features.reshape(-1, 1) + + # Train the model using the extracted features and labels + model = abstract_train(features, labels) + + return model + + +def test_by_feature_kind(detector, samples, feature_type): + """ + Tests a detector using features extracted from the provided samples based on the specified feature type. + + Args: + detector (object): The detector model to be evaluated. + samples (list): A list of samples where each sample is a dictionary with 'text' and 'label' keys. + feature_type (str): The type of feature to extract for testing. + + Returns: + None + """ + # Extract features and labels from the samples + features, labels = extract_by_feature_kind(samples, feature_type) + + # Convert features to a numpy array and reshape for evaluation + features = np.array(features) + features = features.reshape(-1, 1) + + # Evaluate the detector model using the extracted features and labels + evaluate_model(detector, features, labels) + + +def general_process_multimodels_train_val_test(train_samples, val_samples, test_samples): + """ + General process for training, validating, and testing models using multi-model and feature kind approaches. + + Args: + train_samples (list): Training samples. + val_samples (list): Validation samples. + test_samples (list): Test samples. + + Returns: + None + """ + # Multi-model approach + input_kind = MULTIMODEL + write_to_file(OUTPUT_FILE, f"\nInput kind = {input_kind} \n") + + # Train detector using multi-model with early stopping + detector = train_by_transformer_with_multimodel_and_early_stop(train_samples, val_samples, input_kind) + detector.save_model("./models/multi_model_detector") + + # Evaluate on train set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TRAIN SET \n") + test_by_transformer_with_multimodel(detector, train_samples, input_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, f"EVALUATE ON VALIDATION SET \n") + test_by_transformer_with_multimodel(detector, val_samples, input_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TEST SET \n") + test_by_transformer_with_multimodel(detector, test_samples, input_kind) + + # Single from multi-model approach + input_kind = SINGLE_FROM_MULTIMODEL + write_to_file(OUTPUT_FILE, f"\nInput kind = {input_kind} \n") + + # Train detector using single from multi-model with early stopping + detector = train_by_transformer_with_multimodel_and_early_stop(train_samples, val_samples, input_kind) + detector.save_model("./models/single_model_detector_1") + + # Evaluate on train set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TRAIN SET \n") + test_by_transformer_with_multimodel(detector, train_samples, input_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, f"EVALUATE ON VALIDATION SET \n") + test_by_transformer_with_multimodel(detector, val_samples, input_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TEST SET \n") + test_by_transformer_with_multimodel(detector, test_samples, input_kind) + + # Feature kind approach + sample_length = len(train_samples[0]["text"]) + if sample_length == 2: # Check if the sample length is 2, indicating BART feature kind + feature_kind = BART + write_to_file(OUTPUT_FILE, f"\nFeature kind = {feature_kind} \n") + + # Train detector using feature kind + detector = train_by_feature_kind(train_samples, feature_kind) + + # Evaluate on train set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TRAIN SET \n") + test_by_feature_kind(detector, train_samples, feature_kind) + + # Evaluate on validation set + write_to_file(OUTPUT_FILE, f"EVALUATE ON VALIDATION SET \n") + test_by_feature_kind(detector, val_samples, feature_kind) + + # Evaluate on test set + write_to_file(OUTPUT_FILE, f"EVALUATE ON TEST SET \n") + test_by_feature_kind(detector, test_samples, feature_kind) + + +def process_multi_models_with_validation(multimodel_csv_file, train_indices, test_indices, num_samples): + """ + Processes multi-model data with validation, training, and testing. + + Args: + multimodel_csv_file (str): Path to the CSV file containing multi-model data. + train_indices (list): Indices for the training data. + test_indices (list): Indices for the testing data. + num_samples (int): Number of samples to process. + + Returns: + None + """ + # Log the details of the process + write_to_file(OUTPUT_FILE, f"PROCESSING FILE={multimodel_csv_file} \n") + write_to_file(OUTPUT_FILE, f"EXPERIMENT WITH {MODEL_NAME} model \n") + write_to_file(OUTPUT_FILE, f"NUMBER OF MAX EPOCHS WITH EARLY STOPPING = {NUMBER_OF_MAX_EPOCH_WITH_EARLY_STOPPING} \n") + write_to_file(OUTPUT_FILE, f"PATIENCE = {PATIENCE} \n") + write_to_file(OUTPUT_FILE, f"OPTIMIZED METRIC = {OPTIMIZED_METRIC} \n") + write_to_file(OUTPUT_FILE, f"BATCH SIZE = {BATCH_SIZE} \n") + write_to_file(OUTPUT_FILE, f"Number of samples = {num_samples} \n") + + # Read multi-model data from the CSV file + data = read_multimodel_data_from_csv(multimodel_csv_file) + + # Limit data to the specified number of samples + data = data[:num_samples] + + # Distribute data into training, validation, and testing sets + train_samples, val_samples, test_samples = distribute_data(data, train_indices, test_indices, TRAIN_RATIO, VAL_RATIO) + + # Log the training and testing indices + write_to_file(OUTPUT_FILE, f"Multimodel training with train indices {train_indices}, test with test indices {test_indices} \n") + + # Process the multi-models for training, validation, and testing + general_process_multimodels_train_val_test(train_samples, val_samples, test_samples) + + + + +def split_train_val_test(data, train_ratio, val_ratio): + """ + Splits the dataset into training, validation, and test sets based on specified ratios. + + Args: + data (list): The dataset to be split. + train_ratio (float): The ratio of the dataset to be used for training. + val_ratio (float): The ratio of the dataset to be used for validation. + + Returns: + tuple: A tuple containing three lists - (train_data, val_data, test_data). + """ + # Calculate the number of samples for the training set + num_train_samples = int(len(data) * train_ratio) + + # Calculate the number of samples for the validation set + num_val_samples = int(len(data) * val_ratio) + + # Split the data into training, validation, and test sets + train_data = data[:num_train_samples] + val_data = data[num_train_samples:(num_train_samples + num_val_samples)] + test_data = data[(num_train_samples + num_val_samples):] + + return train_data, val_data, test_data + + +def main(): + """ + Main function to handle argument parsing and execute the sequence of operations + including data generation and processing with multiple models. + """ + parser = argparse.ArgumentParser(description='SimLLM.') + + # Argument for specifying the list of large language models + parser.add_argument('--LLMs', nargs="+", default=[CHATGPT],#, "Yi", "OpenChat"], + help='List of large language models') + + # Argument for specifying the list of training indexes + parser.add_argument('--train_indexes', type=int, default=[0,1,2], nargs="+", + help='List of training indexes') + + # Argument for specifying the list of testing indexes + parser.add_argument('--test_indexes', type=int, default=[0], nargs="+", + help='List of testing indexes') + + # Argument for specifying the number of samples + parser.add_argument('--num_samples', type=int, default=5000, + help='Number of samples') + + # Argument for multimodel_csv_file + parser.add_argument('--multimodel_csv_file', type=str, default="data/ChatGPT_Nous_Hermes_2_Yi_34B_openchat_3_5_1210_with_best_similarity.csv", + help='multimodel_csv_file') + + # Parse the command-line arguments + args = parser.parse_args() + + if args.multimodel_csv_file == "": + # Static dataset parameters + dataset_name = "xsum" + column_name = "document" + num_samples = args.num_samples + output_file = "data/test.csv" + + # Generate human data with shuffle + # generate_human_with_shuffle(dataset_name, column_name, num_samples, output_file) + + # Existing data parameters + existing_data_file = output_file + existing_kinds = [] + + # New kinds of models to generate data with + new_kinds = args.LLMs + + # Generate new data with best similarity + generate_new_data_with_best_similarity(existing_data_file, existing_kinds, new_kinds) + + # Generate a filename for the multimodel CSV file + multimodel_csv_file = generate_file_name(existing_data_file, existing_kinds, new_kinds) + + else: + multimodel_csv_file = args.multimodel_csv_file + + # Number of samples to process (-1 means process all samples) + num_samples_to_process = -1 + + # Training and testing indexes from arguments + training_indexes = args.train_indexes + testing_indexes = args.test_indexes + + # Process multiple models with validation + process_multi_models_with_validation(multimodel_csv_file, training_indexes, testing_indexes, num_samples_to_process) + +if __name__ == "__main__": + main() diff --git a/src/texts/SimLLM/bart_score.py b/src/texts/SimLLM/bart_score.py new file mode 100644 index 0000000000000000000000000000000000000000..bf83d97a2e3569d653906d5319030c127b513cb9 --- /dev/null +++ b/src/texts/SimLLM/bart_score.py @@ -0,0 +1,136 @@ +# %% +import traceback +from typing import List + +import numpy as np +import torch +import torch.nn as nn +from transformers import ( + BartForConditionalGeneration, + BartTokenizer, +) + + +class BARTScorer: + def __init__( + self, + device="cuda:0", + max_length=1024, + checkpoint="facebook/bart-large-cnn", + ): + # Set up model + self.device = device + self.max_length = max_length + self.tokenizer = BartTokenizer.from_pretrained(checkpoint) + self.model = BartForConditionalGeneration.from_pretrained(checkpoint) + self.model.eval() + self.model.to(device) + + # Set up loss + self.loss_fct = nn.NLLLoss( + reduction="none", + ignore_index=self.model.config.pad_token_id, + ) + self.lsm = nn.LogSoftmax(dim=1) + + def load(self, path=None): + """Load model from paraphrase finetuning""" + if path is None: + path = "./bart.pth" + + self.model.load_state_dict(torch.load(path, map_location=self.device)) + + def score(self, srcs, tgts, batch_size=16): + """Score a batch of examples""" + score_list = [] + for i in range(0, len(srcs), batch_size): + src_list = srcs[i : i + batch_size] + tgt_list = tgts[i : i + batch_size] + try: + with torch.no_grad(): + encoded_src = self.tokenizer( + src_list, + max_length=self.max_length, + truncation=True, + padding=True, + return_tensors="pt", + ) + encoded_tgt = self.tokenizer( + tgt_list, + max_length=self.max_length, + truncation=True, + padding=True, + return_tensors="pt", + ) + src_tokens = encoded_src["input_ids"].to(self.device) + src_mask = encoded_src["attention_mask"].to(self.device) + + tgt_tokens = encoded_tgt["input_ids"].to(self.device) + tgt_mask = encoded_tgt["attention_mask"] + tgt_len = tgt_mask.sum(dim=1).to(self.device) + + output = self.model( + input_ids=src_tokens, + attention_mask=src_mask, + labels=tgt_tokens, + ) + logits = output.logits.view( + -1, + self.model.config.vocab_size, + ) + loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) + loss = loss.view(tgt_tokens.shape[0], -1) + loss = loss.sum(dim=1) / tgt_len + curr_score_list = [-x.item() for x in loss] + score_list += curr_score_list + + except RuntimeError: + traceback.print_exc() + print(f"source: {src_list}") + print(f"target: {tgt_list}") + exit(0) + return score_list + + def multi_ref_score( + self, + srcs, + tgts: List[List[str]], + agg="mean", + batch_size=4, + ): + # Assert we have the same number of references + ref_nums = [len(x) for x in tgts] + if len(set(ref_nums)) > 1: + raise Exception( + "You have different number of references per test sample.", + ) + + ref_num = len(tgts[0]) + score_matrix = [] + for i in range(ref_num): + curr_tgts = [x[i] for x in tgts] + scores = self.score(srcs, curr_tgts, batch_size) + score_matrix.append(scores) + if agg == "mean": + score_list = np.mean(score_matrix, axis=0) + elif agg == "max": + score_list = np.max(score_matrix, axis=0) + else: + raise NotImplementedError + return list(score_list) + + def test(self, batch_size=3): + """Test""" + src_list = [ + "This is a very good idea. Although simple, but very insightful.", + "Can I take a look?", + "Do not trust him, he is a liar.", + ] + + tgt_list = [ + "That's stupid.", + "What's the problem?", + "He is trustworthy.", + ] + + print(self.score(src_list, tgt_list, batch_size)) diff --git a/src/texts/__init__.py b/src/texts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/texts/config.py b/src/texts/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9057675663968965422ca0b556de6678ac70152a --- /dev/null +++ b/src/texts/config.py @@ -0,0 +1,12 @@ +# MODEL +#chatgpt-detector-roberta +ROBERTA_MODEL = "Hello-SimpleAI/chatgpt-detector-roberta" + + +# Default labels +HUMAN = "HUMAN" +MACHINE = "MACHINE" + +UNKNOWN = "UNKNOWN" +PARAPHASE = "PARAPHASE" +NON_PARAPHASE = "NON_PARAPHASE" \ No newline at end of file diff --git a/src/texts/evaluation/__init__.py b/src/texts/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/texts/main_text.py b/src/texts/main_text.py new file mode 100644 index 0000000000000000000000000000000000000000..ca6f35829863ecc686a1d3466b4027123ab5201d --- /dev/null +++ b/src/texts/main_text.py @@ -0,0 +1,106 @@ +import argparse + +from texts.config import CHATGPT +from texts.models import process_multi_models_with_validation +from texts.proofreading import generate_new_data_with_best_similarity +from texts.utils import generate_file_name + + +def main(): + """ + Main function to handle argument parsing and execute the sequence of + operations including data generation and processing with multiple + models. + """ + parser = argparse.ArgumentParser(description="SimLLM.") + + # Argument for specifying the list of large language models + parser.add_argument( + "--LLMs", + nargs="+", + default=[CHATGPT, "Yi", "OpenChat"], + help="List of large language models", + ) + + # Argument for specifying the list of training indexes + parser.add_argument( + "--train_indexes", + type=int, + default=[0, 1, 2], + nargs="+", + help="List of training indexes", + ) + + # Argument for specifying the list of testing indexes + parser.add_argument( + "--test_indexes", + type=int, + default=[0], + nargs="+", + help="List of testing indexes", + ) + + # Argument for specifying the number of samples + parser.add_argument( + "--num_samples", + type=int, + default=5000, + help="Number of samples", + ) + + # Parse the command-line arguments + args = parser.parse_args() + + # Static dataset parameters + # dataset_name = "xsum" + # column_name = "document" + # num_samples = args.num_samples + output_file = "data/human.csv" + + # Generate human data with shuffle + # generate_human_with_shuffle( + # dataset_name, + # column_name, + # num_samples, + # output_file, + # ) + + # Existing data parameters + existing_data_file = output_file + existing_kinds = [] + + # New kinds of models to generate data with + new_kinds = args.LLMs + + # Generate new data with best similarity + generate_new_data_with_best_similarity( + existing_data_file, + existing_kinds, + new_kinds, + ) + + # Generate a filename for the multimodel CSV file + multimodel_csv_file = generate_file_name( + existing_data_file, + existing_kinds, + new_kinds, + ) + + # Number of samples to process (-1 means process all samples) + num_samples_to_process = -1 + + # Training and testing indexes from arguments + training_indexes = args.train_indexes + testing_indexes = args.test_indexes + + # Process multiple models with validation + process_multi_models_with_validation( + multimodel_csv_file, + training_indexes, + testing_indexes, + num_samples_to_process, + ) + + +if __name__ == "__main__": + main() diff --git a/src/texts/readme.md b/src/texts/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..98e8de3b322ac618766fc05c9c11268a7a03cbcc --- /dev/null +++ b/src/texts/readme.md @@ -0,0 +1,67 @@ +# [Text] SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-Generation + +## **Getting Started** +1. **Clone the repository:** + ```bash + git clone https://github.com/Tokyo-Techies/prj-nict-ai-content-detection + ``` + +2. **Set up the environment:** +Using virtual environment: + ```bash + python -m venv .venv + source .venv/bin/activate + ``` + +3. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + + +4. **API Keys** (optional) + - Obtain API keys for the corresponding models and insert them into the `SimLLM.py` file: + - ChatGPT: [OpenAI API](https://openai.com/index/openai-api/) + - Gemini: [Google Gemini API](https://ai.google.dev/gemini-api/docs/api-key) + - Other LLMs: [Together API](https://api.together.ai/) + + +5. **Run the project:** + ```bash + python SimLLM.py + ``` + +### Parameters + +- `LLMs`: List of large language models to use. Available models include 'ChatGPT', 'Yi', 'OpenChat', 'Gemini', 'LLaMa', 'Phi', 'Mixtral', 'QWen', 'OLMO', 'WizardLM', and 'Vicuna'. Default is `['ChatGPT', 'Yi', 'OpenChat']`. +- `train_indexes`: List of LLM indexes for training. Default is `[0, 1, 2]`. +- `test_indexes`: List of LLM indexes for testing. Default is `[0]`. +- `num_samples`: Number of samples. Default is 5000. + +### Examples + +- Running with default parameters: + `python SimLLM.py` + +- Running with customized parameters: + `python SimLLM.py --LLMs ChatGPT --train_indexes 0 --test_indexes 0` + +## Dataset + +The `dataset.csv` file contains both human and generated texts from 12 large language models, including: +ChatGPT, GPT-4o, Yi, OpenChat, Gemini, LLaMa, Phi, Mixtral, QWen, OLMO, WizardLM, and Vicuna. + +## Citation + +```bibtex +@inproceedings{nguyen2024SimLLM, + title={SimLLM: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-generation}, + author={Nguyen-Son, Hoang-Quoc and Dao, Minh-Son and Zettsu, Koji}, + booktitle={The Conference on Empirical Methods in Natural Language Processing}, + year={2024} +} +``` + +## Acknowledgements + +- BARTScore: [BARTScore GitHub Repository](https://github.com/neulab/BARTScore)