PhotoshopRequest-Arena

Sleeping

App Files Files Community

taesiri commited on Feb 21

Commit

a167ff0

1 Parent(s): 8940d76

backup

Browse files

Files changed (2) hide show

app.py +29 -16
extract_samples.py +137 -0

app.py CHANGED Viewed

@@ -34,7 +34,6 @@ dataset_post_ids = list(
 photoexp = pd.read_csv("./photoexp_filtered.csv")
 valid_post_ids = set(photoexp.post_id.tolist())
-# filter RESULTS_BACKUP_REPO to include only valid_post_ids using batched processing
 dataset = dataset.filter(
     lambda xs: [x in valid_post_ids for x in xs["post_id"]],
     batched=True,
@@ -51,47 +50,61 @@ def sync_with_hub():
     """
     print("Starting sync with hub...")
     data_dir = Path("./data")
-    if data_dir.exists():
-        # Backup existing data
-        backup_dir = Path("./data_backup")
-        if backup_dir.exists():
-            shutil.rmtree(backup_dir)
-        shutil.copytree(data_dir, backup_dir)
     # Clone/pull latest data from hub
-    # Use token in the URL for authentication following HF's new format
     token = os.environ["HF_TOKEN"]
-    username = "taesiri"  # Extract from DATASET_REPO
     repo_url = (
         f"https://{username}:{token}@huggingface.co/datasets/{RESULTS_BACKUP_REPO}"
     )
     hub_data_dir = Path("hub_data")
     if hub_data_dir.exists():
-        # If repo exists, do a git pull
         print("Pulling latest changes...")
         repo = git.Repo(hub_data_dir)
         origin = repo.remotes.origin
-        # Set the new URL with token
         if "https://" in origin.url:
             origin.set_url(repo_url)
         origin.pull()
     else:
-        # Clone the repo with token
         print("Cloning repository...")
         git.Repo.clone_from(repo_url, hub_data_dir)
     # Merge hub data with local data
     hub_data_source = hub_data_dir / "data"
     if hub_data_source.exists():
-        # Create data dir if it doesn't exist
         data_dir.mkdir(exist_ok=True)
-        # Copy files from hub
         for item in hub_data_source.glob("*"):
-            if item.is_dir():
                 dest = data_dir / item.name
-                if not dest.exists():  # Only copy if doesn't exist locally
                     shutil.copytree(item, dest)
     # Clean up cloned repo

 photoexp = pd.read_csv("./photoexp_filtered.csv")
 valid_post_ids = set(photoexp.post_id.tolist())
 dataset = dataset.filter(
     lambda xs: [x in valid_post_ids for x in xs["post_id"]],
     batched=True,
     """
     print("Starting sync with hub...")
     data_dir = Path("./data")
+    local_csv_path = data_dir / "evaluation_results_exp.csv"
+    # Read existing local data if it exists
+    local_data = None
+    if local_csv_path.exists():
+        local_data = pd.read_csv(local_csv_path)
+        print(f"Found local data with {len(local_data)} entries")
     # Clone/pull latest data from hub
     token = os.environ["HF_TOKEN"]
+    username = "taesiri"
     repo_url = (
         f"https://{username}:{token}@huggingface.co/datasets/{RESULTS_BACKUP_REPO}"
     )
     hub_data_dir = Path("hub_data")
     if hub_data_dir.exists():
         print("Pulling latest changes...")
         repo = git.Repo(hub_data_dir)
         origin = repo.remotes.origin
         if "https://" in origin.url:
             origin.set_url(repo_url)
         origin.pull()
     else:
         print("Cloning repository...")
         git.Repo.clone_from(repo_url, hub_data_dir)
     # Merge hub data with local data
     hub_data_source = hub_data_dir / "data"
     if hub_data_source.exists():
         data_dir.mkdir(exist_ok=True)
+        hub_csv_path = hub_data_source / "evaluation_results_exp.csv"
+        if hub_csv_path.exists():
+            hub_data = pd.read_csv(hub_csv_path)
+            print(f"Found hub data with {len(hub_data)} entries")
+            if local_data is not None:
+                # Merge data, keeping all entries and removing exact duplicates
+                merged_data = pd.concat([local_data, hub_data]).drop_duplicates()
+                print(f"Merged data has {len(merged_data)} entries")
+                # Save merged data
+                merged_data.to_csv(local_csv_path, index=False)
+            else:
+                # If no local data exists, just copy hub data
+                shutil.copy2(hub_csv_path, local_csv_path)
+        # Copy any other files from hub
         for item in hub_data_source.glob("*"):
+            if item.is_file() and item.name != "evaluation_results_exp.csv":
+                shutil.copy2(item, data_dir / item.name)
+            elif item.is_dir():
                 dest = data_dir / item.name
+                if not dest.exists():
                     shutil.copytree(item, dest)
     # Clean up cloned repo

extract_samples.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import random
+from datasets import load_dataset
+import pandas as pd
+import os
+from pathlib import Path
+import requests
+from PIL import Image
+from io import BytesIO
+# Load the experimental dataset
+dataset = load_dataset("taesiri/IERv2-BattleResults_exp", split="train")
+dataset_post_ids = list(
+    set(
+        load_dataset(
+            "taesiri/IERv2-BattleResults_exp", columns=["post_id"], split="train"
+        )
+        .to_pandas()
+        .post_id.tolist()
+    )
+)
+# Load and filter photoexp dataset
+photoexp = pd.read_csv("./photoexp_filtered.csv")
+valid_post_ids = set(photoexp.post_id.tolist())
+# Filter dataset to include only valid_post_ids
+dataset = dataset.filter(
+    lambda xs: [x in valid_post_ids for x in xs["post_id"]],
+    batched=True,
+    batch_size=256,
+)
+def download_and_save_image(url, save_path):
+    """Download image from URL and save it to disk"""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        img = Image.open(BytesIO(response.content))
+        img.save(save_path)
+        return True
+    except Exception as e:
+        print(f"Error downloading image {url}: {e}")
+        return False
+def get_random_sample():
+    """Get a random sample by first selecting a post_id then picking random edits for that post."""
+    # First randomly select a post_id from valid posts
+    random_post_id = random.choice(list(valid_post_ids))
+    # Filter dataset for this post_id
+    post_edits = dataset.filter(
+        lambda xs: [x == random_post_id for x in xs["post_id"]],
+        batched=True,
+        batch_size=256,
+    )
+    # Get matching photoexp entries for this post_id
+    matching_photoexp_entries = photoexp[photoexp.post_id == random_post_id]
+    # Randomly select one edit from the dataset
+    idx = random.randint(0, len(post_edits) - 1)
+    sample = post_edits[idx]
+    # Randomly select one entry from the matching photoexp entries
+    if not matching_photoexp_entries.empty:
+        random_photoexp_entry = matching_photoexp_entries.sample(n=1).iloc[0]
+        additional_edited_image = random_photoexp_entry["edited_image"]
+        model_b = random_photoexp_entry.get("model")
+        if model_b is None:
+            model_b = f"REDDIT_{random_photoexp_entry['comment_id']}"
+    else:
+        return None
+    return {
+        "post_id": sample["post_id"],
+        "instruction": sample["instruction"],
+        "simplified_instruction": sample["simplified_instruction"],
+        "source_image": sample["source_image"],
+        "edit1_image": sample["edited_image"],
+        "edit1_model": sample["model"],
+        "edit2_image": additional_edited_image,
+        "edit2_model": model_b,
+    }
+def save_sample(sample, output_dir):
+    """Save a sample to disk with all its components"""
+    if sample is None:
+        return False
+    # Create directory structure
+    sample_dir = Path(output_dir) / str(sample["post_id"])
+    sample_dir.mkdir(parents=True, exist_ok=True)
+    # Save instruction and metadata
+    with open(sample_dir / "metadata.txt", "w") as f:
+        f.write(f"Post ID: {sample['post_id']}\n")
+        f.write(f"Original Instruction: {sample['instruction']}\n")
+        f.write(f"Simplified Instruction: {sample['simplified_instruction']}\n")
+        f.write(f"Edit 1 Model: {sample['edit1_model']}\n")
+        f.write(f"Edit 2 Model: {sample['edit2_model']}\n")
+    # Save images
+    success = True
+    success &= download_and_save_image(
+        sample["source_image"], sample_dir / "source.jpg"
+    )
+    success &= download_and_save_image(sample["edit1_image"], sample_dir / "edit1.jpg")
+    success &= download_and_save_image(sample["edit2_image"], sample_dir / "edit2.jpg")
+    return success
+def main():
+    output_dir = Path("extracted_samples")
+    output_dir.mkdir(exist_ok=True)
+    num_samples = 100  # Number of samples to extract
+    successful_samples = 0
+    print(f"Extracting {num_samples} samples...")
+    while successful_samples < num_samples:
+        sample = get_random_sample()
+        if sample and save_sample(sample, output_dir):
+            successful_samples += 1
+            print(f"Successfully saved sample {successful_samples}/{num_samples}")
+        else:
+            print("Failed to save sample, trying next...")
+    print(f"Successfully extracted {successful_samples} samples to {output_dir}")
+if __name__ == "__main__":
+    main()