taesiri commited on
Commit
05b8870
·
1 Parent(s): df7d714
extract_samples.py DELETED
@@ -1,137 +0,0 @@
1
- import random
2
- from datasets import load_dataset
3
- import pandas as pd
4
- import os
5
- from pathlib import Path
6
- import requests
7
- from PIL import Image
8
- from io import BytesIO
9
-
10
- # Load the experimental dataset
11
- dataset = load_dataset("taesiri/IERv2-BattleResults_exp", split="train")
12
- dataset_post_ids = list(
13
- set(
14
- load_dataset(
15
- "taesiri/IERv2-BattleResults_exp", columns=["post_id"], split="train"
16
- )
17
- .to_pandas()
18
- .post_id.tolist()
19
- )
20
- )
21
-
22
- # Load and filter photoexp dataset
23
- photoexp = pd.read_csv("./photoexp_filtered.csv")
24
- valid_post_ids = set(photoexp.post_id.tolist())
25
-
26
- # Filter dataset to include only valid_post_ids
27
- dataset = dataset.filter(
28
- lambda xs: [x in valid_post_ids for x in xs["post_id"]],
29
- batched=True,
30
- batch_size=256,
31
- )
32
-
33
-
34
- def download_and_save_image(url, save_path):
35
- """Download image from URL and save it to disk"""
36
- try:
37
- response = requests.get(url)
38
- response.raise_for_status()
39
- img = Image.open(BytesIO(response.content))
40
- img.save(save_path)
41
- return True
42
- except Exception as e:
43
- print(f"Error downloading image {url}: {e}")
44
- return False
45
-
46
-
47
- def get_random_sample():
48
- """Get a random sample by first selecting a post_id then picking random edits for that post."""
49
- # First randomly select a post_id from valid posts
50
- random_post_id = random.choice(list(valid_post_ids))
51
-
52
- # Filter dataset for this post_id
53
- post_edits = dataset.filter(
54
- lambda xs: [x == random_post_id for x in xs["post_id"]],
55
- batched=True,
56
- batch_size=256,
57
- )
58
-
59
- # Get matching photoexp entries for this post_id
60
- matching_photoexp_entries = photoexp[photoexp.post_id == random_post_id]
61
-
62
- # Randomly select one edit from the dataset
63
- idx = random.randint(0, len(post_edits) - 1)
64
- sample = post_edits[idx]
65
-
66
- # Randomly select one entry from the matching photoexp entries
67
- if not matching_photoexp_entries.empty:
68
- random_photoexp_entry = matching_photoexp_entries.sample(n=1).iloc[0]
69
- additional_edited_image = random_photoexp_entry["edited_image"]
70
- model_b = random_photoexp_entry.get("model")
71
- if model_b is None:
72
- model_b = f"REDDIT_{random_photoexp_entry['comment_id']}"
73
- else:
74
- return None
75
-
76
- return {
77
- "post_id": sample["post_id"],
78
- "instruction": sample["instruction"],
79
- "simplified_instruction": sample["simplified_instruction"],
80
- "source_image": sample["source_image"],
81
- "edit1_image": sample["edited_image"],
82
- "edit1_model": sample["model"],
83
- "edit2_image": additional_edited_image,
84
- "edit2_model": model_b,
85
- }
86
-
87
-
88
- def save_sample(sample, output_dir):
89
- """Save a sample to disk with all its components"""
90
- if sample is None:
91
- return False
92
-
93
- # Create directory structure
94
- sample_dir = Path(output_dir) / str(sample["post_id"])
95
- sample_dir.mkdir(parents=True, exist_ok=True)
96
-
97
- # Save instruction and metadata
98
- with open(sample_dir / "metadata.txt", "w") as f:
99
- f.write(f"Post ID: {sample['post_id']}\n")
100
- f.write(f"Original Instruction: {sample['instruction']}\n")
101
- f.write(f"Simplified Instruction: {sample['simplified_instruction']}\n")
102
- f.write(f"Edit 1 Model: {sample['edit1_model']}\n")
103
- f.write(f"Edit 2 Model: {sample['edit2_model']}\n")
104
-
105
- # Save images
106
- success = True
107
- success &= download_and_save_image(
108
- sample["source_image"], sample_dir / "source.jpg"
109
- )
110
- success &= download_and_save_image(sample["edit1_image"], sample_dir / "edit1.jpg")
111
- success &= download_and_save_image(sample["edit2_image"], sample_dir / "edit2.jpg")
112
-
113
- return success
114
-
115
-
116
- def main():
117
- output_dir = Path("extracted_samples")
118
- output_dir.mkdir(exist_ok=True)
119
-
120
- num_samples = 100 # Number of samples to extract
121
- successful_samples = 0
122
-
123
- print(f"Extracting {num_samples} samples...")
124
-
125
- while successful_samples < num_samples:
126
- sample = get_random_sample()
127
- if sample and save_sample(sample, output_dir):
128
- successful_samples += 1
129
- print(f"Successfully saved sample {successful_samples}/{num_samples}")
130
- else:
131
- print("Failed to save sample, trying next...")
132
-
133
- print(f"Successfully extracted {successful_samples} samples to {output_dir}")
134
-
135
-
136
- if __name__ == "__main__":
137
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
instructions/home.jpg ADDED
instructions/page2.jpg ADDED
instructions/page3.jpg ADDED