make-stockimage-dataset

Sleeping

App Files Files Community

Deadmon commited on Mar 15

Commit

3eb9780

verified ·

1 Parent(s): 9e2e26a

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -287

app.py CHANGED Viewed

@@ -7,276 +7,7 @@ from PIL import Image
 import json
 from datetime import date
 import random
-# Configuration
-OUTPUT_DIR = "downloaded_images"
-IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
-ZIP_FILE = os.path.join(OUTPUT_DIR, "images.zip")
-TRACKING_FILE = os.path.join(OUTPUT_DIR, "used_pages.json")
-# Ensure output directory exists
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-# Constants
-ITEMS_PER_PAGE = 40
-DAILY_IMAGE_LIMIT = 2000  # Adjusted for free tier limits
-MAX_PAGES = DAILY_IMAGE_LIMIT // ITEMS_PER_PAGE
-# API Configurations
-API_CONFIGS = {
-    "pexels": {
-        "base_url": "https://api.pexels.com/v1/search",
-        "headers": {"Authorization": "klHADHclpse2e2xSP9h747AgfE1Rx0wioemGhXYtedjZzvJ1WBUKwz7g"},
-        "image_key": "src.medium",
-        "result_key": "photos"
-    },
-    "unsplash": {
-        "base_url": "https://api.unsplash.com/search/photos",
-        "headers": {"Authorization": "722961 na6HV6Ym7dCeK1cZM5GRkWpNmhWsV1ZwusOpkTaCL9U"},
-        "image_key": "urls.small",
-        "result_key": "results"
-    },
-    "pixabay": {
-        "base_url": "https://pixabay.com/api/?key=45122300-cd3621e1539e8e95430ee3efc&q={category}&per_page={ITEMS_PER_PAGE}&page={page}",
-        "headers": {},
-        "image_key": "webformatURL",
-        "result_key": "hits"
-    }
-}
-def load_used_pages():
-    """Load or initialize the used pages tracking file."""
-    today = str(date.today())
-    if os.path.exists(TRACKING_FILE):
-        with open(TRACKING_FILE, "r") as f:
-            data = json.load(f)
-            if data.get("date") != today:
-                data = {"date": today, "used_pages": {}}
-    else:
-        data = {"date": today, "used_pages": {}}
-    return data
-def save_used_pages(data):
-    """Save the used pages tracking file."""
-    with open(TRACKING_FILE, "w") as f:
-        json.dump(data, f)
-def get_available_pages(num_pages_needed, api_name):
-    """Get a list of unused page numbers for the specified API."""
-    data = load_used_pages()
-    used_pages = set(data["used_pages"].get(api_name, []))
-    all_pages = set(range(1, MAX_PAGES + 1))
-    available_pages = list(all_pages - used_pages)
-    if len(available_pages) < num_pages_needed:
-        return None
-    selected_pages = random.sample(available_pages, num_pages_needed)
-    if api_name not in data["used_pages"]:
-        data["used_pages"][api_name] = []
-    data["used_pages"][api_name].extend(selected_pages)
-    save_used_pages(data)
-    return selected_pages
-def fetch_image_urls(api_name, category, num_images):
-    """Fetch image URLs from the specified API based on category and desired number of images."""
-    config = API_CONFIGS[api_name]
-    num_pages_needed = (num_images + ITEMS_PER_PAGE - 1) // ITEMS_PER_PAGE
-    pages = get_available_pages(num_pages_needed, api_name)
-    if not pages:
-        return []
-    image_urls = []
-    for page in pages:
-        if api_name == "pixabay":
-            url = config["base_url"].format(category=category.lower(), page=page, ITEMS_PER_PAGE=ITEMS_PER_PAGE)
-        else:
-            url = f"{config['base_url']}?query={category}&per_page={ITEMS_PER_PAGE}&page={page}"
-        try:
-            response = requests.get(url, headers=config["headers"])
-            response.raise_for_status()
-            data = response.json()
-            if config["result_key"] not in data or not data[config["result_key"]]:
-                break
-            for item in data[config["result_key"]]:
-                if len(image_urls) >= num_images:
-                    break
-                image_url = item.get(config["image_key"])
-                if image_url:
-                    image_urls.append(image_url)
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching page {page} from {api_name}: {e}")
-            break
-    return image_urls[:num_images]
-def download_images(image_urls):
-    """Download images from the provided URLs and save to IMAGES_DIR."""
-    if os.path.exists(IMAGES_DIR):
-        shutil.rmtree(IMAGES_DIR)
-    os.makedirs(IMAGES_DIR, exist_ok=True)
-    downloaded_count = 0
-    image_paths = []
-    for idx, url in enumerate(image_urls, 1):
-        try:
-            response = requests.get(url, stream=True)
-            response.raise_for_status()
-            image_path = os.path.join(IMAGES_DIR, f"img{idx}.jpg")
-            with open(image_path, "wb") as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        f.write(chunk)
-            Image.open(image_path).verify()
-            downloaded_count += 1
-            image_paths.append(image_path)
-            print(f"Downloaded {idx}/{len(image_urls)}: {url}")
-        except Exception as e:
-            print(f"Error downloading {url}: {e}")
-    return downloaded_count, image_paths
-def create_zip_file(selected_image_paths):
-    """Create a ZIP file of the selected images."""
-    if os.path.exists(ZIP_FILE):
-        os.remove(ZIP_FILE)
-    with zipfile.ZipFile(ZIP_FILE, 'w', zipfile.ZIP_DEFLATED) as zipf:
-        for image_path in selected_image_paths:
-            arcname = os.path.relpath(image_path, OUTPUT_DIR)
-            zipf.write(image_path, arcname)
-    return ZIP_FILE
-def process_and_display(api_name, category, num_images):
-    """Fetch and download images, then prepare data for display."""
-    num_images = int(num_images)
-    if num_images > 24:
-        num_images = 24
-    image_urls = fetch_image_urls(api_name, category, num_images)
-    if not image_urls:
-        return "No unique images available today or API limit reached.", None, None, [None] * 24, [False] * 24
-    downloaded_count, image_paths = download_images(image_urls)
-    if downloaded_count == 0:
-        return "No images were successfully downloaded.", None, None, [None] * 24, [False] * 24
-    status = f"Successfully downloaded {downloaded_count}/{num_images} images from {api_name}. Select images to include in ZIP below."
-    image_outputs = [image_paths[i] if i < len(image_paths) else None for i in range(24)]
-    checkbox_outputs = [True if i < len(image_paths) else False for i in range(24)]
-    return status, None, image_paths, image_outputs, checkbox_outputs
-def process_zip_submission(image_paths, *checkbox_states):
-    """Create a ZIP file based on the selected images."""
-    if not image_paths:
-        return "No images available to process.", None
-    selected_image_paths = [image_paths[i] for i, state in enumerate(checkbox_states) if state]
-    if not selected_image_paths:
-        return "No images selected for ZIP.", None
-    zip_path = create_zip_file(selected_image_paths)
-    return f"ZIP file created with {len(selected_image_paths)} images at {zip_path}", zip_path
-# Gradio Interface
-with gr.Blocks(title="Stock Photo Downloader") as demo:
-    gr.Markdown("### Select Parameters to Download Stock Photos")
-    api_input = gr.Dropdown(
-        label="API Source",
-        choices=["pexels", "unsplash", "pixabay"],
-        value="pexels"
-    )
-    category_input = gr.Dropdown(
-        label="Category",
-        choices=["nature", "business", "people", "technology", "food", "travel", "animals", "fashion"],
-        value="nature",
-        allow_custom_value=True
-    )
-    num_images_input = gr.Dropdown(
-        label="Number of Images (Max 24)",
-        choices=["4", "8", "12", "16", "20", "24"],
-        value="4"
-    )
-    download_button = gr.Button("Fetch and Display Images")
-    gr.Markdown("### Download Status")
-    status_output = gr.Textbox(label="Status", interactive=False)
-    gr.Markdown("### Download Your Images")
-    zip_output = gr.File(label="Download ZIP", visible=False)
-    gr.Markdown("### Image Gallery (Click Thumbnails to View Full Size)")
-    image_paths_state = gr.State()
-    IMAGES_PER_ROW = 4
-    MAX_ROWS = 6
-    TOTAL_IMAGES = IMAGES_PER_ROW * MAX_ROWS
-    image_outputs = []
-    checkbox_outputs = []
-    for row in range(MAX_ROWS):
-        with gr.Row():
-            for col in range(IMAGES_PER_ROW):
-                idx = row * IMAGES_PER_ROW + col
-                with gr.Column(min_width=150):
-                    image_output = gr.Image(
-                        label=f"Image {idx+1}",
-                        visible=False,
-                        height=150,
-                        width=150
-                    )
-                    checkbox_output = gr.Checkbox(
-                        label=f"Include in ZIP",
-                        value=True,
-                        visible=False
-                    )
-                    image_outputs.append(image_output)
-                    checkbox_outputs.append(checkbox_output)
-    gr.Markdown("### Submit Selections")
-    submit_button = gr.Button("Create ZIP of Selected Images")
-    def on_download(api_name, category, num_images):
-        status, zip_path, image_paths, image_outs, checkbox_outs = process_and_display(api_name, category, num_images)
-        return (
-            status,
-            zip_path,
-            image_paths,
-            *[gr.Image(value=img, visible=img is not None, label=f"Image {i+1}", height=150, width=150) if img else gr.Image(value=None, visible=False) for i, img in enumerate(image_outs)],
-            *[gr.Checkbox(value=chk, visible=chk, label=f"Include in ZIP") if chk else gr.Checkbox(value=False, visible=False) for chk in checkbox_outs]
-        )
-    def on_submit(image_paths, *checkbox_states):
-        status, zip_path = process_zip_submission(image_paths, *checkbox_states)
-        return status, gr.File(value=zip_path, visible=True) if zip_path else gr.File(visible=False)
-    download_button.click(
-        fn=on_download,
-        inputs=[api_input, category_input, num_images_input],
-        outputs=[status_output, zip_output, image_paths_state] + image_outputs + checkbox_outputs
-    )
-    submit_button.click(
-        fn=on_submit,
-        inputs=[image_paths_state] + checkbox_outputs,
-        outputs=[status_output, zip_output]
-    )
-demo.launch()
-import os
-import requests
-import zipfile
-import gradio as gr
-import shutil
-from PIL import Image
-import json
-from datetime import date
-import random
 import logging
 # Configure logging
@@ -299,23 +30,26 @@ MAX_PAGES = DAILY_IMAGE_LIMIT // ITEMS_PER_PAGE
 # API Configurations (Replace with valid keys)
 API_CONFIGS = {
-        "pexels": {
         "base_url": "https://api.pexels.com/v1/search",
         "headers": {"Authorization": "klHADHclpse2e2xSP9h747AgfE1Rx0wioemGhXYtedjZzvJ1WBUKwz7g"},
         "image_key": "src.medium",
-        "result_key": "photos"
     },
     "unsplash": {
         "base_url": "https://api.unsplash.com/search/photos",
-        "headers": {"Authorization": "722961 na6HV6Ym7dCeK1cZM5GRkWpNmhWsV1ZwusOpkTaCL9U"},
         "image_key": "urls.small",
-        "result_key": "results"
     },
     "pixabay": {
-        "base_url": "https://pixabay.com/api/?key=45122300-cd3621e1539e8e95430ee3efc&q={category}&per_page={ITEMS_PER_PAGE}&page={page}",
         "headers": {},
         "image_key": "webformatURL",
-        "result_key": "hits"
     }
 }
@@ -366,19 +100,21 @@ def fetch_image_urls(api_name, category, num_images):
         return []
     image_urls = []
     for page in pages:
         if api_name == "pixabay":
             url = config["base_url"].format(category=category.lower(), page=page, ITEMS_PER_PAGE=ITEMS_PER_PAGE)
         else:
             url = f"{config['base_url']}?query={category}&per_page={ITEMS_PER_PAGE}&page={page}"
         try:
             response = requests.get(url, headers=config["headers"])
             response.raise_for_status()
             data = response.json()
             if config["result_key"] not in data or not data[config["result_key"]]:
                 logger.warning(f"No results for page {page} from {api_name}")
-                break
             for item in data[config["result_key"]]:
                 if len(image_urls) >= num_images:
@@ -386,19 +122,23 @@ def fetch_image_urls(api_name, category, num_images):
                 image_url = item.get(config["image_key"])
                 if image_url:
                     image_urls.append(image_url)
-            # Only mark page as used if images are successfully fetched
-            if not image_urls:
-                data["used_pages"][api_name].remove(page)
-                save_used_pages(data)
-                logger.info(f"Removed unused page {page} for {api_name}")
         except requests.exceptions.RequestException as e:
             logger.error(f"Error fetching page {page} from {api_name}: {e}")
             if response.status_code == 401:
-                logger.error(f"401 Unauthorized for {api_name}. Check API key.")
             elif response.status_code == 400:
-                logger.error(f"400 Bad Request for {api_name}. Check URL or API key.")
-            data["used_pages"][api_name].remove(page)
-            save_used_pages(data)
             break
     return image_urls[:num_images]
@@ -447,7 +187,7 @@ def process_and_display(api_name, category, num_images):
     image_urls = fetch_image_urls(api_name, category, num_images)
     if not image_urls:
-        return "No unique images available today or API limit reached.", None, None, [None] * 24, [False] * 24
     downloaded_count, image_paths = download_images(image_urls)
     if downloaded_count == 0:

 import json
 from datetime import date
 import random
+import time
 import logging
 # Configure logging
 # API Configurations (Replace with valid keys)
 API_CONFIGS = {
+    "pexels": {
         "base_url": "https://api.pexels.com/v1/search",
         "headers": {"Authorization": "klHADHclpse2e2xSP9h747AgfE1Rx0wioemGhXYtedjZzvJ1WBUKwz7g"},
         "image_key": "src.medium",
+        "result_key": "photos",
+        "delay": 1  # Seconds between requests to avoid rate limits
     },
     "unsplash": {
         "base_url": "https://api.unsplash.com/search/photos",
+        "headers": {"Authorization": "Client-ID YOUR_NEW_UNSPLASH_API_KEY"},  # Replace with new key
         "image_key": "urls.small",
+        "result_key": "results",
+        "delay": 2  # Stricter limit (50 requests/hour)
     },
     "pixabay": {
+        "base_url": "https://pixabay.com/api/?key=YOUR_NEW_PIXABAY_API_KEY&q={category}&per_page={ITEMS_PER_PAGE}&page={page}",
         "headers": {},
         "image_key": "webformatURL",
+        "result_key": "hits",
+        "delay": 1
     }
 }
         return []
     image_urls = []
+    data = load_used_pages()  # Load current data to modify used_pages
     for page in pages:
         if api_name == "pixabay":
             url = config["base_url"].format(category=category.lower(), page=page, ITEMS_PER_PAGE=ITEMS_PER_PAGE)
         else:
             url = f"{config['base_url']}?query={category}&per_page={ITEMS_PER_PAGE}&page={page}"
         try:
+            time.sleep(config.get("delay", 0))  # Add delay to respect rate limits
             response = requests.get(url, headers=config["headers"])
             response.raise_for_status()
             data = response.json()
             if config["result_key"] not in data or not data[config["result_key"]]:
                 logger.warning(f"No results for page {page} from {api_name}")
+                continue  # Skip to next page instead of breaking
             for item in data[config["result_key"]]:
                 if len(image_urls) >= num_images:
                 image_url = item.get(config["image_key"])
                 if image_url:
                     image_urls.append(image_url)
+            # Mark page as used only if images are fetched
+            if image_urls:
+                if page not in data["used_pages"].get(api_name, []):
+                    data["used_pages"].setdefault(api_name, []).append(page)
+                    save_used_pages(data)
+                    logger.info(f"Marked page {page} as used for {api_name}")
         except requests.exceptions.RequestException as e:
             logger.error(f"Error fetching page {page} from {api_name}: {e}")
             if response.status_code == 401:
+                logger.error(f"401 Unauthorized for {api_name}. Please regenerate your API key at https://unsplash.com/developers (Unsplash) or similar.")
             elif response.status_code == 400:
+                logger.error(f"400 Bad Request for {api_name}. Check API key or ensure page {page} is valid.")
+            # Remove failed page from used_pages
+            if page in data["used_pages"].get(api_name, []):
+                data["used_pages"][api_name].remove(page)
+                save_used_pages(data)
+                logger.info(f"Removed failed page {page} from {api_name}")
             break
     return image_urls[:num_images]
     image_urls = fetch_image_urls(api_name, category, num_images)
     if not image_urls:
+        return "No unique images available today or API limit reached. Check logs for details.", None, None, [None] * 24, [False] * 24
     downloaded_count, image_paths = download_images(image_urls)
     if downloaded_count == 0: