CropGuard / src /data /download.py
mitraarka27's picture
πŸš€ Initial full clean push to Hugging Face
09823ea
import os
import shutil
from kaggle.api.kaggle_api_extended import KaggleApi
from src.utils.config import DATA_DIR, CLEAN_DIR, RAW_DIR, TARGET_CROPS # βœ… Import cleanly
# -------------------------------
# Functions
# -------------------------------
def download_and_extract_dataset():
"""
Download and extract the full PlantVillage dataset.
"""
os.makedirs(RAW_DIR, exist_ok=True)
api = KaggleApi()
api.authenticate()
print("[INFO] Downloading PlantVillage dataset...")
api.dataset_download_files('mohitsingh1804/plantvillage', path=RAW_DIR, unzip=True)
print("[INFO] Download complete and extracted.")
def clean_and_organize_dataset():
"""
Organize Potato, Tomato, Grape from train/ and val/ into clean/ directory.
"""
extracted_dir = os.path.join(RAW_DIR, "plantvillage")
train_dir = os.path.join(extracted_dir, "train")
val_dir = os.path.join(extracted_dir, "val")
if not os.path.exists(CLEAN_DIR):
os.makedirs(CLEAN_DIR)
print(f"[INFO] Created clean directory at: {CLEAN_DIR}")
for split_dir in [train_dir, val_dir]:
if not os.path.exists(split_dir):
raise FileNotFoundError(f"[ERROR] {split_dir} not found.")
for folder in os.listdir(split_dir):
full_folder_path = os.path.join(split_dir, folder)
if os.path.isdir(full_folder_path) and any(folder.startswith(crop) for crop in TARGET_CROPS):
crop_name = folder.split("___")[0].lower()
disease_folder = folder
destination_crop_dir = os.path.join(CLEAN_DIR, crop_name)
os.makedirs(destination_crop_dir, exist_ok=True)
destination_disease_dir = os.path.join(destination_crop_dir, disease_folder)
os.makedirs(destination_disease_dir, exist_ok=True)
for img_file in os.listdir(full_folder_path):
src_img = os.path.join(full_folder_path, img_file)
dst_img = os.path.join(destination_disease_dir, img_file)
shutil.copy(src_img, dst_img)
print("[INFO] Crops cleaned and organized into 'clean/' directory from train and val folders.")
# -------------------------------
# Remove plant_disease_raw after cleaning
# -------------------------------
if os.path.exists(RAW_DIR):
shutil.rmtree(RAW_DIR)
print(f"[INFO] Deleted raw data directory at {RAW_DIR} after cleaning.")
def check_data_integrity():
"""
Quick check that clean/ has the crops properly.
"""
if not os.path.exists(CLEAN_DIR):
raise FileNotFoundError(f"[ERROR] Clean folder {CLEAN_DIR} not found!")
for crop in os.listdir(CLEAN_DIR):
crop_dir = os.path.join(CLEAN_DIR, crop)
print(f"[INFO] {crop.capitalize()}: {len(os.listdir(crop_dir))} disease classes found.")