|
import json |
|
import os |
|
from pathlib import Path |
|
import requests |
|
from tqdm import tqdm |
|
|
|
|
|
def download_eurorad_figures(metadata_path: str, output_dir: str) -> None: |
|
""" |
|
Download figures from Eurorad dataset and save them organized by case_id. |
|
|
|
Args: |
|
metadata_path: Path to the eurorad_metadata.json file |
|
output_dir: Base directory where figures will be saved |
|
|
|
The figures will be saved as: |
|
{output_dir}/{case_id}/{figure_number}.jpg |
|
Example: |
|
figures/189/Figure_1a.jpg |
|
""" |
|
|
|
output_path = Path(output_dir) |
|
output_path.mkdir(exist_ok=True) |
|
|
|
|
|
with open(metadata_path) as f: |
|
metadata = json.load(f) |
|
|
|
|
|
for case_id in tqdm(metadata, desc="Downloading cases", unit="case"): |
|
case = metadata[case_id] |
|
case_dir = output_path / str(case["case_id"]) |
|
case_dir.mkdir(exist_ok=True) |
|
|
|
|
|
for figure in case["figures"]: |
|
for subfig in figure["subfigures"]: |
|
|
|
|
|
subfig_name = f"{subfig['number'].strip().replace(' ', '_').lower()}.jpg" |
|
subfig_path = Path(case_dir) / subfig_name |
|
|
|
save_figure( |
|
url=subfig["url"], |
|
output_path=subfig_path, |
|
) |
|
|
|
|
|
def save_figure(url: str, output_path: Path) -> None: |
|
""" |
|
Download and save a single figure. |
|
|
|
Args: |
|
url: URL of the figure to download |
|
output_path: Path where the figure should be saved |
|
""" |
|
if output_path.exists(): |
|
return |
|
|
|
try: |
|
response = requests.get(url, timeout=10) |
|
response.raise_for_status() |
|
with open(output_path, "wb") as f: |
|
f.write(response.content) |
|
except Exception as e: |
|
print(f"Error downloading {url}: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
root = os.path.dirname(os.path.abspath(__file__)) |
|
download_eurorad_figures( |
|
metadata_path=os.path.join(root, "eurorad_metadata.json"), |
|
output_dir=os.path.join(root, "figures"), |
|
) |
|
|