From 53e5d02a289ae9440dde019e90af88481120dea0 Mon Sep 17 00:00:00 2001 From: Ultralytics Assistant <135830346+UltralyticsAssistant@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:51:17 +0200 Subject: [PATCH] New `create_synthetic_coco_dataset` function (#16742) Signed-off-by: UltralyticsAssistant --- docs/en/reference/data/converter.md | 4 ++ ultralytics/data/converter.py | 70 ++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/docs/en/reference/data/converter.md b/docs/en/reference/data/converter.md index d4ba3d58..073c760e 100644 --- a/docs/en/reference/data/converter.md +++ b/docs/en/reference/data/converter.md @@ -41,4 +41,8 @@ keywords: Ultralytics, data conversion, YOLO models, COCO, DOTA, YOLO bbox2segme ## ::: ultralytics.data.converter.yolo_bbox2segment +



+ +## ::: ultralytics.data.converter.create_synthetic_coco_dataset +

diff --git a/ultralytics/data/converter.py b/ultralytics/data/converter.py index 03dbf0ad..6305c6de 100644 --- a/ultralytics/data/converter.py +++ b/ultralytics/data/converter.py @@ -1,13 +1,18 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license import json +import random +import shutil from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import cv2 import numpy as np +from PIL import Image -from ultralytics.utils import LOGGER, TQDM +from ultralytics.utils import DATASETS_DIR, LOGGER, NUM_THREADS, TQDM +from ultralytics.utils.downloads import download from ultralytics.utils.files import increment_path @@ -588,15 +593,13 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"): - im_dir ├─ 001.jpg - ├─ .. + ├─ ... └─ NNN.jpg - labels ├─ 001.txt - ├─ .. + ├─ ... └─ NNN.txt """ - from tqdm import tqdm - from ultralytics import SAM from ultralytics.data import YOLODataset from ultralytics.utils import LOGGER @@ -610,7 +613,7 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"): LOGGER.info("Detection labels detected, generating segment labels by SAM model!") sam_model = SAM(sam_model) - for label in tqdm(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"): + for label in TQDM(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"): h, w = label["shape"] boxes = label["bboxes"] if len(boxes) == 0: # skip empty labels @@ -635,3 +638,58 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"): with open(txt_file, "a") as f: f.writelines(text + "\n" for text in texts) LOGGER.info(f"Generated segment labels saved in {save_dir}") + + +def create_synthetic_coco_dataset(): + """ + Creates a synthetic COCO dataset with random images and existing labels. + + This function downloads COCO labels, creates synthetic images for train2017 and val2017 subsets, and organizes + them in the COCO dataset structure. It uses multithreading to generate images efficiently. + + Examples: + >>> create_synthetic_coco_dataset() + + Notes: + - Requires internet connection to download label files. + - Generates random RGB images of varying sizes (480x480 to 640x640 pixels). + - Existing test2017 directory is removed as it's not needed. + - If label directories don't exist, image creation for that subset is skipped. + """ + + def create_synthetic_image(image_file): + """Generates synthetic images with random sizes and colors for dataset augmentation or testing purposes.""" + if not image_file.exists(): + size = (random.randint(480, 640), random.randint(480, 640)) + Image.new( + "RGB", + size=size, + color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)), + ).save(image_file) + + # Download labels + dir = DATASETS_DIR / "coco" + url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/" + label_zip = "coco2017labels-segments.zip" + download([url + label_zip], dir=dir.parent) + + # Create synthetic images + shutil.rmtree(dir / "labels" / "test2017", ignore_errors=True) # Remove test2017 directory as not needed + with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: + for subset in ["train2017", "val2017"]: + subset_dir = dir / "images" / subset + subset_dir.mkdir(parents=True, exist_ok=True) + + label_dir = dir / "labels" / subset + if label_dir.exists(): + label_files = list(label_dir.glob("*.txt")) + image_files = [subset_dir / f"{label_file.stem}.jpg" for label_file in label_files] + + # Submit all tasks + futures = [executor.submit(create_synthetic_image, image_file) for image_file in image_files] + for _ in TQDM(as_completed(futures), total=len(futures), desc=f"Generating images for {subset}"): + pass # The actual work is done in the background + else: + print(f"Warning: Label directory {label_dir} does not exist. Skipping image creation for {subset}.") + + print("Synthetic COCO dataset created successfully.")