New create_synthetic_coco_dataset function (#16742)

Signed-off-by: UltralyticsAssistant <web@ultralytics.com>
This commit is contained in:
Ultralytics Assistant 2024-10-07 16:51:17 +02:00 committed by GitHub
parent d88e57f143
commit 53e5d02a28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 68 additions and 6 deletions

View file

@ -41,4 +41,8 @@ keywords: Ultralytics, data conversion, YOLO models, COCO, DOTA, YOLO bbox2segme
## ::: ultralytics.data.converter.yolo_bbox2segment ## ::: ultralytics.data.converter.yolo_bbox2segment
<br><br><hr><br>
## ::: ultralytics.data.converter.create_synthetic_coco_dataset
<br><br> <br><br>

View file

@ -1,13 +1,18 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license # Ultralytics YOLO 🚀, AGPL-3.0 license
import json import json
import random
import shutil
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
import cv2 import cv2
import numpy as np import numpy as np
from PIL import Image
from ultralytics.utils import LOGGER, TQDM from ultralytics.utils import DATASETS_DIR, LOGGER, NUM_THREADS, TQDM
from ultralytics.utils.downloads import download
from ultralytics.utils.files import increment_path from ultralytics.utils.files import increment_path
@ -588,15 +593,13 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
- im_dir - im_dir
001.jpg 001.jpg
.. ...
NNN.jpg NNN.jpg
- labels - labels
001.txt 001.txt
.. ...
NNN.txt NNN.txt
""" """
from tqdm import tqdm
from ultralytics import SAM from ultralytics import SAM
from ultralytics.data import YOLODataset from ultralytics.data import YOLODataset
from ultralytics.utils import LOGGER from ultralytics.utils import LOGGER
@ -610,7 +613,7 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
LOGGER.info("Detection labels detected, generating segment labels by SAM model!") LOGGER.info("Detection labels detected, generating segment labels by SAM model!")
sam_model = SAM(sam_model) sam_model = SAM(sam_model)
for label in tqdm(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"): for label in TQDM(dataset.labels, total=len(dataset.labels), desc="Generating segment labels"):
h, w = label["shape"] h, w = label["shape"]
boxes = label["bboxes"] boxes = label["bboxes"]
if len(boxes) == 0: # skip empty labels if len(boxes) == 0: # skip empty labels
@ -635,3 +638,58 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
with open(txt_file, "a") as f: with open(txt_file, "a") as f:
f.writelines(text + "\n" for text in texts) f.writelines(text + "\n" for text in texts)
LOGGER.info(f"Generated segment labels saved in {save_dir}") LOGGER.info(f"Generated segment labels saved in {save_dir}")
def create_synthetic_coco_dataset():
"""
Creates a synthetic COCO dataset with random images and existing labels.
This function downloads COCO labels, creates synthetic images for train2017 and val2017 subsets, and organizes
them in the COCO dataset structure. It uses multithreading to generate images efficiently.
Examples:
>>> create_synthetic_coco_dataset()
Notes:
- Requires internet connection to download label files.
- Generates random RGB images of varying sizes (480x480 to 640x640 pixels).
- Existing test2017 directory is removed as it's not needed.
- If label directories don't exist, image creation for that subset is skipped.
"""
def create_synthetic_image(image_file):
"""Generates synthetic images with random sizes and colors for dataset augmentation or testing purposes."""
if not image_file.exists():
size = (random.randint(480, 640), random.randint(480, 640))
Image.new(
"RGB",
size=size,
color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
).save(image_file)
# Download labels
dir = DATASETS_DIR / "coco"
url = "https://github.com/ultralytics/assets/releases/download/v0.0.0/"
label_zip = "coco2017labels-segments.zip"
download([url + label_zip], dir=dir.parent)
# Create synthetic images
shutil.rmtree(dir / "labels" / "test2017", ignore_errors=True) # Remove test2017 directory as not needed
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
for subset in ["train2017", "val2017"]:
subset_dir = dir / "images" / subset
subset_dir.mkdir(parents=True, exist_ok=True)
label_dir = dir / "labels" / subset
if label_dir.exists():
label_files = list(label_dir.glob("*.txt"))
image_files = [subset_dir / f"{label_file.stem}.jpg" for label_file in label_files]
# Submit all tasks
futures = [executor.submit(create_synthetic_image, image_file) for image_file in image_files]
for _ in TQDM(as_completed(futures), total=len(futures), desc=f"Generating images for {subset}"):
pass # The actual work is done in the background
else:
print(f"Warning: Label directory {label_dir} does not exist. Skipping image creation for {subset}.")
print("Synthetic COCO dataset created successfully.")