new check_dataset functions (#43)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2022-11-12 19:32:41 +05:30 · 2022-11-12 19:32:41 +05:30 · 1f3aad86c1
commit 1f3aad86c1
parent d143ac666f
13 changed files with 336 additions and 62 deletions
--- a/ultralytics/yolo/data/utils.py
+++ b/ultralytics/yolo/data/utils.py
@ -1,11 +1,22 @@
 import contextlib
 import hashlib
 import os
+import subprocess
+import time
+from pathlib import Path
+from tarfile import is_tarfile
+from zipfile import is_zipfile

 import cv2
 import numpy as np
+import torch
 from PIL import ExifTags, Image, ImageOps

+from ultralytics.yolo.utils import LOGGER, ROOT, colorstr
+from ultralytics.yolo.utils.checks import check_file, check_font, is_ascii
+from ultralytics.yolo.utils.downloads import download
+from ultralytics.yolo.utils.files import unzip_file, yaml_load
+
 from ..utils.ops import segments2boxes

 HELP_URL = "See https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data"
@ -176,3 +187,89 @@ def polygons2masks_overlap(img_size, segments, downsample_ratio=1):
        masks = masks + mask
        masks = np.clip(masks, a_min=0, a_max=i + 1)
    return masks, index
+
+
+def check_dataset_yaml(data, autodownload=True):
+    # Download, check and/or unzip dataset if not found locally
+    data = check_file(data)
+    DATASETS_DIR = Path.cwd() / "../datasets"
+    # Download (optional)
+    extract_dir = ''
+    if isinstance(data, (str, Path)) and (is_zipfile(data) or is_tarfile(data)):
+        download(data, dir=f'{DATASETS_DIR}/{Path(data).stem}', unzip=True, delete=False, curl=False, threads=1)
+        data = next((DATASETS_DIR / Path(data).stem).rglob('*.yaml'))
+        extract_dir, autodownload = data.parent, False
+    # Read yaml (optional)
+    if isinstance(data, (str, Path)):
+        data = yaml_load(data)  # dictionary
+
+    # Checks
+    for k in 'train', 'val', 'names':
+        assert k in data, f"data.yaml '{k}:' field missing ❌"
+    if isinstance(data['names'], (list, tuple)):  # old array format
+        data['names'] = dict(enumerate(data['names']))  # convert to dict
+    data['nc'] = len(data['names'])
+
+    # Resolve paths
+    path = Path(extract_dir or data.get('path') or '')  # optional 'path' default to '.'
+    if not path.is_absolute():
+        path = (Path.cwd() / path).resolve()
+        data['path'] = path  # download scripts
+    for k in 'train', 'val', 'test':
+        if data.get(k):  # prepend path
+            if isinstance(data[k], str):
+                x = (path / data[k]).resolve()
+                if not x.exists() and data[k].startswith('../'):
+                    x = (path / data[k][3:]).resolve()
+                data[k] = str(x)
+            else:
+                data[k] = [str((path / x).resolve()) for x in data[k]]
+
+    # Parse yaml
+    train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
+    if val:
+        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
+        if not all(x.exists() for x in val):
+            LOGGER.info('\nDataset not found ⚠️, missing paths %s' % [str(x) for x in val if not x.exists()])
+            if not s or not autodownload:
+                raise Exception('Dataset not found ❌')
+            t = time.time()
+            if s.startswith('http') and s.endswith('.zip'):  # URL
+                f = Path(s).name  # filename
+                LOGGER.info(f'Downloading {s} to {f}...')
+                torch.hub.download_url_to_file(s, f)
+                Path(DATASETS_DIR).mkdir(parents=True, exist_ok=True)  # create root
+                unzip_file(f, path=DATASETS_DIR)  # unzip
+                Path(f).unlink()  # remove zip
+                r = None  # success
+            elif s.startswith('bash '):  # bash script
+                LOGGER.info(f'Running {s} ...')
+                r = os.system(s)
+            else:  # python script
+                r = exec(s, {'yaml': data})  # return None
+            dt = f'({round(time.time() - t, 1)}s)'
+            s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt} ❌"
+            LOGGER.info(f"Dataset download {s}")
+    check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf', progress=True)  # download fonts
+    return data  # dictionary
+
+
+def check_dataset(dataset: str):
+    data = Path.cwd() / "datasets" / dataset
+    data_dir = data if data.is_dir() else (Path.cwd() / data)
+    if not data_dir.is_dir():
+        LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
+        t = time.time()
+        if str(data) == 'imagenet':
+            subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
+        else:
+            url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip'
+            download(url, dir=data_dir.parent)
+        s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
+        LOGGER.info(s)
+    train_set = data_dir / "train"
+    test_set = data_dir / 'test' if (data_dir / 'test').exists() else data_dir / 'val'  # data/test or data/val
+    nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()])  # number of classes
+    names = [name for name in os.listdir(data_dir / 'train') if os.path.isdir(data_dir / 'train' / name)]
+    data = {"train": train_set, "val": test_set, "nc": nc, "names": names}
+    return data