new check_dataset functions (#43)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
d143ac666f
commit
1f3aad86c1
13 changed files with 336 additions and 62 deletions
|
|
@ -1,11 +1,22 @@
|
|||
import contextlib
|
||||
import hashlib
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from tarfile import is_tarfile
|
||||
from zipfile import is_zipfile
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import ExifTags, Image, ImageOps
|
||||
|
||||
from ultralytics.yolo.utils import LOGGER, ROOT, colorstr
|
||||
from ultralytics.yolo.utils.checks import check_file, check_font, is_ascii
|
||||
from ultralytics.yolo.utils.downloads import download
|
||||
from ultralytics.yolo.utils.files import unzip_file, yaml_load
|
||||
|
||||
from ..utils.ops import segments2boxes
|
||||
|
||||
HELP_URL = "See https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data"
|
||||
|
|
@ -176,3 +187,89 @@ def polygons2masks_overlap(img_size, segments, downsample_ratio=1):
|
|||
masks = masks + mask
|
||||
masks = np.clip(masks, a_min=0, a_max=i + 1)
|
||||
return masks, index
|
||||
|
||||
|
||||
def check_dataset_yaml(data, autodownload=True):
|
||||
# Download, check and/or unzip dataset if not found locally
|
||||
data = check_file(data)
|
||||
DATASETS_DIR = Path.cwd() / "../datasets"
|
||||
# Download (optional)
|
||||
extract_dir = ''
|
||||
if isinstance(data, (str, Path)) and (is_zipfile(data) or is_tarfile(data)):
|
||||
download(data, dir=f'{DATASETS_DIR}/{Path(data).stem}', unzip=True, delete=False, curl=False, threads=1)
|
||||
data = next((DATASETS_DIR / Path(data).stem).rglob('*.yaml'))
|
||||
extract_dir, autodownload = data.parent, False
|
||||
# Read yaml (optional)
|
||||
if isinstance(data, (str, Path)):
|
||||
data = yaml_load(data) # dictionary
|
||||
|
||||
# Checks
|
||||
for k in 'train', 'val', 'names':
|
||||
assert k in data, f"data.yaml '{k}:' field missing ❌"
|
||||
if isinstance(data['names'], (list, tuple)): # old array format
|
||||
data['names'] = dict(enumerate(data['names'])) # convert to dict
|
||||
data['nc'] = len(data['names'])
|
||||
|
||||
# Resolve paths
|
||||
path = Path(extract_dir or data.get('path') or '') # optional 'path' default to '.'
|
||||
if not path.is_absolute():
|
||||
path = (Path.cwd() / path).resolve()
|
||||
data['path'] = path # download scripts
|
||||
for k in 'train', 'val', 'test':
|
||||
if data.get(k): # prepend path
|
||||
if isinstance(data[k], str):
|
||||
x = (path / data[k]).resolve()
|
||||
if not x.exists() and data[k].startswith('../'):
|
||||
x = (path / data[k][3:]).resolve()
|
||||
data[k] = str(x)
|
||||
else:
|
||||
data[k] = [str((path / x).resolve()) for x in data[k]]
|
||||
|
||||
# Parse yaml
|
||||
train, val, test, s = (data.get(x) for x in ('train', 'val', 'test', 'download'))
|
||||
if val:
|
||||
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
|
||||
if not all(x.exists() for x in val):
|
||||
LOGGER.info('\nDataset not found ⚠️, missing paths %s' % [str(x) for x in val if not x.exists()])
|
||||
if not s or not autodownload:
|
||||
raise Exception('Dataset not found ❌')
|
||||
t = time.time()
|
||||
if s.startswith('http') and s.endswith('.zip'): # URL
|
||||
f = Path(s).name # filename
|
||||
LOGGER.info(f'Downloading {s} to {f}...')
|
||||
torch.hub.download_url_to_file(s, f)
|
||||
Path(DATASETS_DIR).mkdir(parents=True, exist_ok=True) # create root
|
||||
unzip_file(f, path=DATASETS_DIR) # unzip
|
||||
Path(f).unlink() # remove zip
|
||||
r = None # success
|
||||
elif s.startswith('bash '): # bash script
|
||||
LOGGER.info(f'Running {s} ...')
|
||||
r = os.system(s)
|
||||
else: # python script
|
||||
r = exec(s, {'yaml': data}) # return None
|
||||
dt = f'({round(time.time() - t, 1)}s)'
|
||||
s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt} ❌"
|
||||
LOGGER.info(f"Dataset download {s}")
|
||||
check_font('Arial.ttf' if is_ascii(data['names']) else 'Arial.Unicode.ttf', progress=True) # download fonts
|
||||
return data # dictionary
|
||||
|
||||
|
||||
def check_dataset(dataset: str):
|
||||
data = Path.cwd() / "datasets" / dataset
|
||||
data_dir = data if data.is_dir() else (Path.cwd() / data)
|
||||
if not data_dir.is_dir():
|
||||
LOGGER.info(f'\nDataset not found ⚠️, missing path {data_dir}, attempting download...')
|
||||
t = time.time()
|
||||
if str(data) == 'imagenet':
|
||||
subprocess.run(f"bash {ROOT / 'data/scripts/get_imagenet.sh'}", shell=True, check=True)
|
||||
else:
|
||||
url = f'https://github.com/ultralytics/yolov5/releases/download/v1.0/{dataset}.zip'
|
||||
download(url, dir=data_dir.parent)
|
||||
s = f"Dataset download success ✅ ({time.time() - t:.1f}s), saved to {colorstr('bold', data_dir)}\n"
|
||||
LOGGER.info(s)
|
||||
train_set = data_dir / "train"
|
||||
test_set = data_dir / 'test' if (data_dir / 'test').exists() else data_dir / 'val' # data/test or data/val
|
||||
nc = len([x for x in (data_dir / 'train').glob('*') if x.is_dir()]) # number of classes
|
||||
names = [name for name in os.listdir(data_dir / 'train') if os.path.isdir(data_dir / 'train' / name)]
|
||||
data = {"train": train_set, "val": test_set, "nc": nc, "names": names}
|
||||
return data
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue