Reformat Markdown code blocks (#12795)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
This commit is contained in:
Glenn Jocher 2024-05-18 18:58:06 +02:00 committed by GitHub
parent 2af71d15a6
commit fceea033ad
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
128 changed files with 1067 additions and 1018 deletions

View file

@ -62,36 +62,36 @@ Without further ado, let's dive in!
```python
import datetime
import shutil
from pathlib import Path
from collections import Counter
from pathlib import Path
import yaml
import numpy as np
import pandas as pd
from ultralytics import YOLO
import yaml
from sklearn.model_selection import KFold
from ultralytics import YOLO
```
2. Proceed to retrieve all label files for your dataset.
```python
dataset_path = Path('./Fruit-detection') # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt")) # all data in 'labels'
dataset_path = Path("./Fruit-detection") # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt")) # all data in 'labels'
```
3. Now, read the contents of the dataset YAML file and extract the indices of the class labels.
```python
yaml_file = 'path/to/data.yaml' # your data YAML with data directories and names dictionary
with open(yaml_file, 'r', encoding="utf8") as y:
classes = yaml.safe_load(y)['names']
yaml_file = "path/to/data.yaml" # your data YAML with data directories and names dictionary
with open(yaml_file, "r", encoding="utf8") as y:
classes = yaml.safe_load(y)["names"]
cls_idx = sorted(classes.keys())
```
4. Initialize an empty `pandas` DataFrame.
```python
indx = [l.stem for l in labels] # uses base filename as ID (no extension)
indx = [l.stem for l in labels] # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)
```
@ -101,16 +101,16 @@ Without further ado, let's dive in!
for label in labels:
lbl_counter = Counter()
with open(label,'r') as lf:
with open(label, "r") as lf:
lines = lf.readlines()
for l in lines:
# classes for YOLO label uses integer at first position of each line
lbl_counter[int(l.split(' ')[0])] += 1
lbl_counter[int(l.split(" ")[0])] += 1
labels_df.loc[label.stem] = lbl_counter
labels_df = labels_df.fillna(0.0) # replace `nan` values with `0.0`
labels_df = labels_df.fillna(0.0) # replace `nan` values with `0.0`
```
6. The following is a sample view of the populated DataFrame:
@ -142,7 +142,7 @@ The rows index the label files, each corresponding to an image in your dataset,
```python
ksplit = 5
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20) # setting random_state for repeatable results
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20) # setting random_state for repeatable results
kfolds = list(kf.split(labels_df))
```
@ -150,12 +150,12 @@ The rows index the label files, each corresponding to an image in your dataset,
2. The dataset has now been split into `k` folds, each having a list of `train` and `val` indices. We will construct a DataFrame to display these results more clearly.
```python
folds = [f'split_{n}' for n in range(1, ksplit + 1)]
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)
for idx, (train, val) in enumerate(kfolds, start=1):
folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train'
folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val'
folds_df[f"split_{idx}"].loc[labels_df.iloc[train].index] = "train"
folds_df[f"split_{idx}"].loc[labels_df.iloc[val].index] = "val"
```
3. Now we will calculate the distribution of class labels for each fold as a ratio of the classes present in `val` to those present in `train`.
@ -168,8 +168,8 @@ The rows index the label files, each corresponding to an image in your dataset,
val_totals = labels_df.iloc[val_indices].sum()
# To avoid division by zero, we add a small value (1E-7) to the denominator
ratio = val_totals / (train_totals + 1E-7)
fold_lbl_distrb.loc[f'split_{n}'] = ratio
ratio = val_totals / (train_totals + 1e-7)
fold_lbl_distrb.loc[f"split_{n}"] = ratio
```
The ideal scenario is for all class ratios to be reasonably similar for each split and across classes. This, however, will be subject to the specifics of your dataset.
@ -177,17 +177,17 @@ The rows index the label files, each corresponding to an image in your dataset,
4. Next, we create the directories and dataset YAML files for each split.
```python
supported_extensions = ['.jpg', '.jpeg', '.png']
supported_extensions = [".jpg", ".jpeg", ".png"]
# Initialize an empty list to store image file paths
images = []
# Loop through supported extensions and gather image files
for ext in supported_extensions:
images.extend(sorted((dataset_path / 'images').rglob(f"*{ext}")))
images.extend(sorted((dataset_path / "images").rglob(f"*{ext}")))
# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f'{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val')
save_path = Path(dataset_path / f"{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []
@ -195,22 +195,25 @@ The rows index the label files, each corresponding to an image in your dataset,
# Create directories
split_dir = save_path / split
split_dir.mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
(split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True)
(split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
(split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
(split_dir / "val" / "images").mkdir(parents=True, exist_ok=True)
(split_dir / "val" / "labels").mkdir(parents=True, exist_ok=True)
# Create dataset YAML files
dataset_yaml = split_dir / f'{split}_dataset.yaml'
dataset_yaml = split_dir / f"{split}_dataset.yaml"
ds_yamls.append(dataset_yaml)
with open(dataset_yaml, 'w') as ds_y:
yaml.safe_dump({
'path': split_dir.as_posix(),
'train': 'train',
'val': 'val',
'names': classes
}, ds_y)
with open(dataset_yaml, "w") as ds_y:
yaml.safe_dump(
{
"path": split_dir.as_posix(),
"train": "train",
"val": "val",
"names": classes,
},
ds_y,
)
```
5. Lastly, copy images and labels into the respective directory ('train' or 'val') for each split.
@ -221,8 +224,8 @@ The rows index the label files, each corresponding to an image in your dataset,
for image, label in zip(images, labels):
for split, k_split in folds_df.loc[image.stem].items():
# Destination directory
img_to_path = save_path / split / k_split / 'images'
lbl_to_path = save_path / split / k_split / 'labels'
img_to_path = save_path / split / k_split / "images"
lbl_to_path = save_path / split / k_split / "labels"
# Copy image and label files to new directory (SamefileError if file already exists)
shutil.copy(image, img_to_path / image.name)
@ -243,8 +246,8 @@ fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")
1. First, load the YOLO model.
```python
weights_path = 'path/to/weights.pt'
model = YOLO(weights_path, task='detect')
weights_path = "path/to/weights.pt"
model = YOLO(weights_path, task="detect")
```
2. Next, iterate over the dataset YAML files to run training. The results will be saved to a directory specified by the `project` and `name` arguments. By default, this directory is 'exp/runs#' where # is an integer index.
@ -254,12 +257,12 @@ fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")
# Define your additional arguments here
batch = 16
project = 'kfold_demo'
project = "kfold_demo"
epochs = 100
for k in range(ksplit):
dataset_yaml = ds_yamls[k]
model.train(data=dataset_yaml,epochs=epochs, batch=batch, project=project) # include any train arguments
model.train(data=dataset_yaml, epochs=epochs, batch=batch, project=project) # include any train arguments
results[k] = model.metrics # save output metrics for further analysis
```