Skip to content

Commit

Permalink
update ymir-exc-sdk from 1.3.0 to 2.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
yzbx committed Feb 15, 2023
1 parent 62caa9a commit 53e3671
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 129 deletions.
56 changes: 40 additions & 16 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,25 @@
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from ymir_exc import monitor
from ymir_exc.util import YmirStage, get_merged_config, get_ymir_process, write_ymir_training_result
from ymir_exc.util import (YmirStage, get_merged_config,
write_ymir_monitor_process,
write_ymir_training_result)

from models.experimental import attempt_load
from models.yolo import Model
from utils.autoanchor import check_anchors
from utils.datasets import create_dataloader
from utils.general import (check_dataset, check_file, check_git_status, check_img_size, check_requirements, colorstr,
fitness, get_latest_run, increment_path, init_seeds, labels_to_class_weights,
labels_to_image_weights, one_cycle, print_mutation, set_logging, strip_optimizer)
from utils.general import (check_dataset, check_file, check_git_status,
check_img_size, check_requirements, colorstr,
fitness, get_latest_run, increment_path, init_seeds,
labels_to_class_weights, labels_to_image_weights,
one_cycle, print_mutation, set_logging,
strip_optimizer)
from utils.google_utils import attempt_download
from utils.loss import ComputeLoss, ComputeLossOTA
from utils.plots import plot_evolution, plot_images, plot_labels, plot_results
from utils.torch_utils import ModelEMA, intersect_dicts, is_parallel, select_device, torch_distributed_zero_first
from utils.torch_utils import (ModelEMA, intersect_dicts, is_parallel,
select_device, torch_distributed_zero_first)
from utils.wandb_logging.wandb_utils import WandbLogger, check_wandb_resume

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -208,7 +213,8 @@ def train(hyp, opt, device, tb_writer=None):
# Optimizer
if ckpt['optimizer'] is not None:
optimizer.load_state_dict(ckpt['optimizer'])
best_fitness = ckpt['best_fitness']
if opt.resume:
best_fitness = ckpt['best_fitness']

# EMA
if ema and ckpt.get('ema'):
Expand All @@ -223,7 +229,10 @@ def train(hyp, opt, device, tb_writer=None):
start_epoch = ckpt['epoch'] + 1
if opt.resume:
assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
if epochs < start_epoch:
else:
start_epoch = 0

if epochs <= start_epoch:
logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
(weights, ckpt['epoch'], epochs))
epochs += ckpt['epoch'] # finetune additional epochs
Expand Down Expand Up @@ -287,7 +296,7 @@ def train(hyp, opt, device, tb_writer=None):
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
# model._initialize_biases(cf.to(device))
if plots:
#plot_labels(labels, names, save_dir, loggers)
# plot_labels(labels, names, save_dir, loggers)
if tb_writer:
tb_writer.add_histogram('classes', c, 0)

Expand Down Expand Up @@ -337,8 +346,8 @@ def train(hyp, opt, device, tb_writer=None):
model.train()

if rank in [-1, 0] and epoch % monitor_gap == 0:
monitor.write_monitor_logger(
percent=get_ymir_process(stage=YmirStage.TASK, p=(epoch - start_epoch) / (epochs - start_epoch)))
p = (epoch - start_epoch) / (epochs - start_epoch)
write_ymir_monitor_process(ymir_cfg, task='training', naive_stage_percent=p, stage=YmirStage.TASK)
# Update image weights (optional)
if opt.image_weights:
# Generate indices
Expand Down Expand Up @@ -507,16 +516,31 @@ def train(hyp, opt, device, tb_writer=None):

# Save last, best and delete
torch.save(ckpt, last)
write_ymir_training_result(ymir_cfg,
evaluation_result=dict(mAP=float(results[2]),
mAR=float(results[1]),
mAP50_95=float(results[3]),
P=float(results[0])),
id='last',
files=[str(last)])
if best_fitness == fi:
torch.save(ckpt, best)
if (best_fitness == fi) and (epoch >= 200):
torch.save(ckpt, wdir / 'best_{:03d}.pt'.format(epoch))
write_ymir_training_result(ymir_cfg,
evaluation_result=dict(mAP=float(results[2]),
mAR=float(results[1]),
mAP50_95=float(results[3]),
P=float(results[0])),
id='best',
files=[str(wdir / 'best_{:03d}.pt')])
if (epoch + 1) % opt.save_period == 0:
epoch_weight_file = wdir / 'epoch_{:03d}.pt'.format(epoch)
torch.save(ckpt, epoch_weight_file)
write_ymir_training_result(ymir_cfg,
map50=float(results[2]),
id=str(epoch),
evaluation_result=dict(mAP=float(results[2]),
mAR=float(results[1]),
mAP50_95=float(results[3]),
P=float(results[0])),
id=f'epoch_{epoch}',
files=[str(epoch_weight_file)])
elif epoch >= (epochs - 5):
torch.save(ckpt, wdir / 'epoch_{:03d}.pt'.format(epoch))
Expand All @@ -538,7 +562,7 @@ def train(hyp, opt, device, tb_writer=None):
[wandb_logger.wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()]
})
# Test best.pt
logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
logger.info('%g epochs completed in %.3f hours.\n' % (epochs - start_epoch + 1, (time.time() - t0) / 3600))
if opt.data.endswith('coco.yaml') and nc == 80: # if COCO
for m in (last, best) if best.exists() else (last): # speed, mAP tests
results, _, _ = test.test(opt.data,
Expand Down
171 changes: 69 additions & 102 deletions ymir/README.md
Original file line number Diff line number Diff line change
@@ -1,112 +1,79 @@
# ymir-yolov5
# ymir-yolov7 镜像说明文档

```
docker pull youdaoyzbx/ymir-executor:ymir1.1.0-yolov7-cu111-tmi
docker build -t ymir/executor:yolov7-cu111-tmi --build-arg YMIR=1.1.0 -f ymir/docker/cuda111.dockerfile .
```


## change log
- add `ymir` folder
- modify `train.py` to write `monitor.txt` and `result.yaml`
- modify `utils/datasets.py` to support ymir dataset format
## 代码仓库

> 参考[WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7)
- [modelai/ymir-yolov7](https://github.com/modelai/ymir-yolov7)

## support ymir dataset format
- use `convert_ymir_to_yolov5` to generate `data.yaml`
## 镜像地址

- modify `utils/datasets.py` load `train.tsv`, `val.tsv`
```
# support ymir index file `train.tsv` and `val.tsv`
# class LoadImagesAndLabels(Dataset):
f = [] # image files
img2label_map = dict() # map image files to label files
for p in path if isinstance(path, list) else [path]:
p = Path(p) # os-agnostic
if p.is_file(): # file
with open(p, 'r') as t:
t = t.read().strip().splitlines()
for x in t:
# x = f'{image_path}\t{label_path}\n'
image_path, label_path = x.split()
f.append(image_path)
img2label_map[image_path] = label_path
else:
raise Exception(f'{prefix}{p} is not valid ymir index file')
# get label file path from image file path
def img2label_paths(img_paths, img2label_map={}):
return [img2label_map[img] for img in img_paths]
self.label_files = img2label_paths(self.img_files, img2label_map)
# support ymir label file
# convert ymir (xyxy) to yolov5 bbox format normalized (xc,yc,w,h)
# def cache_labels()
l_ymir = np.array(l, dtype=np.float32)
l = l_ymir.copy()
width, height = imagesize.get(im_file)
l[:,1:5:2] /= width # normalize x1,x2
l[:,2:5:2] /= height # normalize y1,y2
l[:,1] = (l_ymir[:,1] + l_ymir[:,3])/2/width
l[:,2] = (l_ymir[:,2] + l_ymir[:,4])/2/height
l[:,3] = (l_ymir[:,3] - l_ymir[:,1])/width
l[:,4] = (l_ymir[:,4] - l_ymir[:,2])/height
```

## write monitor and training results
youdaoyzbx/ymir-executor:ymir2.1.0-yolov7-cu111-tmi
modify `train.py`
- import functions
```
from ymir.ymir_yolov5 import get_merged_config, YmirStage, get_ymir_process
from ymir_exc import monitor
from ymir_exc import result_writer as rw
# 此版本加载预训练权重需要epoch>原有epoch,否则会出错.
youdaoyzbx/ymir-executor:ymir2.0.0-yolov7-cu111-tmi
```

- write tensorboard results
## 性能表现

> 数据参考[WongKinYiu/yolov7](https://github.com/WongKinYiu/yolov7)
| Model | Test Size | AP<sup>test</sup> | AP<sub>50</sub><sup>test</sup> | AP<sub>75</sub><sup>test</sup> | batch 1 fps | batch 32 average time |
| :-- | :-: | :-: | :-: | :-: | :-: | :-: |
| [**YOLOv7**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt) | 640 | **51.4%** | **69.7%** | **55.9%** | 161 *fps* | 2.8 *ms* |
| [**YOLOv7-X**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7x.pt) | 640 | **53.1%** | **71.2%** | **57.8%** | 114 *fps* | 4.3 *ms* |
| | | | | | | |
| [**YOLOv7-W6**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6.pt) | 1280 | **54.9%** | **72.6%** | **60.1%** | 84 *fps* | 7.6 *ms* |
| [**YOLOv7-E6**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-e6.pt) | 1280 | **56.0%** | **73.5%** | **61.2%** | 56 *fps* | 12.3 *ms* |
| [**YOLOv7-D6**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-d6.pt) | 1280 | **56.6%** | **74.0%** | **61.8%** | 44 *fps* | 15.0 *ms* |
| [**YOLOv7-E6E**](https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-e6e.pt) | 1280 | **56.8%** | **74.4%** | **62.1%** | 36 *fps* | 18.7 *ms* |


## 训练参数

| 超参数 | 默认值 | 类型 | 说明 | 建议 |
| - | - | - | - | - |
| hyper-parameter | default value | type | note | advice |
| shm_size | 128G | 字符串| 受ymir后台处理,docker image 可用共享内存 | 建议大小:镜像占用GPU数 * 32G |
| export_format | ark:raw | 字符串| 受ymir后台处理,ymir数据集导出格式 | - |
| model | yolov5s | 字符串 | yolov5模型,可选yolov5n, yolov5s, yolov5m, yolov5l等 | 建议:速度快选yolov5n, 精度高选yolov5l, yolov5x, 平衡选yolov5s或yolov5m |
| batch_size_per_gpu | 16 | 整数 | 每张GPU一次处理的图片数量 | 建议大小:显存占用<50% 可增加2倍加快训练速度 |
| workers_per_gpu | 4 | 整数 | 每张GPU对应的数据读取进程数 | - |
| epochs | 100 | 整数 | 整个数据集的训练遍历次数 | 建议:必要时分析tensorboard确定是否有必要改变,一般采用默认值即可 |
| img_size | 640 | 整数 | 输入模型的图像分辨率 | - |
| args_options | '--exist-ok' | 字符串 | yolov5命令行参数 | 建议:专业用户可用yolov5所有命令行参数 |
| save_weight_file_num | 1 | 整数 | 保存最新模型的数量 | - |
| sync_bn | False | 布尔型 | 是否同步各gpu上的归一化层 | 建议:开启以提高训练稳定性及精度 |
| cfg_file | cfg/training/yolov7-tiny.yaml | 文件路径 | 模型文件路径, 对应 `--cfg` | 参考[cfg/training](https://github.com/modelai/ymir-yolov7/tree/ymir/cfg/training) |
| hyp_file | data/hyp.scratch.tiny.yaml | 文件路径 | 超参数文件路径,对应 `--hyp` | 参考[data](https://github.com/modelai/ymir-yolov7/tree/ymir/data) |
| cache_images | True | 布尔 | 是否缓存图像 | 设置为True可加快训练速度 |


## 推理参数

| 超参数 | 默认值 | 类型 | 说明 | 建议 |
| - | - | - | - | - |
| hyper-parameter | default value | type | note | advice |
| img_size | 640 | 整数 | 模型的输入图像大小 | 采用32的整数倍,224 = 32*7 以上大小 |
| conf_thres | 0.25 | 浮点数 | 置信度阈值 | 采用默认值 |
| iou_thres | 0.45 | 浮点数 | nms时的iou阈值 | 采用默认值 |

## 挖掘参数

| 超参数 | 默认值 | 类型 | 说明 | 建议 |
| - | - | - | - | - |
| hyper-parameter | default value | type | note | advice |
| shm_size | 128G | 字符串| 受ymir后台处理,docker image 可用共享内存 | 建议大小:镜像占用GPU数 * 32G |
| img_size | 640 | 整数 | 模型的输入图像大小 | 采用32的整数倍,224 = 32*7 以上大小 |
| conf_thres | 0.25 | 浮点数 | 置信度阈值 | 采用默认值 |
| iou_thres | 0.45 | 浮点数 | nms时的iou阈值 | 采用默认值 |

## 引用
```
# change tensorboard writer log_dir
ymir_cfg = get_merged_config()
tb_writer = SummaryWriter(ymir_cfg.ymir.output.tensorboard_dir) # Tensorboard
```

- monitor training process
@article{wang2022yolov7,
title={{YOLOv7}: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors},
author={Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark},
journal={arXiv preprint arXiv:2207.02696},
year={2022}
}
```
# for each epoch
monitor_gap = max(1, (epochs - start_epoch)//1000)
if rank in [-1, 0] and epoch % monitor_gap == 0:
monitor.write_monitor_logger(percent=get_ymir_process(stage=YmirStage.TASK, p=(epoch-start_epoch)/(epochs-start_epoch)))
```

- write `result.yaml` to save model weights and map50
- optional: modify `utils/metrics.py` fitness() to save best map50
```
def fitness(x):
# Model fitness as a weighted combination of metrics
w = [0.0, 0.0, 1.0, 0.0] # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
return (x[:, :4] * w).sum(1)
```
```
if (epoch + 1) % opt.save_period == 0:
epoch_weight_file = wdir / 'epoch_{:03d}.pt'.format(epoch)
torch.save(ckpt, epoch_weight_file)
write_ymir_training_result(ymir_cfg, map50=float(results[2]), id=str(epoch), files=[str(epoch_weight_file)])
```

- modify `start.py` save other output files
```
# save other files in output directory
write_ymir_training_result(cfg, map50=0, id='last', files=[])
# if task done, write 100% percent log
monitor.write_monitor_logger(percent=1.0)
```

## infer and mining
- view `ymir/start.py` "_run_infer()" for infer
- view `ymir/start.py` "_run_mining()" for mining

## TODO
- [ ] multi-gpu mining and infer
- [ ] batch mining and infer
Loading

0 comments on commit 53e3671

Please sign in to comment.