YOLO系列 --- YOLOV7算法(三):YOLO V7算法train.py代码解析

时间:2022-12-11 03:30:01

YOLO系列 — YOLOV7算法(3):YOLO V7算法train.py代码解析


    parser = argparse.ArgumentParser()     parser.add_argument('--weights', type=str, default='', help='initial weights path')  #初始权重文件,如果有预训练模型,可以直接在这里加载     parser.add_argument('--cfg', type=str, default=r'E:\work\People_Detect\yolov7-main\cfg\training\yolov7x.yaml', help='model.yaml path') #网络结构配置文件     parser.add_argument('--data', type=str, default='data/custom_data.yaml', help='data.yaml path') #培训数据集配置文件     parser.add_argument('--hyp', type=str, default='data/hyp.scratch.p5.yaml', help='hyperparameters path') #超参数配置文件     parser.add_argument('--epochs', type=int, default=20) #训练迭代次数     parser.add_argument('--batch-size', type=int, default=2, help='total batch size for all GPUs') #训练批次大小     parser.add_argument('--img-size', nargs=' ', type=int, default=[640, 640], help='[train, test] image sizes') #训练图片大小 parser.add_argument('--rect', action='store_true', help='rectangular training') #是否采用矩形训练,默认False parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') #是否继续进行训练,如果设置成True,那么会自动寻找最近训练权重文件 parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') #不保存权重文件,默认False parser.add_argument('--notest', action='store_true', help='only test final epoch') #不进行test,默认False parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') #不自动调整anchor,默认False parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') #是否进行超参数优化,默认是False,开启该选项,会加大训练时间,一般不需要 parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') #谷歌云盘bucket,一般不会用到 parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') #是否提前将训练数据进行缓存,默认是False parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') #训练的时候是否选择图片权重进行训练 parser.add_argument('--device', default='cpu', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') #训练所使用的设备 parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') #是否进行多尺度训练,默认False parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') #训练数据集是否只有一类 parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') #是否使用adam优化器 parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') #是否使用跨卡同步BN,在DDP模式使用 parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') #gpu编号 parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') #dataloader的最大worker数量 parser.add_argument('--project', default='runs/train', help='save to project/name') #训练结果保存路径 parser.add_argument('--entity', default=None, help='W&B entity') #wandb库对应的东西,一般不用管 parser.add_argument('--name', default='exp', help='save to project/name') #训练结果保存文件夹名称 parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') #判断下训练结果保存路径是否存在,如果存在的话,就不用重新创建 parser.add_argument('--quad', action='store_true', help='quad dataloader') #作用是兼顾速度和精度,选择折中的方案 parser.add_argument('--linear-lr', action='store_true', help='linear LR') #用于对学习速率进行调整,默认为 false,含义是通过余弦函数来降低学习率。 parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') #是否做标签平滑,防止出现过拟合 parser.add_argument('--upload_dataset', action='store_true', help='Upload dataset as W&B artifact table') #wandb库对应的东西 parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B') #wandb 库对应的东西 parser.add_argument('--save_period', type=int, default=-1, help='Log model after every "save_period" epoch') #用于记录训练日志信息 parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used') #这一行参数表达的是想实现但还未实现的一个内容,忽略即可 opt = parser.parse_args() 


1.parser = argparse.ArgumentParser()
3.opt = parser.parse_args()

    # Set DDP variables
    opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
    opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1


    # Resume
    wandb_run = check_wandb_resume(opt)
    if opt.resume and not wandb_run:  # resume an interrupted run
        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
        apriori = opt.global_rank, opt.local_rank
        with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
            opt = argparse.Namespace(**yaml.load(f, Loader=yaml.SafeLoader))  # replace
        opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = '', ckpt, True, opt.total_batch_size, *apriori  # reinstate
        logger.info('Resuming training from %s' % ckpt)
        # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
        opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check files
        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
        opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
        opt.name = 'evolve' if opt.evolve else opt.name
        opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)  # increment run


    # DDP mode
    opt.total_batch_size = opt.batch_size
    device = select_device(opt.device, batch_size=opt.batch_size)
    if opt.local_rank != -1:
        assert torch.cuda.device_count() > opt.local_rank
        device = torch.device('cuda', opt.local_rank)
        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
        opt.batch_size = opt.total_batch_size // opt.world_size


    # Hyperparameters
    with open(opt.hyp) as f:
        hyp = yaml.load(f, Loader=yaml.SafeLoader)  # load hyps


    # Train
    if not opt.evolve:
        tb_writer = None  # init loggers
        if opt.global_rank in [-1, 0]:
            prefix = colorstr('tensorboard: ')
          prefix}Start with 'tensorboard --logdir { 
          opt.project}', view at http://localhost:6006/")
            tb_writer = SummaryWriter(opt.save_dir)  # Tensorboard
        train(hyp, opt, device, tb_writer)


    logger.info(colorstr('hyperparameters: ') + ', '.join(f'{ 
          v}' for k, v in hyp.items()))
    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank


    # Directories
    wdir = save_dir / 'weights'
    wdir.mkdir(parents=True, exist_ok=True)  # make dir
    last = wdir / 'last.pt'
    best = wdir / 'best.pt'
    results_file = save_dir / 'results.txt'


    # Save run settings
    with open(save_dir / 'hyp.yaml', 'w') as f:
        yaml.dump(hyp, f, sort_keys=False)
    with open(save_dir / 'opt.yaml', 'w') as f:
        yaml.dump(vars(opt), f, sort_keys=False)


    # Configure
    plots = not opt.evolve  # create plots
    cuda = device.type != 'cpu'
    init_seeds(2 + rank)
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.SafeLoader)  # data dict
    is_coco = opt.data.endswith('coco.yaml')


    loggers = { 
        'wandb': None}  # loggers dict
    if rank in [-1, 0]:
        opt.hyp = hyp  # add hyperparameters
        run_id = torch.load(weights).get('wandb_id') if weights.endswith('.pt') and os.path.isfile(weights) else None
        wandb_logger = WandbLogger(opt, Path(opt.save_dir).stem, run_id, data_dict)
        loggers['wandb'] = wandb_logger.wandb
        data_dict = wandb_logger.data_dict
        if wandb_logger.wandb:
            weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp  # WandbLogger might update weights, epochs if resuming

    nc = 1 if opt.single_cls else int(data_dict['nc'])  # number of classes
    names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check

这里主要是进行训练类别的计算。如果自定义数据集不止一个类别,但是又不小心将single_cls 设置为Ture的话,其实代码是不会报错的,但是这样就会在测试的时候不会正确的显示类别,所有的类别都变成了"items"。

    # Model
    pretrained = weights.endswith('.pt')
    if pretrained:
        with torch_distributed_zero_first(rank):
            attempt_download(weights)  # download if not found locally
        ckpt = torch.load(weights, map_location=device)  # load checkpoint
        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
        exclude = ['anchor'] if (opt.cfg or hyp.get('anchors')) and not opt.resume else []  # exclude keys
        state_dict = ckpt['model'].float().state_dict()  # to FP32
        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
        model.load_state_dict(state_dict, strict=False)  # load
        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
        model = Model(opt.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
    with torch_distributed_zero_first(rank):
        check_dataset(data_dict)  # check
    train_path = data_dict['train']
    test_path = data_dict['val']


    # Freeze
    freeze = []  # parameter names to freeze (full or partial)
    for k, v in model.named_parameters():
        v.requires_grad = True  # train all layers
        if any(x in k for x in freeze):
            print('freezing %s' % k)
            v.requires_grad = False


    # Optimizer
    nbs = 64  # nominal batch size
    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
    logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")

nbs为模拟的batch_size; 就比如默认的话上面设置的opt.batch_size为16,这个nbs就为64,也就是模型梯度累积了64/16=4(accumulate)次之后再更新一次模型,变相的扩大了batch_size。

    for k, v in model.named_modules():
        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
            pg2.append(v.bias)  # biases
        if isinstance(v, nn.BatchNorm2d):
            pg0.append(v.weight)  # no decay
        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
            pg1.append(v.weight)  # apply decay

先说model.modules()迭代遍历模型的所有子层,而model.named_modules()不但返回模型的所有子层,还会返回这些层的名字。还有一个 model.parameters(),它的作用是迭代地返回模型的所有参数。
然后,用hasattr函数来判断遍历的每个层对象是否拥有相对应的属性,将所有参数分成三类:weight、bn, bias。

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp 
