yolov26改进 | 添加注意力机制篇 | 添加DAttention (DAT)注意力机制二次创新C2PSA(附独家网络结构图)
开始讲解之前推荐一下我的专栏本专栏的内容支持(分类、检测、分割、追踪、关键点检测),专栏目前为限时折扣欢迎大家订阅本专栏本专栏每周更新5-7篇最新机制更有包含我所有改进的文件和交流群提供给大家本人定期在群内分享发表论文方法和经验。一、本文介绍本文给大家带来的是YOLOv26改进DAT(Vision Transformer with Deformable Attention)的教程CVPR上同时被评选为Best Paper由此可以证明其是一种十分有效的改进机制其主要的核心思想是引入可变形注意力机制和动态采样点(听着是不是和可变形动态卷积DCN挺相似)。文的讲解主要包含三方面DAT的网络结构思想、DAttention的代码复现如何添加DAttention到你的结构中实现涨点下面先来分享我测试的对比图。专栏链接YOLOv26有效涨点专栏包含Conv、注意力机制、主干/Backbone、损失函数、优化器、后处理等改进机制目录一、本文介绍二、DAT的网络结构思想2.1 DAT的主要思想和改进2.2 DAT的网络结构图2.3 DAT和其他机制的对比三、DAT即插即用的代码块四、添加DAT到你的网络中4.1 修改一4.2 修改二4.3 修改三4.4 修改四4.5 修改五4.6 修改六五、正式训练5.1 yaml文件5.1.1 yaml文件15.1.2 yaml文件25.2 训练代码5.3 训练过程截图五、本文总结二、DAT的网络结构思想论文地址DAT论文地址官方地址官方代码的地址2.1 DAT的主要思想和改进DATVision Transformer with Deformable Attention是一种引入了可变形注意力机制的视觉TransformerDAT的核心思想主要包括以下几个方面可变形注意力Deformable Attention传统的Transformer使用标准的自注意力机制这种机制会处理图像中的所有像素导致计算量很大。而DAT引入了可变形注意力机制它只关注图像中的一小部分关键区域。这种方法可以显著减少计算量同时保持良好的性能。动态采样点在可变形注意力机制中DAT动态地选择采样点而不是固定地处理整个图像。这种动态选择机制使得模型可以更加集中地关注于那些对当前任务最重要的区域。即插即用DAT的设计允许它适应不同的图像大小和内容使其在多种视觉任务中都能有效工作如图像分类、对象检测等。总结DAT通过引入可变形注意力机制改进了视觉Transformer的效率和性能使其在处理复杂的视觉任务时更加高效和准确。2.2 DAT的网络结构图(a) 展示了可变形注意力的信息流。左侧部分一组参考点均匀地放置在特征图上这些点的偏移量是由查询通过偏移网络学习得到的。然后如右侧所示根据变形点从采样特征中投影出变形的键和值。相对位置偏差也通过变形点计算增强了输出转换特征的多头注意力。为了清晰展示图中仅显示了4个参考点但在实际实现中实际上有更多的点。(b) 展示了偏移生成网络的详细结构每层输入和输出特征图的大小都有标注(这个Offset network在网络的代码中需要控制可添加可不添加)。通过上面的方式产生多种参考点分布在图像上从而提高检测的效率最终的效果图如下-2.3 DAT和其他机制的对比DAT与其他视觉Transformer模型和CNN模型中的DCN可变形卷积网络的对比图如下突出了它们处理查询的不同方法(图片展示的很直观不给大家描述过程了)三、DAT即插即用的代码块下面的代码是DAT的网络结构代码.import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import einops from timm.models.layers import trunc_normal_ __all__ [DAttentionBaseline, C2PSA_DAT] class LayerNormProxy(nn.Module): def __init__(self, dim): super().__init__() self.norm nn.LayerNorm(dim) def forward(self, x): x einops.rearrange(x, b c h w - b h w c) x self.norm(x) return einops.rearrange(x, b h w c - b c h w) class DAttentionBaseline(nn.Module): def __init__( self, q_size(224,224), kv_size(224,224), n_heads8, n_head_channels32, n_groups1, attn_drop0.0, proj_drop0.0, stride1, offset_range_factor-1, use_peTrue, dwc_peTrue, no_offFalse, fixed_peFalse, ksize9, log_cpbFalse ): super().__init__() n_head_channels int(q_size / 8) q_size (q_size, q_size) self.dwc_pe dwc_pe self.n_head_channels n_head_channels self.scale self.n_head_channels ** -0.5 self.n_heads n_heads self.q_h, self.q_w q_size # self.kv_h, self.kv_w kv_size self.kv_h, self.kv_w self.q_h // stride, self.q_w // stride self.nc n_head_channels * n_heads self.n_groups n_groups self.n_group_channels self.nc // self.n_groups self.n_group_heads self.n_heads // self.n_groups self.use_pe use_pe self.fixed_pe fixed_pe self.no_off no_off self.offset_range_factor offset_range_factor self.ksize ksize self.log_cpb log_cpb self.stride stride kk self.ksize pad_size kk // 2 if kk ! stride else 0 self.conv_offset nn.Sequential( nn.Conv2d(self.n_group_channels, self.n_group_channels, kk, stride, pad_size, groupsself.n_group_channels), LayerNormProxy(self.n_group_channels), nn.GELU(), nn.Conv2d(self.n_group_channels, 2, 1, 1, 0, biasFalse) ) if self.no_off: for m in self.conv_offset.parameters(): m.requires_grad_(False) self.proj_q nn.Conv2d( self.nc, self.nc, kernel_size1, stride1, padding0 ) self.proj_k nn.Conv2d( self.nc, self.nc, kernel_size1, stride1, padding0) self.proj_v nn.Conv2d( self.nc, self.nc, kernel_size1, stride1, padding0 ) self.proj_out nn.Conv2d( self.nc, self.nc, kernel_size1, stride1, padding0 ) self.proj_drop nn.Dropout(proj_drop, inplaceTrue) self.attn_drop nn.Dropout(attn_drop, inplaceTrue) if self.use_pe and not self.no_off: if self.dwc_pe: self.rpe_table nn.Conv2d( self.nc, self.nc, kernel_size3, stride1, padding1, groupsself.nc) elif self.fixed_pe: self.rpe_table nn.Parameter( torch.zeros(self.n_heads, self.q_h * self.q_w, self.kv_h * self.kv_w) ) trunc_normal_(self.rpe_table, std0.01) elif self.log_cpb: # Borrowed from Swin-V2 self.rpe_table nn.Sequential( nn.Linear(2, 32, biasTrue), nn.ReLU(inplaceTrue), nn.Linear(32, self.n_group_heads, biasFalse) ) else: self.rpe_table nn.Parameter( torch.zeros(self.n_heads, self.q_h * 2 - 1, self.q_w * 2 - 1) ) trunc_normal_(self.rpe_table, std0.01) else: self.rpe_table None torch.no_grad() def _get_ref_points(self, H_key, W_key, B, dtype, device): ref_y, ref_x torch.meshgrid( torch.linspace(0.5, H_key - 0.5, H_key, dtypedtype, devicedevice), torch.linspace(0.5, W_key - 0.5, W_key, dtypedtype, devicedevice), indexingij ) ref torch.stack((ref_y, ref_x), -1) ref[..., 1].div_(W_key - 1.0).mul_(2.0).sub_(1.0) ref[..., 0].div_(H_key - 1.0).mul_(2.0).sub_(1.0) ref ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2 return ref torch.no_grad() def _get_q_grid(self, H, W, B, dtype, device): ref_y, ref_x torch.meshgrid( torch.arange(0, H, dtypedtype, devicedevice), torch.arange(0, W, dtypedtype, devicedevice), indexingij ) ref torch.stack((ref_y, ref_x), -1) ref[..., 1].div_(W - 1.0).mul_(2.0).sub_(1.0) ref[..., 0].div_(H - 1.0).mul_(2.0).sub_(1.0) ref ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2 return ref def forward(self, x): x x B, C, H, W x.size() dtype, device x.dtype, x.device q self.proj_q(x) q_off einops.rearrange(q, b (g c) h w - (b g) c h w, gself.n_groups, cself.n_group_channels) offset self.conv_offset(q_off).contiguous() # B * g 2 Hg Wg Hk, Wk offset.size(2), offset.size(3) n_sample Hk * Wk if self.offset_range_factor 0 and not self.no_off: offset_range torch.tensor([1.0 / (Hk - 1.0), 1.0 / (Wk - 1.0)], devicedevice).reshape(1, 2, 1, 1) offset offset.tanh().mul(offset_range).mul(self.offset_range_factor) offset einops.rearrange(offset, b p h w - b h w p) reference self._get_ref_points(Hk, Wk, B, dtype, device) if self.no_off: offset offset.fill_(0.0) if self.offset_range_factor 0: pos offset reference else: pos (offset reference).clamp(-1., 1.) if self.no_off: x_sampled F.avg_pool2d(x, kernel_sizeself.stride, strideself.stride) assert x_sampled.size(2) Hk and x_sampled.size(3) Wk, fSize is {x_sampled.size()} else: x_sampled F.grid_sample( inputx.reshape(B * self.n_groups, self.n_group_channels, H, W), gridpos[..., (1, 0)], # y, x - x, y modebilinear, align_cornersTrue) # B * g, Cg, Hg, Wg x_sampled x_sampled.reshape(B, C, 1, n_sample) # self.proj_k.weight torch.nn.Parameter(self.proj_k.weight.float()) # self.proj_k.bias torch.nn.Parameter(self.proj_k.bias.float()) # self.proj_v.weight torch.nn.Parameter(self.proj_v.weight.float()) # self.proj_v.bias torch.nn.Parameter(self.proj_v.bias.float()) # 检查权重的数据类型 q q.reshape(B * self.n_heads, self.n_head_channels, H * W) k self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample) v self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample) attn torch.einsum(b c m, b c n - b m n, q, k) # B * h, HW, Ns attn attn.mul(self.scale) if self.use_pe and (not self.no_off): if self.dwc_pe: residual_lepe self.rpe_table(q.reshape(B, C, H, W)).reshape(B * self.n_heads, self.n_head_channels, H * W) elif self.fixed_pe: rpe_table self.rpe_table attn_bias rpe_table[None, ...].expand(B, -1, -1, -1) attn attn attn_bias.reshape(B * self.n_heads, H * W, n_sample) elif self.log_cpb: q_grid self._get_q_grid(H, W, B, dtype, device) displacement ( q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul( 4.0) # d_y, d_x [-8, 8] displacement torch.sign(displacement) * torch.log2(torch.abs(displacement) 1.0) / np.log2(8.0) attn_bias self.rpe_table(displacement) # B * g, H * W, n_sample, h_g attn attn einops.rearrange(attn_bias, b m n h - (b h) m n, hself.n_group_heads) else: rpe_table self.rpe_table rpe_bias rpe_table[None, ...].expand(B, -1, -1, -1) q_grid self._get_q_grid(H, W, B, dtype, device) displacement ( q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul( 0.5) attn_bias F.grid_sample( inputeinops.rearrange(rpe_bias, b (g c) h w - (b g) c h w, cself.n_group_heads, gself.n_groups), griddisplacement[..., (1, 0)], modebilinear, align_cornersTrue) # B * g, h_g, HW, Ns attn_bias attn_bias.reshape(B * self.n_heads, H * W, n_sample) attn attn attn_bias attn F.softmax(attn, dim2) attn self.attn_drop(attn) out torch.einsum(b m n, b c n - b c m, attn, v) if self.use_pe and self.dwc_pe: out out residual_lepe out out.reshape(B, C, H, W) y self.proj_drop(self.proj_out(out)) h, w pos.reshape(B, self.n_groups, Hk, Wk, 2), reference.reshape(B, self.n_groups, Hk, Wk, 2) return y def autopad(k, pNone, d1): # kernel, padding, dilation Pad to same shape outputs. if d 1: k d * (k - 1) 1 if isinstance(k, int) else [d * (x - 1) 1 for x in k] # actual kernel-size if p is None: p k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p class Conv(nn.Module): Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation). default_act nn.SiLU() # default activation def __init__(self, c1, c2, k1, s1, pNone, g1, d1, actTrue): Initialize Conv layer with given arguments including activation. super().__init__() self.conv nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groupsg, dilationd, biasFalse) self.bn nn.BatchNorm2d(c2) self.act self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() def forward(self, x): Apply convolution, batch normalization and activation to input tensor. return self.act(self.bn(self.conv(x))) def forward_fuse(self, x): Perform transposed convolution of 2D data. return self.act(self.conv(x)) class PSABlock(nn.Module): PSABlock class implementing a Position-Sensitive Attention block for neural networks. This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers with optional shortcut connections. Attributes: attn (Attention): Multi-head attention module. ffn (nn.Sequential): Feed-forward neural network module. add (bool): Flag indicating whether to add shortcut connections. Methods: forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers. Examples: Create a PSABlock and perform a forward pass psablock PSABlock(c128, attn_ratio0.5, num_heads4, shortcutTrue) input_tensor torch.randn(1, 128, 32, 32) output_tensor psablock(input_tensor) def __init__(self, c, attn_ratio0.5, num_heads4, shortcutTrue) - None: Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction. super().__init__() self.attn DAttentionBaseline(c) self.ffn nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1, actFalse)) self.add shortcut def forward(self, x): Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor. x x self.attn(x) if self.add else self.attn(x) x x self.ffn(x) if self.add else self.ffn(x) return x class C2PSA_DAT(nn.Module): C2PSA module with attention mechanism for enhanced feature extraction and processing. This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations. Attributes: c (int): Number of hidden channels. cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c. cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c. m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations. Methods: forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations. Notes: This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules. Examples: c2psa C2PSA(c1256, c2256, n3, e0.5) input_tensor torch.randn(1, 256, 64, 64) output_tensor c2psa(input_tensor) def __init__(self, c1, c2, n1, e0.5): Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio. super().__init__() assert c1 c2 self.c int(c1 * e) self.cv1 Conv(c1, 2 * self.c, 1, 1) self.cv2 Conv(2 * self.c, c1, 1) self.m nn.Sequential(*(PSABlock(self.c, attn_ratio0.5, num_headsself.c // 64) for _ in range(n))) def forward(self, x): Processes the input tensor x through a series of PSA blocks and returns the transformed tensor. a, b self.cv1(x).split((self.c, self.c), dim1) b self.m(b) return self.cv2(torch.cat((a, b), 1)) if __name__ __main__: # Generating Sample image image_size (1, 64, 224, 224) image torch.rand(*image_size) # Model model C2PSA_DAT(64, 64) out model(image) print(out.size())四、添加DAT到你的网络中下面的步骤如果你不会或者不想麻烦操作可以联系作者获得本专栏添加所有项目文件的源代码可直接训练.4.1 修改一第一还是建立文件我们找到如下ultralytics/nn文件夹下建立一个目录名字呢就是Addmodules文件夹4.2 修改二然后在Addmodules文件夹内建立一个新的py文件将本文章节三中的“核心代码复制粘贴进去。4.3 修改三第二步我们在该目录下创建一个新的py文件名字为__init__.py然后在其内部导入我们的文件如下图所示。4.4 修改四第三步我门中到如下文件ultralytics/nn/tasks.py进行导入和注册我们的模块(此处只需要添加一次即可如果你用我其它的改进机制这里的步骤只需要添加一次)4.5 修改五在ultralytics/nn/tasks.py文件内的parse_model方法函数内位置大概在1500行左右按照图示位置添加即可此处需要自己有一定的判别能力如果不会可联系作者获得视频教程。4.6 修改六在ultralytics/nn/tasks.py文件内的parse_model方法函数内位置大概在1550行左右按照图示位置添加即可此处一定要对应好位置和缩进否则很容易报错。elif m in {此处填写本章代码的名字.}: c2 ch[f] args [c2, *args]五、正式训练5.1 yaml文件5.1.1 yaml文件1训练信息YOLO26-C2PSA-DAT summary: 263 layers, 2,533,148 parameters, 2,533,148 gradients, 5.8 GFLOPs# Ultralytics AGPL-3.0 License - https://ultralytics.com/license # Ultralytics YOLO26 object detection model with P3/8 - P5/32 outputs # Model docs: https://docs.ultralytics.com/models/yolo26 # Task docs: https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes end2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs # YOLO26n backbone backbone: # [from, repeats, module, args] - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 - [-1, 2, C3k2, [256, False, 0.25]] - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 - [-1, 2, C3k2, [512, False, 0.25]] - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 - [-1, 2, C3k2, [512, True]] - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 - [-1, 2, C3k2, [1024, True]] - [-1, 1, SPPF, [1024, 5, 3, True]] # 9 - [-1, 2, C2PSA_DAT, [1024]] # 10 # YOLO26n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 6], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, True]] # 13 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 4], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, True]] # 16 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 13], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, True]] # 19 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 10], 1, Concat, [1]] # cat head P5 - [-1, 1, C3k2, [1024, True, 0.5, True]] # 22 (P5/32-large) - [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)5.1.2 yaml文件2训练信息YOLO26-Att-DAT summary: 271 layers, 2,528,924 parameters, 2,528,924 gradients, 6.1 GFLOPs# Ultralytics AGPL-3.0 License - https://ultralytics.com/license # Ultralytics YOLO26 object detection model with P3/8 - P5/32 outputs # Model docs: https://docs.ultralytics.com/models/yolo26 # Task docs: https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes end2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs # YOLO26n backbone backbone: # [from, repeats, module, args] - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2 - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4 - [-1, 2, C3k2, [256, False, 0.25]] - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8 - [-1, 2, C3k2, [512, False, 0.25]] - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16 - [-1, 2, C3k2, [512, True]] - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32 - [-1, 2, C3k2, [1024, True]] - [-1, 1, SPPF, [1024, 5, 3, True]] # 9 - [-1, 2, C2PSA, [1024]] # 10 # YOLO26n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 6], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, True]] # 13 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 4], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, True]] # 16 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 13], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, True]] # 19 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 10], 1, Concat, [1]] # cat head P5 - [-1, 1, C3k2, [1024, True, 0.5, True]] # 22 (P5/32-large) - [16, 1, DAttentionBaseline, []] # 23 # - [19, 1, DAttentionBaseline, []] # 24 # - [22, 1, DAttentionBaseline, []] # 25 # 此处的使用说法注释: 其中上面的三个注意力机制目前仅使用了23层如果你想使用24层那么就取消掉代码注释 # 并将下面检测头中的19改为24,如果想使用第25层注意力机制同理将下面检测头中的22改为25即可。 # 此处用法比较复杂如过不会联系Snu77博主获取视频教程 - [[23, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)5.2 训练代码大家可以创建一个py文件将我给的代码复制粘贴进去配置好自己的文件路径即可运行。import warnings warnings.filterwarnings(ignore) from ultralytics import YOLO if __name__ __main__: model YOLO(模型配置文件地址,也就是5.1你保存到本地文件的地址) # 如何切换模型版本, 上面的ymal文件可以改为 yolo26s.yaml就是使用的26s, # 类似某个改进的yaml文件名称为yolo26-XXX.yaml那么如果想使用其它版本就把上面的名称改为yolo26l-XXX.yaml即可改的是上面YOLO中间的名字不是配置文件的 # model.load(yolo26n.pt) # 是否加载预训练权重,科研不建议大家加载否则很难提升精度 model.train( datar数据集文件地址, # 如果大家任务是其它的ultralytics/cfg/default.yaml找到这里修改task可以改成detect, segment, classify, pose cacheFalse, imgsz640, epochs20, single_clsFalse, # 是否是单类别检测 batch16, close_mosaic0, workers0, device0, optimizerMuSGD, # using SGD/MuSGD # resume, # 这里是填写last.pt地址 ampTrue, # 如果出现训练损失为Nan可以关闭amp projectruns/train, nameexp, )5.3 训练过程截图五、本文总结到此本文的正式分享内容就结束了在这里给大家推荐我的YOLOv26改进有效涨点专栏本专栏目前为新开的平均质量分98分后期我会根据各种最新的前沿顶会进行论文复现也会对一些老的改进机制进行补充如果大家觉得本文帮助到你了订阅本专栏关注后续更多的更新~专栏链接YOLOv26有效涨点专栏包含Conv、注意力机制、主干/Backbone、损失函数、优化器、后处理等改进机制