diff --git a/ultralytics/engine/validator.py b/ultralytics/engine/validator.py
index 42c0e329..0f46395e 100644
--- a/ultralytics/engine/validator.py
+++ b/ultralytics/engine/validator.py
@@ -158,7 +158,12 @@ class BaseValidator:
             self.dataloader = self.dataloader or self.get_dataloader(self.data.get(self.args.split), self.args.batch)
 
             model.eval()
-            model.warmup(imgsz=(1 if pt else self.args.batch, 3, imgsz, imgsz))  # warmup
+            # model.warmup(imgsz=(1 if pt else self.args.batch, 3, imgsz, imgsz))  # warmup
+            # 新增warmup，保证性能准确
+            print('start warm up')
+            model.warmup(imgsz=(self.args.batch, 3, imgsz, imgsz))  # warmup
+            model.warmup(imgsz=(self.args.batch, 3, 288, imgsz))  # warmup
+            print('end warm up')
 
         self.run_callbacks("on_val_start")
         dt = (
diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py
index 7629fb7c..36ce75d6 100644
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@@ -12,6 +12,10 @@ import numpy as np
 import torch
 import torch.nn as nn
 from PIL import Image
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+import torchair as tng
+from torchair.configs.compiler_config import CompilerConfig
 
 from ultralytics.utils import ARM64, IS_JETSON, IS_RASPBERRYPI, LINUX, LOGGER, PYTHON_VERSION, ROOT, yaml_load
 from ultralytics.utils.checks import check_requirements, check_suffix, check_version, check_yaml, is_rockchip
@@ -156,6 +160,12 @@ class AutoBackend(nn.Module):
             names = model.module.names if hasattr(model, "module") else model.names  # get class names
             model.half() if fp16 else model.float()
             self.model = model  # explicitly assign for to(), cpu(), cuda(), half()
+            # 添加torchair适配代码
+            config = CompilerConfig()
+            config.experimental_config.frozen_parameter = True
+            npu_backbend = tng.get_npu_backend(compiler_config=config)
+            model = torch.compile(model, dynamic=True, fullgraph=True, backend=npu_backbend)
+            tng.use_internal_format_weight(model.model)
             pt = True
 
         # PyTorch
diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py
index 567e1aaf..a7cf8dd5 100644
--- a/ultralytics/nn/modules/block.py
+++ b/ultralytics/nn/modules/block.py
@@ -185,7 +185,10 @@ class SPPF(nn.Module):
     def forward(self, x):
         """Forward pass through Ghost Convolution block."""
         y = [self.cv1(x)]
-        y.extend(self.m(y[-1]) for _ in range(3))
+        # y.extend(self.m(y[-1]) for _ in range(3))
+        for _ in range(3):
+            o1 = self.m(y[-1])
+            y.extend(o1.unsqueeze(0))
         return self.cv2(torch.cat(y, 1))
 
 
@@ -236,7 +239,11 @@ class C2f(nn.Module):
     def forward(self, x):
         """Forward pass through C2f layer."""
         y = list(self.cv1(x).chunk(2, 1))
-        y.extend(m(y[-1]) for m in self.m)
+        # y.extend(m(y[-1]) for m in self.m)
+        # 该条代码在torch和dynamo中存在逻辑分歧，改为外部循环表示
+        for m in self.m:
+            o1 = m(y[-1])
+            y.extend(o1.unsqueeze(0))
         return self.cv2(torch.cat(y, 1))
 
     def forward_split(self, x):
diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py
index b9d05024..7906afa5 100644
--- a/ultralytics/nn/modules/head.py
+++ b/ultralytics/nn/modules/head.py
@@ -102,9 +102,9 @@ class Detect(nn.Module):
         # Inference path
         shape = x[0].shape  # BCHW
         x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
-        if self.format != "imx" and (self.dynamic or self.shape != shape):
-            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
-            self.shape = shape
+        anchors, strides = make_anchors(x, self.stride, 0.5)
+        anchors = anchors.transpose(0, 1)
+        strides = strides.transpose(0, 1)
 
         if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops
             box = x_cat[:, : self.reg_max * 4]
@@ -118,15 +118,15 @@ class Detect(nn.Module):
             grid_h = shape[2]
             grid_w = shape[3]
             grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
-            norm = self.strides / (self.stride[0] * grid_size)
-            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
+            norm = strides / (stride[0] * grid_size)
+            dbox = self.decode_bboxes(self.dfl(box) * norm, anchors.unsqueeze(0) * norm[:, :2])
         elif self.export and self.format == "imx":
             dbox = self.decode_bboxes(
-                self.dfl(box) * self.strides, self.anchors.unsqueeze(0) * self.strides, xywh=False
+                self.dfl(box) * strides, anchors.unsqueeze(0) * strides, xywh=False
             )
             return dbox.transpose(1, 2), cls.sigmoid().permute(0, 2, 1)
         else:
-            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
+            dbox = self.decode_bboxes(self.dfl(box), anchors.unsqueeze(0)) * strides
 
         return torch.cat((dbox, cls.sigmoid()), 1)
 
diff --git a/ultralytics/utils/tal.py b/ultralytics/utils/tal.py
index e4a40f5e..2ded54be 100644
--- a/ultralytics/utils/tal.py
+++ b/ultralytics/utils/tal.py
@@ -341,7 +341,8 @@ def make_anchors(feats, strides, grid_cell_offset=0.5):
         sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset  # shift y
         sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
         anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
-        stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
+        # stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
+        stride_tensor.append(torch.ones((h * w, 1), dtype=dtype, device=device)*stride)
     return torch.cat(anchor_points), torch.cat(stride_tensor)