Update autotune configuration to avoid crash on AMD devices

When running on an AMD device, trying to autotune with 32 warps causes a crash with `RuntimeError: Triton Error [HIP]: Code: 1, Messsage: invalid argument`. Thus we are removing that configuration when the device name contains "AMD", which is the case for MI250, MI300 and MI355. Tested on MI300.

Files changed (1) hide show

torch-ext/triton_layer_norm/layer_norm.py +18 -16

torch-ext/triton_layer_norm/layer_norm.py CHANGED Viewed

@@ -16,6 +16,22 @@ import triton
 import triton.language as tl
 def layer_norm_ref(
     x,
     weight,
@@ -128,14 +144,7 @@ def rms_norm_ref(
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=1),
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-        triton.Config({}, num_warps=16),
-        triton.Config({}, num_warps=32),
-    ],
     key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
@@ -407,14 +416,7 @@ def _layer_norm_fwd(
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=1),
-        triton.Config({}, num_warps=2),
-        triton.Config({}, num_warps=4),
-        triton.Config({}, num_warps=8),
-        triton.Config({}, num_warps=16),
-        triton.Config({}, num_warps=32),
-    ],
     key=[
         "N",
         "HAS_DRESIDUAL",

 import triton.language as tl
+autotune_configs = [
+    triton.Config({}, num_warps=1),
+    triton.Config({}, num_warps=2),
+    triton.Config({}, num_warps=4),
+    triton.Config({}, num_warps=8),
+    triton.Config({}, num_warps=16),
+    triton.Config({}, num_warps=32),
+]
+if torch.cuda.is_available():
+    is_amd_device = ("AMD" in torch.cuda.get_device_name())
+    # AMD devices have a maximum of 16 warps, so we remove the 32 warps autotune config
+    if is_amd_device and autotune_configs[-1].num_warps == 32:
+        autotune_configs.pop()
 def layer_norm_ref(
     x,
     weight,
 @triton.autotune(
+    configs=autotune_configs[:],
     key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 @triton.autotune(
+    configs=autotune_configs[:],
     key=[
         "N",
         "HAS_DRESIDUAL",