AI-Hypercomputer · kocchop · Dec 24, 2025 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
@@ -108,3 +108,13 @@ class AttentionType(enum.Enum):
 class ShardMode(enum.Enum):
   AUTO = "auto"  # default
   EXPLICIT = "explicit"
+
+
+class ReorderStrategy(enum.Enum):
+  """Reorder strategies for load-balanced context parallelism.
+  Maps to transformer_engine.jax.attention.ReorderStrategy at runtime.
+  """
+
+  AUTO = "auto"
+  DUAL_CHUNK_SWAP = "dual_chunk_swap"
+  STRIPED = "striped"
@@ -881,6 +881,7 @@ dq_reduction_steps: 0 #the number of reduction steps. For now, only 3 or all the
 ### Determine if we want to use load balance for context parallelism
 context_parallel_load_balance: True
 context_parallel_strategy: "all_gather" # "all_gather" or "ring"
+context_parallel_reorder_strategy: "auto" # "auto", "dual_chunk_swap", or "striped"
 
 ### Paged Attention ###
 # These settings take effect only when `attention=paged`.

@@ -34,7 +34,7 @@
 from pydantic.types import PositiveInt, NonNegativeFloat, NonNegativeInt
 
 from MaxText import accelerator_to_spec_map, max_utils
-from MaxText.common_types import AttentionType, DecoderBlockType, ShardMode
+from MaxText.common_types import AttentionType, DecoderBlockType, ShardMode, ReorderStrategy
 from MaxText.globals import MAXTEXT_ASSETS_ROOT
 
 logger = logging.getLogger(__name__)
@@ -699,6 +699,10 @@ class HardwareAndMesh(BaseModel):
       "all_gather",
       description="Strategy for context parallelism ('all_gather' or 'ring').",
   )
+  context_parallel_reorder_strategy: ReorderStrategy = Field(
+      "auto",
+      description="Reorder strategy for load-balanced context parallelism.",
+  )
   custom_mesh: str = Field("", description="Available options: ['hybrid_ring_64x4', 'hybrid_ring_32x8']")
   allow_split_physical_axes: bool = Field(False, description="Allow splitting physical axes for device mesh creation.")
   enable_nnx: bool = Field(False, description="Whether to use NNX for model definition.")

@@ -818,6 +818,7 @@ def apply_attention(
       key: Array | KVTensor,
       value: Array | KVTensor,
       decoder_segment_ids: Array | None,
+      segment_positions: Array | None,
       lengths: Array | None,
       model_mode: str,
       use_ragged_attention: bool = False,
@@ -929,7 +930,7 @@ def apply_attention(
             """Decode not supported with flash attention.
                            Use `dot_product` instead."""
         )
-      return self.cudnn_flash_attention(query, key, value, decoder_segment_ids, model_mode), None, None
+      return self.cudnn_flash_attention(query, key, value, decoder_segment_ids, segment_positions, model_mode), None, None
     elif self.attention_kernel == "cudnn_flash_jax":
       validate_flash_attention_with_sinks_on_gpu(sinks)
       if isinstance(key, KVTensor):
@@ -1358,11 +1359,14 @@ def cudnn_flash_attention(
       key: Array,
       value: Array,
       decoder_segment_ids: Array | None,
+      segment_positions: Array | None,
       model_mode: str = MODEL_MODE_TRAIN,
   ) -> Array:
     """CUDNN Flash Attention with Transformer Engine.
     1. Stable API, supports MHA, GQA, SWA, Packing and Context Parallelism
-    2. Context Parallelism currently only supports causal masking and no packing
+    2. Context Parallelism currently only supports causal masking
+    3. Only Ring attention has packing support with striped load balancing
+      (context_parallel_strategy="ring" and context_parallel_load_balance=true)
     """
     # These imports are only meant to work in a GPU build.
     # pylint: disable=import-outside-toplevel
@@ -1372,6 +1376,11 @@ def cudnn_flash_attention(
     _, _, _, head_dim = query.shape  # pylint: disable=unused-variable
 
     using_context_parallelism = self.mesh.shape["context"] > 1
+    using_load_balanced_ring_cp = (
+        using_context_parallelism
+        and self.config.context_parallel_strategy == "ring"
+        and self.config.context_parallel_load_balance
+    )
 
     # Initialize default attention configuration
     sliding_window_size = None
@@ -1385,18 +1394,26 @@ def cudnn_flash_attention(
 
     # Handle packing configurations
     if self.config.packing and self.config.dataset_type != "synthetic":
+      if using_context_parallelism and not using_load_balanced_ring_cp:
+        raise ValueError("Packing is only supported for load balanced ring attention with context parallelism.")
       qkv_layout = "THD_THD_THD"  # Packed format: 'T3HD', 'THD_T2HD' or 'THD_THD_THD'
       if decoder_segment_ids is None:
         decoder_segment_ids = jnp.ones(shape=query.shape[:2], dtype=jnp.int32)
-      attn_mask = SequenceDescriptor.from_segment_ids_and_pos(segment_ids=decoder_segment_ids, segment_pos=None)
+      attn_mask = SequenceDescriptor.from_segment_ids_and_pos(
+          segment_ids=decoder_segment_ids, segment_pos=segment_positions
+      )
       # Create dummy SequenceDescriptor for lazy_init
       dummy_segment_ids = jnp.ones(shape=query.shape[:2], dtype=jnp.int32)
-      dummy_attn_mask = SequenceDescriptor.from_segment_ids_and_pos(segment_ids=dummy_segment_ids, segment_pos=None)
+      dummy_attn_mask = SequenceDescriptor.from_segment_ids_and_pos(
+          segment_ids=dummy_segment_ids, segment_pos=segment_positions
+      )
       max_segments_per_seq = self.config.max_segments_per_seq
     elif using_context_parallelism:
       if self.attention_type == AttentionType.LOCAL_SLIDING:
-        raise AssertionError("Sliding window attention is not supported for context parallelism")
-      # Context parallelism without packing: only supports causal masking
+        raise AssertionError(
+            "Sliding window attention is only supported for load balanced ring attention with context parallelism."
+        )
+      # Context parallelism without packing: only supports causal masking, but not sliding window attention
       attn_mask = None
       dummy_attn_mask = None
       mask_type = "causal"
@@ -1774,6 +1791,7 @@ def __call__(
       key,
       value,
       decoder_segment_ids,
+      inputs_positions,
       model_mode,
       cached_values=None,
       previous_chunk=None,
@@ -1795,6 +1813,7 @@ def __call__(
         key=key,
         value=value,
         decoder_segment_ids=decoder_segment_ids,
+        segment_positions=inputs_positions,
         lengths=None,
         model_mode=model_mode,
         use_ragged_attention=self.use_ragged_attention,
@@ -1817,6 +1836,7 @@ def __call__(
         key=key,
         value=value,
         decoder_segment_ids=decoder_segment_ids,
+        segment_positions=inputs_positions,
         lengths=lengths,
         model_mode=model_mode,
         use_ragged_attention=self.use_ragged_attention,

@@ -1105,6 +1105,7 @@ def __call__(
           key,
           value,
           decoder_segment_ids,
+          inputs_positions,
           model_mode,
           cached_values,
           previous_chunk,

@@ -791,13 +791,50 @@ def reorder_sequence(tensor, cp_size: int, seq_dim: int = 1, to_contiguous: bool
   return reordered.reshape(ori_tensor_shape)
 
 
-@partial(jax.jit, static_argnums=1)
-def reorder_causal_load_balanced(batch, cp_size):
-  """Reorders the example batch sequences"""
+@partial(jax.jit, static_argnums=(1, 2))
+def reorder_causal_load_balanced(batch, cp_size, reorder_strategy):
+  """Reorders the example batch sequences
+
+  Args:
+    batch: The batch to reorder.
+    cp_size: The size of the compute parallelism.
+    reorder_strategy: The ReorderStrategy enum value (DUAL_CHUNK_SWAP or STRIPED).
+
+  Returns:
+    The reordered batch.
+
+  Reorder Strategy:
+  - DUAL_CHUNK_SWAP: This strategy splits each query into two chunks and do the mirror swap between
+    GPUs. This is currently used for non-THD load balance. It requires the max_seqlens be the
+    multiple of 2 * cp_size.
+    Examples:
+    - Before reorder: GPU0: [0, 1, 2, 3]; GPU1: [4, 5, 6, 7]; GPU2: [8, 9, 10, 11]; GPU3: [12, 13, 14, 15];
+    - After reorder: GPU0: [0, 1, 14, 15]; GPU1: [4, 5, 10, 11]; GPU2: [8, 9, 6, 7]; GPU3: [12, 13, 2, 3]
+
+  - STRIPED: This strategy distributes the tokens in a striped (interleaved) manner across
+    the sequence. This is currently used for THD load balance.
+    Example: Consider 4 GPUs with seqlens=16.
+    - Before reorder: GPU0: [0, 1, 2, 3]; GPU1: [4, 5, 6, 7]; ...; GPU3: [12, 13, 14, 15]
+    - After reorder: GPU0: [0, 4, 8, 12]; GPU1: [1, 5, 9, 13]; ...; GPU3: [3, 7, 11, 15]
+
+  See: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/jax/attention.py
+  """
+  # pylint: disable=import-outside-toplevel
+  from transformer_engine.jax.attention import ReorderStrategy as TE_ReorderStrategy
+  from transformer_engine.jax.attention import reorder_causal_load_balancing
+  from MaxText.common_types import ReorderStrategy
+
+  reorder_strategy_map = {
+      ReorderStrategy.DUAL_CHUNK_SWAP: TE_ReorderStrategy.DualChunkSwap,
+      ReorderStrategy.STRIPED: TE_ReorderStrategy.Striped,
+  }
+
   return {
-      key: reorder_sequence(
+      key: reorder_causal_load_balancing(
           value,  # Pass each key's value inside batch separately
+          reorder_strategy_map[reorder_strategy],
           cp_size=cp_size,
+          seq_dim=1,
       )
       if key
       in ["inputs", "targets", "inputs_position", "targets_position", "inputs_segmentation", "targets_segmentation"]

@@ -40,7 +40,7 @@
 from MaxText import max_utils
 from MaxText import multimodal_utils
 from MaxText import sharding
-from MaxText.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
+from MaxText.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE, ReorderStrategy
 from MaxText.inference.page_manager import PageState
 
 OVERWRITE_WITH_GRADIENT = "_overwrite_with_gradient"
@@ -107,19 +107,24 @@ def get_functional_eval_with_signature(eval_step, data_sharding, state_mesh_shar
   return functional_eval, in_shardings, out_shardings, static_argnums, donate_argnums
 
 
-def shard_reorder_causal_load_balanced(batch, cp_size, shard_mode):
+def shard_reorder_causal_load_balanced(batch, cp_size, shard_mode, reorder_strategy=ReorderStrategy.DUAL_CHUNK_SWAP):
   """Shard the output of the reordered sequence."""
-  reordered = max_utils.reorder_causal_load_balanced(batch, cp_size)
+  reordered = max_utils.reorder_causal_load_balanced(batch, cp_size, reorder_strategy)
   for _, v in batch.items():
     if isinstance(v, jax.Array):
       reordered = sharding.maybe_shard_with_name(reordered, v.sharding, shard_mode)
       break
   return reordered
 
 
-def get_reorder_callable(cp_size, shard_mode):
+def get_reorder_callable(cp_size, shard_mode, reorder_strategy=ReorderStrategy.DUAL_CHUNK_SWAP):
   """Creates a callable that can be used with map() to reorder batches."""
-  return functools.partial(shard_reorder_causal_load_balanced, cp_size=cp_size, shard_mode=shard_mode)
+  return functools.partial(
+      shard_reorder_causal_load_balanced,
+      cp_size=cp_size,
+      shard_mode=shard_mode,
+      reorder_strategy=reorder_strategy,
+  )
 
 
 def get_shaped_batch(config):

@@ -27,7 +27,7 @@
 
 from MaxText import max_utils
 from MaxText import pyconfig_deprecated
-from MaxText.common_types import DecoderBlockType, ShardMode
+from MaxText.common_types import DecoderBlockType, ShardMode, ReorderStrategy
 from MaxText.configs import types
 from MaxText.configs.types import MaxTextConfig
 from MaxText.inference_utils import str2bool
@@ -169,6 +169,7 @@ def __init__(self, pydantic_config: types.MaxTextConfig):
 
     final_dict["decoder_block"] = DecoderBlockType(final_dict["decoder_block"])
     final_dict["shard_mode"] = ShardMode(final_dict["shard_mode"])
+    final_dict["context_parallel_reorder_strategy"] = ReorderStrategy(final_dict["context_parallel_reorder_strategy"])
 
     object.__setattr__(self, "_flat_config", final_dict)
 

@@ -32,7 +32,7 @@
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_REPO_ROOT, MAXTEXT_PKG_DIR
-from MaxText.common_types import AttentionType, DecoderBlockType, ShardMode
+from MaxText.common_types import AttentionType, DecoderBlockType, ShardMode, ReorderStrategy
 from MaxText.utils import gcs_utils
 
 
@@ -847,6 +847,7 @@ def user_init(raw_keys):
 
     raw_keys["decoder_block"] = DecoderBlockType(raw_keys["decoder_block"])
     raw_keys["shard_mode"] = ShardMode(raw_keys["shard_mode"])
+    raw_keys["context_parallel_reorder_strategy"] = ReorderStrategy(raw_keys["context_parallel_reorder_strategy"])
 
   @staticmethod
   def configure_gpt3_task(raw_keys):

@@ -30,6 +30,7 @@
 from MaxText.utils.goodput_utils import GoodputEvent
 from MaxText.utils.goodput_utils import maybe_record_goodput
 from MaxText import model_creation_utils
+from MaxText.common_types import ReorderStrategy
 
 
 def create_training_tools(config, model, mesh):
@@ -186,26 +187,43 @@ def setup_train_loop(config, recorder, devices=None):
   with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
     data_iterator, eval_data_iterator = create_data_iterator(config, mesh)
     rampup_manager = create_rampup_manager(config, checkpoint_manager)
-    data_loader = create_dataloader(config, mesh, data_iterator, recorder, rampup_manager)
     context_parallel_size = mesh.shape["context"]
-    # Check if context parallelism is being used with sequence packing
-    if context_parallel_size > 1 and config.packing and config.dataset_type != "synthetic":
-      raise ValueError(
-          "Context parallelism cannot be used with sequence packing. "
-          "Disable sequence packing (set packing=False). "
-          "Context parallelism with packing support will be added soon."
-      )
+    # Validate context parallelism with packing configuration
+    if context_parallel_size > 1 and config.packing:
+      if config.dataset_type == "synthetic":
+        raise ValueError(
+            "Context parallelism with sequence packing is not supported with synthetic data. "
+            "Please disable sequence packing (set packing=False)."
+        )
+      if config.context_parallel_strategy != "ring":
+        raise ValueError(
+            "Context parallelism with 'all_gather' strategy cannot be used with sequence packing. "
+            "Please use 'ring' strategy instead."
+        )
 
     # Apply reordering wrapper to data iterators if context parallelism is enabled
     with jax.set_mesh(mesh):
       if context_parallel_size > 1 and config.context_parallel_load_balance:
-        data_iterator = map(maxtext_utils.get_reorder_callable(context_parallel_size, config.shard_mode), data_iterator)
+
+        # Determine load balancing reorder strategy based on whether packing is enabled
+        if config.context_parallel_reorder_strategy == ReorderStrategy.AUTO:
+          reorder_strategy = ReorderStrategy.STRIPED if config.packing else ReorderStrategy.DUAL_CHUNK_SWAP
+        else:
+          reorder_strategy = config.context_parallel_reorder_strategy
+
+        data_iterator = map(
+            maxtext_utils.get_reorder_callable(context_parallel_size, config.shard_mode, reorder_strategy),
+            data_iterator,
+        )
         if eval_data_iterator:
           eval_data_iterator = map(
-              maxtext_utils.get_reorder_callable(context_parallel_size, config.shard_mode),
+              maxtext_utils.get_reorder_callable(context_parallel_size, config.shard_mode, reorder_strategy),
               eval_data_iterator,
           )
 
+    # Create data_loader AFTER reordering wrapper is applied
+    data_loader = create_dataloader(config, mesh, data_iterator, recorder, rampup_manager)
+
     state, _, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
         model, data_iterator, tx, config, init_rng, mesh, checkpoint_manager
     )

@@ -495,6 +495,35 @@ def test_gpu_ring_attention(self):
     ]
     train_main(ring_attention)
 
+  @pytest.mark.integration_test
+  @pytest.mark.gpu_only
+  def test_gpu_ring_attention_with_packing(self):
+    gpu_device = jax.devices("gpu")[0]
+    compute_capability = gpu_device.compute_capability
+    if float(compute_capability) < 9.0:
+      pytest.skip("Ring attention with packing is only supported on sm90+!")
+    os.environ["NVTE_FUSED_ATTN"] = "1"  # Enable fused attention
+    os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"] = "0"  # Disable scan for ring attention
+    thd_ring_attention = [  # tests base config on GPU with ring attention + packing
+        None,
+        os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml"),
+        "base_output_directory=gs://runner-maxtext-logs",
+        "run_name=runner_test",
+        "dataset_path=gs://maxtext-dataset",
+        "steps=10",
+        "enable_checkpointing=False",
+        "enable_goodput_recording=False",
+        "attention=cudnn_flash_te",
+        "ici_fsdp_parallelism=-1",
+        "ici_context_parallelism=2",
+        "context_parallel_load_balance=True",
+        "context_parallel_strategy=ring",
+        "packing=True",
+        "hardware=gpu",
+        rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizer.llama2')}",
+    ]
+    train_main(thd_ring_attention)
+
 
 if __name__ == "__main__":
   absltest.main()