
    Vpf                    >   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'Z(e)e$cZ*Z)e+e%cZ,Z+dddZ-dddZ.dddZ/dddZ0d Z1d Z2d Z3d Z4d Z5d Z6d  Z7d! Z8ddd"Z9dd#d$d%Z:d& Z;d~d)Z<dd3Z=d4 Z>d5 Z?d6 Z@d7 ZAdd;ZBd< ZCd= ZDd> ZEd? ZF ejG        d@          ZHdAeH_I        eHJ                     eeCejK                             eHL                    eD            ejM        eH eeEejN        ejK                              ejO        eHeF            ee?eH          ejP        eH<    ee@eHdB           ejQ        eH<    ee=d'          ejR        eH<   eHjS        dC             ZT ejG        dD          ZUdAeU_I        eUJ                     eeCejV                             eUL                    eD            ejM        eU eeEejW        ejV                              ee?eU          ejP        eU<    ee@eUdE           ejQ        eU<    ee=d'          ejR        eU<    ejG        dF          ZXdAeX_I        eXJ                     eeCejY                             eXL                    eD            ejM        eX eeEejZ        ejY                              ee?eX          ejP        eX<    ee@eXdG           ejQ        eX<    ee=d'          ejR        eX<   dH Z[dI Z\dJ Z]dK Z^ ejG        dL          Z_e_`                    dM             ejO        e_e\            ejM        e_e[            ee^e_          ejP        e_<   e]ejQ        e_<    ee=dN          ejR        e_<   dO ZadP ZbdQ Zc ejG        dR          Zded`                    dS             ejO        edea            ejM        edec            ee^ed          ejP        ed<   ebejQ        ed<    ee=dN          ejR        ed<   dT ZedU ZfdV ZgdW ZhdX ZidY ZjdZ Zkd[ Zld\ Zm ejG        d]          ZnenL                    em            ejM        enei            ejO        enej           ekejP        en<   elejQ        en<    ee=dN          ejR        en<   ddd#d^d_Zod` Zpda ZqddbdcZrdd Zsde Ztdf Zudg Zv ejG        dh          ZwewL                    es           ewJ                    eq            ejM        ewer           diD ] Zx ejM        ew eerexb          exb           ! ejO        ewet           euejP        ew<   evejQ        ew<    ee=dN          ejR        ew<   dj Zydk Zzdl Z{dm Z|dn Z} ejG        do          Z~e~L                    ez            ejO        e~e{           e|ejP        e~<   e}ejQ        e~<    ejM        e~ eeyejN                              ee=dN          ejR        e~<   ddd#dpdqZdr Zds Zdt Z ej        du          Z ejM        ee           eL                    e            ee=dN          ejR        e<   dv ZeS                    e           dw Zeej        _        dx Zdy Zdz Zd{ Zd| Z ejG        d}          ZeJ                    e           e`                    e            ejM        ee           eejP        e<   eejQ        e<    ee=d'          ejR        e<   dS )z
Parallelization primitives.
    )annotations)Sequence)partialN)	tree_util)core)dtypes)sharding_impls)AxisNameShapedArrayraise_to_shaped)ad)batching)mlir)pxla)lax)slicing)ir)hlo)	lax_numpy)canonicalize_axismoveaxissafe_mapsafe_zipunzip2axis_index_groupsc               ~   t          |t          t          f          s|f}t          d |D                       r|t	          d          t          |           t          j        |           \  }}d |D             }t          |          }t          j
        |t          |          |d}t          j        ||          S )a,  Compute an all-reduce sum on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Inputs of boolean dtype are converted to integers before the reduction.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform psums over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce sum along the axis ``axis_name``.

  Examples:
    For example, with 4 XLA devices available:

    >>> x = np.arange(4)
    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [6 6 6 6]
    >>> y = jax.pmap(lambda x: x / jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [0.         0.16666667 0.33333334 0.5       ]

    Suppose we want to perform ``psum`` among two groups, one with ``device0`` and ``device1``, the other with ``device2`` and ``device3``,

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i', axis_index_groups=[[0, 1], [2, 3]]), axis_name='i')(x)
    >>> print(y)
    [1 1 5 5]

    An example using 2D-shaped x. Each row is data from one device.

    >>> x = np.arange(16).reshape(4, 4)
    >>> print(x)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]
     [12 13 14 15]]

    Full ``psum`` across all devices:

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i'), axis_name='i')(x)
    >>> print(y)
    [[24 28 32 36]
     [24 28 32 36]
     [24 28 32 36]
     [24 28 32 36]]

    Perform ``psum`` among two groups:

    >>> y = jax.pmap(lambda x: jax.lax.psum(x, 'i', axis_index_groups=[[0, 1], [2, 3]]), axis_name='i')(x)
    >>> print(y)
    [[ 4  6  8 10]
     [ 4  6  8 10]
     [20 22 24 26]
     [20 22 24 26]]
  c              3  @   K   | ]}t          |t                    V  d S N
isinstanceint.0axiss     U/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/_src/lax/parallel.py	<genexpr>zpsum.<locals>.<genexpr>t   ,      554D#		555555    N>axis_index_groups only supported for sums over just named axesc                    g | ]E}t          j        |          t          j        k    rt	          j        |t          j                  n|FS  )r   dtypenpbool_r   convert_element_typeint32)r$   ls     r&   
<listcomp>zpsum.<locals>.<listcomp>x   sW     C C C67v|A"(** $Q11101C C Cr)   axesr   )r!   tuplelistany
ValueError"_validate_reduce_axis_index_groupsr   tree_flatten_canonicalize_axis_index_groupspsum_pbindtree_unflattenx	axis_namer   leavestreedefout_flats         r&   psumrF   1   s    B 
It}	-	- I55955555 W:K:W
U
V
VV$%6777*1--/&'C C;AC C C&56GHH[E)$$8IK K K(		!'8	4	44r)   c               |    t          | ||          } t          d||          t          j        fd|           S )aY  Compute an all-reduce mean on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmeans over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce mean along the axis ``axis_name``.

  For example, with 4 XLA devices available:

  >>> x = np.arange(4)
  >>> y = jax.pmap(lambda x: jax.lax.pmean(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [1.5 1.5 1.5 1.5]
  >>> y = jax.pmap(lambda x: x / jax.lax.pmean(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [0.        0.6666667 1.3333334 2.       ]
  )rB   r      c                    | z  S r   r,   )vns    r&   <lambda>zpmean.<locals>.<lambda>   s    a!e r)   )rF   r   tree_map)rA   rB   r   rK   s      @r&   pmeanrN      sH    : 1	5FGGG!
1	5FGGG!		OOOOQ	/	//r)   c               L   t          |t          t          f          s|f}t          d |D                       r|t	          d          t          |           t          j        |           \  }}t          |          }t          j
        |||d}t          j        ||          S )a#  Compute an all-reduce max on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmaxes over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce max along the axis ``axis_name``.
  c              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   zpmax.<locals>.<genexpr>   r(   r)   Nr*   r4   )r!   r6   r7   r8   r9   r:   r   r;   r<   pmax_pr>   r?   r@   s         r&   pmaxrR          & 
It}	-	- I55955555 W:K:W
U
V
VV$%6777*1--/&'56GHH[&y+<> > >(		!'8	4	44r)   c               L   t          |t          t          f          s|f}t          d |D                       r|t	          d          t          |           t          j        |           \  }}t          |          }t          j
        |||d}t          j        ||          S )a"  Compute an all-reduce min on ``x`` over the pmapped axis ``axis_name``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would perform pmins over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and on TPUs all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x`` representing the result of an
    all-reduce min along the axis ``axis_name``.
  c              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   zpmin.<locals>.<genexpr>   r(   r)   Nr*   r4   )r!   r6   r7   r8   r9   r:   r   r;   r<   pmin_pr>   r?   r@   s         r&   pminrW      rS   r)   c                    t          |t          t          f          rt          d|           t	          | t          | |          |          S Nz(pargmin only accepts a single axis, got )r!   r6   r7   	TypeError_axis_index_of_valrW   rA   rB   s     r&   pargminr]      L    	E4=)) L
JyJJ
K
KK	AtAy119	=	==r)   c                    t          |t          t          f          rt          d|           t	          | t          | |          |          S rY   )r!   r6   r7   rZ   r[   rR   r\   s     r&   pargmaxr`      r^   r)   c           	         t          |          }t          j        || k    |t          j        t          j        |                    j                  }t          ||          S r   )
axis_indexr   wherer   iinfor-   maxrW   )rA   valrB   idxvalidxs        r&   r[   r[      sM    9#?3!8S&,v|C7H7H*I*I*MNN&	fi	 	  r)   c                    | d S t          t          d | D                                 }d | D             t          |          k    rt          d          d S )Nc              3  4   K   | ]}t          |          V  d S r   len)r$   groups     r&   r'   z5_validate_reduce_axis_index_groups.<locals>.<genexpr>   s(      CCUCCCCCCr)   c                    h | ]	}|D ]}|
S r,   r,   )r$   gis      r&   	<setcomp>z5_validate_reduce_axis_index_groups.<locals>.<setcomp>   s%    ...AA..qa....r)   z5axis_index_groups must cover all indices exactly once)rangesumsetr9   )r   
axis_spaces     r&   r:   r:      sj    
FSCC1BCCCCCDD*.."...#j//AA
L
M
MM BAr)   c                N    | d S t          t          t           |                     S r   )r6   mapr   s    r&   r<   r<      s&    
F	s5+,,	-	--r)   c                `    t          j        t          t          j        ||          |           S )aM  Perform a collective broadcast and replicate from ``source``.

  This is equivalent to
  ```
  def pbroadcast(x, axis_name, source):
    masked = jnp.where(axis_index(axis_name) == source, x, zeros_like(x))
    return psum(masked, axis_name)
  ```
  but implemented in a hardware optimized way.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This function is an analog of the CollectiveBroadcast HLO.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    source: int, representing which index into ``axis_name`` that should be copied.

  Returns:
    Array(s) with ``x`` being copied from the ``source`` index slice of ``axis_name``.
  )rB   source)r   rM   r   pbroadcast_pr>   )rA   rB   ry   s      r&   
pbroadcastr{      s3    2 
	l9VDDDa
I 
I Ir)   c                    t          j        t          t          j        |t          t          t
          |                              |           S )a|  Perform a collective permutation according to the permutation ``perm``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This function is an analog of the CollectivePermute HLO.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of pairs of ints, representing
      ``(source_index, destination_index)``
      pairs that encode how the mapped axis named ``axis_name`` should be
      shuffled. The integer values are treated as indices into the mapped axis
      ``axis_name``. Any two pairs should not have the same source index or the
      same destination index. For each index of the axis ``axis_name`` that does
      not correspond to a destination index in ``perm``, the corresponding
      values in the result are filled with zeros of the appropriate type.

  Returns:
    Array(s) with the same shape as ``x`` with slices along the axis
    ``axis_name`` gathered from ``x`` according to the permutation ``perm``.
  rB   perm)r   rM   r   
ppermute_pr>   r6   rw   rA   rB   r~   s      r&   ppermuter     sI    2 
	joUD))**, , ,-.
0 
0 0r)   c                &   t          |          t          t          t          |                              k    rt          d|           t	          | |t          t          |t          t          |                                                  S )a0  Convenience wrapper of jax.lax.ppermute with alternate permutation encoding

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    perm: list of ints encoding sources for the permutation to be applied to
      the axis named ``axis_name``, so that the output at axis index i
      comes from the input at axis index perm[i]. Every integer in [0, N) should
      be included exactly once for axis size N.

  Returns:
    Array(s) with the same shape as ``x`` with slices along the axis
    ``axis_name`` gathered from ``x`` according to the permutation ``perm``.
  z)`perm` does not represent a permutation: )rt   rr   rl   r9   r   r7   zipr   s      r&   pshuffler   4  st    & 	YY#eCII&&''''
GGG
H
HH	!YSuSYY/?/?%@%@ A A	B	BBr)   c               *    t          | ||||          S )a  Swap the pmapped axis ``axis_name`` with the unmapped axis ``axis``.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  The group size of the mapped axis size must be equal to the size of the
  unmapped axis; that is, we must have
  ``lax.psum(1, axis_name, axis_index_groups=axis_index_groups) == x.shape[axis]``.
  By default, when ``axis_index_groups=None``, this encompasses all the devices.

  This function is a special case of ``all_to_all`` where the pmapped axis of
  the input is placed at the position ``axis`` in the output. That is, it is
  equivalent to ``all_to_all(x, axis_name, axis, axis)``.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis: int indicating the unmapped axis of ``x`` to map with the name
      ``axis_name``.
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run pswapaxes over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.

  Returns:
    Array(s) with the same shape as ``x``.
  r   
all_to_all)rA   rB   r%   r   s       r&   	pswapaxesr   L  s    : 
Ay$@Q	R	R	RRr)   F)r   tiledc               d    t                    ||ffd	}t          j        ||           S )a  Materialize the mapped axis and map a different axis.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  In the output, the input mapped axis ``axis_name`` is materialized at the
  logical axis position ``concat_axis``, and the input unmapped axis at position
  ``split_axis`` is mapped with the name ``axis_name``.

  The group size of the mapped axis size must be equal to the size of the
  unmapped axis; that is, we must have
  ``lax.psum(1, axis_name, axis_index_groups=axis_index_groups) == x.shape[axis]``.
  By default, when ``axis_index_groups=None``, this encompasses all the devices.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    split_axis: int indicating the unmapped axis of ``x`` to map with the name
      ``axis_name``.
    concat_axis: int indicating the position in the output to materialize the
      mapped axis of the input with the name ``axis_name``.
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run all_to_all over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.
    tiled: when True, all_to_all will divide split_axis into chunks and concatenate
      them along concat_axis. In particular, no dimensions are added or removed.
      False by default.

  Returns:
    When tiled is False, array(s) with shape given by the expression::

      np.insert(np.delete(x.shape, split_axis), concat_axis, axis_size)

    where ``axis_size`` is the size of the mapped axis named ``axis_name`` in
    the input ``x``, i.e. ``axis_size = lax.psum(1, axis_name)``.

    Otherwise array with shape similar to the input shape, except with split_axis
    divided by axis size and concat_axis multiplied by axis size.
  c           	     (   t          d          }r9| j        |         |z  dk    r$t          d| j        |          d d| d          n|| j        |         k    r0d}t          |                    || j        |                             ||k     r|dz  }t	          j        | |f          } n"||k    rnt	          j        | |f          } |dz  }t                              | ||	          }s||k    rt	          j        ||f          }|S )
NrH   r   r   z#The size of all_to_all split_axis (z4) has to be divisible by the size of the named axis z ()z|all_to_all requires the size of the mapped axis axis_name to equal x.shape[split_axis], but they are {} and {} respectively.)
split_axisconcat_axisrB   r   r   )	rF   shaper9   formatr   expand_dimsall_to_all_pr>   squeeze)	rA   r   r   
group_sizemsgresultr   rB   r   s	         r&   r>   zall_to_all.<locals>.bind  sm   a6GHHHJ 	
	z	)Q	.	. 7qwz?R 7 7%7 7)37 7 7 8 8 	8 
/
 
qwz*	*	*QJ
0CDDEEE	k	!	!qOA~..$$OA~..a
qZ[)21B%*  , ,F  2Z;..{6J=11fMr)   )r<   r   rM   )rA   rB   r   r   r   r   r>   s    `  `` r&   r   r   k  sY    T 66GHH#        8 
	D!	$	$$r)   c                8    t                               |           S )a	  Return the index along the mapped axis ``axis_name``.

  Args:
    axis_name: hashable Python object used to name the mapped axis.

  Returns:
    An integer representing the index.

  For example, with 8 XLA devices available:

  >>> from functools import partial
  >>> @partial(jax.pmap, axis_name='i')
  ... def f(_):
  ...   return lax.axis_index('i')
  ...
  >>> f(np.zeros(4))
  Array([0, 1, 2, 3], dtype=int32)
  >>> f(np.zeros(8))
  Array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32)
  >>> @partial(jax.pmap, axis_name='i')
  ... @partial(jax.pmap, axis_name='j')
  ... def f(_):
  ...   return lax.axis_index('i'), lax.axis_index('j')
  ...
  >>> x, y = f(np.zeros((4, 2)))
  >>> print(x)
  [[0 0]
  [1 1]
  [2 2]
  [3 3]]
  >>> print(y)
  [[0 1]
  [0 1]
  [0 1]
  [0 1]]
  rB   )axis_index_pr>   r   s    r&   rb   rb     s    J 
		Y		/	//r)   r5   int | AxisNamec                    t          |t          t          f          s|f}t                              | |t          |                    S )z>Uses the last positional axis of idx to index into src's axes.r5   )r!   r6   r7   	pgather_pr>   )srcrg   r5   s      r&   pgatherr     s=    	D5$-	(	( 7D	SuT{{	3	33r)   pnamestrparamscore.ParamDictsubstcore.AxisSubsttraverseboolreturnc                    ||          }t          |t          t          f          s|f}t          |          }t	          fd|D             d          || <   |S )Nc              3  ^   K   | ]'}t          |t                    r|fn
 |          V  (d S r   r    )r$   namer   s     r&   r'   z,_subst_all_names_in_param.<locals>.<genexpr>  sT       . . #-T3"7"7HwwUU4[[ . . . . . .r)   r,   )r!   r6   r7   dictrs   )r   r   r   r   rB   r   s     `   r&   _subst_all_names_in_paramr     sw    Um)	It}	-	- I<<& . . . .#,. . . &- 
-r)   c                   |t          d          d t          ||          D             }g g fx\  }}}g g fx\  }	}
}t          t          ||                    D ]V\  }\  }}||t          j        u                              |           ||t          j        u                              |           Wd gt          |          z  }|r8 |d|          \  }} | j        ||d d}t          |
|          D ]
\  }}|||<   |r8 |d|          \  }} | j        ||d d}t          |	|          D ]
\  }}|||<   t          d |D                       sJ |S )NzSaxis_index_groups not supported in vmap collectives. Please open a feature request!c                b    g | ],\  }}|t           j        u s|d k    r|nt          |d |          -S r   )r   
not_mapped	_moveaxis)r$   rf   ds      r&   r3   z6_reduction_with_positional_batcher.<locals>.<listcomp>  sQ     2 2 2a ,,,QSSIaC<P<P 2 2 2r)   r   r4   c              3     K   | ]}|d uV  	d S r   r,   )r$   rJ   s     r&   r'   z5_reduction_with_positional_batcher.<locals>.<genexpr>  s&      --qQd]------r)   )	NotImplementedErrorr   	enumerater   r   appendrl   r>   all)primvals_indims_inr   transform_unmappedtransform_mappedmapped_vals_inunmapped_vals_inpartitioned_vals_inmapped_idxsunmapped_idxspartitioned_idxsrp   rf   r   vals_outunmapped_axesunmapped_vals_outmapped_axesmapped_vals_outs                       r&   "_reduction_with_positional_batcherr     s   "
 ? @ @ @2 2w002 2 2';=r6A"."%824b&8+}/s7G4455 9 9ka#qX00188===Q(--.55a8888Vc'll"( &8&8<L&M&M#M#!	#3-[_```m%677  3hqkk "2"21n"E"EKdikUYZZZOk?33  3hqkk	--H---	-	----	/r)   c          	         | j         sJ t          d D                       s | j        ||d|fS t          | |||fdfd          }|d |D             fS )Nc              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   z%_reduction_batcher.<locals>.<genexpr>  ,      44tZc""444444r)   r4   c                    |fS r   r,   r   	d_vals_inr5   s     r&   rL   z$_reduction_batcher.<locals>.<lambda>  s    D), r)   c                @     t           fdD                       |fS )Nc              3  X   K   | ]$}t          |t                    r	||k    z   n|V  %d S r   r    )r$   r%   r   s     r&   r'   z7_reduction_batcher.<locals>.<lambda>.<locals>.<genexpr>  sV       "4 "4&* 9C48M8M"W$$!)"4"4SW "4 "4 "4 "4 "4 "4r)   r6   r   s   ` r&   rL   z$_reduction_batcher.<locals>.<lambda>  s@    E "4 "4 "4 "4.2"4 "4 "4 4 4%' r)   c                2    g | ]}|t           j        u r|nd S r   )r   r   )r$   r   s     r&   r3   z&_reduction_batcher.<locals>.<listcomp>  s)    JJJQh111AAqJJJr)   )multiple_resultsr8   r>   r   )r   r   r   r5   r   r   s      `  r&   _reduction_batcherr     s    			44t444	4	4 X49gD<MNNNPWWW/
GW/,,,,' ' ' '( (( 
JJ'JJJ	JJr)   c	           
         | j         sJ v sJ t          | |||fdfd          }	|	t          j        gt	          |	          z  fS )Nc                X    t          fdD                       fd|D             fS )Nc              3  (   K   | ]}|k    |V  d S r   r,   r$   r%   
frame_names     r&   r'   zB_batched_reduction_collective.<locals>.<lambda>.<locals>.<genexpr>%  s-      !N!N44:;M;M$;M;M;M;M!N!Nr)   c                (    g | ]} |          S r,   r,   )r$   rJ   	axis_sizeif_unmappeds     r&   r3   zC_batched_reduction_collective.<locals>.<lambda>.<locals>.<listcomp>&  s%    JJJ1[[I66JJJr)   r   )r   r   r5   r   r   r   s     r&   rL   z/_batched_reduction_collective.<locals>.<lambda>%  sD    E!N!N!N!N4!N!N!NNNJJJJJ	JJJL r)   c                B     t           fdD                       |fS )Nc              3  h   K   | ],}t          |t                    r	||k    z   n	|k    r|nV  -d S r   r    )r$   r%   r   r   s     r&   r'   zB_batched_reduction_collective.<locals>.<lambda>.<locals>.<genexpr>'  si       "4 "4 '+ 9C48M8M #$$$!)"4"4*.**<*<$$"#"4 "4 "4 "4 "4 "4r)   r   )r   r   r5   r   s   ` r&   rL   z/_batched_reduction_collective.<locals>.<lambda>'  sI    E "4 "4 "4 "4 "4 /3"4 "4 "4 4 4 &	' r)   )r   r   r   r   rl   )
r   r   r   r   _r   r   r5   r   r   s
    ```   `  r&   _batched_reduction_collectiver     s     
		t				 0
GW/L L L L L L L' ' ' ' '	( (( 
H'(3x==8	88r)   c                R    t          j        | |          }fd|D             }|S )Nc                8    g | ]D ]}fd |D             S )c                     g | ]
}|         S r,   r,   )r$   rp   
axis_groups     r&   r3   z._replica_groups.<locals>.<listcomp>.<listcomp>1  s    ???z!}???r)   r,   )r$   axis_index_groupr   r   s     @r&   r3   z#_replica_groups.<locals>.<listcomp>1  s^     A A A$.?A A* @???.>??? A A A Ar)   )r   axis_groups)axis_envrB   r   replica_groupss     ` r&   _replica_groupsr   .  sR    #Hi88."A A A A(6A A AN 
r)   r   Sequence[Sequence[int]]ir.DenseElementsAttrc           	         t          j        t          t          j        | ddi          t           j                  j        }t          j        	                    t          j
        |                    S )N	fillvaluer-   )r.   arrayr7   	itertoolszip_longestint64Tr   DenseIntElementsAttrgetascontiguousarray)r   groupss     r&   _replica_groups_hlor   6  sa     8D.M"MMNN($ $ $$% 			 	$	$R%9&%A%A	B	BBr)   c               b     |J t          d D                       sJ  fd|D             S )Nc              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   z"_allreduce_impl.<locals>.<genexpr>?  r   r)   c                (    g | ]} |          S r,   r,   )r$   argr5   pos_reducers     r&   r3   z#_allreduce_impl.<locals>.<listcomp>@  s%    	1	1	1S++c4
 
 	1	1	1r)   )r   )r   r5   r   argss   ``  r&   _allreduce_implr  =  sP    		"	"	"	44t444	4	4444	1	1	1	1	1D	1	1	11r)   c                    t          d | D                       }t          d | D                       |%t                    dk    rt          d|            fd|D             }|d |D             fS )Nc              3  D   K   | ]}t          |t                    |V  d S r   r    r#   s     r&   r'   z5_allreduce_effectful_abstract_eval.<locals>.<genexpr>C  s1      HHd*T32G2GHTHHHHHHr)   c              3  D   K   | ]}t          |t                    |V  d S r   r    r#   s     r&   r'   z5_allreduce_effectful_abstract_eval.<locals>.<genexpr>D  s1      BBDJtS,A,AB4BBBBBBr)   r   zMaxis_index_groups can only be used with reductions over named axes, but got: c           	     |    g | ]8}t          t          j        t          |                     |j                  9S )r   )r   r   _reduce_op_shape_ruler   r-   )r$   r   pos_axess     r&   r3   z6_allreduce_effectful_abstract_eval.<locals>.<listcomp>I  sR     . . .!$ #+OC,@,@xPPP) . . .r)   c                6    h | ]}t          j        |          S r,   )r   NamedAxisEffectr#   s     r&   rq   z5_allreduce_effectful_abstract_eval.<locals>.<setcomp>L  s#    GGGDT)$//GGGr)   )r6   rl   r9   )r5   r   r   
named_axes	out_avalsr  s        @r&   "_allreduce_effectful_abstract_evalr  B  s    HHdHHHHH*BBDBBBBB("
8}} 6/36 6 7 7 7. . . .(,. . .) 
GGJGGG	GGr)   c                   |Mdj         j        v r?t          |d                   t          fd|D                       rt	          d          g g fx\  }}|D ]0}|t          |t                                                 |           1r3t          j	        |d          fd}	t          |	j        |          }|s|S t          t          j         j        ||                    j         j        }
t          |
t           j        t           j        f           fdfd	t'          j        |          D             S )
Ntpur   c              3  >   K   | ]}t          |          k    V  d S r   rk   )r$   ro   len_0s     r&   r'   z&_allreduce_lowering.<locals>.<genexpr>Q  s-      
6
6q3q66U?
6
6
6
6
6
6r)   z<axis_index_groups must all be the same size for TPU loweringFr   c                   |                      t          j        t          j        | j        t          j                                      }                    d | g|g          } ||t                              \  }|S )Nr   r   	primitiveavals_in	avals_outr   )updater.   deleter   r   r   replacer6   )avalr   aval_outreducer_ctxoutctxpositional_axesreducers        r&   _positional_reducez/_allreduce_lowering.<locals>._positional_reduceY  s    	"(4:RX>>>)+ +  , ,h KK$$H:KVVkW[#E/,B,BCCCdcjr)   c                   rpj                                         }t          t          j                            |t          j                  t          j	                            d                    }ni }t          j
        |j        |fdi|}t          j        d| j                  }t          j        |          }|j        d         j                            ||          }t          j        |          5  t          j        j        d          }                    d |gdz  |g	          }	 ||	g|j        R  }
t          j        t          j        |
                     d d d            n# 1 swxY w Y   |j        S )
NTchannel_handleuse_global_device_idsr   r,   r   Fr     r  )module_contextnew_channelr   r   ChannelHandler   r   DEVICE_TO_DEVICE_TYPEr   BoolAttrAllReduceOptyper   r   r-   aval_to_ir_typeregionsblocksr   InsertionPoint	lower_funr>   r  	argumentsreturn_flatten_ir_valuesr   )r  rA   channel
other_argsopscalar_avalscalar_typereducer_blocklower_reducerr  	out_nodesr  is_spmdr   r   s              r&   
all_reducez'_allreduce_lowering.<locals>.all_reducem  s    "..00g*..t13 3 " 5 57 7 7jj
 j		
@ 
@"0
@4>
@ 
@B"2tz22K&{33KJqM(//[IIM		=	)	) 5 5nTYGGGmKK$*5):{m   U Uk-Fm.EFFFi	k$(334445 5 5 5 5 5 5 5 5 5 5 5 5 5 5 9s   8A-E11E58E5c                .    g | ]\  }} ||          S r,   r,   )r$   r  rA   r@  s      r&   r3   z'_allreduce_lowering.<locals>.<listcomp>  s)    	E	E	E'$**T1

	E	E	Er)   )r(  	platformsrl   r8   r9   r!   r"   r   r   r3  rw   r  r   r   r   axis_contextr	   SPMDAxisContextShardingContextr   )r   pos_fnr  r5   r   r   r
  axes_partitionr%   r"  rC  r@  r?  r  r   r!  r   s   ` `        @@@@@@r&   _allreduce_loweringrH  N  s   "1C1M(M(M!!$%%E

6
6
6
6$5
6
6
666 WUVVV13R7*o 7 7d:dC(()006666 	7nVe<<<G       !3<66D	 K&c(1:') )* *. #0,%~'EF '
       , 
F	E	E	ESt-D-D	E	E	EEr)   c               B   g g fx\  }}|D ]0}|t          |t                                                 |           1rfd}t          || |          } t	          j        |           \  }}	t          j        |t          |          |d}
t	          j	        |	|
          S )Nc                    t          j        |          sJ t          |           t           j        u rt          j        |j                  S t          j        | |          d         S )Nr   r   )r   is_undefined_primalr.  Zeror  r   _reduce_sum_transpose_rule)ctr   r  s     r&   broadcast_positionalz2_psum_transpose_rule.<locals>.broadcast_positional  s[    #C(((((	bRW		RWSX%6%66+B(CCCAFFr)   r4   )
r!   r"   r   rw   r   r;   r=   r>   r6   r?   )ctsr5   r   r   r
  rG  r%   rO  nonzero_out_ctsrD   nonzero_in_ctsr  s              @r&   _psum_transpose_rulerS    s    *,b&0*h 7 7d:dC(()006666 /G G G G G "C
.
.C '3C88/7;eJ6G6G1BD D D.		!'>	:	::r)   rF   Tc                    || z  S r   r,   rJ   r   s     r&   rL   rL     s    iRSm r)   c                   t          d |D                       rg g fx\  }}| D ]0}|t          |t                                                 |           1fd|rJ t	          |d                   nt          j        d |D                       t          fd|D                       S t          j	        j
        t          g|R | |dS )Nc              3  L   K   | ]}t          |t          j                   V   d S r   )r!   r   Tracer)r$   rA   s     r&   r'   zpsum_bind.<locals>.<genexpr>  s1      66AZ4;''	'666666r)   c                P     s S t          j          fdD                       S )Nc           
     N    g | ]!}t          |t          d d                    "S )ndimr   )r   getattr)r$   r%   rA   s     r&   r3   z1psum_bind.<locals>.pos_reduce.<locals>.<listcomp>  s?     !7 !7 !7%) "34FA9N9N!O!O !7 !7 !7r)   )r   _reduce_sum)rA   r  s   `r&   
pos_reducezpsum_bind.<locals>.pos_reduce  sM     _Q !7 !7 !7 !7-5!7 !7 !7 8 8 8r)   r   c                @    g | ]}t          j        |          j        S r,   )r   
axis_framesize)r$   r   s     r&   r3   zpsum_bind.<locals>.<listcomp>  s%    JJJt--2JJJr)   c              3  Z   K   | ]%}t          j        |           |          z  V  &d S r   )r   _const)r$   rA   r^  ra  s     r&   r'   zpsum_bind.<locals>.<genexpr>  s<      CCAt$$zz!}}4CCCCCCr)   r4   )r   r!   r"   r   rl   mathprodr6   r   AxisPrimitiver>   r=   )	r5   r   r   r
  rG  r%   r  r^  ra  s	         @@@r&   	psum_bindrg    s2   6666666 D,.F2J> 9 9Zc**+22488888 8 8 8 8
 $"1%&&ddYJJzJJJKKdCCCCCdCCCCCC			 
E
E 
E2C
E 
E 
E Er)   rR   c                    | S r   r,   rU  s     r&   rL   rL         a r)   rW   c                    | S r   r,   rU  s     r&   rL   rL     ri  r)   c               H   t          | j        j        |d           }t          |d                   t	          fd|D                       \  }}t          |          t          t          |                    k    r-t          |          t          t          |                    k    s$d}t          |                    |                    t          j	        t          |          t          |          dft          j
                  }t          |          D ]H\  }	}
t          |
          }
t          |          D ]$\  }\  }}|
|         ||	|df<   |
|         ||	|df<   %I|                    d          }| j        j        }t          |t           j                  o|j        }|rR| j                                        }t)          t*          j                            |t0          j                            }ni }t+          j        |t1          j        |          fi |j        S )Nr   c              3  2   K   | ]\  }}|z  |z  fV  d S r   r,   )r$   r   dstr   s      r&   r'   z%_ppermute_lowering.<locals>.<genexpr>  s6      PPxsCsZ'z)9:PPPPPPr)   z9ppermute sources and destinations must be unique, got {}.r'  rH   )r   r'  r%  )r   r(  r   rl   r   rt   r9   r   r.   zerosr   r   sortedreshaperC  r!   r	   rD  manual_axesr)  r   r   r*  r   r   r+  CollectivePermuteOpdense_int_elementsresults)r  rA   rB   r~   r   srcsdstsr   	full_permrp   grpjr   rm  rC  	is_manualr7  r8  r   s                     @r&   _ppermute_loweringr|    s   "3#5#>	4PP.>!$%%*PPPP4PPPPP*$
d))s3t99~~
%
%#d))s3t99~~*E*E
EC
SZZ%%
&
&&hN++SYY:BHEE).)) $ $fa
++C"4 $ $:Cs8i1as8i1a$ (()#0,~=>> #

"    ,,..G(,,Wd6PQQS S SJJ J		 	 	+	+
; 
;/9
; 
;;BCr)   c                    t          |          \  }}t          t          ||                    }t          | ||          gS )Nr}   )r   r7   r   r   )trA   r~   rB   rv  rw  inverse_perms          r&   _ppermute_transpose_ruler    s>    d||*$c$oo&&,
1	
=
=
=	>>r)   c                   ||c\  }\  }t          |t          t          f          s|f}t          fd|D                       }	| dk    r!|	rt                              |||	          |fS |	rt          d          |d         k    s
J d            t          |          | k    s
J d            |t          j        u r||fS t          j
        | t                    }
|D ]
\  }}||
|<   t          j        ||
|          |fS )	Nc              3  (   K   | ]}|k    |V  d S r   r,   r   s     r&   r'   z$_ppermute_batcher.<locals>.<genexpr>  -      JJ$tz7I7I7I7I7I7IJJr)   rH   )r~   rB   z,ppermute batcher only supports a single axisr   z*ppermute batcher called with a wrong axis!z(Permutation doesn't match the axis size!r   )r!   r6   r7   r   r>   r   rl   r   r   r.   ro  r"   r   take)r   r   r   r   r   rB   r~   rJ   r   remaining_axesperm_indicesr   rm  s    `           r&   _ppermute_batcherr    s5   *$1	It}	-	- IJJJJ)JJJJJ.!^^^??14>?BBAEE N
L
M
MM	1	#	#	#%Q	#	#	#	Ti			!K			(
a4K)3///,  hc3L	<	+	+Q	..r)   c                >     | j         |i || j        r|n|d         fS Nr   )r>   r   )r   r   dimsr   s       r&   _collective_batcherr    s.    	D	#F	#	#T-B%OTTQ	OOr)   r   c                     t          |           S r   r   rA   r   s     r&   rL   rL     s    1C1C r)   rB   c                    t          |          |k    }t          | |          }t          j        ||t          j        |                     gS r   )rb   rF   r   rc   
zeros_like)r~  rA   ry   rB   	is_sourcetsums         r&   _pbroadcast_transpose_ruler    sE    ##v-)	a		$
/)T9+?+B+B
C
C	DDr)   c                   ||c\  }\  }t          |t          t          f          s|f}t          fd|D                       }	|	rt          d          |d         k    s
J d            |dk    r|| k     s
J d            | dk    r!|	rt                              |||	          |fS |t          j        u r||fS t          j	        ||g| z  |          |fS )Nc              3  (   K   | ]}|k    |V  d S r   r,   r   s     r&   r'   z&_pbroadcast_batcher.<locals>.<genexpr>  r  r)   z.pbroadcast batcher only supports a single axisr   z,pbroadcast batcher called with a wrong axis!z2collective broadcast doesn't fit in the axis size!rH   )ry   rB   )
r!   r6   r7   r   rz   r>   r   r   r   r  )
r   r   r   r   r   rB   ry   rJ   r   r  s
    `        r&   _pbroadcast_batcherr    s   *$1	It}	-	- IJJJJ)JJJJJ. P
N
O
OO	1	#	#	#%S	#	#	#	1)+++-a+++!^^^QvHH!KK(
a4K	F8i/	3	3Q	66r)   c                   t          | j        j        |d           }fdfd|D             }| j                                        }t	          j        |t          |                    j        S )Nc                |    |          gt          | d                    z   t          | dz   d                    z   S )NrH   )r7   )rm   ry   s    r&   source_to_frontz-_pbroadcast_lowering.<locals>.source_to_front,  s>    &M?T%.111Dvz{{9K4L4LLLr)   c                &    g | ]} |          S r,   r,   )r$   rm   r  s     r&   r3   z(_pbroadcast_lowering.<locals>.<listcomp>.  s#    GGGuOOE**GGGr)   )r   )r   r(  r   r)  r   CollectiveBroadcastOpr   ru  )r  rA   rB   ry   r   r7  r  s      `  @r&   _pbroadcast_loweringr  *  s    "3#5#>	4PP.M M M M MGGGGGGG.**,,'		"+N;;
= 
= 
==DEr)   r{   c                     t          |           S r   r  r  s     r&   rL   rL   4  s    ?13E3E r)   c                      fdt          |j                  D             }|                    |            t          j        ||          S )Nc                     g | ]
}|k    |S r,   r,   )r$   rp   r   s     r&   r3   z_moveaxis.<locals>.<listcomp>=  s    	/	/	/a3hh!hhhr)   )rr   r[  insertr   	transpose)r   rm  rA   r~   s   `   r&   r   r   <  sJ    	/	/	/	/U16]]	/	/	/$++c3	q$		r)   c                    t          |j                  }||          |z  dk    sJ ||          |f            |||          |z  g|| | dz   <   |                    |          S Nr   rH   r7   r   rq  )r%   factorrA   	new_shapes       r&   
_splitaxisr  A  sl    17mm)	46	!Q	&	&	&4&(A	&	&	&"IdOv$=>)DaK	
9		r)   c                    t          |j                  }|j        |          |j        | dz            z  g|| | dz   <   |                    |          S )NrH   r'  r  )r%   rA   r  s      r&   	_foldaxisr  G  sL    17mm)GDMAGD1H,==>)DaK	
9		r)   c                   t          |           }||S t          j        |                                          }|                                t          |d                   z  }t          j        t          j	        ||d          dg          S r  )
rb   r.   r   flattenargsortrl   r   r   r   dynamic_slice_in_dim)rB   r   cur_device_idflat_groupsdevice_id_to_idxs        r&   _index_in_groupr  L  s    Y''-*++3355+ ((**S1B11E-F-FF	"#3]AFF
M 
M Mr)   c          	        ~t          | j        j        ||          }t          |d                   dk    r|gS t          |d                   t	          fd|D                       st          d          t          | j        j        t          j	        t          j
        f          }|rT| j                                        }	t          j                            |	t          j                  }
t#          |
          }ni }t          j        |ft          j        |          t          j        |          t          j                  t)          |          d|j        S )Nr   rH   c              3  >   K   | ]}t          |          k    V  d S r   rk   )r$   ro   split_counts     r&   r'   z'_all_to_all_lowering.<locals>.<genexpr>a  s.      ;;q[CFF";;;;;;r)   z$Replica groups must be equally sizedrn  )split_dimensionconcat_dimensionr  r   )r   r(  r   rl   r   r9   r!   rC  r	   rD  rE  r)  r   r*  r   r   r+  r   
AllToAllOpi64_attrr   ru  )r  rA   r   r   rB   r   r   r   r?  r7  r%  r8  r  s               @r&   _all_to_all_loweringr  W  sc    "3#5#>	#46 6.	q  3JN1%&&+	;;;;N;;;	;	; =
;
<
<<	%%~'EF '    ,,..G&**7D4NOON^444JJJ	
mJ//}[11-,,(88
 
 	
 
 r)   c                .    t          | |||||          fS NrB   r   r   r   r   r   )rP  rA   rB   r   r   r   r   s          r&   _all_to_all_transpose_ruler  x  s2     	)   
 r)   c               v    | \  }|\  }t                               |||||k    z   |||k    z   ||          }	|	|fS r  )r   r>   )
r   r   rB   r   r   r   r   rA   r   r   s
             r&   _all_to_all_batcherr    s[    "!"!qJ/k!12)   & 
r)   c
           	        |t          d          |\  }
|\  }|t          j        u r t          j        |
| g|
j        R           }
d}t          |t          t          f          r-|	                    |          }|d |         ||dz   d          }}nd\  }}|s|s||k    rJ|||k    z   }|}t          || |
          }
|||k    z  }t          |t          |
||f||f                    |fS t          |t          |||
                    }t          || |          |fS t          j        t          |d|
          d          d}}
|dz  }|dz  }|r t                              |
||d||	          }
t          || |
          }
|}|||k    z  }|dz  }|r t                              |
||d||	          }
t          dt          d|
                    }
|dz  }|dz  }|dz  }t          |dz
  t          d|dz
  |
                    }
|dz  }|
|fS )	NPlease open a feature request!r   rH   )r,   r,   )r   r'     r  r'  )r   r   r   r   	broadcastr   r!   r7   r6   indexr  r  r   r   r   r   r>   )r   r   r   r   r   rB   r   r   r   r   rA   r   pos
major_axes
minor_axesr%   d_pre_splitx_concatnew_ds                      r&   _all_to_all_batched_collectiver    s    "
>
?
??"!"!(
 	a).ag..//A	A	D%=)) $
//*
%
%C&tt_ia.A
JJ#J
	 	EJ 	E[  1
?+dk
T9a
(
(aDAIatXa!TT1I>>??LL;	![!(D(DEEh
Ix88*DD 
1a++V	4	4aQ!/*;!#; '!z%/Q,= % 	 	' 	'A
 Y**!
%*+,+/* '!z%/Q,= % 	 	' 	'A 9Q??##!/*;!#;UaZUa1kAoq!A!ABB!1*%	
E/r)   c                   ~t          |t          t          f          s|f}t          |           }t          |j                  }|t          d|          nt          |d                   }||         |z  dk    sJ ||         |f            ||xx         |z  cc<   ||xx         |z  cc<   |                    t          |          d          }	h t          t          j
        |          }
|	|
fS )NrH   r   F)r   	weak_type)r!   r7   r6   r   r   rF   rl   r  rw   r   r	  )rA   rB   r   r   r   r   
input_avalr   r   out_avaleffectss              r&   #_all_to_all_effectful_abstract_evalr    s    	Ie}	-	- Iq!!*
z
 
 %$5$=d1i   3GXYZG[C\C\)	z	Y	&!	+	+	+eJ.?-K	+	+	+
	!	!U5\\UCC(3c$&	223'	7	r)   r   )r   r%   r   c                   t                    t          d          fd}t          j        ||           S )a  Gather values of x across all replicas.

  If ``x`` is a pytree then the result is equivalent to mapping this function to
  each leaf in the tree.

  This is equivalent to, but faster than, all_to_all(broadcast(x)).

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a pmapped axis (see the
      :func:`jax.pmap` documentation for more details).
    axis_index_groups: optional list of lists containing axis indices (e.g. for
      an axis of size 4, [[0, 1], [2, 3]] would run all gather over the first
      two and last two replicas). Groups must cover all axis indices exactly
      once, and all groups must be the same size.
    axis: a positional axis into which the chunks along ``axis_name`` will be
      concatenated.
    tiled: when ``False``, the chunks will be stacked into a fresh positional
      axis at index ``axis`` in the output. When ``True``, ``axis`` has to
      refer to an existing positional dimension and the chunks will be
      concatenated into that dimension.

  Returns:
    Array(s) representing the result of an all-gather along the axis
    ``axis_name``. Shapes are the same as ``x.shape``, but:

    - when ``tiled`` is ``False``, there is a new dimension equal to the
      size of axis ``axis_name`` in position ``axis``,
    - when ``tiled`` is ``True``, the size of dimension in position ``axis``
      is multiplied by the size of axis ``axis_name``.

  For example, with 4 XLA devices available:

  >>> x = np.arange(4)
  >>> y = jax.pmap(lambda x: jax.lax.all_gather(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [[0 1 2 3]
   [0 1 2 3]
   [0 1 2 3]
   [0 1 2 3]]

  An example of using axis_index_groups, groups split by even & odd device ids:

  >>> x = np.arange(16).reshape(4, 4)
  >>> print(x)
    [[ 0  1  2  3]
     [ 4  5  6  7]
     [ 8  9 10 11]
     [12 13 14 15]]
  >>> def f(x):
  ...   return jax.lax.all_gather(
  ...       x, 'i', axis_index_groups=[[0, 2], [3, 1]])
  >>> y = jax.pmap(f, axis_name='i')(x)
  >>> print(y)
  [[[ 0  1  2  3]
    [ 8  9 10 11]]
   [[12 13 14 15]
    [ 4  5  6  7]]
   [[ 0  1  2  3]
    [ 8  9 10 11]]
   [[12 13 14 15]
    [ 4  5  6  7]]]
  rH   r   c           	         t                               | t          rt          j        |           nt          j        |           dz                       S NrH   )all_gather_dimensionrB   r   r   r   )all_gather_pr>   r   r.   r[  )leafr%   r   rB   r   r   s    r&   r>   zall_gather.<locals>.bind(  s`    .5?"'$---bgdmma.?A A/@5  * * *r)   )r<   rF   r   rM   )rA   rB   r   r%   r   r>   r   s    ```` @r&   
all_gatherr    sp    @ 66GHH1i3DEEE)* * * * * * * * * 
	D!	$	$$r)   c                   t          |j                  }|rZ||          }|| xx         |z  cc<   t          j        |t          j        |d                    }t          j        ||||z  |           S |                    | |           t          j        |t          j        |d                    }t          j        ||||           S r  )	r7   r   r   fullrc  r   dynamic_update_slice_in_dimr  dynamic_update_index_in_dim)dimra  r  r   rA   r   	tile_sizer  s           r&   _expandr  1  s    
qw--%
 Cc
I	#JJJ$JJJ
(5#*Q**
+
+C.sAuy7H#NNN	LLd
(5#*Q**
+
+C.sAucBBBr)   c                    t          d          )Nz#Unexpected call to _all_gather_impl)AssertionError)rA   r  rB   r   r   r   s         r&   _all_gather_implr  =  s    <===r)   )platformc                  | j         \  }| j        \  }	| j        j        }
t	          |
t
          j        t
          j        f          }|st          |j	                  }|
                    d           fdt          t          |                    D             }t          j        t          j        |                    |                    |t          j        |                    }t'          | j        j        ||          }|rp| j                                        }t-          t          j                            |t          j                  t4          j                            d                    }ni }t          j        t          j        |	          |ft          j                  t=          |          d|j        S )NrH   c                     g | ]
}|k    |S r,   r,   r$   rp   r  s     r&   r3   z(_all_gather_lowering.<locals>.<listcomp>M  s$    ZZZ!EY@Y@YA@Y@Y@Yr)   r  Tr$  )all_gather_dimr   ) r  r  r(  rC  r!   r	   rD  rE  r7   r   r  rr   rl   r   broadcast_in_dimr   r/  r  dense_int_arrayr   r   r)  r   r*  r   r+  r   r,  AllGatherOpr  r   ru  )r  rA   r  rB   r   r   r   r  x_avalr  rC  r?  r  broadcast_dimensionsr   r7  r8  s     `              r&   _all_gather_loweringr  @  s    L'&m)(#0,%~'EF ' 
 4V\""I)1---ZZZZuS^^'<'<ZZZV]]];;<<a122	4 	4A #3#5#>	$57 7. 
  ,,..G(,,T/1 1 kood335 5 5JJ
 J	
8$$
&:;;(88
 
 		
 
 	r)   c               <   t          |t          t          f          s|f}t          |           }t          |j                  }|r||xx         |z  cc<   n|                    ||           |                    |          h t          t          j	        |          fS )Nr  )
r!   r7   r6   r   r   r  r  rw   r   r	  )rA   r  rB   r   r   r   r  r  s           r&   #_all_gather_effectful_abstract_evalr  e  s     
Ie}	-	- I1&6<  )
 6"###y0####)9555	Y	'	')P3t/CY+O+O)P	PPr)   c               ,    t          | ||||          fS )N)rB   scatter_dimensionr   r   )psum_scatter)rP  rA   r  rB   r   r   r   s          r&   _all_gather_transpose_ruler  r  s+    
si)=):"$ $ $ 
& &r)   c                   | |c\  }\  }||k    r|dz  }n|s|dz  }t                               ||||||          }	|	|fS r  )r  r>   )
r   r   r  rB   r   r   r   rA   r   r   s
             r&   _all_gather_batcherr  z  st    *$1
A FA/)   & 
r)   c
                X   |t          d          || k    s
J d            t          |t                    s|f}t          |          dk    rt          d          ||fk    s
J d            ||c\  }
\  }|t          j        u rvt          t          j        |
                    }|	                    |           fdt          t          |                    D             }t          j        |
||          }nt          ||
          }|	rt          |          }|t          j        fS )N'axis_index_groups not supported in vmapaxis size doesn't matchrH   r  #batcher called with wrong axis namec                     g | ]
}|k    |S r,   r,   r  s     r&   r3   z2_all_gather_batched_collective.<locals>.<listcomp>  s$    TTTA!?S:S:Sa:S:S:Sr)   )r   r!   r6   rl   r   r   r7   r.   r   r  rr   r   r  r   r  )
frame_sizer   r   r   r   r  rB   r   r   r   rA   r   	out_shapebroadcast_dimsys        `         r&   _all_gather_batched_collectiver    sG    "
G
H
HH	j	 	 	 ";	 	 	 	Iu	%	% I^^a
>
?
??	zm	#	#	#%J	#	#	#*$1(
RXa[[!!I)9555TTTTs9~~!6!6TTTNQ	>::AA!)1--A
 +&**A	
H	r)   r  )cudarocmr  c                  |j         \  }|j        \  }	|                    d          }
t          |j        j        ||          }t          |j                  }||xx         |z  cc<   |j        j        }t          |t          j        t          j        f          }|rp|j                                        }t          t          j                            |t$          j                  t(          j                            d                    }ni }t          j        t%          j        |                    |                    |ft%          j        |          t3          |          d|}t%          j        |
          }|j        d         j                            ||          }t)          j        |          5  t%          j        | j        d          }|                     d |
gd	z  |
g
          } ||g|j!        R  }t          j"        t%          j#        |                     d d d            n# 1 swxY w Y   |r|j$        S t          j%        t%          j        |	          |j&                  gS )Nr,   r  Tr$  )r  r   r   Fr  r'  r  )'r  r  r  r   r(  r   r7   r   rC  r!   r	   rD  rE  r)  r   r   r*  r   r   r+  r   r,  ReduceScatterOpr/  r  r   r0  r1  r   r2  r3  r>   r  r4  r5  r6  ru  rq  r   )r   r  rA   r  rB   r   r   r   r  r  r:  r   scatter_out_shaperC  r?  r7  r8  r9  r;  r<  r=  r  r>  s                          r&   _reduce_scatter_loweringr    s    L'&m)(B''+"3#5#>	#46 6.6<((%&&&94&&&#0,%~'EF '  
  ,,..G(,,T/1 1 kood335 5 5JJ
 J

6==/@=AABB &788(88	 
 	 " $[11+*Q-&--k;GG-	'' 3 3N49uEEEM++(3}q'8)4  7 7K kDM,CDDDIK&y112223 3 3 3 3 3 3 3 3 3 3 3 3 3 3  D:K,X66	BBCCs   7A-H00H47H4c                  t          |t          t          f          s|f}t          j        |           }t          |j                  }|j        |         }|r'||z  dk    rt          d| d|           ||z  ||<   n||k    rt          d| d|           ||= |                    |          h t          t          j	        |          fS )Nr   z4tiled reduce_scatter operand scatter dimension size z" must be divisible by shard_count z.reduce_scatter operand scatter dimension size z must match shard count r  )
r!   r7   r6   r   r   r   r9   r  rw   r	  )	rA   rB   r  r   r   r   r  r  scatter_dim_input_sizes	            r&   '_reduce_scatter_effectful_abstract_evalr    s    
Ie}	-	- I""&6<  )!<(9:
 %	)Q.. 202 2&/2 2 3 3 3 $:Y#FI  ** &0& &#& & ' ' ' 	#$	Y	'	')P3t/CY+O+O)P	PPr)   c               ,    t          | ||||          fS )N)rB   r   r%   r   )r  )rP  rA   rB   r  r   r   r   s          r&   _reduce_scatter_transpose_ruler
    s*    
SI'8+5: : : 
< <r)   c                   | |c\  }\  }||k    r|dz  }n|s|dz  }t                               ||||||          }	|	|fS )NrH   )r  rB   r   r   r   )reduce_scatter_pr>   )
r   r   r  rB   r   r   r   rA   r   r   s
             r&   _reduce_scatter_batcherr    st    *$1
 FA  )) !  & 
r)   c
                   |t          d          || k    s
J d            t          |t                    s|f}t          |          dk    rt          d          ||fk    s
J d            ||c\  }
\  }|t          j        u r|
|z  |}}n$t          j        |
dt          j        |f          |}}|	rt          |||          }||fS )Nr  r  rH   r  r  g        )
r   r!   r6   rl   r   r   r   reduceaddr  )r  r   r   r   r   r  rB   r   r   r   rA   r   r  dys                 r&   _reduce_scatter_collectiver    s     "
G
H
HH	j	 	 	 ";	 	 	 	Iu	%	% I^^a
>
?
??	zm	#	#	#%J	#	#	#*$1(
	M,rAAJq"cgt,,.?rA
 %2y!$$A	
B,r)   reduce_scatter)r  r   r   c                   t          d||          }t          |          }t          t          j        |||||          }t          j        ||           S )a  
  Like ``psum(x, axis_name)`` but each device retains only part of the result.

  For example, ``psum_scatter(x, axis_name, scatter_dimension=0, tiled=False)``
  computes the same value as ``psum(x, axis_name)[axis_index(axis_name)]``, but
  it is more efficient. Thus the ``psum`` result is left scattered along the
  mapped axis.

  One efficient algorithm for computing ``psum(x, axis_name)`` is to perform a
  ``psum_scatter`` followed by an ``all_gather``, essentially evaluating
  ``all_gather(psum_scatter(x, axis_name))``. So we can think of
  ``psum_scatter`` as "the first half" of a ``psum``.

  Args:
    x: array(s) with a mapped axis named ``axis_name``.
    axis_name: hashable Python object used to name a mapped axis (see the
      :func:`jax.pmap` documentation for more details).
    scatter_dimension: a positional axis into which the all-reduce result along
      ``axis_name`` will be scattered.
    axis_index_groups: optional list of lists of integers containing axis
      indices. For example, for an axis of size 4,
      ``axis_index_groups=[[0, 1], [2, 3]]`` would run reduce-scatter over the
      first two and the last two axis indices. Groups must cover all axis
      indices exactly once, and all groups must be the same size.
    tiled: boolean representing whether to use rank-preserving 'tiled' behavior.
      When ``False`` (the default value), the size of dimension in
      ``scatter_dimension`` must match the size of axis ``axis_name`` (or the
      group size if ``axis_index_groups`` is given). After scattering the
      all-reduce result along ``scatter_dimension``, the output is squeezed by
      removing ``scatter_dimension``, so the result has lower rank than the
      input. When ``True``, the size of dimension in ``scatter_dimension`` must
      be divisible by the size of axis ``axis_name`` (or the group size if
      ``axis_index_groups`` is given), and the ``scatter_dimension`` axis is
      preserved (so the result has the same rank as the input).

  Returns:
    Array(s) with the similar shape as ``x``, except the size of dimension in
    position ``scatter_dimension`` is divided by the size of axis ``axis_name``
    (when ``tiled=True``), or the dimension in position ``scatter_dimension`` is
    eliminated (when ``tiled=False``).

  For example, with 4 XLA devices available:

  >>> x = np.arange(16).reshape(4, 4)
  >>> print(x)
  [[ 0  1  2  3]
   [ 4  5  6  7]
   [ 8  9 10 11]
   [12 13 14 15]]
  >>> y = jax.pmap(lambda x: jax.lax.psum_scatter(x, 'i'), axis_name='i')(x)
  >>> print(y)
  [24 28 32 36]

  if using tiled:

  >>> y = jax.pmap(lambda x: jax.lax.psum_scatter(x, 'i', tiled=True), axis_name='i')(x)
  >>> print(y)
  [[24]
   [28]
   [32]
   [36]]

  An example of using axis_index_groups:

  >>> def f(x):
  ...   return jax.lax.psum_scatter(
  ...       x, 'i', axis_index_groups=[[0, 2], [3, 1]], tiled=True)
  >>> y = jax.pmap(f, axis_name='i')(x)
  >>> print(y)
  [[ 8 10]
   [20 22]
   [12 14]
   [16 18]]
  rH   r   )rB   r  r   r   r   )rF   r<   r   r  r>   r   rM   )rA   rB   r  r   r   r   r>   s          r&   r  r  1  sg    X 1i3DEEE)56GHH	))
 
 
$ 
	D!	$	$$r)   c           
        t          |t                    r2|s
J d            t          |          dk    rt          d          |\  }t	          |j                                      |          }|j        t          j	        |j
                  z  }t          j        t          j        |t          j	        |j
        |dz   d                    z  t          j                            }t          j        t          j        |j
        |         t          j                            }| j        j        }t          |t$          j        t$          j        f          }|rt+          j                    }	nt+          j                    }	t+          j        t+          j        |	|          |          }
t+          j        t6          j                            g t6          j                            d                    |
          S )Nzempty axis namerH   zC`axis_index` translation rule does not support multiple axis names.r       ) r!   r6   rl   r   r7   namesr  nrepsrd  re  sizesr   ir_constantr.   r   uint32r(  rC  r	   rD  rE  r   partition_id
replica_id	remainderdivideconvertr   RankedTensorTyper   IntegerTypeget_signless)r  rB   r   axis_pos	nreplicasdivmodrC  r?  	device_idunsigned_indexs              r&   _build_axis_index_lowering_hlor*    s   	5!! ''''''
9~~
OQ Q QJI(.!!''	22(n	(. 9 99)h
dix!|~~ >??
?ry  	 	#
 	(.":")LLLMM##0,%~'EF '  ! ""II  I=Is!;!;SAA.	b"."="=b"A"ABB
 
 r)   c               :    t          | || j        j                  gS r   )r*  r(  r   )r  rB   s     r&   _axis_index_loweringr,    s(    $S)%(%7%@B B
 r)   c                    t          j        |           }t          dt          j                  t          j        |           hfS )Nr,   )r   r`  r   r.   r1   r	  )rB   frames     r&   #_axis_index_effectful_abstract_evalr/    s6    
/)
$
$%	R	"	"T%9)%D%D$E	EEr)   rb   c                    d }t          | t          t          f          s ||           S d}d}t          |           D ]&}| ||          |z  z  }|t	          d|          z  }'|S )Nc                @   t          j        |           }t           j        j        j        j        }|j        |j        |j        j        k    r&t           j        	                    t          |           S |j                                        }|                    |          S )Nr   )r   r`  thread_local_statetrace_statetrace_stackdynamic
main_tracelevel	Primitiver>   r   with_cur_sublevelprocess_axis_index)r   r.  r5  traces       r&   name_idxz"_axis_index_bind.<locals>.name_idx  s    OD!!E%1=EG GME4D4J$J$J^   >>>0022e%%e,,,r)   rH   r   )r!   r6   r7   reversedrF   )rB   r<  
inner_sizer  r   s        r&   _axis_index_bindr?    s    - - - 
It}	-	- 8IJE## " "xx~~
**eDDMM!jjLr)   c                    |j         J t          j        | t          j        t
          j        |j                   d          S r  )ra  r   BatchTracerr   iotar.   r1   )selfr.  s     r&   _vmap_process_axis_indexrD    s5    						dCHRXuz$B$BA	F	FFr)   c          	        t          d |D                       sJ t          | |t          t          |                              }|j        t          |          d          }|                    d|z             }d|z   }t          j        |d          }t          t          |j	        dz
  |j	        |j	        z   dz
                      }t          j        |dd          }t          j        |||t          |                    S )	Nc              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   z _pgather_impl.<locals>.<genexpr>  r   r)   )r   )rH   rH   r'  r   )offset_dimscollapsed_slice_dimsstart_index_map)dimension_numbersslice_sizes)r   r   rr   rl   r   rq  r   r   r6   r[  r   GatherDimensionNumbersgather)	r   rg   r5   src_axes_frontnon_axes_shapesrc_one_axis_frontrK  rG  dnumss	            r&   _pgather_implrR    s	   	44t444	4	4444CuSYY'7'788.!'D		

3.%--en.DEE~%+U###eCHqL#(5G5L*Lq*PQQRR+

(  % 
*C5$)+$6$6
8 
8 
8 8r)   c                   t          | j                  }t          d |D             d          D ]}||= |j        t          |          z   }t	          || j                  S )Nc              3  D   K   | ]}t          |t                    |V  d S r   r    )r$   as     r&   r'   z)_pgather_abstract_eval.<locals>.<genexpr>  s1      <<AAs););<a<<<<<<r)   T)reverse)r7   r   rp  r6   r   r-   )r   rg   r5   r   r%   s        r&   _pgather_abstract_evalrW    si     sy//%<<<<<dKKK  dd
)eEll
"%	UCI	&	&&r)   c                   t          d |D                       rt          d           t          j        t          d          | |||          S )Nc              3  B   K   | ]}t          |t                     V  d S r   r    r#   s     r&   r'   z-_pgather_parallel_lowering.<locals>.<genexpr>  s/      44tZc""	"444444r)   zJpgather only supported in the SPMD lowering.Please open a feature request!Fr  r   )r8   r   r   r3  rR  )r  r   rg   r5   s       r&   _pgather_parallel_loweringrZ    sm    44t44444 @
 ? @ @ @	>	>	>	>	3$
  
  
   r)   c               Z   | \  }}|\  }}|t           j        ur|t           j        urt          d          |t           j        urt                              |||          |fS |t           j        ur:t          ||d          }t                              |||          }||j        dz
  fS J )Nr  r   r   rH   )r   r   r   r   r>   r   r[  )	r   r   r5   r   rg   dsrcdidxsrc_last_batchedr   s	            r&   _pgather_batcherr_    s    (#s*$	$$$X5H)H)H >
?
??8&&&>>#s>..448&&&T2..^^,c^==F6;?""Lr)   c                 
 |\  }}|\  
}
t           j        u rt          d          |t           j        urt          d          t	          
fd|D                       }	t          d |D                       rt          |||	          t           j        fS t                              |||	          t           j        fS )Nz;pgather axis {frame.name} is missing from the indexed valuer  c              3  h   K   | ],}|k    rnt          |t                    r	||k    z   n|V  -d S r   r    )r$   r%   r\  r   s     r&   r'   z._pgather_collective_batcher.<locals>.<genexpr>  sh       % %   :--44,6tS,A,A 444<((% % % % % %r)   c              3  @   K   | ]}t          |t                    V  d S r   r    r#   s     r&   r'   z._pgather_collective_batcher.<locals>.<genexpr>  s,      004D#		000000r)   r   )	r   r   r9   r   r6   r   rR  r   r>   )r   r   r   r   r   r5   r   rg   r]  new_axesr\  s    `        @r&   _pgather_collective_batcherrd    s    (#s*$	X   
R
S
SS	$$$
>
?
?? % % % % %  $% % % % %( 	00400000 H c11183FFF>>#s>22H4GGGr)   r   )r5   r   )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )__doc__
__future__r   collections.abcr   	functoolsr   r   rd  jaxr   jax._srcr   r   r	   jax._src.corer
   r   r   jax._src.interpretersr   r   r   r   jax._src.laxr   r   jax._src.lib.mlirr   jax._src.lib.mlir.dialectsr   jax._src.numpyr   jax._src.utilr   r   r   r   r   numpyr.   rw   
unsafe_mapr   
unsafe_ziprF   rN   rR   rW   r]   r`   r[   r:   r<   r{   r   r   r   r   rb   r   r   r   r   r   r   r   r  r  rH  rS  rf  r=   r   def_implr]  def_effectful_abstract_evalregister_loweringadd_p
deflinear2primitive_batchersaxis_primitive_batchersaxis_substitution_rulesdef_custom_bindrg  rQ   _reduce_maxmax_prV   _reduce_minmin_pr|  r  r  r  r   def_abstract_evalr  r  r  rz   r   r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  pr  r  r
  r  r  r  r  r*  r,  r/  r8  r   r?  rD  
BatchTracer:  rR  rW  rZ  r_  rd  r   r,   r)   r&   <module>r     s    # " " " " " $ $ $ $ $ $                              # # # # # # @ @ @ @ @ @ @ @ @ @ $ $ $ $ $ $ * * * * * * & & & & & & & & & & & &                               * * * * * * $ $ $ $ $ $# # # # # # # # # # # # # #    x
Cx
C
 -1 L5 L5 L5 L5 L5\ .2 0 0 0 0 0B -1 5 5 5 5 5< -1 5 5 5 5 5>> > >> > >
! ! !
N N N. . .I I I:0 0 0:C C C0 8< S S S S S> LPW\ G% G% G% G% G%R%0 %0 %0P4 4 4 4	 	 	 	  4K K K9 9 9,  C C C C2 2 2

H 
H 
H5F 5F 5Fp; ; ;& 
	F	#	#  99 : : :  " "#E F F F  
GG'CODDF F F f* + + +&-g.@&&I&I F #	'
'1S1STT 	   ('.w/H&'Q'Q V $
 E E E( 
	F	#	#  99 : : :  " "#E F F F  
GG'CODDF F F&-g.@&&I&I F #	'
'1G1GHH 	   ('.w/H&'Q'Q V $ 
	F	#	#  99 : : :  " "#E F F F  
GG'CODDF F F&-g.@&&I&I F #	'
'1G1GHH 	   ('.w/H&'Q'Q V $C C C>? ? ?
/ / /$P P P  T
++
 
  CC D D D j2 3 3 3  z#5 6 6 6*1'2Ez*R*R J '/@   ,+273Lk+Z+Z Z (E E E
7 7 7E E E "t!,//   EE F F F l6 7 7 7  |%9 : : :,3G4G,V,V L )1D   .-4W5NP[-\-\ \ *     
    
M M M  B	 	 	  8 8 8v  " "t!,//  ( ()L M M M  |%9 : : : l6 7 7 7,? L )1O   .-4W5NP[-\-\ \ * 37Qe I% I% I% I% I%V
C 
C 
C> > >
 #'" " " " "JQ Q Q& & &       . "t!,//  ( ()L M M M   & ' ' '  |%9 : : :	  % %A$ !5BBB"#% % % % % l6 7 7 7,? L )1O   .-4W5NP[-\-\ \ *.D .D .DbQ Q Q.< < <     * &4%&677   , ,+    > ? ? ?0G , -5O  !1 2  'w7CCE E E G%{33  - . 56U% U% U% U% U%p  :  F F F t~l++  |%9 : : :  ( ()L M M M-4W5NP[-\-\ \ *  &   - . . .G G G *B  &8 8 8' ' '        H H H, Dy))	 	  = ! ! ! 	  2 3 3 3  y"< = = =)9 I &.I   +*1'2KV*T*T Y ' ' 'r)   