
    Vpfvu                        U d Z ddlmZmZ ddlZddlZddlZddlZddlZddl	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlZdZeed<   dZdej        fdZ d Z!d Z"dee#z  fdZ$dddZ% ej&        d           G d d                      Z'd Z(ej)        d             Z*d  Z+dMd!Z,dNd"Z-dNd#Z. G d$ d%ej/                  Z0da1e0dz  ed&<   ej)        dNd'            Z2d( Z3dOd)ed*         dz  fd+Z4d,ej5        fd-Z6 ej&        d           G d. d/                      Z7e7Z8d0ej9        d1ej9        fd2Z: e;d          fd3ej        d4e;dz  fd5Z<d0ej9        d1ej9        fd6Z=d0ej9        d1ej9        fd7Z>d0ej9        d1ej9        fd8Z?d0ej9        d9ee         d1ej9        fd:Z@d;eAed<f         d1eAeBej9        ez           eBe         eBeC         f         fd=ZDd> ZE ej&        d           G d? d@                      ZF ej&        d           G dA dB                      ZG G dC dD          ZH G dE dF          ZIdG ZJdH ZKdOdIZLdJeAeeef         dKejM        fdLZNdS )PzUtilities for code generator.    )IteratorSequenceN)AnyLiteral)ir)arith)builtin)gpu)llvm)memref)nvgpu)nvvm)scf)vector   WARPGROUP_SIZEl         	memref_tyc                    t          |j                  dk    rt          t          j                            d          }t          |j                  }t          j                            d| d| d          }t          j	        |          }t          j
        || dg          }t          j
        || dg          }t          j
        |t          j        |t          j                            |d                    dg          }t          |j                  D ]N\  }}t          j
        |t          j        |t          j                            ||                    d|g          }Ot          t          |j                            D ]N\  }}t          j
        |t          j        |t          j                            ||                    d	|g          }Ot!          j        |g|g          S )
Nr   @   z#!llvm.struct<(ptr, ptr, i64, array< x i64>, array<	 x i64>)>            )lenshapeNotImplementedErrorr   IntegerTypeget_signlessTypeparser   UndefOpInsertValueOp
ConstantOpIntegerAttrget	enumerateget_contiguous_stridesr	   unrealized_conversion_cast)ptrr   i64rankdesc_tydesciss           a/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/experimental/mosaic/gpu/utils.pyptr_as_memrefr3   .   s   Q

##B''#	Y_		$GMMPDPPPPP ' 
g		$		D#s	+	+$		D#s	+	+$		
DOC!3!3C!;!;<<qc
 
$ 	((  dadoc2>#5#5c1#=#=>>A DD .y??@@  dadoc2>#5#5c1#=#=>>A DD 
	+YK$	@	@@    c                    | st          d          | d         j        }t          j                            d          }t          j                            d          }t          j        |t          t          |           |          |          }t          |           D ]3\  }}t          j        ||g |g|          }t          j        ||           4|S )NzEmpty arrayr   r   	!llvm.ptr)
ValueErrortyper   r   r    r!   r"   r   allocacr   r(   getelementptrstore)valueselem_tyr,   ptr_tyarr_ptrr0   velem_ptrs           r2   
pack_arrayrC   G   s    	 $
]
#
##1IN'
##B''#7==%%&K#f++s 3 3W=='  da!&'2sGDDHJq(	.r4   c                 p    g }d}| d d d         D ]}|                     |           ||z  }|d d d         S )Nr   )append)xsstrides_retstridexs       r2   r)   r)   T   sU    +&ddd8  av
aKFF	TTrT	r4   valc           	         t           j                            |          st           j                            |          r^t          | t          t
          j        f          st          t          |                     t           j	        
                    ||           }nt           j                            |          r!t           j        
                    ||           }nht           j                            |          r:t          j        |t!          | t          j        |          j                            S t%          |          t'          j        ||          S N)r   r   
isinstance	IndexTypeintnpinteger	TypeErrorr8   r&   r'   	FloatType	FloatAttr
VectorTyper   splatr:   element_typer   r   constant)rK   tyattrs      r2   r:   r:   ]   s	   ^r"" 	"bl&=&=b&A&A 	"cC,-- !d3ii   >b#&&DD	|r"" "<B$$DD	}## "<Ac2=#4#4#ABBCCC
b
!
!!	D	!	!!r4   T)uniformc                   g }g }|D ]d}d }t           j                            |j                  rd}t           j                            |j                  rXt          j        |j                  j        }d}|dk     r2t          j        t           j                            d          |          }t           j	                            |j                  rd}t           j
                            |j                  r3d}t          j        t           j	                                        |          }|t          |j                  |                    |           |                    |           f|rt          j        t"          d          nt$          j        } |            5  t)          j         | j        | dz   |           d d d            d S # 1 swxY w Y   d S )Nz%llur   z%fF	per_block
)r   rO   rN   r8   r   widthr   extuir    F32TypeF16Typeextfr'   r   rF   	functoolspartialsingle_thread
contextlibnullcontextr
   printfformat)	fmtr\   argstype_formatsnew_argsarg	ty_formatra   ctxs	            r2   debug_printrt   k   s   ,(  cI	|sx(( i	~  ** @nSX&&,ei	k".55b993??	zSX&& i	zSX&& .iJrz~~''--c)))	"""OOC 
"i7777! 
 suu ; ;Jzsz<(4/:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ;s    !GGG)frozenc                   T    e Zd ZU ej        ed<   eedf         ed<   ed             Z	dS )	ForResultop.resultsc                 Z    t          | j                  dk    rt          | j        d         S )Nr   r   )r   ry   r7   selfs    r2   resultzForResult.result   s)    
4<A<?r4   N)
__name__
__module____qualname__r   ForOp__annotations__tupler   propertyr}    r4   r2   rw   rw      sN         	i---c?  8  r4   rw   c                      dt          |t          t          f          s|g}dt          j                            |          \   fd}|S )NFTc                 $   t           j                                        }t          j        |t           j                            |d                    }t          j        |t           j                            |d                    }t          j        ||          }t          j        |j	                  5  |j
        }t          j                            |j                  }r|\  } | ||          }r|g}t          j                            |          \  }}	|	k    rt!          |	          t          j        |           d d d            n# 1 swxY w Y   |j        }
t'          |t          j                            |
                    S )Nr   r   )r   rO   r'   r   r%   r&   r   r   InsertionPointbodyinduction_variablejaxtree	unflatteninner_iter_argsflattenr7   YieldOpry   rw   )findexc0c1for_opr0   inner_carrys
new_carrysnew_flat_carrysnew_carry_treedeffinal_flat_carrysboundcarry_treedefflat_carrysunwraps              r2   wrapperzfori.<locals>.wrapper   s   LE		%!3!3E1!=!=	>	>B		%!3!3E1!=!=	>	>BYr5"k22F		6;	'	' # #

#aX''v7MNNl	 &%1Q%%j	 " \
+.8+;+;J+G+G(o(	m	+	+*M:::	k/"""# # # # # # # # # # # # # # # ""=2CDD  s   6BEEE)rN   listr   r   r   r   )r   carrysr   r   r   r   s   `  @@@r2   forir      sv    &	FT5M	*	* XFF"x//77+}       , 
.r4   c              #      K   t          j        t          j        |           j                  5  d V  t          j        g            d d d            d S # 1 swxY w Y   d S rM   )r   r   r   IfOp
then_blockyield_)conds    r2   whenr      s      	$233  	EEEJrNNN                 s   AAAc                  0   t           j                            d          fd}  | t          j        t          j        j                            } | t          j        t          j        j                            }t          j        j        t          j        j	        fD ]u}t          j        |t          j         | t          j        |                    |                    }t          j        | | t          j        |                              }v|S )N    c                 .    t          j        |           S rM   )r   
index_cast)rJ   i32s    r2   <lambda>zthread_idx.<locals>.<lambda>   s    U%c1-- r4   )r   r   r    r
   	thread_id	DimensionrJ   	block_dimyzr   addimuli)as_i32tidxrI   dimr   s       @r2   
thread_idxr      s    
##B''#----&	cmo..	/	/$6#-0011&mos}/ < <c:dEJvvcmC.@.@'A'A6JJKKDZs}S'9'9 : :;;FF	+r4   c           
          t           j                            d          }t          d|          }t	          j        | j        || t          ||          t          d|          t          j        j                  S )Nr          )	r   r   r    r:   r   	shfl_syncr8   ShflKindidx)rK   lane_idxr   masks       r2   _warp_bcastr      s`    
##B''#	
:s		$		hc1Xs++QtS\\4=;L
 
 r4   c                     t           j                            d          }t          j        t                      t          d|                    }| rt          |          n|S )Nr      r   r   r    r   shruir   r:   r   )syncr   warp_idxs      r2   r   r      sN    
##B''#[qCyy11( #'	4X			H4r4   c                     t           j                            d          }t          j        t                      t          d|                    }| rt          |          n|S )Nr      r   )r   r   wg_idxs      r2   warpgroup_idxr      sN    
##B''#;z||Qq#YY//& !%	0V			&0r4   c                   J    e Zd Z ej                    Z ej                    ZdS )ThreadSubsetN)r~   r   r   enumauto	WARPGROUPBLOCKr   r4   r2   r   r      s(        dikk)
$)++%%%r4   r   	_ONCE_PERc              #     K   | rt           j        nt           j        }t          t          |k    rdV  dS t	                      }| s(t          j        |t          d|j                            }t          j	        t
          j
        j        |t          d|j                            }t          j        t          j                            d                    }t          j        ||          }t%          j        |          }t          }|a	 t          j        |j                  5  dV  t%          j        g            ddd           n# 1 swxY w Y   |adS # |aw xY w)zRuns the context only from a single thread.

  Args:
    per_block: If True, only one thread per block will run the context.
      Otherwise, only one thread per warp group will run the context.
  Nr   r   r   )r   r   r   r   r   r   remuir:   r8   cmpiCmpIPredicateeqr   
elect_syncr   r   r    andir   r   r   r   r   )r_   scopewarp
first_warpelected
should_runif_op
prev_scopes           r2   rh   rh      ss      !*
E,

|/E%yE11	EEE
F	$	 .;tQq$)__--Dz%-0$!TYHH*OBN77::;;'z*g..*
(:

%*)		5+	,	,  eee	k"ooo               III
Is0   	E "E;E EE EE Ec                  t    t           j                            d          } t          j        | g dddd          S )Nr   zmov.u32  $0,%clock;=rr   Tasm_dialecthas_side_effectsr   r   r    r   
inline_asm)r   s    r2   clockr     s>    
##B''#		2$dD
 
 
 r4   kind)lowhighc                    | 9t           j                            d          }t          j        |g dddd          S t           j                            d          }t          j        |g d| d d	          d
ddd          S )Nr   zmov.u32  $0,%globaltimer;z=lr   Tr   r   zmov.u32  $0,%globaltimer_r   ;r   r   )r   r,   r   s      r2   globaltimerr     s    	\
.
%
%b
)
)C?R,!d    	##B''#		2648666
D
 
 
 r4   rZ   c                    t           j                            |           rt          j        |           j        dz  S t           j                            |           rt          j        |           j        dz  S t          |           )N   )r   r   rN   ra   rT   r   )rZ   s    r2   	bytewidthr     sp    ^r"" )>"#q((\R   '<!Q&&Br4   c                   4    e Zd ZU ej        ez  ed<   eed<   dS )DynamicSlicebaselengthN)r~   r   r   r   ValuerP   r   r   r4   r2   r   r   %  s+         
3
+++++r4   r   refreturnc                    t          j        | j                  }t          ||j                  \  }}}|                                \  }}|}t          ||          D ]A\  }	}
t          |	t                    r	||	|
z  z  }#t           j	        
                                } d t          ||          D             }d t          ||          D             }t           j                            ||          }t          j        | ||dgt          |j                  z  t           j                            ||j        ||j                            }|S )Nc                     g | ]	\  }}||
S r   r   .0r1   squeezes      r2   
<listcomp>z memref_slice.<locals>.<listcomp>:  s1       Aw'  r4   c                     g | ]	\  }}||
S r   r   r   s      r2   r   z memref_slice.<locals>.<listcomp>=  s!    PPPZQPqPPPr4   r   )result_type)r   
MemRefTyper8   parse_indicesr   get_strides_and_offsetziprN   rP   
ShapedTypeget_dynamic_stride_or_offsetStridedLayoutAttrr'   r   subviewr   rX   memory_space)r   r   ref_tybase_indicesslice_shapeis_squeezedmemref_stridesoffset
new_offsetr   rI   new_strides	new_shape
new_layout	ref_slices                  r2   memref_slicer  .  sW   =""&+8+M+M(,[!88::.&*~66  kc6#s C&L jj===??j nk::  + QP3{K#@#@PPP)#''
K@@*n	<qcC,=,=&=-##
V(*f6I   ) 
r4   r  	dim_slicec                 V   t           j                            | j                  sdS t          j        | j                  j        |         }| j        |         }t          t          ||          d d          }t          ||dd                    D ]\  \  }}\  }}||z  |k    r dS dS )NTc                     | d         S Nr   r   )rJ   s    r2   r   z,_is_contiguous_shape_slice.<locals>.<lambda>T  s
    1 r4   )keyreverser   F)r   r  rN   layoutstridesr   sortedr  )r  r  r  r   ssprev_stride_rI   s           r2   _is_contiguous_shape_slicer#  I  s     
		(	(	7	7 4 //7	B'
,y
!% c'5!!~~tDDD"+.r2abb6??  '{A~$$UU % 
r4   c           
      *   t          j        | j                  }t          |j                  }t          j        ||z                      g||z   <   t           j                            t           j	        
                    |j                            }t           j                            d          }|j        |k    s|j        |k    rHt           j                            t           j	        
                    |j        |z
  dz                       }nt          |t!          |z                       rO|                                \  }}	||z   dz
           g||z   <   t           j                            |	|          }n8t'          d|                                d          d|j        dd|          t           j                            ||j        ||j                  }
d t-                    D             }|                    fd	t-          |          D                        |                    d
 t-          |z   |j                  D                        t3          |          |
j        k    sJ t5          j        |
| |          S )Nzstrided<[1]>r   zstrides=r   z, ref_ty.shape=z, dim=z, fold_rank=c                     g | ]}|gS r   r   r   ds     r2   r   zmemref_fold.<locals>.<listcomp>t  s    
#
#
#1A3
#
#
#r4   c                     g | ]}|z   S r   r   )r   r0   r   s     r2   r   zmemref_fold.<locals>.<listcomp>u  s    222Aa222r4   c              3      K   | ]}|gV  d S rM   r   r&  s     r2   	<genexpr>zmemref_fold.<locals>.<genexpr>v  s$      @@qs@@@@@@r4   )r   r  r8   r   r   rQ   prodAffineMapAttrr'   	AffineMapget_identityr-   	Attributer"   r  r#  slicer  r  r   rX   r
  rangerF   extendr   r   collapse_shape)r   r   	fold_rankr  r  identitycontig_strided_1dr  r  r  new_tyassocs    `          r2   memref_foldr9  \  s   =""&6<  )&(gicIo8M.N&O&O%P)C#	/!"!!",";";FK"H"HII(l((88]h&-3D"D"D!%%
!!&+	"9A"=>> JJ "&%S9_*E*EFF  7799K*5cIo6I*J)KKcIo%&%))&+>>JJ
	!6002215 	! 	! 	! 	!	! 	!	! 	!  
 =$j&2E & $
#c


#
#
#%,,2222y!1!1222333,,@@E#	/6;??@@@@@@	Uv{	"	"	"	"		vsE	2	22r4   c                    t          j        | j                  }t          |j                  t          d D                       dk    rt          d          t          j        d D                                z  rt          d                   t          fdD                       dz   <   t           j
                            t           j                            |j                            }|j        |k    rUt           j
                            t           j                            |j        t!                    z   dz
                      }n|                                \  }}|         }g }	t%                    D ]}
|	                    |           ||
z  }t%          |	          |dz   <   t           j                            ||          }t           j                            |j        ||j                  }|j        k    red t/          |j                  D             }|d                             t/          |j        |j        t!                    z   dz
                       nd	 t/                    D             }|                    t          t/          t!                    z                                  |                    fd
t/          dz   |j                  D                        t!          |          |j        k    sJ t3          j        || |g |j                  S )zOUnfolds dim into two dimensions, the size of leading one given be major_factor.c              3      K   | ]}|d u V  	d S rM   r   r   r   s     r2   r*  z memref_unfold.<locals>.<genexpr>  s&      $$qd$$$$$$r4   r   zCan only infer one dimensionc                     g | ]}||S rM   r   r<  s     r2   r   z!memref_unfold.<locals>.<listcomp>  s    CCCQQ]q]]]r4   zNon-divisible unfold:c              3   6   K   | ]}|         z  n|V  d S rM   r   )r   r   r   known_factor_prodr  s     r2   r*  z memref_unfold.<locals>.<genexpr>  sG        BCQYin)))A     r4   c                     g | ]}|gS r   r   r&  s     r2   r   z!memref_unfold.<locals>.<listcomp>      ---QaS---r4   rE   c                     g | ]}|gS r   r   r&  s     r2   r   z!memref_unfold.<locals>.<listcomp>  s    %%%QaS%%%r4   c              3   D   K   | ]}|t                    z   d z
  gV  dS r   N)r   )r   r'  factorss     r2   r*  z memref_unfold.<locals>.<genexpr>  s6      MMA!c'll"Q&'MMMMMMr4   )r   r  r8   r   r   sumr7   rQ   r+  r   r,  r'   r-  r.  r-   r  r   r  reversedrF   r  rX   r
  r1  r2  r   expand_shape)r   r   rE  r  r5  r  r  r  r!  inserted_stridesr   r7  r8  r?  r  s    ``          @@r2   memref_unfoldrJ  {  s7   =""&6<  )$$G$$$$$q((
3
4
44gCC'CCCDDs^'' G
,ing
F
FF      GN    ' %)C#'M!!",";";FK"H"HII(]h!%%
!!&+G"<q"@AA JJ !7799Kc"Kg  k***Qkk!)*:!;!;KcAg%))&+>>J=$j&2E & 	FK--%,,---E	"IU6;c'll(BQ(FGGHHHH%%%**%%%E	LLeCs7||!34455666	LLMMMMsQw1L1LMMMMMM	Uv{	"	"	"	"		VS%V\	B	BBr4   c                    t          j        | j                  }||j        k    rt	          |j                  }|                    d           t           j                            t           j	        
                    |j                            }|j        |k    rEt           j                            t           j	        
                    |j        dz                       }nL|                                \  }}|                    d           t           j                            ||          }t           j                            ||j        ||j                  }d t!          |j                  D             }	|	d                             |j                   t#          j        || |	g |j                  S t'          | |d          S )zInserts a singleton dimension.r   c                     g | ]}|gS r   r   r&  s     r2   r   z$memref_unsqueeze.<locals>.<listcomp>  rA  r4   rE   rD  )r   r  r8   r-   r   r   rF   r,  r'   r-  r.  r  r  r  rX   r
  r1  r   rH  rJ  )
r   r   r  r  r5  r  r  r  r7  r8  s
             r2   memref_unsqueezerM    s{   =""&FKV\""IQ##BL$=$=fk$J$JKKH}  #''
,
#
#FK!O
4
4 jj #99;;k6'++FK@@j]6&
F4G F .-%,,---E	"IV[!!!vsE2v|DDDc9---r4   permutationc                    t          j        | j                                                  \  }fd|D             }fd|D             }t           j                            ||          }t           j                            |j        |j                  }t          j	        || t           j
                            |                    S )Nc                      g | ]
}|         S r   r   )r   pr  s     r2   r   z$memref_transpose.<locals>.<listcomp>  s    111111r4   c                 *    g | ]}j         |         S r   )r   )r   rQ  r  s     r2   r   z$memref_transpose.<locals>.<listcomp>  s    4441v|A444r4   )r   r  r8   r  r  r'   rX   r
  r   	transposer-  get_permutation)	r   rN  r  r  r  r  r7  r  r  s	          @@r2   memref_transposerU    s    =""&1133/'61111[111+4444444)#''<<*=$j&2E & 
	c2<//<<
 
 r4   r   .c                    t          | t                    s| f} t          |          t          |           z
  x}r| t          d           f|z  z  } g }g }g }t	          | |          D ]\  }}t          |t
          j        t
          j        f          r|j        }t          |t                    r@|
                    |           |
                    d           |
                    d           t          |t                    ro|j        t          d          |
                    |j        pd           |
                    |j        p||j        pdz
             |
                    d           t          |t                    rK|
                    |j                   |
                    |j                   |
                    d           lt          |t
          j                  rtt
          j                             |j                  st+          d          |
                    |           |
                    d           |
                    d           t          t)          |                    t          |          t          |          cxk    r't          |          cxk    rt          |          k    sn J |||fS )Nr   TzStrided slices not implementedr   FzExpected an index-typed index)rN   r   r   r0  r  r   	OperationOpViewr}   rP   rF   stepr   startstopr   r   r   r   rO   r8   r7   )r   r   trailing_dimsr  r  r  r   r   s           r2   r  r    s    
E5	!	! HE%jj3u::--] ,	eDkk^m++E,++u%% + +jc5#bi011 Jc#s +#	C		 +		!"BCCC#).q)))#(+e	Q?@@@	C	&	& +#(###$$$	C	"	" +\$$SX.. :8999#S		***	\		c+..	P	P	P	P#k2B2B	P	P	P	Pc%jj	P	P	P	P	P	P	{K	//r4   c                      t          j                     t          j        t          j        j        t          j        j                   d S )N)space)r
   barrierr   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctar   r4   r2   commit_sharedre    sA    +---
n!)9)D     r4   c            
       8   e Zd ZU ej        ed<   ej        ed<   ej        ed<   eed<   eddej        dededd fd	            Zde	d          fd
Z
dej        ez  dd fdZddZddZdej        deej        ej        f         fdZd Zdeej        z  fdZd ZdS )
BarrierRefbase_addressr  phasesnum_barriersr   addressarrival_countr   c                    |dk    rt          d          t          j                            d          }t          j                            d          }t          j                            d          }t          j        t          j        	                    d|          g g           }t          j
        t          d|          |g            t          d          5  t          |          D ]<}t          j        t!          j        || g |g|          t          ||                     =	 d d d            n# 1 swxY w Y   t%          | t          d|          ||          S )	Nr   z*Only up to 32 barriers per group supportedr   !llvm.ptr<3>r   r   Tr^   )r   r   r   r    r!   r"   r   r9   r  r'   r<   r:   rh   r1  r   mbarrier_init_sharedr   r;   rg  )rk  rj  rl  r   r,   r+   ri  r0   s           r2   
initializezBarrierRef.initialize   sy   b LMMM
.
%
%b
)
)C
.
%
%b
)
)C
'--
'
'C]2=,,R55r2>>F
L1cFB'''		&	&	& 
 
\"" 
 
!!sGR!c::mS!!	
 	
 	
 	


 
 
 
 
 
 
 
 
 
 
 
 
 
 
 gqCyy&,???s   AD55D9<D9c              #   n   K   | j         dk    r| V  d S t          | j                   D ]}| |         V  d S )Nr   )rj  r1  r|   r  s     r2   __iter__zBarrierRef.__iter__  sX      Ajjjjj$+,,  &6l r4   c                    t           j                            d          }t          |t                    rt          ||          }nWt           j                            |j                  rt          j	        ||          }n|j        |k    rt          d|           t          | j        t          j        | j        |          | j        d          S )Nr   z,Expected a dynamic index or an integer, got r   )r   r   r    rN   rP   r:   rO   r8   r   index_castuir7   rg  rh  r   r  ri  )r|   r  r   s      r2   __getitem__zBarrierRef.__getitem__  s    
.
%
%b
)
)C&# P~~ff		 	 	-	- P!#v..ff			NfNNOOO
4;''		  r4   Fc                    t           j                            d          }t           j                            d          }t          d|          }|                                 }t          j        ||          }|rt          j        |||           t          j
        |||gddd          }t          j        |t          d|                    }t          j        |t          d|                    }t          j        t          j        |          j                  5  t          j        |||           t          j        g            d d d            d S # 1 swxY w Y   d S )	Nr   r   i z2mbarrier.test_wait.parity.shared.b64 $0, [$1], $2;z=b,l,rTr   r   )r   r   r    r:   get_ptrr   rb   r   mbarrier_try_wait_parity_sharedr   r   xoriintr_expectr   r   r   r   r   )	r|   parityexpect_waiti1r   ticksrk  barrier_readyshould_waits	            r2   wait_parityzBarrierRef.wait_parity'  sz   		$	$Q	'	'B
.
%
%b
)
)ChEllnnG[f%%F C
*7FEBBBO
	&<  M *]AaHH55K";!R99K		38K00;	<	<  
*7FEBBB	jnnn                 s   +EEEc                     t          j        | j        g           }|                     |          \  }}t          j        || j        g            |                     ||           d S )N)r~  )r   loadri  update_paritiesr<   r  )r|   r~  paritiesr}  new_paritiess        r2   waitzBarrierRef.wait<  sb    {4;++H//99FL
Lt{B///V55555r4   r  c                 J   t           j                            d          }t          j        t          d|          | j                  }t          j        t          j        j	        t          j
        ||          t          d|                    }|t          j        ||          fS Nr   r   r   )r   r   r    r   shlir:   r  r   r   ner   r{  )r|   r  r   bitmaskr}  s        r2   r  zBarrierRef.update_paritiesB  s}    
.
%
%b
)
)Cj1cDK00GZ
8W = =qCyy F 5:h0000r4   c                     t           j                            d          }t          j        ||                                            d S )Nr   )r   r   r    r   mbarrier_arrive_sharedry  )r|   r,   s     r2   arrivezBarrierRef.arriveJ  s8    
.
%
%b
)
)CT\\^^44444r4   bytesc                    t          |t                    r.t          |t          j                            d                    }nXt          j                             |j                  r4t          j                            d          }t          j	        ||          }t          j        |                                 |           d S )Nr   )rN   rP   r:   r   r   r    rO   r8   r   r   r   mbarrier_arrive_expect_txry  )r|   r  r   s      r2   arrive_expect_txzBarrierRef.arrive_expect_txN  s    % +r~2226677ee		 	 	,	, +N''++csE**e"4<<>>599999r4   c                     t           j                            d          }t           j                            d          }d}t          j        || j        | j        g|g|          S )Nrn  r   i   )	r   r!   r"   r   r    r   r;   rh  r  )r|   r+   r,   	DYNAMIC32s       r2   ry  zBarrierRef.get_ptrW  sX    
'--
'
'C
.
%
%b
)
)CIT	{C  r4   N)r   )F)r~   r   r   r   r   r   rP   staticmethodrp  r   rs  rv  r  r  r   r  r  r  ry  r   r4   r2   rg  rg    sx        
(
(@ @"( @# @c @R^ @ @ @ <@ .    3 <       *6 6 6 61bh 15289K3L 1 1 1 15 5 5:C"(N : : : :    r4   rg  c                       e Zd ZU eed<   ej        dz  ed<   edej        dede	e
j                 deeeef         dd f
d	            Zd
 Zd Zd Zd ZdS )CollectiveBarrierRefr_  Ncluster_maskrk  rj  dimscluster_shaper   c                    t           j                            d          }t          fd|D                       t	          |          z
  dz   }t          j        fd|D                       dk    rd }|dk    sJ nEt          d|          }|D ]2}|         dk    rt          j	        |t          |                    }3t                              | ||          }t          ||          S )Nr   c              3   (   K   | ]}|         V  d S rM   r   r   r'  r  s     r2   r*  z2CollectiveBarrierRef.initialize.<locals>.<genexpr>p  s(      77Qa(777777r4   r   c              3   (   K   | ]}|         V  d S rM   r   r  s     r2   r*  z2CollectiveBarrierRef.initialize.<locals>.<genexpr>q  s(      00aq!000000r4   r   )rl  )r   r   r    rF  r   mathr+  r:   r   oricluster_collective_maskrg  rp  r  )	rk  rj  r  r  r   rl  r  r'  r_  s	      `     r2   rp  zCollectiveBarrierRef.initializee  s    .
%
%b
)
)C 7777$77777#d))CaGMy0000400000A55laq#YYl 
 
!q   y1-CC
 
 ##G\#WWG666r4   c              #   L   K   | j         D ]}t          || j                  V  d S rM   )r_  r  r  )r|   bs     r2   rs  zCollectiveBarrierRef.__iter__  s=      \ 7 7 D$56666667 7r4   c                 B    t          | j        |         | j                  S rM   )r  r_  r  rr  s     r2   rv  z CollectiveBarrierRef.__getitem__  s    V 4d6GHHHr4   c                 .   | j         j        dk    rt          d          | j        Ct	          d          5  | j                                          ddd           n# 1 swxY w Y   dS t          j                            d          }t          j
        t                      t          t          |                    }t          j        |t          t          dz  |                    }t          j        t          j        j        t          j        | j        t          j        t          d|          |                    t          d|                    }t          j        t          j        j        t          j
        |t          t          dz  |                    t          d|                    }t          j        ||          }t+          j        t          j                            d	          || j                                         |gd
dd           dS )zArrives on a barrier in all blocks that share at least one of the coordinates along the collective dimensions.

    Note that unlike in arrive, each warpgroup arrives once.
    r   z#Can only arrive on a single barrierNFr^   r      r   z
!llvm.voidz
    {
        .reg .b32 mapped_addr;
        @$0 mapa.shared::cluster.u32 mapped_addr, $1, $2;
        @$0 mbarrier.arrive.shared::cluster.b64 _, [mapped_addr];
    }zb,r,rTrx  )r_  rj  r7   r  rh   r  r   r   r    r   r   r   r:   r   divuir   r   r  r   r  r   r   r   r!   r"   ry  )r|   r   thread_in_warpgroupsignaled_blockis_collective_blockis_signaling_threadshould_arrives          r2   r  zCollectiveBarrierRef.arrive  s   
 | A%%<=== 5)))                f
.
%
%b
)
)C+jllAnc4J4JKK[Q~3S99 N  *
4$ej1cN&K&KLL	!S		 
  *'>R+?)E)EFF	!S		 
 J24GHHMO
l##	,,..?		 	     s   AA!$A!c                 8    | j                                          d S rM   )r_  r  r{   s    r2   r  zCollectiveBarrierRef.wait  s    Lr4   )r~   r   r   rg  r   r   r   r  rP   r   r
   r   r   rp  rs  rv  r  r  r   r4   r2   r  r  `  s         47x77 S]#7 3S=)	7
 7 7 7 <767 7 7I I I& & &P    r4   r  c                      e Zd ZU eedf         ed<   eedf         ed<   eedz  df         ed<   eej        df         dz  ed<   dddddeedf         deedz  df         deej        df         dz  d	eedf         dz  d
eedf         dz  f
dZe	deedf         fd            Z
e	d             Zdej        ez  deej                 fdZdS )	Partition.source_boundstarget_boundsN	partitionbase_offsetr  
num_chunks
chunk_sizeelementsr  r  c                :   || _         || _        || _        t          | j                   t          | j                  k    rt          |d cxu r	|cxk    rn nt	          d          ||| _        nt          |          t          | j                   k    rt          g }t          ||          D ]3\  }}||z  rt	          d||          |                    ||z             4t          |          | _        t                      }	| j        D ]M}
|
d|
cxk    rt          | j                  k     s	n t          |
|	v rt          |	
                    |
           Nt          | j         | j                  D ]&\  }}
|
|| j        |
         z  rt	          d          'd S )N:Exactly one of num_chunks and chunk_size must be specifiedzNon-divisible partitionr   zNon-divisible partitioning)r  r  r  r   r7   r  r  rF   r   setadd)r|   r  r  r  r  r  r  elschunk	seen_dimsrQ  tbs               r2   __init__zPartition.__init__  s    "DDN"D
4#dn"5"555T////Z////////
F   %d	ZC 233	3	3mHj11 + +*#u; 	L4h
KK
KSE\**** //dI^  	
1....s4-......	
immAT'88 7 7A	
2 21 5556667 7r4   r   c                     | j         S rM   r  r{   s    r2   r  zPartition.num_chunks  s    r4   c                 j     t           fdt           j         j                  D                       S )Nc              3   F   K   | ]\  }}||n|j         |         z  V  d S rM   r  )r   r  rQ  r|   s      r2   r*  z/Partition.target_block_shape.<locals>.<genexpr>  sX       G GR ybD,>q,A&A G G G G G Gr4   )r   r  r  r  r{   s   `r2   target_block_shapezPartition.target_block_shape  sT     G G G G!$"4dnEEG G G G G Gr4   source_coordsc                    g }t           j                                        }t          t	          | j        | j                            D ]\  }\  }}|t          d|          }n)t          j	        t          ||          ||                   }| j
         t          j        | j
        |         |          }|                    |           |S r  )r   rO   r'   r(   r  r  r  r:   r   r   r  r   rF   )r|   r  coordsr   r0   tbsrQ  dim_bases           r2   get_basezPartition.get_base  s    FLE T%<dn!M!MNN  8C	
Q;;:aUmm]1-=>>			%:d.q18<<mmHMr4   )r~   r   r   r   rP   r   r   r   r  r   r  r  r   r  r   r4   r2   r  r    s        sCx   sCx   3:s?####RXs]#d**** 26+/+/)7 )7 )7c3h)7 sTz3'	)7
 3'$.)7 S/D()7 S/D()7 )7 )7 )7V %S/    8 G G 8GRX^ RX      r4   r  c            
           e Zd ZU eed<   dddddedej        dz  dedz  dedz  fdZe	d	efd
            Z
dej        d	ej        fdZdddddej        dz  dedz  dedz  fdZdS )Partition1Dr  Nr  r  r  r  r  c                    || _         |d cxu r	|cxk    rn nt          d          t          |fd          }||f|d<   |t          dd|fi|| _        d S t          dd|fi|| _        d S )Nr  r   )r  r  r  r  r  r   )r  r7   dictr  r  )r|   r  r  r  r  common_kwargss         r2   r  zPartition1D.__init__  s     #DT////Z////////
F   8+>>>M&1^mM" KKZMK]KKdnnn KKZMK]KKdnnnr4   r   c                 &    | j         j        d         S r  )r  r  r{   s    r2   r  zPartition1D.num_chunks  s    >'**r4   r  c                 B    | j                             |          d         S r  )r  r  )r|   r  s     r2   r  zPartition1D.get_base  s    >""=11!44r4   )r  r  r  r  c                v    t          | j        j        d         ||||                     |          nd           S )Nr   )r  r  r  )r  r  r  r  )r|   r  r  r  s       r2   refinezPartition1D.refine  sG     )!,,1,=DMM%(((4	   r4   )r~   r   r   r  r   rP   r   r   r  r   r  r  r  r   r4   r2   r  r    s+         &*##L L LL 8d?	L
 *L *L L L L* +# + + + 8+5BH 5 5 5 5 5  $##   X_ *	
 *     r4   r  c                 F   t          |          t          |           k    rt          |s| S t          |          }t          | | d          |          D ]\  }}||z  rt          d| |          g | d |          d t          | | d          |          D             |R S )NzNon-divisible tiling:c              3   &   K   | ]\  }}||z  V  d S rM   r   )r   r1   ts      r2   r*  ztile_shape.<locals>.<genexpr>5  s*      ==41aQ======r4   )r   r7   r  )r   tilingtiling_rankr1   r  s        r2   
tile_shaper  *  s    [[3u::
	 LF+%&// ? ?da1u ?.v>>>?
]{l]
==3uk\]]3V<<===
 
 
 r4   c                    d|z  dk    r|dk    sJ t           j                            d          }| }t          j        |          }|                                st          d| d          t          |          }t          |          D ]h}t          j
        |j        t          d|          |t          d|z  |          t          d|          t          j        j                  } |||          }i|S )z$Reduce a value across the warpgroup.r   r   z6Warp reduction group size should be a power of 2 (got )r   r   r   )r   r   r    rQ   log2
is_integerr7   rP   r1  r   r   r8   r:   r   bfly)valuerx   
group_sizer   r}   itersr0   other_results           r2   warp_tree_reducer  :  s    	jA		*"2"2"2"2
##B''#&
'*

%					 ]
[j[[[
\
\\
e**%<< 	& 	&a>	*c	!q&#	$ L R%%FF	-r4   c                 X   t           j                            d          }t          j        | j                  }t          |j                  dk    rt          t          |j	                  }t          |j                  }|dndt          |          z   dz   }t           j                            d|z             }t           j                            d| d| d	| d
| d	          }t          j        |g| g          }	t          j        ||	dg          }
t          j        ||	dg          }t          j        |t%          ||          t          j        j                  }t          j        |t          j        t          j        ||
          |t          j        j                            S )Nr   r    <>r6   z!llvm.struct<(z, z, i64, array<r   r   r   r   )overflow_flags)r   r   r    r  r8   r   r   r   r   rX   strr!   r"   r	   UnrealizedConversionCastOpr   extractvaluemulr:   IntegerOverflowFlagsnoneinttoptrr  ptrtoint)
memref_argr
  r,   r   elem_bytewidthr-   r^  r?   r.   r/   aligned_ptroffset_elemsoffset_bytess                r2   
memref_ptrr  Q  s   
##B''#mJO,,)Q
Y344.	Y_		$$""#L0A0A*AC*G%7==u,--&GMM v        d          ' 
	+WI
|	D	D$!&$44+"3qc22,.3  ,
 

h
-[
)
)
27  
 
 r4   r  
collectivec                    t           j                            d          }d}t          d|          }d }t          j        D ]}||k    rj| |         dk    r]t          j        |t	          j        |                    }t          j	        |t          j
        |t          ||                              }n|}|| |         z  }d}t          | |                   D ]}	|d|	|z  z  z  }t          j        t          ||          |          S r  )r   r   r    r:   r
   r   r   ru  cluster_block_idr   r   r1  r  )
r  r  r   rI   
mask_shiftcollective_stridecluster_dimdim_idxmask_unshiftedr0   s
             r2   r  r  q  s    	##B''#&Cyy*] 	) 	)kj  	{	#q	(	($S#*>{*K*KLLZ
7AfcNN;;
 

 !
mK((FF.z*++ 3 3aaA 1122NN	Anc**J	7	77r4   r  )TrM   )O__doc__collections.abcr   r   ri   dataclassesr   rf   r  typingr   r   r   jaxlib.mlirr   jaxlib.mlir.dialectsr   r	   r
   r   r   r   r   r   r   numpyrQ   r   rP   r   DYNAMICr  r3   rC   r)   floatr:   rt   	dataclassrw   r   contextmanagerr   r   r   r   r   IntEnumr   r   rh   r   r   r!   r   r   dsr   r  r0  r#  r9  rJ  rM  rU  r   r   boolr  re  rg  r  r  r  r  r  r  r   r  r   r4   r2   <module>r     sf   $ # # . . . . . . . .                       



       & & & & & & ( ( ( ( ( ( $ $ $ $ $ $ % % % % % % ' ' ' ' ' ' & & & & & & % % % % % % $ $ $ $ $ $ ' ' ' ' ' '        

A"- A A A A2
 
 
  "3; " " " " %) ; ; ; ; ;< d###       $#  @        5 5 5 51 1 1 1    4<    "&	<$ % % %    >   gm,t3     "'         d###       $#
 bh "(    8 6;U4[[ M&+dl   &3RX 3"( 3 3 3 3>&Crx &C"( &C &C &C &CR."( .BH . . . .2"( # 28    $0S/$0
43cDJ67$0 $0 $0 $0N   d###c c c c c c c $#cL d###P P P P P P P $#PfE E E E E E E EP+ + + + + + + +\     .   @8c3'858]8 8 8 8 8 8r4   