
    VpfVk                     ^   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddlm
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlZddlmZ ddlmZ ej        Zej        Z ej        d           G d d                      Z ej        d           G d d                      Z ej        d           G d d                      Z ej        d           G d d                      Zeez  ez  ez  Z e            Z e            Zej         j!         G d d                      Z"dS )zUtilities for code generator.    N)Callable)ir)arith)gpu)llvm)math)memref)nvvm)vector   )dsl)utilsT)frozenc                   >    e Zd ZU dZdZeedf         ed<   defdZ	dS )WGSplatFragLayouta>  A fragmented array where all the values are equal represented as a register per thread.

  FragmentedArrays in this layout can be are always the result of a
  splat, each thread in the warpgroup has a single copy of the value,
  while the FragmentedArray pretends it has whatever shape the user
  wants. This means we can trivially broadcast, reshape and do
  elementwise operations with all other layouts.

  Examples:

  To load a value in
  ```
  FragmentedArray.splat(memref.load(ref_1d, [1]), (10,20,2))
  ```

  A shape is always provided for sanity check reasons.

   .shapereturnc           
      ~    t          d t          | j        ddd         |ddd                   D                       S )zCheck that the shape can be broadcast.

    Only dimensions of size 1 can be broadcast. All other dimensions
    must be the same as the argument shape.
    c              3   4   K   | ]\  }}||k    p|d k    V  dS )r   Nr   ).0dim1dim2s      l/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/experimental/mosaic/gpu/fragmented_array.py	<genexpr>z5WGSplatFragLayout.can_broadcast_to.<locals>.<genexpr>E   s3      ^^ZT4tt|(tqy^^^^^^    N)allzipr   selfr   s     r   can_broadcast_toz"WGSplatFragLayout.can_broadcast_to?   sH     ^^3tz$$B$?OQVW[W[Y[W[Q\;];]^^^^^^r   N)
__name__
__module____qualname____doc__r   tupleint__annotations__boolr"   r   r   r   r   r   (   s\          & %sCx_t _ _ _ _ _ _r   r   c                       e Zd ZdZdS )WGMMAFragLayoutz*[m, n] matrix, where m % 64 == 0 == n % 8.Nr#   r$   r%   r&   r   r   r   r,   r,   H   s        2222r   r,   c                       e Zd ZdZdS )WGMMARowFragLayoutz[m] matrix, where m % 64 == 0.Nr-   r   r   r   r/   r/   M   s        &&&&r   r/   c                   j    e Zd ZU dZeedf         ed<   eed<   d Zede	j
        fd            Zd Zd	S )
WGStridedFragLayoutz6Convert the array to 1D and then shard across threads..r   vec_sizec                     t          j        | j                  | j        t          z  z  dk    rt          | t          f          d S Nr   )npprodr   r2   WARPGROUP_SIZE
ValueErrorr!   s    r   __post_init__z!WGStridedFragLayout.__post_init__Y   sA    	wtzdmn<=BBn-... CBr   	memref_tyc                 
   t           j                            |          st          |          t          j        |          }t	          j        |j                  }d|z  dk    r	d|z  dk    s
J |            t          j        |j	                  t          z  dk    rt          dt                     t          j        |j	                  t          z  } | t          |j	                  t          d|z  |                    S )N   r   z9Ref must have a number of elements that is a multiple of )r   r2   )r   
MemRefType
isinstance	TypeErrormgpu	bytewidthelement_typer5   r6   r   r7   r8   r'   min)clsr;   memref_typebwmax_vec_sizes        r   from_memref_typez$WGStridedFragLayout.from_memref_type]   s    =##I.. !i   -	**K	0	1	1Br6Q;;17a<<<<<<	w{ !!N2a77    7;,--?L3K%&&Q"Wl1K1K   r   c              #   R  K   t           j                                        }t          j        | j                  }|t          | j        z  z  dk    sJ |t          | j        z  z  }t          j	        t          j        t          j        j                  t          t          |                    }t          j        |t          | j        |j                            }t#          |          D ]=}t          j        |t          |t          z  | j        z  |j                            gV  >dS )zThe indexes to be used for vector load/store WGStridedFragLayout.

    Yields:
      The indices of the vector that correspond to the current thread.
    r   N)r   	IndexTypegetr5   r6   r   r7   r2   r   remuir   	thread_id	Dimensionxcmulityperangeaddi)r!   indexcardinalityreg_numtidxoffis          r   thread_vec_idxsz#WGStridedFragLayout.thread_vec_idxso   s       LE'$*%%K.4=89Q>>>>nt}<=G;s}S]_55q7O7OPPD
*T1T]DI66
7
7C7^^ P PZQq>1DMA49MMNNOOOOOP Pr   N)r#   r$   r%   r&   r'   r(   r)   r:   classmethodr   TyperI   r\   r   r   r   r1   r1   R   s         >>	sCx---/ / / rw    ;"P P P P Pr   r1   c                      e Zd ZU ej        ed<   eed<   dej        defdZd Ze	de
j        fd            Ze	d4d
            Zed             Zed             Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd5defdZd5defdZd5defdZd5defdZe de!e"e
j        ge
j        f         z  de"e
j        ge
j        f         fd            Z#d  Z$d!e
j%        fd"Z&d# Z'd$e
j%        fd%Z(de
j        fd&Z)d' Z*d( Z+d) Z,d* Z-de
j        fd+Z.de
j        fd,Z/de
j        fd-Z0d.e1d	z  fd/Z2e	d.e1d	z  fd0            Z3e d.e1d	z  fd1            Z4d2 Z5e	d3             Z6d	S )6FragmentedArray	registerslayout
_registers_layoutc                   || _         || _        | j        xt          d x<\    | j         j        dk    s| j         j        dd          dk    rt          d          d S  xt          d x:\    | j         j        dk    s| j         j        d         dk    rt          d          d S  xt          dx\  } t          j	        |j
        d         j                  j        \  }t          j        |          t          j        |j                  t          z  |z  k    r.t          |||j        t          f|j
        d         j                  d S  t          d x/\   |j        dk    r t          d	|j         d
|j         d          d S  	 t"          )Nr         )rg   r   zInvalid register array shaper   r   r   z,WGStridedFragLayout requires a single value z ())ra   rb   r,   ndimr   r8   r/   r1   r   
VectorTypeflatrS   r5   r6   r7   r   sizeNotImplementedError)r!   rc   rd   r   reg_sizes        r   __init__zFragmentedArray.__init__   s   DNDK
+ ?>!##t~';ABB'?6'I'I9::
: (J'I   >!##t~';B'?1'D'D9::
: (E'D  
 &%%%%%uumJOA$6$;<<B75>>RWZ%566G(RRRHeZ-=~NPZP_`aPbPghh
h SR & ?ap*JZpp^h^mpppqq
q    !!r   c                 (    d| j          d| j         dS )NzFragmentedArray(layout=z, shape=rh   )rb   r   r9   s    r   __repr__zFragmentedArray.__repr__   s!    D$+DDtzDDDr   refc                    t           j                            |j                  st	          |j                  t          j        |j                  }t          j        |dt          |j                            t          
                    |          }t           j                            |j        f|j                  fd|                                D             } | t!          j        |          |          S )Nr   c                 <    g | ]}t          j        |          S r   )r   load)r   vec_idxref_1dvec_tys     r   
<listcomp>z0FragmentedArray.load_strided.<locals>.<listcomp>   s'    YYYWFK00YYYr   rc   rd   )r   r>   r?   rS   r@   rA   memref_foldlenr   r1   rI   rj   rL   r2   rC   r\   r5   array)rE   rr   ref_tyrb   vecsrw   rx   s        @@r   load_stridedzFragmentedArray.load_strided   s    =##CH--  ch]38$$Fc1c&,&7&788F 11&99F]163FGGFYYYYY@V@V@X@XYYYD3"(4..&9999r   Nc                    |pt          |          }|xt          d x>\    t          |          dk    rt          |d         dz  rt          |d         dz  df}n> xt          d x\    t          |          dk    rt          |d         dz  s|d         dz  rt          |d         dz  |d         dz  ddf}t          j        t          j        	                    d|j
                  |          }n xt          d xm\  } ||j        k    sJ t          j        |          }|t          |z  z  f}t          j        t          j        	                    |f|j
                  |          }n. t           d x\   ||j        k    sJ d}n 	 t!          |           | t          j        ||t$          	          |
          S )Nr   r   r   @   rg   r=   rg   r2   dtyperz   )r   r/   r|   r8   r,   r   splatr   rj   rL   rS   r1   r   r5   r6   r7   rm   fullobject)rE   valuer   rb   	reg_shaper2   elemss          r   r   zFragmentedArray.splat   s   /(//F
u::??
8b= 	
1X^Q'		   ?u::??
8b= 	E!HqL 	
1X^U1X]Aq9	R]..tUZ@@%HH  211111$$$$~89;	R]..{EJGGOO	 2
 $$$$		  !&)))379e6:::   r   c                    | j         xt          d x$\    | j        j        d d         \  }}|dz  |dz  fS  xt          d x\    | j        j        d         }|dz  fS  xt
          dx\  } |S  t          d x\  }|S  d S )Nr   rg   r   r=   r   )r   )rb   r,   ra   r   r/   r1   r   )r!   	row_tiles	col_tilesr   s       r   r   zFragmentedArray.shape   s    
+?#~3BQB7	9B	A..   N(+	B     &%%%%%uu &)))))5 *))r   c                     | j         j        d         j        }| j        xxt          d x\   n xt
          d x\   n  n  t          j        |          j        S xt          d x\   n xt          d x\   n  d S  |S )Nr   r   )ra   rk   rS   rb   r,   r1   r   rj   rC   r/   r   )r!   reg_tys     r   
mlir_dtypezFragmentedArray.mlir_dtype   s    ^ #(F
+4?4244444444444}V$$11"5"3"5"5"5"5"5"5"5"5"5"5"5r   c                 f   g }|D ]E}t          |t                    sPt          |t          j                  st	          |          t                              || j        | j                  }t          |j        t                    rf|j        	                    | j                  st          d          t                              |j        j        d         | j        | j                  }nH| j        |j        k    rt          d          | j        j        |j        j        k    rt          d          |                    |           Gt          j        | j                  }t          j        | j                  D ]\  } ||gfd|D             R  |<   t          || j                  S )N)r   rb   zCan't broadcast shape.r   z$Incompatible FragmentedArray layoutsz#Incompatible FragmentedArray shapesc              3   2   K   | ]}|j                  V  d S N)ra   )r   oidxs     r   r   z-FragmentedArray._pointwise.<locals>.<genexpr>
  s*      EEQC 0EEEEEEr   rz   )r?   r`   r   Valuerm   r   r   rb   r   r"   r8   ra   rk   appendr5   
empty_likendenumerate)r!   opother
other_arrsr   new_regsregr   s          @r   
_pointwisezFragmentedArray._pointwise   s   J  ?++ K!RX&& 	'#A&&
&!!!4:dk!JJ	AH/	0	0 Bx((44 	5344
4!!!+"21"5TZPTP[!\\;!(""ABB
B>1;#444@AA
A}T^,,HN4>22 G GSbFEEEE*EEEFFFhsmmhDDDDr   c                 :   t           j                            | j                  r |                     t
          j        |          S t           j                            | j                  r |                     t
          j        |          S t          | j                  r   )
r   	FloatTyper?   r   r   r   addfIntegerTyperU   rm   r!   r   s     r   __add__zFragmentedArray.__add__  r    	|t// 1__UZ///		"	"4?	3	3 1__UZ///000r   c                     | |z   S r   r   r   s     r   __radd__zFragmentedArray.__radd__      %<r   c                 :   t           j                            | j                  r |                     t
          j        |          S t           j                            | j                  r |                     t
          j        |          S t          | j                  r   )
r   r   r?   r   r   r   mulfr   rR   rm   r   s     r   __mul__zFragmentedArray.__mul__  r   r   c                     | |z  S r   r   r   s     r   __rmul__zFragmentedArray.__rmul__   r   r   c                     t           j                            | j                  st          |                     t          j        |          S r   )r   r   r?   r   rm   r   r   subfr   s     r   __sub__zFragmentedArray.__sub__#  8    <""4?33  ??5:u---r   c                     t           j                            | j                  st          |                     d |          S )Nc                 ,    t          j        ||           S r   )r   r   sr   s     r   <lambda>z*FragmentedArray.__rsub__.<locals>.<lambda>+      
1a(8(8 r   r   r   r?   r   rm   r   r   s     r   __rsub__zFragmentedArray.__rsub__(  :    <""4?33  ??88%@@@r   c                     t           j                            | j                  st          |                     t          j        |          S r   )r   r   r?   r   rm   r   r   divfr   s     r   __truediv__zFragmentedArray.__truediv__-  r   r   c                     t           j                            | j                  st          |                     d |          S )Nc                 ,    t          j        ||           S r   )r   r   r   s     r   r   z.FragmentedArray.__rtruediv__.<locals>.<lambda>5  r   r   r   r   s     r   __rtruediv__zFragmentedArray.__rtruediv__2  r   r   c                     t           j                            | j                  st          |                     t          j        |          S r   )r   r   r?   r   rm   r   r   maximumfr   s     r   maxzFragmentedArray.max7  s8    <""4?33  ??5>5111r   Fapproxc                    t           j                            | j                  st          |rt           j                                        | j        k    rt          t          j        t           j	                            d                    fd}| 
                    |                     |                    S | 
                    t          j                  S )Ng+eG?c                 ^    t          j        |           }t          j        |gdd          S )Nzex2.approx.f32 $0, $1;=f,f)r   r   r   
inline_asm)rP   scaledf32log2es     r   fast_expz%FragmentedArray.exp.<locals>.fast_expD  s-    Au%%sVH.FOOOr   )r   r   r?   r   rm   F32TyperL   r   constant	FloatAttrr   _lift_fast_unary	mlir_mathexp)r!   r   r   r   r   s      @@r   r   zFragmentedArray.exp<  s    <""4?33   >JNNc	C		!!nS","2"238J"K"KLLeP P P P P P __T228<<===??9=)))r   c                 $   t           j                            | j                  st          |r.| j        t           j                                        k    rt          |                     |r|                     d          nt          j
                  S )Nzsin.approx.f32)r   r   r?   r   rm   r   rL   r   r   r   sinr!   r   s     r   r   zFragmentedArray.sinJ  |    <""4?33    $/RZ^^%5%555??39L.///y}  r   c                 $   t           j                            | j                  st          |r.| j        t           j                                        k    rt          |                     |r|                     d          nt          j
                  S )Nzcos.approx.f32)r   r   r?   r   rm   r   rL   r   r   r   cosr   s     r   r   zFragmentedArray.cosS  r   r   c                 $   t           j                            | j                  st          |r.| j        t           j                                        k    rt          |                     |r|                     d          nt          j
                  S )Nzrsqrt.approx.f32)r   r   r?   r   rm   r   rL   r   r   r   rsqrtr   s     r   r   zFragmentedArray.rsqrt\  s|    <""4?33    $/RZ^^%5%555??5;P0111  r   instrr   c                       fdS )Nc           	      z   t           j                                        }| j        |k    r;t	          t
                    rt          j        || gdz   d          S  |           S t           j                            | j                  rt           j	                                        }t          j
        | j                  }t          d          D ]V}t          j        | t          ||                    } |          }t          j        ||t          ||                    }W|S t!          | j                  )Nz $0, $1;r   rg   position)r   r   rL   rS   r?   strr   r   rj   rK   
mlir_undefrT   r   extractelementrQ   insertelementrm   )	rP   r   rV   resultr[   vvr
fast_instrr   s	          r   r   z4FragmentedArray._lift_fast_unary.<locals>.fast_instri  s   JNNc	
3eS!! 	qc5:+=vFF
Fq/=##AF++ 	*  ""((q 	J 	JA#A!U<<<!z!}}"'FQq%[[III&&!!&)))r   r   )r   r   s   `@r   r   z FragmentedArray._lift_fast_unarye  s*    * * * * * *" r   c                     t           j                            | j                  st	          d| j                   |                     t          j        |          S )Nz7Bitwise operations only defined for integer types, not )r   r   r?   r   r8   r   r   andir   s     r   __and__zFragmentedArray.__and__|  sZ    >$$T_55  o     
 ??5:u---r   eltc                    | j         j        d         j        }t          j                            |          r:t          j        |          j        }t          j                            ||          n||                     fd          S )Nr   c                 .    t          j        |           S r   )r   bitcast)rP   tys    r   r   z)FragmentedArray.bitcast.<locals>.<lambda>  s    U]2q%9%9 r   )	ra   rk   rS   r   rj   r?   r   rL   r   )r!   r   reg_typer   r   s       @r   r   zFragmentedArray.bitcast  sy    ~"1%*H	})) -))/i=Y,,bbb??9999:::r   c                 T   | j         t          k    rt          d          t          j        || j                  \  }}}t          |          rt          d          |d         dz  s!|d         dz  s|d         dz  s|d         dz  rt          d          |dxx         dz  cc<   |dxx         dz  cc<   |dxx         dz  cc<   |dxx         dz  cc<   | j        |d         |d         |d         z   |d         |d         |d         z   f         }t          || j                   S )	Nz"Only WGMMA layouts support slicingzOnly slicing implementedr   r   r   r=   z#Only tile aligned slicing supportedrz   )	rb   WGMMA_LAYOUTrm   r   parse_indicesr   anyra   r`   )r!   r   base_idxslice_shapeis_squeezedr   s         r   __getitem__zFragmentedArray.__getitem__  sf   {l"" DEEE).)<S$*)M)M&Hk;
; < :;;;bGq>BG A;?G q>A	G   EFFFQKKKBKKKNNNrNNNQKKKAKKKNNNqNNN~hqkKN22hqkKN22	4H hDDDDr   	new_dtypec                    | j         }||k    r| S t          j                            |          }t          j                            |          }t          j                            |          }t          j                            |          }|rP|rNt          j        |          j        t          j        |          j        k    rt          j        }nt          j        }ns|rP|rNt          j        |          j        t          j        |          j        k    rt          j	        }n.t          j
        }n!|r|rt          j        }n|r|rt          j        }t          j        | j                  }| j        xt"          d x$\    t          j                            d|          }	no xt(          d x&\  }
 t          j                            |
f|          }	n= xt*          d x\   n xt,          d x\   n  n |}	n	 t/          d| j                   t          j        | j                  D ]\  }} ||	|          ||<   t3          || j                  S )Nr   r   r   Unsupported layout rz   )r   r   r   r?   r   widthr   truncfextftrunciextsisitofpfptosir5   r   ra   rb   r,   rj   rL   r1   r/   r   rm   r   r`   )r!   r   	cur_dtype
from_floatto_floatfrom_integer
to_integerconvertnew_registers
new_reg_tyr2   r   r   s                r   astypezFragmentedArray.astype  sZ   IIk((33J|&&y11H>,,Y77L**955J h 	i	 	 	&i)@)@)F	F	F,*	 * 			"	"	(2>)+D+D+J	J	J,+	 ( gg	 
 gM$.11M
+?]&&tY77

 111111]&&{I>>

 2"5"3"5"5"5"5"5"5"5"5"5"5

!"E"E"EFFFN4>22 4 4S"7:s33mCmT[IIIIr   c           	         t           j                                        }t          | j        t
                    st          d| j                   t          d| j                  }| j	        D ]?}t          j        |t          j        | j        t          j        j        |                    }@t          j        |j                  }|j        | j        k    s|j        dgk    rt)          dd d| j         d| d          t           j                            | j                  rt          j        }nEt           j                            | j                  rt          j        }nt          | j                  t1          j        ||d	          }t          j        t7          j        t6          j        j                  t          d	|                    }t?          j         |||g           t1          j!                     t          d|          }	tE          j#                    5  t          j$        t           j%                            d| j                  ||	g          }
t          j        | j        t          j        j        |
          }t?          j         |||	g           d d d            n# 1 swxY w Y   t1          j!                     t?          j$        ||	g          S )
Nr   r   rf   zExpected shape=)rf   z, z (got rh       )&r   rK   rL   r?   rb   r1   rm   rQ   r   ra   r   r   r   	reductionCombiningKindADDr>   rS   rC   r   r8   r   r   rU   r   warp_tree_reducedivuir   rN   rO   rP   r	   storecommit_sharedrA   single_threadru   rj   )r!   scratchrV   r   r   
scratch_tyr   warp_resultwarp_id
zero_indexscratch_vecscratch_sums               r   
reduce_sumzFragmentedArray.reduce_sum  s   LEdk#677 E Cdk C CDDDq$/""F~  z


4?F,@,Dc
J
J ff w|,,J$/11Z5E!5L5LUUUUU
UUUVVV	|t// 1:bb		"	"4?	3	3 1:bb000(R88Kk#-88!B,,GGG
Lgy111	1eJ				 	7 	7K
-

D$/
2
2
, k
 $
/6/3[ k l;*666	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 
;w---s   A<J$$J(+J(c                    | j         t          k    rt          | j                   |dk    rt          t          j                                        }t          j                            d          }t          j	        | j
        j        d d d         t                    }| j
        j        d         dk    sJ t          j        |j                  D ]\  }}| j
        |d|df         }t          d| j
        j        d                   D ]}	 ||| j
        ||	|df                   } |t          j        |t#          d|                    t          j        |t#          d|                              }
|
}dD ]e}t%          j        |j        t#          d	|          |t#          ||          t#          d
|          t$          j        j                  } |||          }f||||f<   t/          |t0                    S )Nr   r  rg   r   r   r   r   )r   rg   l       rz   )rb   r   rm   r   rK   rL   r   get_signlessr5   emptyra   r   r   ndindexrT   r   r   rQ   r
   	shfl_syncrS   ShflKindbflyr`   WGMMA_ROW_LAYOUT)r!   r   axisrV   i32r   row_tilerow_subtilethread_result_vecn_tilethread_resultr   r[   other_results                 r   reducezFragmentedArray.reduce  s   {l"",,,qyyLE
.
%
%b
)
)Cx,SSqS1@@@H>#q((((!#HN!;!; / /+.1k1)DE!T^1!455 
 
&Bt~hQ.NO
 
 b

 1AaKK
H
H
H

 1AaKK
H
H
H m
 f 	* 	*!~Kj#aIIdCLLM
 
 FL))(.hx$%%h8HIIIIr   c                 &   t          | j        t                    st          | j                  | j        |k    r| S | j                            |          st          d| j         d|           t          | j        t          |                    S )NzCan't broadcast  to rz   )	r?   rb   r   rm   r   r"   r8   r`   ra   r    s     r   	broadcastzFragmentedArray.broadcast  s    dk#455 -,,,zUk;''.. CA$*AA%AABBBdn>OPU>V>VWWWWr   c                 P   | j         |k    r| S t          | j        t                    st	          | j                  t          j        |          t          j        | j                   k    rt          d| j          d|           t          | j	        t          |                    S )NzCan't reshape r4  rz   )
r   r?   rb   r   rm   r5   r6   r8   r`   ra   r    s     r   reshapezFragmentedArray.reshape!  s    zUkdk#455 -,,,	wu~~,,,,?
????@@@dn>OPU>V>VWWWWr   c                    | j         t          k    rt          | j        j        d         }t          |d          \  }}|rt          d          t          j        ||ddft                    }| j
        }t          j        | j                  D ]F\  \  }}}	t          j        t          j                            d|          |	          ||d d |d d f<   Gt#          |t$                    S )	Nr   r=   z(Number of columns must be divisible by 8rg   r   r   r   rz   )rb   r)  rm   ra   r   divmodr8   r5   r$  r   r   r   r   r   r   rj   rL   r`   r   )
r!   nnum_row_tilesnum_col_tilesremr   r   r,  r-  r   s
             r   broadcast_minorzFragmentedArray.broadcast_minor-  s    {&&&N(+M1M3
 CABBBxq!<FKKKHOE(*t~(F(F  $;.4l
-

D%
(
(#/ /hxK*++ hEEEEr   c                 2   t           j                            |j                  st	          |          | j        xt          d x\    |                     |           d S  t          d x\   | 	                    |           d S  	 t          | j                  )Nr   )r   r>   r?   rS   r8   rb   r,   _store_untiled_wgmmar1   _store_untiled_wg_stridedrm   )r!   rr   s     r   store_untiledzFragmentedArray.store_untiled<  s    =##CH-- sOO
+?!!#&&&&&      &&s+++++ !!$+...r   c                    t          j        |j                  }t          |j                  }|| j        k    rt          || j        f          t          j        |dt          |j                            }t          | j
                                        | j        j                  D ]\  }}t          j        |||           d S r4   )r   r>   rS   r'   r   r8   rA   r{   r|   r   rb   r\   ra   rk   r   r  )r!   rr   r~   	ref_shapesmem_1dr   r   s          r   rA  z)FragmentedArray._store_untiled_wg_stridedH  s    ]38$$Ffl##IDJ	4:.///sAs6<'8'899G3355t~7JKK & &Sl3%%%%& &r   c           	         | j         t          k    sJ t          j                                        | j        \  }}t          j        |j                  }|j        ||gk    rt          |j        ||f          fd}t          j
        t          j        t          j        j                   |t                              }t          j
        | |d                    }t          j        | |d                    }t          j        t          j        | |d                    t          j        | |d                              }	t          j        t          j
        | |d                     |d                    }
t'          j        | j                  }|D ]\  \  }}}}}~t          j        |	 ||dz  |dz  z                       }t-          d          D ]]}t/          j        | ||                    }t          j        |
 ||dz  |z                       }t3          j        ||||g           ^d	S )
z?Stores accumulator to a 2D memref. Not optimized at the moment.c                 j    t          j        t          j                            |                     S r   )r   
ConstantOpr   IntegerAttrrL   )rP   rV   s    r   rQ   z/FragmentedArray._store_untiled_wgmma.<locals>.cZ  s(    eR^%7%7q%A%ABBBr   r  rf      rg   r   r=   r   N)rb   r   r   rK   rL   r   r>   rS   r8   r   rM   r   rN   rO   rP   r7   r  rU   rR   r5   r   ra   rT   r   r   r	   r  )r!   rr   mr:  r~   rQ   rY   lane_idr  row_basecol_baseitr,  col_tilerow_idxcol_zeroelemrowcol_idxr   colrV   s                        @r   r@  z$FragmentedArray._store_untiled_wgmmaQ  s-   ;,&&&&LE:DAq]38$$F|1vsx!Q(((C C C C C ;s}S]_55qq7H7HIIDk$"&&Gk$"&&GzGQQqTT""EJw"$>$> H z%+gqqtt44aadd;;H		'	'B9; - -5/8Wh
Jx8b=7Q;#>!?!?@@c1XX - -'%dQQwZZ@@@j11X\G%;#<#<==UC#s,,,,-- -r   swizzlec                    | j         t          k    rt          | j        }t	          j        |          }| j        \  }}|dz  dk    sJ ||z  }|dz  ||z  d|g}t          j        |j	                  j        |k    rt          |j	        ||f          |                     | j        ||          D ]*\  }	}
}t          j         |	| j                  ||           +d S )Nr   r   )rb   r   rm   r   rA   rB   r   r   r>   rS   r8   transfer_tiledr   r  ra   )r!   rr   rW  r   rG   rK  r:  cols_per_tileexpected_shaperL   _idxss               r   store_tiledzFragmentedArray.store_tiledm  s    {l""OE			B:DAqr6Q;;;;rMM2gqM12}EN	}SX$66sx!Q(((++DJwGG 3 3Ql33t~&&T22223 3r   c           
      |   t          j        |j                  }|j        }t	          j        |          }|j        \  }}}}	|dk    s	|	||z  k    rt          ||z  ||	z  }}
|
dz  dk    sJ t          j	        ||dz  ddft          j        t           j                            d|          t          d|                    t                    }|                     |
|f||          D ]D\  }}} ||t          j        t           j                            d|          ||                     E | |t$                    S )	Nr   r   r=   rg   r   r   r   rz   )r   r>   rS   rC   rA   rB   r   r8   r5   r   r   r   rj   rL   rQ   r   rY  ru   r   )rE   rr   rW  r~   r   rG   m_tilesn_tilesm_tile_sizen_tile_sizerK  r:  ra   r\  updater]  s                   r   
load_tiledzFragmentedArray.load_tiled{  sH   ]38$$FE			B17.GWk;bKGrM::[ 'K"7qAr6Q;;;;	!q&!QR]&&tU33Qq%[[AA  I
 --q!feWEE P P64fYBM$5$5dE$B$BCNNOOOO3)\::::r   c              #     !K   t          j        |          }| \  }}||z  }||z  dk    rt          |dvrt          d          t          j        j        }t          j        t          j        t          j	        j
                   |t                              }t          j        | |d                    }	t          j        | |d                    }
t          j        |	 |d                    }|dk    r|xdk    rL t          j        t          j        j        t          j        | |d                     |d                    !nxdk    rh t          j        t          j        j        t          j        t          j        | |d                     |d                     |d                    !ndk    rt          	 t!          |          |dz  }nSt"          j                            d	          }t          j        |t"          j                            d
                    !d}t          j        |t          j        |
 |d                              }t          j        t          j        |	 |d                     |d                    }t          j        t          j        | |d|z                       |d|z                      }t3          |dz            D ]}t3          ||z            D ]}t3          d          D ]}t          j        | ||dz                      }t3          |dz            D ]}|}||z  }t          j        ! ||dz             ||dz                      }t          j        ||          }t          j        ||          }|||dz  z  z   }|||dz  z  z   }|||df}|||df} ||           ||          ||f}||f!fd	}||f!fd	} || |fV  d S )Nr   >   r  r      zOnly swizzled stores supportedr  rf   rg   rg  r   r   TrJ  r=   c                 P    | |         }| |         }t          j        ||          S r   r   select)regsleft_idx	right_idx
value_leftvalue_rightis_stagger_lefts        r   get_registerz4FragmentedArray.transfer_tiled.<locals>.get_register  s)    >j Ok\/:{KKKr   c                     t          j        || |                   | |<   t          j        | |         |          | |<   d S r   ri  )rk  newrl  rm  rp  s       r   update_registersz8FragmentedArray.transfer_tiled.<locals>.update_registers  s=    $|OS$x.QQd8n %_d9os S Sd9ooor   )rA   rB   rm   r   rH  create_indexrM   r   rN   rO   rP   r7   r  cmpiCmpIPredicateeqAssertionErrorr   r   r#  r   BoolAttrrL   rU   rR   rT   rj  xori)"r   r   rW  rG   rK  r:  rZ  rQ   rY   rL  r  sub_row_basestagger_amounti1rM  rN  col_swizzle_bits	row_group	col_group
row_subidxrT  
col_subidxcol_subidx_leftcol_subidx_rightcol_offrV  reg_idx_leftreg_idx_rightrl  rm  r   rq  rt  rp  s"                                    @r   rY  zFragmentedArray.transfer_tiled  s_      
		BDAqrMM=Am## @AAA%A;s}S]_55qq7H7HIIDk$"&&Gk$"&&G;w!--L	Avv SSSS!J!$ek,!&E&Eqqtt // RRRR!J!$k%+lAAaDD9911Q44@@add //
 RR $
#w''
'"}nn >&&q))br2;??4+@+@AAonz,
7AAbEE(B(BCCHz%+gqqtt44aadd;;HzL!!C7N"3"344aabkk  17^^ 6 6	Q-/00 6 6)(( 	6 	6J
8QQzA~%6%677#!-1"455 6 6j(O)N:l?Q#6!7!7;Ka;O9P9P G *Xw//C*S"233C*Y-1:L-MML,yMQ<N/OOM ,
A=H!=*a?I!I,,)c36C,4	 L L L L L L 6> T T T T T T  0#55555)6	666 6r   c                 Z    t          | j        j                  | j        | j        j        ffS r   )listra   rk   rb   r   r9   s    r   tree_flattenzFragmentedArray.tree_flatten  s&    #$$t{DN4H&IIIr   c                     |\  }}t          j        |t                                        |          } | ||          S )Nr   rz   )r5   asarrayr   r7  )rE   auxflat_registersrb   r   ra   s         r   tree_unflattenzFragmentedArray.tree_unflatten  sC    FI
>888@@KKI3)V4444r   r   )F)7r#   r$   r%   r5   ndarrayr)   FragmentedLayoutro   rq   r]   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   r   r   staticmethodr   r   r   r   r^   r   r   r  r   r2  r5  r7  r>  rB  rA  r@  r(   r^  re  rY  r  r  r   r   r   r`   r`      s        Z
"BJ "9I " " " ">  
 	:RX 	: 	: 	: ;	:    ;@   8   8E E E41 1 1  1 1 1  . . .
A A A
. . .
A A A
2 2 2
* * * * * *           $     8RXJ011
BH$%   <,. . .; ; ; ; ;E E E0"Jbg "J "J "J "JH%.28 %. %. %. %.N!J !J !JF
X 
X 
X
X 
X 
XF F F
/rx 
/ 
/ 
/ 
/&28 & & & &-bh - - - -83cDj 3 3 3 3 ;C$J ; ; ; ;;$ L6C$J L6 L6 L6 <L6\J J J 5 5 ;5 5 5r   r`   )#r&   dataclassestypingr   jaxjaxlib.mlirr   jaxlib.mlir.dialectsr   r   r   r   r   r	   r
   r   numpyr5    r   rA   r   r7   rQ   	dataclassr   r,   r/   r1   r  r   r)  	tree_utilregister_pytree_node_classr`   r   r   r   <module>r     s   $ #           



       & & & & & & $ $ $ $ $ $ % % % % % % 2 2 2 2 2 2 ' ' ' ' ' ' % % % % % % ' ' ' ' ' '                 %	G d###_ _ _ _ _ _ _ $#_> d###3 3 3 3 3 3 3 $#3 d###' ' ' ' ' ' ' $#' d###)P )P )P )P )P )P )P $#)PX %'::_LOaa    %%''  )]	5 ]	5 ]	5 ]	5 ]	5 ]	5 ]	5 *)]	5 ]	5 ]	5r   