
    VpfC                     P   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d dlZdd	lmZ dd
lmZ ej        Zej        Zej        j        e j         G d d                                  ZdefdZd Z	 d0dedededz  dedz  fdZd ZdefdZdej         dej!        dedz  dededz  dedededej"        fd Z# G d! d"ej$                  Z%d#de%j&        d$deded%e%dz  d&e%fd'Z'd(ej(        fd)Z)d*ej"        d+e*ed,f         fd-Z+d. Z,d/ Z-dS )1    N)ir)arith)builtin)llvm)nvvm)vector   )dsl)utilsc                       e Zd ZU dZej        ed<   dddej        defdZe	dd	            Z
e	d
             Zd Ze	d             ZdS )WGMMAAccumulatora  A FragmentedArray that has is synchronized with the async proxy.

  This implies that it requires no additional synchronization when passed in
  as a WGMMA accumulator. In particular, when created from a
  FragmentedArray, the necessary synchronization is inserted at construction.
  valueT)_sync_valuer   c                    |j         t          j        k    rt          d          || _        |rt          |          | _        d S d S )Nz0Only WGMMA layouts supported in WGMMAAccumulator)layoutmgpuWGMMA_LAYOUT
ValueErrorr   wgmma_fence)selfr   r   s      a/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/experimental/mosaic/gpu/wgmma.py__init__zWGMMAAccumulator.__init__2   sM    })))IJJJDJ 'v&&djjj' '    Nc                 <   |dz  s|dz  rt           t          j                                        }||}t	          j        |t          j                            |d                    } | t          j        	                    |||ft          j
                            S )N@      g        r   )r   r   F32Typegetr   constant	FloatAttrr   FragmentedArraysplatr   )clsmndtypef32zeros         r   r*   zWGMMAAccumulator.zero9   s    2v Q 
*..

C}e>%!1!1%!=!=>>D3#))$A8IJJ   r   c                      | |          S )Nr    )r%   	registerss     r   from_registerszWGMMAAccumulator.from_registersE   s    3i    r   c                     | j         fdfS )Nr,   )r   )r   s    r   tree_flattenzWGMMAAccumulator.tree_flattenI   s    J="r   c                 *    ~ | |d         d          S )Nr   Fr   r   r,   )r%   auxr   s      r   tree_unflattenzWGMMAAccumulator.tree_unflattenL   s    3eAhe,,,,r   N)__name__
__module____qualname____doc__r   r#   __annotations__boolr   classmethodr*   r.   r0   r4   r,   r   r   r   r   '   s           
	DH ' ' ' 4 'T ' ' ' ' 	 	 	 ;	 ! ! ;!   - - ;- - -r   r   xc                 F    | dz  dz	  }|dz  | k    rt          d          |S )N    z)Cannot encode value in a WGMMA descriptor)r   )r=   results     r   wgmma_encoderB   R   s3    KA&q[A
@
A
AA	-r   c                 N    t          j        | |t           j        j                  S )N)overflow_flags)r   addIntegerOverflowFlagsnone)r=   ys     r   llvm_addrI   Y   s    	!Qt'@'E	F	F	FFr   leading_byte_offsetstride_byte_offsetswizzlememory_spacec           	         t           j                            d          }t          j        |t          j        | |                    }|d}n*|dk    rd}n!|dk    rd}n|dk    rd}nt          |          t          j        t          j	        |t          d|                    t          d	|                    }t          |          d
z  t          |          dz  z  }	t          j        t          j        t          ||          t          d|                    t          |	|                    }
t          j        ||
          }
|
S )Nr   r      r	             r?   r@      >   )r   IntegerTypeget_signlessr   ptrtointr   
memref_ptrNotImplementedErrorLShrOpAndOpcrB   or_r   shli)
memref_argrJ   rK   rL   rM   i64ptr_valswizzle_encodingencoded_base_addr
desc_constdescs              r   create_descriptorrf   ]   sD    	##B''#M#u/
LIIJJ'_#~~"}}"}}
g
&
&&k
j!GS//**AaII 
 '((B.	(	)	)R	/1  
j#S))1R::66*c8J8J
 
$ 
#T	*	*$	+r   c           	          t           j                            d          }t          j        | t          j        t           j                            d|          |                    S )NrQ   r	   )r   rU   rV   r   bitcastr$   
VectorTyper    )vec_tyri32s      r   _unpack_i32rn      sN    
##B''#	fl2=,,T377;;
 
 r   returnc                 2   fdt           j                            |           r<t          fdt           j        t           j        t           j        fD                       S t           j                            |           r t           j                  S dS )Nc                 .    |                                S r5   )
isinstance)tyabtypes    r   <lambda>z(_supported_wgmma_types.<locals>.<lambda>   s    r}}V44 r   c              3   .   K   | ]} |          V  d S r5   r,   ).0rs   input_types_ares     r   	<genexpr>z)_supported_wgmma_types.<locals>.<genexpr>   s-      YYrr""YYYYYYr   F)r   r   rr   anyFloatTF32TypeBF16TypeF16Type)r(   rt   rx   s    `@r   _supported_wgmma_typesr~      s    4444/Z5!! YYYYb.>RZ-XYYYYYY	zU## ?2:&&&5r   accb_descriptora_transposeb_transpose
a_k_stride
b_k_strider'   element_typec
                    *+,-./ t          j         j        d         j                  j        }
t          |
|	          st          d|
|	f          t           j                            d          +t           j                            d          }t           j	        
                                ,|dz  rt          ||t          |	          z  z  rt          t          |	          dk    }|s|s|rt          d          t          |t          j                  x}r|j        t           j        
                                k    r>|j        t           j        
                                k    rt          d|j                   |j        t          j        k    s|j        d|dz  fk    rt          d	          ||t          d
          n||dz  rt          |t          t           j                            |
          r>|dz  }|
-,fd j        D             }t-          j        t0          |
 j                  }d}nwt           j                            |
          rE|dz  }+-d  j        D             }t          j         j        d         j                  / /fd}d}nt          d|
 d          |rdnd}|rdgdz  }|dz  }ndg}d| g|z  d t3          |          D             z   |z   dgdz  z   dgd|z   z  z   }d                    |          }t7          j                    ..fd}dd                     ||                    z   dz   } ||          D ]}|r2dd                     |t;          |                              z   dz   }n |d          \  } |d          \  }}d                     ||                    }t=          .          t;          |          k    sJ |	}dt          |	          z  }d| d| d |
 d | d | d!| d| d| d"| d#}d$| d%| d&} +fd'}! |!d          x}"x}#}$|"|#|$g}%|r6|4|% |!t?          |                     |!t?          |                    gz  }%n|r|% |!t?          |                    gz  }% j         dk    s$ j        d         dk    s j        dd          d(k    rt           j                  t           j!        "                    d)d                    -fd*|D                        d+          }&t3          |t          |	          z  |z            D ]7}'|r.|d d |'dz  |'dz   dz  f         }(d, |(j#        j        D             })nM|'dk    rDtI          |tK          j&        |t           j'        
                    ||dz	                                }|g})|'dk    rDtI          |tK          j&        |t           j'        
                    ||dz	                                }t;          |)          t;          |          k    sJ tK          j(        |&g ||)||%| |dd-.          **-fd/t3          t;          |                    D             }9 ||          S )0Nr   z.Usupported wgmma types (out_ty, element_type)=rQ   r   rS   rP   z"Only f16 WGMMA supports transposesz$Unsupported A register array dtype: z#Unsupported A register array layoutz.Unsupported WGMMA features with A in registersc           
      |    g | ]8}t          d           D ]&}t          j        |t          |                    '9S rP   positionranger   extractelementr\   rw   regposindexs      r   
<listcomp>zwgmma_m64.<locals>.<listcomp>   s^       88   	cAc5MM:::   r   )r(   shapefr@   c                 ,    g | ]}t          |          S r,   _as_i32_regrw   r   s     r   r   zwgmma_m64.<locals>.<listcomp>   s     555SC  555r   c                 t    t          j        fd| D                                           j                  S )Nc                 0    g | ]}t          |          S r,   rn   )rw   r   rk   s     r   r   z/wgmma_m64.<locals>.<lambda>.<locals>.<listcomp>   s#    -W-W-W3k&#.F.F-W-W-Wr   )nparrayreshaper   )regsr   rk   s    r   ru   zwgmma_m64.<locals>.<lambda>   s9    BH-W-W-W-WRV-W-W-W$X$X$`$`adaj$k$k r   rl   z5WGMMA instruciton only supports f32 and f16 out (got )r	   l=c                 ,    g | ]}t          |          S r,   strrw   is     r   r   zwgmma_m64.<locals>.<listcomp>   s    ---AQ---r   r'   ,c                 B    d t          j        |           D             S )Nc              3       K   | ]	}d | V  
dS )$Nr,   r   s     r   ry   z/wgmma_m64.<locals>.take_regs.<locals>.<genexpr>   s(      <<GGG<<<<<<r   )	itertoolsislice)r'   	reg_counts    r   	take_regszwgmma_m64.<locals>.take_regs   s$    <<Y-i;;<<<<r   {}z, z!wgmma.mma_async.sync.aligned.m64nk. z, p, ;z{ .reg .pred p; setp.ne.b32 p, z, 0; z }
c                 t    t          j        t          j                            |                     j        S r5   )r   
ConstantOpr   IntegerAttrr    rA   r=   rm   s    r   lczwgmma_m64.<locals>.lc   s*    ?3 2 23 : :;;BBr   )rP   r	   !llvm.struct<(c              3   6   K   | ]}t                    V  d S r5   r   )rw   _out_ty_fields     r   ry   zwgmma_m64.<locals>.<genexpr>  s+      DDaL 1 1DDDDDDr   )>c                 ,    g | ]}t          |          S r,   r   )rw   vs     r   r   zwgmma_m64.<locals>.<listcomp>  s    ???1A???r   Tasm_dialecthas_side_effectsc                 >    g | ]}t          j        |g          S r,   r   extractvalue)rw   r   
acc_structr   s     r   r   zwgmma_m64.<locals>.<listcomp>  s7       =>,
QC88  r   ))r   rj   flattyper   r~   r   rU   rV   	IndexTyper    	bytewidthrr   r   r#   
mlir_dtyper}   r|   r   r   r   r   	functoolspartial_as_fragmented_reg_ndarrayr   joinr   countlennextintndimTypeparser-   rI   r   r   r   
inline_asm)0r   ar   r   r   r   r   r'   rL   r   out_tyr`   supports_transpose	a_in_regsnum_acc_regsacc_regsto_acc_vec_regsacc_constraintnum_imm_regsa_reg_constraintsreg_constraints_listreg_constraintsr   acc_reg_vectorr   a_regs
b_desc_reguse_out_regimm_regsel_tyk_instrwgmma_instrptxr   use_outscale_ascale_bimmsacc_struct_typer   a_slicea_argsr   rm   r   r   r   rk   s0   `                                         @@@@@@r   	wgmma_m64r      s    =!)**7&		5	5 J
H/EHH
I
II
##B''#
##B''#
,



%"_ 
'Y|,,
,- 
 ..!3	 ; ; ;
9
:
::Q 4555Y |rz~~''''ALBKOO<M<M,M,MLalLLMMMx4$$$B13E(E(E<===!8GHHH "9 Z"_Z6"" X6LL   8  H
  '(B&X[XabbbONN	zV$$ X6LL55CH555H]38A;+,,FkkkkkONN
VVVVV
W
WW(/a, 	ALL >|+--|,,---. 
	 
\!"	#  HH122/o)= = = = = ))L"9"9:::S@.9\"" 	 	a 388IIc*;&<&<==>>>DFFillGF%IaLL*kYYyy..//(	iC 455	5	5	5	5
%)L)))'B! B Bg B B B B B B B B	B B!B B%/B B6>B B B  	P;OO[OOO#C C C C C !#1%'%Gg
7G	$$ #K3RRK  !!22c+&6&6#7#788DD #RRK  !!""DX]]cila''39QRR=F+B+B
SY

GMMHsxxDDDD8DDDDDHHH / 'Y|444@AA  a 	!!!a"f!a%2../g??(9(>???ff	
QOC!3!3Cq!I!IJJ
 
 sf1uu

/#r~11#zQGG
H
H l v;;#/0000001(1V1\1D1  J    BGHBVBV  HH 
	"	""r   c                   J    e Zd Z ej                    Z ej                    ZdS )WGMMALayoutN)r6   r7   r8   enumauto	ROW_MAJOR	COL_MAJORr,   r   r   r   r   %  s(        dikk)dikk)))r   r   rO   )rL   a_orderb_orderr   r   c                (  ( t          |t          j                  x}r|j        }|j        }n't          j        |j                  }	|	j        }|	j        }t          j        |j                  }
t
          j	        
                                t
          j        
                                t
          j        
                                h}||vrt          |          |
j        |vrt          |
j                  |x}|
j        k    rt          t          |          (|(z  }|
j        d d         \  }}|
j        dd          ||gk    rt          |
j                  |r|t
          j	        
                                k    r1|t
          j        
                                k    rt          |          |d         dz  s|d         |z  rt          |          |d         |z  |k    rt          |d         |z  |          |d         dz  }|t          d          nW|d         }|d         |k    rt          |d         |          |dd          d|gk    rt          |          |t          j        }|t          j        k    r|dk    rt$          t          j        }t          j        }||dz  z  }|dz  }t'          ||k    rdn|dz  |dz  |d	
          }t'          ||k    r|nddz  |dz  |d	
          }t'          ||k    ||k    ||k    rdnddz  ||k    r|nddz  ||z  |t
          j                             |          rt
          j        
                                n|          }|r
d x|d<   |d<   |rt+          |          }d x}x}}nut-          |fi |}t          j        |j                                                  \  }}(fd|D             }|d d         \  }}|dd          |(gk    rt          |          t-          |fi |}|
                                \  }}(fd|D             } | d         }!| dd          ||z  |(gk    rt          |           t
          j                            d          }"| j        j                                        }#t;          |          D ]}$t;          |          D ]}%|r!||$dz  |$dz   dz  |%|z  |%dz   |z  f         }&n4t=          |t?          tA          |$|z  |%|z  z             |"                    }&t=          |t?          tA          |%|!z            |"                    }'tC          |#|$|$dz            |&|'fi ||#|$|$dz   <   tE          t          j        |#t          j#                  d          S )NrP   r   r   r	   z8a_order can only be specified when A is in shared memoryrO   rQ   r@   rR   )rJ   rK   rL   rM   )r   r   r   r   r'   rL   r   r   r   c                     g | ]}|z  S r,   r,   rw   selement_bytewidths     r   r   zwgmma.<locals>.<listcomp>  s    ???a++???r   c                     g | ]}|z  S r,   r,   r  s     r   r   zwgmma.<locals>.<listcomp>  s    ===aA))===r   
_registers_layoutFr2   )$rr   r   r#   r   r   r   
MemRefTyper   r   r}   r    r|   r   r   r   r   r   r   rY   dictr{   r   rf   get_strides_and_offsetrU   rV   r   r-   copyr   rI   r\   rB   r   r   r   ))r   r   brL   r   r   r   a_element_typea_shapea_tyb_tysupported_typesr   kn_tilegroups_kgroups_ngroups_m	row_major	col_majortnsp_lbosboa_desc_fieldsb_desc_fieldswgmma_paramsa_m_byte_stridea_k_byte_stridea_desc_base	a_stridesr   a_byte_stridesb_desc_base	b_stridesb_byte_stridesb_k_byte_strider`   new_acc_regsmikia_mkb_kr  s)                                           @r   wgmmar+  ,  s[    Q 4555Y \NgGG=  D&NjG	qv		$Z^^%%r{'8'8"*..:J:JK/?**
^
$
$$	o--
T&
'
''$$l):::
--(('z"1"~(H	Z^)))
TZ
 
   &))))n@Q@Q.Q.Q~&&&qzB  '!*w.  wqzW((wqzW,h777qzRH
D   
 qzHqzXwqz8,,,qrr{r7m##w%g%%%'S..
#)#)2&(1# '9 4 411(qH	  - '.)';';88qH	  - Y&Y&)++!!:$	11''qQ>	G				|	,	,2#'')))
 
 
,  D?CCLm!< 	'AA6::O:o#A7777K=((??AALIq????Y???N'5bqb'9$O_abbg'8999~&&&!!55}55+,,..,)Q====9===."1%/ABBGg-w8IJJJ
^
$
$$
##B''#$))++,(OO  bHoo  	 
bBFb=("w,"q&G9K*KKLl2/"2FFGGMM
 
 [!Lo1E$F$F"L"LMMc"+
rBF{
#T3# #2># #l2Q; 
!!4+<   	
 
 
 r   r   c                 .  	
 t           j                            d          }t           j                                        | j        }t          j        | j        j        d         j	                  j
        dgk    sJ |t           j                                        k    rjfd| j        j        D             |dgt                    z  dgt                    z  z   }fdt          t                              D             }n|t           j                                        k    s"|t           j                                        k    rhd | j        j        D             |d	gt                    z  d
gt                    z  z   }fdt          t                              D             }nt!          |          d                    |          }|                    d           d                    |          dz   }t'                    
t           j                            dd                    
fdD                        d          }t-          j        |||dd          		fdt          t                              D             |t           j                                        k    r!t1          | j        | j        j
                  }n|t           j                                        k    s"|t           j                                        k    rGfdD             t3          j        t6                                        | j        j
                  }nt!          |          t;          j        || j                  S )aR  Fences the array construction from WGMMA instructions.

  This is a little workaround to force LLVM to initialize the PTX registers
  before the wgmma.fence.sync.aligned instruction. Otherwise, LLVM treats
  in-register computation as pure and can move it after the fence, which is
  explicitly disallowed by the PTX programming model.
  rQ   r   rP   c           
      |    g | ]8}t          d           D ]&}t          j        |t          |                    '9S r   r   r   s      r   r   zwgmma_fence.<locals>.<listcomp>  s^       88   	cAc5MM:::   r   z=fr   c                 @    g | ]}d | dt                    |z    S )z	mov.f32 $, $r   rw   r   r   s     r   r   zwgmma_fence.<locals>.<listcomp>  4    KKKQ0Q003t99Q;00KKKr   c                 ,    g | ]}t          |          S r,   r   r   s     r   r   zwgmma_fence.<locals>.<listcomp>  s     ===K===r   z=rrl   c                 @    g | ]}d | dt                    |z    S )z	mov.b32 $r/  r0  r1  s     r   r   zwgmma_fence.<locals>.<listcomp>  r2  r   r   zwgmma.fence.sync.alignedz;
r   c              3      K   | ]}V  d S r5   r,   )rw   r   	dtype_strs     r   ry   zwgmma_fence.<locals>.<genexpr>  s#      88a	888888r   r   Tr   c                 >    g | ]}t          j        |g          S r,   r   )rw   r   r   	reg_dtypes     r   r   zwgmma_fence.<locals>.<listcomp>  s7     
 
 
89d	:s33
 
 
r   c                 0    g | ]}t          |          S r,   r   )rw   rl   
src_vec_tys     r   r   zwgmma_fence.<locals>.<listcomp>  s#    5551K
A&&555r   r(   r  ) r   rU   rV   r   r    r   rj   r-   r   r   r   r   r   r   r}   r|   rY   r   appendr   r   r   r   r   r   r   asarrayobjectr   r   r#   r   )r   rm   r(   r   	ptx_linesr   r   	struct_tyr-   r   r6  r   r8  r   r:  s            @@@@@@r   r   r     sh    	##B''#
,



%

%}U_1!49::*		aS	 	 	 	 
bjnn   ?'  D
 I 6CII-D		0AAKKKK%D		:J:JKKKII
    ER[__->->$>$>==(<===DI 6CII-D		0AAKKKK%D		:J:JKKKII
e
$
$$HH122/-...

9%#)nn)gmm<sxx8888488888<<< ) sOd  *
 
 
 
 
=B3t99=M=M
 
 
$ bjnn*
 %/"7 II 
    ER[__->->$>$>5555555D
4v...66u7LMMII
e
$
$$		EL	I	I	IIr   r(   r   .c                    g }t          | d d d         | dd d                   D ]\  }}t          j        t          j                            d|                    }t          j        ||t          d                    }t          j        ||t          d                    }|                    |           t          j
        |t                                        |          S )NrP   r	   )rP   r   r   r;  )zipr   
mlir_undefr   rj   r    insertelement_lcr<  r   r=  r>  r   )	flat_regsr(   r   vec_regsfirstsecondvecs          r   r   r     s    (9SSqS>9QTT?;;  meV
/"-++D%88
9
9C

S%#a&&
9
9
9C

S&3q66
:
:
:COOC	HF	+	+	+	3	3E	:	::r   c                     t           j                            d          }t          j        t          j        t           j                            d|          |           t          d                    S )NrQ   rh   r   )
r   rU   rV   r   r   r   ri   rj   r    rE  )r   rm   s     r   r   r     sU    
##B''#		nR]&&tS11155s1vv
 
 r   c                     t           j                            d          }t          j        |t           j                            ||                     j        S )NrQ   )r   rU   rV   r   r   r   r    rA   r   s     r   rE  rE    s>    
##B''#	bn00a88	9	9	@@r   r5   ).dataclassesr   r   r   jaxjaxlib.mlirr   jaxlib.mlir.dialectsr   r   r   r   r   numpyr    r
   r   r   r\   r   	tree_utilregister_pytree_node_class	dataclassr   r   rB   rI   rf   rn   r;   r~   ndarrayValuer   r   Enumr   r   r+  r#   r   tupler   r   rE  r,   r   r   <module>rZ     si                 



       & & & & & & ( ( ( ( ( ( % % % % % % % % % % % % ' ' ' ' ' '                 	FN	 )&- &- &- &- &- &- &-  *)&-RC    G G G  $   4Z	
 *   D  T    R#	R# (R# 	R#
 R# d
R# R# R# R# 'R# R# R# R#j    $)    "&&0| | |	|
 | 4| | | | |~6Jt+ 6J 6J 6J 6Jr; ;sCx ; ; ; ;  A A A A Ar   