
    VpfW              	      $   d Z ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z	ddl
mZmZ ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlZ ej!        j"        Z"ej!        j#        Z#ej$        j%        Z%ej&        Z'ej$        Z$ee'ej(        f         Z)e*ej(        df         Z+eej(        e,f         Z-eeej.                 ef         Z/eee'         ef         Z0dZ1d Z2dUdZ3dVdZ4d Z5dWdZ6dXdZ7dYd%Z8d& Z9d' Z:d( Z; G d) d*ej<                  Z=ej>         ej?        d+,           G d- d.                                  Z@ ejA        ejB        jC        d/ 0          ZD G d1 d2          ZE eFd3 d4 d5 d6 d7 d8 d9 :          ZG eFd; d< d= d> d? d@ dA :          ZHdZdCZI eIeG          ZG eIeH          ZHdZdDZJdddEdFdGZK G dH dI          ZL eL            ZM eL            ZNd[dQZOdddEdddRd\dSZPdddEdFdTZQdS )]z>Module for emitting custom TPU pipelines within a Pallas call.    )annotations)SequenceN)UnionAny)lax)	tree_util)util)core)
primitives)pallas.)      c                R   t                      t          j        |          }g fd}	 t          j        || |d            n%# t          $ r t	          d|  d| d          dw xY wfdD             t                    |j        k    sJ t          j        |          S )	z/Broadcast a prefix pytree to a given full tree.c                h                         | gt          j        |          j        z             d S N)extendr   tree_structure
num_leaves)ixbroadcast_leavess     _/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/_src/pallas/mosaic/pipeline.py
add_leavesz(_broadcast_pytree_to.<locals>.add_leaves<   s=    	
i&q))446 6 6 6 6    c                
    | d u S r    r   s    r   <lambda>z&_broadcast_pytree_to.<locals>.<lambda>A   s
    d r   is_leafzCannot broadcast tree z to full tree structure .Nc                     g | ]
}|u rd n|S r   r   ).0aproxys     r   
<listcomp>z(_broadcast_pytree_to.<locals>.<listcomp>E   s%    JJJAa5jjddaJJJr   )objectr   r   tree_map
ValueErrorlenr   tree_unflatten)from_pytree	to_pytreetreedefr   r   r%   s       @@r   _broadcast_pytree_tor/   7   s   
((%$Y//'6 6 6 6 6Ez;	224 4 4 4 4	 E E E
 :k : :/6: : : ; ;@DEE KJJJ9IJJJ			'"4	4	4	4	4		!'+;	<	<<s   A "A)returnintc                     t          j                    d         j        } |                     d          r| d t	          d                    } | d d         dk    s
J |             t          | d                   S )Nr   z lite   zTPU v)jaxdevicesdevice_kindendswithr*   r1   )kinds    r   _get_tpu_generationr9   J   sp    	q		%$	]]7  #g,,D	bqbW			d				T!Wr   shapetuple[int, ...]dtypenp.dtypec                   t          |           dk     rt          d|           | d d         | dd          }}|\  }}d|j        z  }t          d         }dt	          t                      dk               z   |z  }|t          ||          k     r|dz  }|t          ||          k     g dt          |          z  |t          d         R S )N   z-Shape must have at least 2 dimensions: shape=   r      rB   )r*   r)   itemsize_TILINGr1   r9   min)	r:   r<   leading_dims
final_dimssecond_minor_packing
max_tilingsecond_minor_tilings	            r   _make_tilingrN   Q   s     	ZZ!^^
GuGG
H
HH"3B3Zrss
, /,'qz*S!4!6!6!:;;;wFc,
;;;;1 	c,
;;;;	E4#l###	E%8	E'!*	E	EEr   c                2    t          j        | |z   |          S )z>"Calculates a mod n for positive and negative a with |a| <= n.)r   rem)r$   ns     r   _modrR   c   s    	Q		r   smultiplec                .    | |z  dk    r| S | | |z  z
  |z   S Nr   r   )rS   rT   s     r   _round_up_to_nearest_multiplerW   h   s*    \QH	
Q\	H	$$r   idxjax.Array | intsizepl.Slicec                n    t          j        | |z  |          }t          |t           j                  sJ |S )z(Make a DMA slice with mosaic size hints.)plds
isinstanceSlice)rX   rZ   outs      r   _make_dsrb   o   s6     	cDj$#	C	"	""""	*r   block_index	jax.Array
block_sizetilingpl.Slice | slicec                T   ||z  dk    rt          | |          S ||z  dk    rt          d|d|          t          j        ||          }| |dz
  k    }t	          j        |t          ||z  |          |          }t          j        ||          }t          j        | |z  |          S )Nr   z*Block size must divide tiling: block_size=z	, tiling=rB   )	rb   r)   r]   cdivjnpwhererW   multiple_ofr^   )rc   re   rZ   rf   
num_blocksis_lastrounded_sizes          r   _make_block_slicerp   x   s     
J!K,,,&A
OzOOfOO
P
PPwtZ((*:>)'#D:$5v>> ,
 f55,	{Z'	6	66r   c                t    t           j                            d | |          }t          j        d |d          S )z+Dynamic index-tuple comparison calculation.c                    | |k    S r   r   r   ys     r   r   z _tuples_differ.<locals>.<lambda>   s
    !q& r   c                    | |z  S r   r   rs   s     r   r   z _tuples_differ.<locals>.<lambda>   s
    q1u r   F)r4   treemap	functoolsreduce)xsysdifferencess      r   _tuples_differr}      s6    00"b99+		,,k5	A	AAr   c                X    t          j        dt           j                  }| D ]}||z  }|S )zDynamic grid size calculation.rB   )rj   arrayint32)gridrZ   dims      r   
_grid_sizer      s4    	1ci	 	 $  cCKDD	+r   c           	     P    |dz   }t          t          j        |ddd         t          j                            ddd         }t           fdt          |dd         |dd                   D                       }t          d t          ||d	          D                       S )
z&Get indices for a given step and grid.rC   N)funcc              3  n   K   | ]/\  }}t          j        t          j        |          |          V  0d S r   )r   divrP   )r#   r$   bsteps      r   	<genexpr>z_get_indices.<locals>.<genexpr>   sS        
!Q 
gcgdA""     r   rB   c              3  &   K   | ]\  }}||z   V  d S r   r   )r#   r$   r   s      r   r   z_get_indices.<locals>.<genexpr>   s*      DDAq1uDDDDDDr   T)strict)tuple	itertools
accumulateoperatormulzip)r   r   offsetsextended_gridstridesindicess   `     r   _get_indicesr      s    +-=2.X\BBBD DDHDbDJ'    gcrclGABBK00    ' 
DDWgd!C!C!CDDD	D	DDr   c                      e Zd ZdZdZdZdZdS )
BufferTypez5Buffer type for the arguments to an emitted pipeline.rB   r?      N)__name__
__module____qualname____doc__INPUTOUTPUTACCUMULATORr   r   r   r   r      s#        ==
%&+++r   r   T)frozenc                     e Zd ZU dZded<   ded<   ded<   ded	<   ded
<   ded<   ded<   ded<   ded<   d Zed             Zed,d            Zed             Z	ed             Z
ed             Zed             Zed             Zed             Zed             Zed             Zed             Zed             Zd Zd  Zd! Zd" Zd# Zd$ Zd% Zd& Zd' Zd-d)Zd* Zd+S ).BufferedRefa  A helper class to automate VMEM double buffering in pallas pipelines.

  Attributes:
    spec: pallas blockspec.
    dtype: dtype for buffers.
    buffer_type: enum indicating whether this is an input, output, or in/out
      accumulator buffered reference.
    vmem_ref: a double-buffer to hold a working buffer and a dirty buffer used
      to copy into and out of.  In the case of a BufferedRef targeting a VMEM
      reference, this simply points to the existing ref.
    accum_ref: accumulating buffer used by accumulator BufferedRefs.
    current_slot: current slot index to the working buffer.
    next_slot: slot that will point to the working buffer in the next iteration.
    sem_recv: semaphore for input DMAs.
    sem_send: semaphore for output DMAs.

    block_shape: passthrough property for the BlockSpec's block_shape.
    compute_index: passthrough property for the BlockSpec's compute_index.
    memory_space: passthrough property for the BlockSpec's memory_space.
    current_ref: points to the current working slice of the double-buffer.
    is_input: whether this BufferedRef acts as a pipeline input.
    is_output: whether this BufferedRef acts as a pipeline output.
    is_accumulator: whether this BufferedRef is an accumulator.
  zpl.BlockSpecspecr   r<   r   buffer_typez
REF | Nonevmem_ref	accum_refzArrayRef | Nonecurrent_slot	next_slotzSemaphoreType | Nonesem_recvsem_sendc                v    | j         | j        | j        | j        | j        | j        f| j        | j        | j        ffS r   )	r   r   r   r   r   r   r   r<   r   selfs    r   tree_flattenzBufferedRef.tree_flatten   s=    ]DND,=^T]DM;Y
D$457 7r   c                     | g ||R  S r   r   )clsmetadatas      r   r+   zBufferedRef.tree_unflatten   s    3tr   r0   c                   t          d |j        D                       }|j        t          k    r2 | |||d|t          j        u rt          ||          nddddd	  	        S  | |||t          d|z   |          |t          j        u rt          ||          ndt          dt          j                  t          dt          j                  |t          j	        u rdnt          j        |t          j        u rdnt          j        	  	        S )a  Create a BufferedRef.

    Args:
      spec: pallas blockspec.
      dtype: dtype for buffers.
      buffer_type: enum indicating whether this is an input, output, or in/out
        accumulator buffered reference.

    Returns:
      Initialized BufferedRef
    c                    g | ]}|dn|	S NrB   r   r#   r   s     r   r&   z&BufferedRef.create.<locals>.<listcomp>        III1aiQIIIr   N)	r   r<   r   r   r   r   r   r   r   )r?   rC   )r   block_shapememory_spaceVMEMr   r   SMEMrj   r   r   SemaphoreTypeDMAr   )r   r   r<   r   r   s        r   createzBufferedRef.create   s-    II8HIIIJJKD   S5!#z'=== +u---CGtdTK K K K S5!{*E22#z'=== +u---CGD#),,sy))':+<<<DD&*':+;;;DD&*. . . .r   c                D    |                      ||t          j                  S r   )r   r   r   r   r   r<   s      r   inputzBufferedRef.input  s    ::dE:#3444r   c                D    |                      ||t          j                  S r   )r   r   r   r   s      r   outputzBufferedRef.output	  s    ::dE:#4555r   c                D    |                      ||t          j                  S r   )r   r   r   r   s      r   accumulatorzBufferedRef.accumulator  s    ::dE:#9:::r   c                    | j         j        S r   )r   r   r   s    r   r   zBufferedRef.block_shape  s    9  r   c                      fdS )Nc                 2    t          j        j        g| R  S r   )pallas_corecompute_indexr   )argsr   s    r   r   z+BufferedRef.compute_index.<locals>.<lambda>  s    249DtDDD r   r   r   s   `r   r   zBufferedRef.compute_index  s    DDDDDr   c                    | j         j        S r   )r   r   r   s    r   r   zBufferedRef.memory_space  s    9!!r   c                    t          d | j        D                       }| j        t          k    r| j        j        |         S | j        j        | j        d         g|R          S )Nc                4    g | ]}|dnt          d           S rV   )slicer   s     r   r&   z+BufferedRef.current_ref.<locals>.<listcomp>   s&    CCCQaiU4[[CCCr   r   )r   r   r   r   r   atr   )r   buffer_slices     r   current_refzBufferedRef.current_ref  sh    CC$2BCCCE ELD  ]l++]t03ClCCDDr   c                @    | j         t          j        t          j        fv S r   )r   r   r   r   r   s    r   is_inputzBufferedRef.is_input&  s    
 0*2HIIIr   c                @    | j         t          j        t          j        fv S r   )r   r   r   r   r   s    r   	is_outputzBufferedRef.is_output*  s    
 1:3IJJJr   c                ,    | j         t          j        k    S r   )r   r   r   r   s    r   is_accumulatorzBufferedRef.is_accumulator.  s    z555r   c                    | j         t          k    r4t          j        | |j        |                     |                             S | S )zDFor handling VMEM references, the pipeline aliases the existing ref.)r   )r   r   dataclassesreplacer   compute_slice)r   r   r   s      r   bind_existing_refzBufferedRef.bind_existing_ref2  sM    D   
T%7%7%@%@AC C C CKr   c                    t          d | j        D                       } | j        | }t          j                            t          ||          S )z$Compute DMA slice from grid indices.c                    g | ]}|dn|	S r   r   r   s     r   r&   z-BufferedRef.compute_slice.<locals>.<listcomp>;  r   r   )r   r   r   r4   rv   rw   rb   )r   grid_indicesr   r   s       r   r   zBufferedRef.compute_slice9  sH    II8HIIIJJK d ,/G8<<';777r   c                R    | j         t          k    rdS d| j        d<   d| j        d<   dS )zInitialize slot indices.Nr   )r   r   r   r   r   s    r   
init_slotszBufferedRef.init_slots?  s3    D  &&DaDN1r   c                T    | j         t          k    rdS | j        d         | j        d<   dS )zSwitch to the next slot.Nr   )r   r   r   r   r   s    r   
swap_slotszBufferedRef.swap_slotsE  s.    D  &&>!,Dar   c                   t          |          dk     rt          d          t          ||          }t          d | j        D                       } | j        | }t          j                            t          ||||          S )Nr?   zMust use >1D values.c              3  "   K   | ]
}|dn|V  d S r   r   )r#   r   s     r   r   z,BufferedRef.get_dma_slice.<locals>.<genexpr>}  s*      HH!QYAHHHHHHr   )
r*   NotImplementedErrorrN   r   r   r   r4   rv   rw   rp   )r   	src_shape	src_dtyper   rf   r   block_indicess          r   get_dma_slicezBufferedRef.get_dma_sliceJ  s    ^ 9~~ 6777)Y//FHHt7GHHHHHK&D&5M8<<=+y&  r   c                   | j         sJ | j        t          k    rdS t          j        | j        d         dz   d          }|| j        d<   |                     |j        |j	        |          }t          d |D                       }t          j        |j        |         | j        j        |         j        |         | j                                                   dS )z3Starts copy of HBM dma slice into the current slot.Nr   rB   r?   c              3  J   K   | ]}t          j        d |j                  V  dS r   Nr]   r^   rZ   r#   rS   s     r   r   z&BufferedRef.copy_in.<locals>.<genexpr>  0      ::1beAqv&&::::::r   )r   r   r   r   rP   r   r   r   r:   r<   r   tpu_primitivesmake_async_copyr   r   r   start)r   src_refr   r   	src_slice	dst_slices         r   copy_inzBufferedRef.copy_in  s    =D  &&)!,q0!44I!DN1""7='-NNI::	:::::I"
9#&y1  uwwwwwr   c                   | j         sJ | j        t          k    rdS | j        d         }t	          j        |dz   d          | j        d<   |                     |j        |j	        |          }t          d |D                       }t          j        | j        j        |         j        |         |j        |         | j                                                   dS )z3Starts copy of HBM dma slice from the current slot.Nr   rB   r?   c              3  J   K   | ]}t          j        d |j                  V  dS r   r   r   s     r   r   z'BufferedRef.copy_out.<locals>.<genexpr>  r   r   )r   r   r   r   r   rP   r   r   r:   r<   r   r   r   r   r   r   r   )r   dst_refr   slotr   r   s         r   copy_outzBufferedRef.copy_out  s    >D  &&QDq!,,DN1""7='-NNI::	:::::I"!),
9  uwwwwwr   c                l   | j         sJ | j        t          k    rdS |                     |j        |j        |          }t          d |D                       }t          j        |j	        |         | j
        j	        | j        d                  j	        |         | j                                                   dS )zWaits for input copy to finish.Nc              3  J   K   | ]}t          j        d |j                  V  dS r   r   r   s     r   r   z&BufferedRef.wait_in.<locals>.<genexpr>  r   r   r   )r   r   r   r   r:   r<   r   r   r   r   r   r   r   wait)r   r   r   r   r   s        r   wait_inzBufferedRef.wait_in  s    =D  &&""7='-NNI::	:::::I"
9*1-.1)<  tvvvvvr   c                   | j         sJ | j        t          k    rdS t          j        | j        d         dz   d          }|                     |j        |j        |          }t          d |D                       }t          j        | j        j        |         j        |         |j        |         | j                                                   dS )z Waits for output copy to finish.Nr   rB   r?   c              3  J   K   | ]}t          j        d |j                  V  dS r   r   r   s     r   r   z'BufferedRef.wait_out.<locals>.<genexpr>  r   r   )r   r   r   r   rP   r   r   r:   r<   r   r   r   r   r   r   r  )r   r   r   	prev_slotr   r   s         r   wait_outzBufferedRef.wait_out  s    >D  &&)!,q0!44I""7='-NNI::	:::::I"#&y1
9  tvvvvvr   Fc                l      j         sJ  j        " fd} fd}t          j        |||           dS dS )z-Set accumulator or zero it out to initialize.Nc                 V    t          j         j        d                    j        d<   d S N.)rj   
zeros_liker   r   s   r   _initz*BufferedRef.set_accumulator.<locals>._init  s&    !nT^C-@AAsr   c                 b     j         d                              j                   j        d<   d S r  )r   astyper   r   s   r   _setz)BufferedRef.set_accumulator.<locals>._set  s-    ".s3::4>JJsr   )r   r   r   cond)r   initr  r  s   `   r   set_accumulatorzBufferedRef.set_accumulator  sx    ~!B B B B BK K K K K	htUD!!!!! "!r   c                Z   | j         sJ | j        t          j        }| j        j        t          j        k    rt          j        }| j        d                             |          | j        d                             |          z                       | j        j                  | j        d<   dS dS )zAdd into the current slot.N.)	r   r   rj   float32r   r<   r   r   r  )r   accum_dtypes     r   r   zBufferedRef.accumulate  s    ~!Kk				)	)i 
3

&
&{
3
3
.

$
$[
1
12t}"## s "!r   N)r0   r   )F)r   r   r   r   __annotations__r   classmethodr+   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r      s         0 ***        7 7 7
   ; $. $. $. ;$.L 5 5 ;5 6 6 ;6 ; ; ;; ! ! 8! E E 8E " " 8" E E 8E J J 8J K K 8K 6 6 86  8 8 8  - - -
7 7 7r    	 	 	
 
 
." " " "$ $ $ $ $r   r   c                ,    t          | t                    S r   )r_   r   r   s    r   r   r     s    jK00 r   r   c                  r    e Zd ZdZ	 	 	 dddZd	 Zd
 Zd Zd ZddZ	ddZ
ddZddZddZddZddZdS )	Schedulerz;Sequences input and output copies and waits for a pipeline.Nr   rd   r   tuple[int | jax.Array, ...]grid_offsetsc                   || _         || _        || _        || _        || _        t          |          | _        |dk    | _        || j        dz
  k    | _        || j        z  | _	        || j        z  | _
        t          |dz
  | j                  | _        t          |dz   | j                  | _        t          |||          | _        t          | j        ||          | _        t          | j        ||          | _        dS )a  Initializes scheduler.

      Args:
        step: inner step number.
        grid: pallas grid for BufferedRefs.
        grid_offsets: offsets for grid indices (used for megacore).
        first_cycle: whether this is the first invocation of the pipeline.
        last_cycle: whether this is the last invocation of the pipeline.
        init_accumulators: do we zero-initialize accumulator state for this
          invocation of the pipeline.
    r   rB   N)r   r   first_cycle
last_cycleinit_accumulatorsr   	num_steps
first_step	last_stepfirst_step_everlast_step_everrR   	prev_step	next_stepr   r   prev_indicesnext_indices)r   r   r   r  r   r!  r"  s          r   __init__zScheduler.__init__  s    & DIDI"D DO.D  %%DN aiDOT^a//DN '8D$t~5D $(DN33DN$(DN33DN  dL99DL$l D %l Dr   c           	         t          j        t          t          t           j        | j        | j                                      S r   )r   grid_envlistrw   GridAxisr   r   r   s    r   r.  zScheduler.grid_env  s8    S%t|TY??@@B B Br   c                ^     |j         | j         } |j         | j         }t          ||          S r   )r   r   r*  r}   )r   buffered_refr   r*  s       r   has_changedzScheduler.has_changed  5    (l($,7G-<-t/@AL'<000r   c                ^     |j         | j         } |j         | j         }t          ||          S r   )r   r   r+  r}   )r   r2  r   r+  s       r   will_changezScheduler.will_change  r4  r   c                8    |                     || j                  S r   )r   r   )r   r2  refs      r   alias_local_refszScheduler.alias_local_refs   s    ))#t|<<<r   c                P    |t           } |d                    }t          j        d          5  t          j         j                  fd            }t          j        |           fd            }                                 d d d            d S # 1 swxY w Y   d S )Nprologue_copy_inep_initializec                 0                                       d S r   )r   r2  s   r   _init_slotsz)Scheduler.initialize.<locals>._init_slots.  s    !!!!!r   c                 P     j         r                     j                   d S d S r   )r   r   r   r2  r   r   s   r   _startz$Scheduler.initialize.<locals>._start2  s6      	6


w
5
5
5
5
5	6 	6r   )_default_scheduler4   named_scoper]   whenr&  r   )r   r2  r   schedulepredr?  rB  s   ```    r   
initializezScheduler.initialize(  s   "h'8&'lGDDD		)	)    	wt#$$" " " " %$" 
wt}}6 6 6 6 6 6 }6                                    s   ABB"Bc                     |t           } |d                    }t          j        d           fd            }t          j        d           fd            }t          j        |||           d S )Nr  
ep_wait_inc                      j         r                     j                    j        r                     j                   d S d S r   )r   r  r   r   r  r"  rA  s   r   _waitz Scheduler.wait_in.<locals>._wait@  sZ    		 4Wdl333		$ = 	$$T%;<<<<<= =r   ep_set_accumc                     j         r=t          j        j                                      z            fd            } d S d S )Nc                 <                          j                   d S r   )r  r"  )r2  r   s   r   _set_accumulatorz=Scheduler.wait_in.<locals>._no_wait.<locals>._set_accumulatorL  s"     
&
&t'=
>
>
>
>
>r   )r   r]   rE  r$  r3  )rP  r2  r   s    r   _no_waitz#Scheduler.wait_in.<locals>._no_waitH  sk    		$ ?	4#3#3L#A#AA	B	B	? 	? 	? 	? 	? 
C	B	? 	? 	?? ?r   rC  r4   rD  r   r  )r   r2  r   rF  rG  rL  rQ  s   ```    r   r  zScheduler.wait_in;  s    "h8It\7;;D_\""= = = = = = #"= 	_^$$? ? ? ? ? %$? HT5(#####r   c                     |t           } |d                    }t          j        |          t          j        d           fd                        }d S )Nr   
ep_copy_inc                 h    j         r)t          j        j                   fd            } d S d S )Nc                 >                          j                   d S r   r   r+  rA  s   r   _copy_inz2Scheduler.copy_in.<locals>._send.<locals>._copy_in]  "    


w(9
:
:
:
:
:r   r   r]   rE  r%  )rX  r2  r   r   s    r   _sendz Scheduler.copy_in.<locals>._sendX  s`     
	 ;	$.	!	!	; 	; 	; 	; 	; 	; 
"	!	; 	; 	;; ;r   rC  r]   rE  r4   rD  r   r2  r   rF  rG  r[  s   ```   r   r   zScheduler.copy_inS  s|    "h8It\7;;DWT]]_\""; ; ; ; ; ; #" ]; ; ;r   c                     |t           } |d                    }t          j        |          t          j        d           fd                        }d S )Nprefetchep_prefetchc                 f    j         r(t          j        j                  fd            } d S d S )Nc                 >                          j                   d S r   rW  rA  s   r   _prefetch_inz7Scheduler.prefetch.<locals>._send.<locals>._prefetch_inn  rY  r   rZ  )rc  r2  r   r   s    r   r[  z!Scheduler.prefetch.<locals>._sendi  s^     
	 ;		 	 	; 	; 	; 	; 	; 	; 
!	 	; 	; 	;; ;r   r\  r]  s   ```   r   r_  zScheduler.prefetchd  s|    "h8JlG<<DWT]]_]##; ; ; ; ; ; $# ]; ; ;r   c                     |t           } |d                    }t          j        |          t          j        d           fd                        }d S )Nr  ep_wait_outc                 P     j         r                     j                   d S d S r   )r   r  r*  r2  r   r   s   r   rL  z!Scheduler.wait_out.<locals>._waitw  s9     
	 :gt'899999: :r   r\  )r   r2  r   rF  rG  rL  s   ```   r   r  zScheduler.wait_outr  s|    "h8JlG<<DWT]]_]##: : : : : : $# ]: : :r   c                     |t           } |d                    }t          j        d           fd            }t          j        d           fd            }t          j        |||           d S )Nr   ep_copy_outc                      j         r                                   j        r                     j                   d S d S r   )r   r   r   r   r   rg  s   r   _copy_out_and_accumulatez4Scheduler.copy_out.<locals>._copy_out_and_accumulate  sS    		$ "!!!		 5gt|444445 5r   ep_accumc                 b    j         r&t          j        j                  fd            } d S d S )Nc                 0                                       d S r   )r   r>  s   r   _accumulatezAScheduler.copy_out.<locals>._just_accumulate.<locals>._accumulate  s    

!
!
#
#
#
#
#r   )r   r]   rE  r%  )ro  r2  r   s    r   _just_accumulatez,Scheduler.copy_out.<locals>._just_accumulate  sT    		$ $ 
	 	 	$ 	$ 	$ 	$ 
!	 	$ 	$ 	$$ $r   rR  )r   r2  r   rF  rG  rk  rp  s   ```    r   r   zScheduler.copy_out  s    "h8JlG<<D_]##5 5 5 5 5 5 $#5
 	_Z  	$ 	$ 	$ 	$ 	$ ! 	$ HT+-=>>>>>r   c                     |t           } |d                    }t          j        |          t          j        d           fd                        }d S )Nepilogue_wait_outep_finalizec                 x     j         r1                                                       j                   d S d S r   )r   r   r  r   rg  s   r   _endz Scheduler.finalize.<locals>._end  sJ     
	 5!!!gt|444445 5r   r\  )r   r2  r   rF  rG  ru  s   ```   r   finalizezScheduler.finalize  s}    "h(8'(|WEEDWT]]_]##5 5 5 5 5 5 $# ]5 5 5r   )NNN)r   rd   r   r  r  r  r   )r   r   r   r   r,  r.  r3  r6  r9  rH  r  r   r_  r  r   rv  r   r   r   r  r    s
       CC  !%/ / / / /bB B B1 1 1
1 1 1
= = =       &$ $ $ $0; ; ; ;"; ; ; ;	: 	: 	: 	:? ? ? ?0
5 
5 
5 
5 
5 
5r   r  c                    | j         S r   r&  rS   brefrJ   s      r   r   r     	    (9 r   c                <    |                      |          | j        z  S r   )r3  r$  ry  s      r   r   r     s    q}}T22Q\A r   c                >    |                      |          | j         z  S r   r6  r'  ry  s      r   r   r         q}}T22a6F5FF r   c                N    |                      |          | j        z  | j         z  S r   )r6  r%  r'  ry  s      r   r   r     s(    	
t		q{	*q/?.?? r   c                N    |                      |          | j        z  | j         z  S r   )r3  r$  r&  ry  s      r   r   r     s(    	
t		q|	+0A/AA r   c                <    |                      |          | j        z  S r   )r6  r%  ry  s      r   r   r     s    d 3 3ak A r   c                    | j         S r   r'  ry  s      r   r   r     	    )9 r   )r;  r  r   r_  r  r   rr  c                    | j         S r   rx  ry  s      r   r   r     r{  r   c                <    |                      |          | j        z  S r   r3  r&  ry  s      r   r   r     s    q}}T22Q5FF r   c                >    |                      |          | j         z  S r   r~  ry  s      r   r   r     r  r   c                >    |                      |          | j         z  S r   r~  ry  s      r   r   r     s    d 3 3q7G6G G r   c                >    |                      |          | j         z  S r   r  ry  s      r   r   r     s    d 3 3q7H6H H r   c                <    |                      |          | j        z  S r   r~  ry  s      r   r   r     s    d 3 3a6F F r   c                    | j         S r   r  ry  s      r   r   r     r  r   r   c                Z    i | }dD ]#}d }t          j        || |                   ||<   $|S )z=Skip input copies in schedule when init_accumulators is True.)r;  r  r   c                L     | | }|d         j         r||d         j         z  }|S )NrB   r   )r   r"  )original_pred_fnr$   rG  s      r   new_predz:skip_input_copies_when_init_accumulators.<locals>.new_pred  s7    q!d	
1	 (1'''kr   )rx   partial)rF  new_schedulekr  s       r   (skip_input_copies_when_init_accumulatorsr    sU    H,5  a    ' LOO 
r   c                    t           t          d}t          | t                    r||                                          S | S )zGRetrieve a named pipeline schedule or pass through fully specified one.)defaultfixed)rC  _fixed_scheduler_   strcopy)rF  predefined_scheduless     r   get_pipeline_scheduler    sG     #  # 1)..000	/r   Fin_specs	out_specsshould_accumulate_outc                   t          |           }t          | t          t          f          s| f} t          |t          t          f          s|f}t          | t                    rt          |           } t          |t                    rt          |          }|d|         }||d         }d }t          j                            || |          }d }	t          j                            |	|||          }
g ||
R S )a"  Create BufferedRefs for the pipeline.

  This function creates buffered refs for an inner pipeline that can be
  created at the top-level of a pallas call such that they may be reused across
  multiple invocations of the inner pipeline.

  Args:
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.

  Returns:
    A list of BufferedRefs, one corresponding to each ref specified in the
    in_specs and out_specs.
  Nc                B    t                               | |j                  S r   )r   r   r<   )in_specin_refs     r   make_input_brefz2make_pipeline_allocations.<locals>.make_input_bref  s    Wfl333r   c                    |r t                               | |j                  S t                               | |j                  S r   )r   r   r<   r   )out_specout_refr   s      r   make_output_brefz3make_pipeline_allocations.<locals>.make_output_bref   s:     >$$Xw}===h666r   )r*   r_   r/  r   r4   rv   rw   )r  r  r  refsnum_in_specsin_refsout_refsr  in_brefsr  	out_brefss              r   make_pipeline_allocationsr    s   . X,	HtUm	,	, {H	Ie}	-	- I$ XH	4   !i  I,',-- (4 4 4X\\/8W==(7 7 7 hll	8-BD D)	 8	 i	 	  r   c                      e Zd ZdS )GridDimensionSemanticsN)r   r   r   r   r   r   r  r  )  s        $r   r  r   r  	core_axis
int | Nonedimension_semantics)tuple[GridDimensionSemantics, ...] | None?tuple[tuple[int | jax.Array, ...], tuple[int | jax.Array, ...]]c                D    | dt                     z  fS t          j        |          t          t                    st          d|          dk    r dt                     z  fS |t          ft                     z  }t          |          t                     k    rt          d          d t          |          D             }|st          d|          t           fd|D                       rt          d            fd	|D             rfd
t          t          |                    D             ^}} |         z  }t          j        |          |z  }t          j         ||          }t          j        dt                     z  ||          }	nt           fd|D                       fdt                     D             ^}
}t           |
                   \  }}|dk    s
J |            t          j        |          }t!          j        ||k     |dz   |          }t          j         |
|          }t!          j        ||k     ||z  ||z  |z             }t          j        dt                     z  |
|          }	||	fS )Nr   z>Cannot partition grid over dynamic number of cores: core_axis=rB   z4dimension_semantics must be the same length as grid.c                .    h | ]\  }}|t           k    |S r   )PARALLEL)r#   r   ds      r   	<setcomp>z"_partition_grid.<locals>.<setcomp>G  s,     + + +tq!MM )MMr   zRCannot partition over cores without parallel grid dimensions: dimension_semantics=c              3  P   K   | ] }t          |         t                     V  !d S r   r_   r1   r#   r   r   s     r   r   z"_partition_grid.<locals>.<genexpr>P  s4      CC!ZQ%%	%CCCCCCr   z?Cannot partition cores over only dynamic grid dimensions: grid=c                h    h | ].}t          |         t                    r|         z  d k    ,|/S r  r  )r#   r   r   	num_coress     r   r  z"_partition_grid.<locals>.<setcomp>U  sM       	DGS	!	!&*1g	&9Q&>&> &>&>&>r   c              3  $   K   | ]
}|v |V  d S r   r   )r#   r   divisible_dimensionss     r   r   z"_partition_grid.<locals>.<genexpr>Z  s7       % %a;O6O6O6O6O6O6O% %r   c              3  ^   K   | ]'}t          |         t                    |         V  (d S r   r  r  s     r   r   z"_partition_grid.<locals>.<genexpr>j  sV       %B %B(247C(@(@%BT!W %B %B %B %B %B %Br   c              3  X   K   | ]$\  }}t          |t                    r
|k     |V  %d S r   r  )r#   r   r  largest_parallel_dimensions      r   r   z"_partition_grid.<locals>.<genexpr>l  sS        Aqa #$'A"A"A 	
"A"A"A"A r   r   )r*   r]   num_programsr_   r1   r   	ARBITRARYr)   	enumerateallrange
program_idjax_utiltuple_updatemaxdivmodrj   rk   )r   r  r  parallel_dimensionsfirst_divisible_dimensionrJ   partitioned_dim_sizepartitioned_dim_offsetnew_gridr   partition_dimensionbase_num_itersrP   
core_index	num_itersgrid_offsetr  r  r  s   `               @@@r   _partition_gridr  /  s   
 D		!!!oi(()	Is	#	# 
KyKK   !^^D		!!!  $,T2	T**
K
L
LL+ +y1D'E'E + + + 
 
	#	# 	#   	CCCC/BCCCCC 
LTLL      $    ,% % % %01122% % %!   9:iG]9558LL$')= H #s4yy35K GG "% %B %B %B %B6I %B %B %B "B "B   dOO  !
 !&9!:IFFNC777C777 y))J	*s*NQ,>(* *I$T+>	JJH
 )SY^#c) K
 #s4yy-{ G 
7	r   )r  r  r  r  r  c          
     >   	
 t          d D                       r+t          d D                       }t          d|           t          ||          \  t	                    	t          t          t          f          sft          t          t          f          sft          t                    rt                    t          t                    rt                    t                    dddddddddd 	
fd

S )a  Creates a function to emit a manual pallas pipeline.

  This has the same semantics as pallas_call but is meant to be called inside
  pallas_call for nesting grids. This is useful when you need to have separate
  windowing strategies for communication and computation.

  The new argument `should_accumulate_out` can be used to specify which outputs
  we should accumulate into automatically within and across pipeline
  invocations.

  Args:
    body: pallas kernel to set up pipeline for.
    grid: a pallas grid definition.
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.
    core_axis: optional int, indicates whether or not to partition the grid
      along the core axis.
    dimension_semantics: optional tuple of GridDimensionSemantics (e.g. PARALLEL
      or ARBITRARY).
  c              3  Z   K   | ]&}t          |t          t          j        f           V  'd S r   )r_   r1   r4   Arrayr#   r  s     r   r   z emit_pipeline.<locals>.<genexpr>  s6      ;;ZC+,,	,;;;;;;r   c              3  4   K   | ]}t          |          V  d S r   )typer  s     r   r   z emit_pipeline.<locals>.<genexpr>  s(      --1tAww------r   z5Grid must consist of Python integers and JAX Arrays: NTF	scratchesallocationsr   r!  r"  r_  postyeetrF  r  r   r   CondValr!  r"  c                     d ,t          j         f	dt          d          S t          t                    rt                    t          d           t          t          t
          f          st          fd          t          t                    rt                    t          d           
 fd}	t          j        d	|	d           dS )
a3  
    Run the pipeline.

    Args:
      *ref_args: a list of pallas refs (or more generally a list of pytrees of
        pallas refs)
      scratches: scratch buffers for the inner kernel
      allocations: a list of BufferedRefs, one corresponding to each ref
      first_cycle: boolean indicating if this is the first invocation of the
        inner pipeline cycle.
      last_cycle: boolean indicating if this is the last invocation of the
        inner pipeline cycle.
      init_accumulators: whether to zero-init accumulators during this cycle.
      prefetch: callback called as fn(*brefs, scheduler) that is used to fetch
        the next cycle invocations first inputs.  Called during the inputs phase
        in the final inner step.
      postyeet: callback called as fn(*brefs, scheduler) that is used to finish
        any writes or transfers from the last output of the previous cycle.
        Called during the outputs phase in the first inner step.
      schedule: manually specified pipeline schedules for brefs, None indicates
        default schedule.
    Nr   c                "   	  	| dS )Nr  r   )
r  r   r"  r!  pipeliner  r_  r  rF  r  s
    r   r   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s3    hh!%%# 1
 
 
 r   r  c                    d S r   r   r   s    r   r   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s    T r   c                    S r   r   )r   rF  s    r   r   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s    X r   c                     t          |          S r   )r  )rJ   r   s     r   r   z1emit_pipeline.<locals>.pipeline.<locals>.<lambda>  s    *1-- r   c                   t          | 	
          t          j                  t          j                   t          j                   t          j                   t          j        d          5  #t          j	        | dz
  k    fdd            d d d            n# 1 swxY w Y   t          d           }t          j        d          5  
                                5   g |R   d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   t          j                   t          j        d          5   t          j	        | d	k    fd
d            d d d            n# 1 swxY w Y   t          j                   t          j                   dS )N)r  r   r!  r"  r`  rB   c                      g  R  S r   r   )brefsr_  	schedulers   r   r   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>      ((5E59555 r   c                     d S r   r   r   r   r   r   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>      $ r   c                    | j         S r   )r   r   s    r   r   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>  s     r   ep_run_kernelep_postyeetr   c                      g  R  S r   r   )r  r  r  s   r   r   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>#  r  r   c                     d S r   r   r   r   r   r   zDemit_pipeline.<locals>.pipeline.<locals>.loop_body.<locals>.<lambda>$  r  r   r   )r  	map_brefsr9  rH  r  r   r4   rD  r   r  r.  r  r   rv  )r   rJ   current_refsr  r  r  bodyr   r   r  r"  r!  r#  r  r_  r  rF  r  s      @@r   	loop_bodyz2emit_pipeline.<locals>.pipeline.<locals>.loop_body  s   

#!-/ / /i 	2KFFe 	$eT8<<<	!5$999	!5$999 ?=))    
(49q=(555555,                                   66>>l??++ * *!! 	* 	*
$
)
)y
)
)
)
)	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	** * * * * * * * * * * * * * *
 	"E4:::?=))    
(419555555,                                  
 	"E4:::	"E4:::RsZ   &B::B>B>*D-?D
D-D	D-D	D--D14D1##FFFr   )	r   
run_scopedr  r_   r/  r   r  r   	fori_loop)r  r  r   r!  r"  r_  r  rF  r  r  r  r   r  r  r#  r  r  r  s   ````````` r   r  zemit_pipeline.<locals>.pipeline  s   B i"
 
 
 
 
 
 
 
 
 
 
 
 $!$9	; ; ;  $ +t$$ '+&&k>>;77hhu.. <----{;;h(D!! !xh--{HF FH* * * * * * * * * * * * * * * * *Z M!Y	2.....r   )r  r   r   r  r!  r  r"  r  )anyr   r)   r  r   r_   r/  r/   )r  r   r  r  r  r  r  
grid_typesr  r#  r  s   `````   @@@r   emit_pipeliner    s   @ 	;;d;;;;; -------J
L
LL   'tY8KLL$)	HtUm	,	, {H	Ie}	-	- I$ XH	4   !i  I./DiPP !&p/ p/ p/ p/ p/ p/ p/ p/ p/ p/ p/ p/ p/ p/ p/d 
/r   c               l    t          j        t          |||          }t          | ||||          }||fS )a  Creates pallas pipeline and top-level allocation preparation functions.

  Args:
    body: pallas kernel to set up pipeline for.
    grid: a pallas grid definition.
    in_specs: input pallas block specs
    out_specs: output pallas block specs
    should_accumulate_out: booleans to indicate which outputs should be treated
      as accumulators.

  Returns:
    (emit_pipeline, make_allocations) function pair, where:
    emit_pipeline is the pallas pipeline function.
    make_allocations is a function to create buffered refs for the inner
      pipeline that can be created at the top-level of a pallas call to be
      reused across multiple invocations of the inner pipeline.

  r  )r   r  r  r  )rx   r  r  r  )r  r   r  r  r  make_allocationsr  s          r   emit_pipeline_with_allocationsr  0  s^    4 &'@%'*?A A A 
13 3 3( 
#	##r   )r0   r1   )r:   r;   r<   r=   r0   r;   )rS   r1   rT   r1   r0   r1   )rX   rY   rZ   rY   r0   r[   )
rc   rd   re   r1   rZ   r1   rf   r1   r0   rg   )r0   r   )r   r  r  r  r  r  r0   r  )r   r  r  r  r  r  )Rr   
__future__r   collections.abcr   r   enumrx   r   r   typingr   r   r4   r   r   jax._srcr	   r  jax._src.pallasr
   r   r   jax._src.pallas.mosaictpu_corer   jax.experimentalr   r]   	jax.numpynumpyrj   npTPUMemorySpacer   r   r   r   	MemoryRefREFr  ArrayRefr   GridIndicesboolr  	BlockSpecPipelineBlockSpecsPipelineRefsrE   r/   r9   rN   rR   rW   rb   rp   r}   r   r   Enumr   register_pytree_node_class	dataclassr   r  rv   rw   r  r  dictrC  r  r  r  r  r  r  r  r  r  r  r   r   r   <module>r      s(   E D " " " " " " $ $ $ $ $ $                       



             % % % % % % / / / / / / 4 4 4 4 4 4 3 3 3 3 3 3 ? ? ? ? ? ? ) ) ) ) ) )           ## &ci CIsN#
	4
 8K$9:C?@ Xc]C'( = = =&   F F F F$  
% % % %   7 7 7 7.B B B  	E 	E 	E        %d###c$ c$ c$ c$ c$ c$ c$ $# &%c$N	 IHL002 2 2	
D5 D5 D5 D5 D5 D5 D5 D5h D99AAFF   BA99   . $99FFFFGGHHFF99     $ =<=NOO ::?KK     	+! +! +! +! +!\       !!##""$$	W W W W|  EId d d d d dV %$ %$ %$ %$ %$ %$ %$r   