
    VpfH                      d   d Z ddlmZ ddlZddlmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ 	 ddlmZ n# e$ r dZY nw xY weZej        j        Zej        j        Zdededededef
dZ dededededef
dZ!dededededef
dZ"dededededef
dZ#dedededede$e         f
dZ%dededededef
dZ&dededededef
dZ'dededededede(e)eef         e)eef         e)eef         e)eef         f         fdZ*dej+        defdZ, eed          	 d>d ed!ed"eded#edededed$e-dedej+        de(eeef         fd%            Z. eej/        d&'          d ed!ed"ed(e)eef         d)e)eef         d*e)eef         d+e)eef         d#edededed$e-dede(eeef         fd,            Z0d#ede(e(eef         ef         fd-Z1d.ed#efd/Z2d0ed#edefd1Z3d ed!ed"ed2ed#edededed$e-dedej+        fd3Z4dededed$e-ded4efd5Z5d6 Z6 e
j7        d7          Z8d8e8_9        e8:                     eej;        e8                     e8<                    e5           er ej=        e8ej>        d9:           dededed$e-dedej+        fd;Z?dededed$e-ded4efd<Z@ e
j7        d=          ZAd8eA_9        eA:                     eej;        eA                     eA<                    e@           er ej=        eAejB        d9:           e.C                    e4e?           dS )?a
  `jax.experimental.rnn`: GPU accelerated RNN

----------------------------------------------

This module provides experimental support to CUDNN-backed LSTM.

Currently, the only supported RNN flavor is LSTM with double-bias. We use
notations and variable names similar to
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM

and CUDNN_LSTM entry in
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t.

Note that a bidirectional LSTM is treated as having twice the number of layers,
where a forward layer i is followed by a reverse layer i. Each direction has
its own associated weights. We use pseudo-layer to denote such layers
following CUDNN documentation
https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetRNNWeightParams.

CUDNN takes an opaque 1D weight array that densely packs all the weight arrays
in a sparsely documented layout. Through trial-and-error and testing, we believe
the layout is the following. Assume 2-layer bi-LSTM with double-bias, so 4
pseudo-layers in total (forward-0, reverse-0, forward-1, reverse-1).

There are 4 kinds of weights: W_ih, W_hh, b_ih and b_hh, where

W_ih = (W_ii, W_if, W_ig, W_io) concatenated on leading axis,
W_hh = (W_hi, W_hf, W_hg, W_ho) concatenated on leading axis,
b_ih = (b_ii, b_if, b_ig, b_io) concatenated on leading axis,
b_hh = (b_hi, b_hf, b_hg, b_ho) concatenated on leading axis.

Say W_ih^0 denotates W_ih from pseudo-layer 0. The linear weights are packed
together from all pseudo-layers followed by bias weights from all pseudo-layers.
In particular, for each layer, W_ih is followed by W_hh and b_ih by b_hh.

(W_ih^0, W_hh^0, W_ih^1, W_hh^1, W_ih^2, W_hh^2, W_ih^3, W_hh^3,
 b_ih^0, b_hh^0, b_ih^1, b_hh^1, b_ih^2, b_hh^2, b_ih^3, b_hh^3)

See `get_params_shapes_in_lstm`.

Example usage:
```
  x = jax.random.normal(
      k1, (batch_size, seq_len, input_size), dtype=jnp.float32)
  h_0 = jax.random.normal(
      k2, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  c_0 = jax.random.normal(
      k3, (num_directions * num_layers, batch_size, hidden_size),
      dtype=jnp.float32)
  seq_lengths = jnp.ones((batch_size,), dtype=jnp.int32) * seq_len
  weights = rnn.init_lstm_weight(k4, input_size, hidden_size, num_layers,
                                 bidirectional)
  y, h_n, c_n = rnn.lstm(
      x,
      h_0,
      c_0,
      weights,
      seq_lengths=seq_lengths,
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=num_layers,
      dropout=False,
      bidirectional=bidirectional)
```

TODO:
  - Add support for input and weight dtypes other than float32.
  - Support ragged inputs.
  - Support RNNs other than LSTM.
    )partialN)Any)core)mlir)xla)
custom_vjp)ArrayShape)lax)gpu_rnnlayer_i
input_sizehidden_sizebidirectionalreturnc                 L    | dk    s| dk    r	|rd|z  |fS |rdnd}d|z  ||z  fS )zSShape of W_ii|W_if|W_ig|W_io.

  Note that layer_i is an index of pseudo-layers.
  r             )r   r   r   r   num_directionss        T/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/experimental/rnn.py_W_ih_lr   l   sJ     \\gll}lOZ(('.QQQNO^k9::    c                     d|z  |fS )zShape of W_hi|W_hf|W_hg|W_ho.r   r   r   r   r   r   s       r   _W_hh_lr   y   s     k/;	''r   c                     d|z  fS )zShape of b_ii|b_if|b_ig|b_io.r   r   r   s       r   _b_ih_lr           k/	r   c                     d|z  fS )zShape of b_hi|b_hf|b_hg|b_ho.r   r   r   s       r   _b_hh_lr"      r    r   
num_layersc                 @   g }|rdnd}||z  }t           t          g}t          |          D ]*}|D ]%}	 |	|| ||          }
|                    |
           &+t          t
          g}t          |          D ]*}|D ]%}	 |	|| ||          }
|                    |
           &+|S )z?Get flat param shapes in LSTM. See module docstring for layout.r   r   )r   r   rangeappendr   r"   )r   r   r#   r   layer_shapesr   num_pseudo_layerslinear_weightsiw_kindlayer_shapebias_weightss               r   _get_params_shapes_in_lstmr.      s     ,%,111. >1W%."## ' 'a  ' 'F1j+}EEk+&&&&' 7#,"## ' 'a ' 'F1j+}EEk+&&&&' 
r   c                 \    t          | |||          }t          d |D                       }|S )zGet param count in LSTM.c              3   >   K   | ]}t          j        |          V  d S N)mathprod).0shapes     r   	<genexpr>z)get_num_params_in_lstm.<locals>.<genexpr>   s,      ??DIe$$??????r   )r.   sum)r   r   r#   r   r'   param_counts         r   get_num_params_in_lstmr9      s>     ,JZ,9; ;,??,?????+	r   rngc                     t          ||||          }t          j        d|z            }t          j                            | |ft          j        | |          S )zDRandom initialize LSTM weights from U(-k, k), k=sqrt(1/hidden_size).g      ?)r5   dtypeminvalmaxval)r9   npsqrtjaxrandomuniformjnpfloat32)r:   r   r   r#   r   r8   ks          r   init_lstm_weightrG      sc     'z;
'46 6+gcK  !				+s{A2a 
 
I 
I Ir   weightsc                    t          ||||          }d}d}|rdnd}||z  }	i }
i }t          |	          D ]R}|
|fD ]K}||         }|dz  }t          j        |          }| |||z                                |          ||<   ||z  }LSi }i }t          |	          D ]R}||fD ]K}||         }|dz  }t          j        |          }| |||z                                |          ||<   ||z  }LS|
|||fS )a  Unpack cudnn LSTM weights into individual weights.

  CUDNN LSTM weight layout: (num_layers, num_directions, W_ih, W_hh, b_ih, b_hh)
  Returns W_ih, W_hh, b_ih, b_hh. e.g. W_ih[2][1] is the concat weights of
  4 weights (W_ii, W_if, W_ig, W_io), each of shape (hidden_size, input_size)
  at 2nd layer for the reverse direction. See notations from
  https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM.
  r   r   r   )r.   r%   r2   r3   reshape)rH   r   r   r#   r   flat_shapesflat_shapes_offset	w_offsetsr   r(   W_ihW_hhlr+   r5   	num_elemsb_ihb_hhs                     r   unpack_lstm_weightsrT      su    +:{J+8: :+)%,111. >1$$"##  a,  ,-eA)E""i)I	$99:BB5IIfQi9ii $$"##  a,  ,-eA)E""i)I	$99:BB5IIfQi9ii 
tT4	r   	precisionc                    t          j        |           } | dS | \  } }| t           j        j        k    rdS | t           j        j        k    rdS | t           j        j        k    rt          d          t          d|            )NTFz)bfloat16 support not implemented for LSTMz%Unexpected precision specifier value )r   canonicalize_precision	PrecisionHIGHESTHIGHDEFAULTNotImplementedError
ValueError)rU   _s     r   _lstm_cudnn_allow_tf32r_      s     (33)4,)Q#-'''5CM&&&4CM)))
I
J
JJ
HYHH
I
IIr   )            	   
   )nondiff_argnumsxh_0c_0seq_lengthsdropoutc                 N    t          | |||||||||	|
          \  \  }}}}|||fS )a`  LSTM via CuDNN or HIPDNN (not-yet-supported).

  Assume batch-first inputs.

  Arguments:
    x: (batch_size, max_seq_length, input_size)
    h_0: (num_directions * num_layers, batch_size, hidden_size)
    c_0: (num_directions * num_layers, batch_size, hidden_size)
    weights: (num_params,) where num_params = get_num_params_in_lstm(...)
    seq_lengths: (batch_size,)
  Returns: (y, h_n, c_n, reserve_space).
    y: (batch_size, max_seq_length, hidden_size * num_directions)
    h_n: (num_directions * num_layers, batch_size, hidden_size)
    c_n: (num_directions * num_layers, batch_size, hidden_size)
  )r   r   r#   rk   r   rU   )lstm_fwd)rg   rh   ri   rH   rj   r   r   r#   rk   r   rU   yh_nc_nr^   s                  r   lstmrq      sR    & 		!  -1c3 
Cr   )rc   rd   re         )static_argnumsrN   rO   rR   rS   c           	         |j         t          j         d          k    rt          d          |dk    rt          d          d }d }|                     ddd	          }|sg }g }t	          |
          D ]}t          |||         ||         ||         ||         
          }t          ||          }t          j                            |||         ||         f|          }t          ||          \  \  }}}|
                    |           |
                    |           t          j        |          }t          j        |          }|                    ddd	          ||fS g }g }t	          |
d	z            D ]5}t          |||         ||         ||         ||         
          }t          ||          }|d	z  dk    rFt          j                            |||         ||         f|          }t          ||          \  \  }}}n}t          ||          }t          j                            |||         ||         f|          }t          ||          \  \  }}}t          ||          }t          j        ||gd          }|
                    |           |
                    |           7t          j        |          }t          j        |          }|                    ddd	          ||fS )zReference implementation of LSTM.

  See https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#lstm
  https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnRNNMode_t
  int32 `seq_lengths` can only be int32.g        zWDropout not supported in LSTM reference because we cannot determine CUDNN dropout mask.c                   | \  }}t          j        |dd          \  }}	}
}t          j        |dd          \  }}}}t          j        |dd          \  }}}}t          j        |dd          \  }}}}t          ||j        z  |d          z   ||j        z  z   |d          z             }t          ||	j        z  |d          z   ||j        z  z   |d          z             }t	          ||
j        z  |d          z   ||j        z  z   |d          z             }t          ||j        z  |d          z   ||j        z  z   |d          z             }||z  ||z  z   }|t	          |          z  }||f|fS )Nr   r   axis)rD   splitsigmoidTtanh)carryrg   rN   rO   rR   rS   hcW_iiW_ifW_igW_ioW_hiW_hfW_hgW_hob_iib_ifb_igb_iob_hib_hfb_hgb_hor*   fgos                               r   	lstm_cellzlstm_ref.<locals>.lstm_cell0  su   DAq YtQQ777D$d YtQQ777D$d YtQQ777D$d YtQQ777D$dDF
T$Z'!df*4tDzABBADF
T$Z'!df*4tDzABBAQZ$t*$q46z1DJ>??ADF
T$Z'!df*4tDzABBA	AAA	DGGAq619r   c                 ,     | ||          \  }}|||ffS r1   r   )cellr   rg   rn   s       r   scan_fnzlstm_ref.<locals>.scan_fnA  s$    tE1~~HE15!*r   r   r   r   )rN   rO   rR   rS   ry   )r<   rD   r\   	transposer%   r   rA   r   scan_extract_outputr&   stack_flip_sequenceconcatenate)rg   rh   ri   rN   rO   rR   rS   rj   r   r   r#   rk   r   r   r   seq_first_yfinal_hfinal_crP   r   cell_fnouth_tc_tro   rp   seq_first_y_fwdseq_first_y_reversedseq_first_y_bwds                                r   lstm_refr     s%    #)G,,,,
@
A
AA^^
a    "   Aq!$$+	 4GG:  
$q'Qd1gDGM M Md&&gGLL3q63q6"2-8: :c /S A AjsC+nnSnnS
)G

C
)G

C  Aq))333 ''a    aQd1gDG$q'K K KDgt$$G1uzzGLL3q63q6"21<> >c$3K$E$E!jsC// ,KEEGLL
CFCF#%9; ;c$3K$E$E!jsC/&DDoO_o$FRPPPkNN3NN3	'#	'#			q!Q	'	'c	11r   c                    |\  }\  \  }}}t          ||           }t          ||           }| d          t          j        |j        d         t          j                  d d d f         k    }t          j        |d         |d          }||f|fS )Nr   )r<   ).N)_select_last_carryrD   aranger5   rv   where)	rj   r   r^   hscsr   r   r   masks	            r   r   r   q  s    "!hr22{++#2{++# 
T	SZ(9!(<CINNNqqqRVwW	W$	
9o	 	+ s[	  r   	carry_seqc                 V    | |dz
  t          j        | j        d                   f         S )Nr   )rD   r   r5   )r   rj   s     r   r   r     s'    	;?CJyq/A$B$BB	CCr   	sequencesc                     | j         d         }||z
  } t          j        t          t          j        d          dd          | |          d d d         S )Nr   ry   )r   r   r   )in_axesout_axesr   )r5   rA   vmapr   rD   roll)r   rj   	max_stepsroll_amountss       r   r   r     sh    oa )[(,
'#(+++V
 
 
\
+ 
++/4R4
1 1r   wc                     |j         t          j         d          k    rt          d          t          |
          }t                              | |||||||||	|          \  }}}}|||f| ||||||ffS )Nrv   rw   r   r   r#   rk   r   cudnn_allow_tf32)r<   rD   r\   r_   	rnn_fwd_pbind)rg   rh   ri   r   rj   r   r   r#   rk   r   rU   r   rn   ro   rp   reserve_spaces                   r   rm   rm     s     #)G,,,,
@
A
AA+I66(~~		!'  .  )  )!S#} S#CaaG	GGr   r   c           
         | j         d         | j         d         }}|	rdnd}||||z  f}t          j        || j                  }t	          j        |||||||	|
          \  }}t          j        |ft          j                  }||||fS )Nr   r   r   )r5   r   ShapedArrayr<   r   )compute_rnn_workspace_reserve_space_sizesrD   rE   )x_avalh_0_avalc_0_avalw_avalseq_lengths_avalr   r   r#   rk   r   r   
batch_sizemax_seq_lengthr   output_shapeoutput_avalr^   reserve_space_sizereserve_space_avals                      r   rnn_abstract_evalr     s      &|AQn*%,111.nn{.JK, v|<<+7
k:z>
="24 4 ! ');(=s{KK	h*<	<<r   c                    ~ | |i |S r1   r   )fnr   argskws       r   _gpu_lowering_strip_tf32r     s    	TRr   rnn_fwdTcuda)platformc                     t          |          }|\  }	}
}}}}}|\  }}}t                              ||||	|
|||||| |||||          \  }}}}||||t          j        |          fS )Nr   )r_   	rnn_bwd_pr   rD   
zeros_like)r   r   r#   rk   r   rU   	residuals	gradientsr   rg   rh   ri   r   rj   rn   r   dydh_ndc_ndxdh_0dc_0dws                          r   lstm_bwdr     s     ,I662;/!S#q+q-."dD ~~

		!'! & ) )"dD"" dD"cn[99	::r   c                     ||||fS r1   r   )dy_avaldhn_avaldcn_avalr   h0_avalc0_avalr   y_avalr   r   r   r   r#   rk   r   r   s                   r   rnn_bwd_abstract_evalr     s    
 
'6	))r   rnn_bwdr1   )D__doc__	functoolsr   r2   typingr   rA   numpyr?   jax._srcr   jax.interpretersr   r   jax._src.custom_derivativesr   jax._src.typingr	   r
   jax._src.laxr   	jax.numpyrD   jax._src.libr   ImportErrorPRNGKeyArraynnr|   r~   intboolr   r   r   r"   listr.   r9   rG   tupledictrT   PrecisionLiker_   floatrq   jitr   r   r   r   rm   r   r   	Primitiver   multiple_resultsdef_implapply_primitivedef_abstract_evalregister_loweringcudnn_rnn_loweringr   r   r   cudnn_rnn_bwd_loweringdefvjpr   r   r   <module>r     s  F FN              



           ! ! ! ! ! !             2 2 2 2 2 2 ( ( ( ( ( ( ( (            """""""   ''' 
&.
v{
;S 
;c 
; 
;
;$)
; 
; 
; 
;(S (c ( (($)( ( ( (S c  $)   S c  $)   3 S +..27;E{   *s  # *.36   I, IC Ic I!$I59I I I I' '  #' 25' CF' '  4U
T#u*-tCJ/?cFKGL BM M N'  '  '  ' TJc&7 JD J J J J0 	%8999 >B E  E E  '*8;FK),):FKESXZ_L_F`   :9B 	!3444W2 W2E W2 W2T#u*5E W2U
#W2+/U
+;W2U
#W227W2EHW2 W2 ,/W2 :?W2 !	W2 &+5%+>%?	W2 W2 W2 54W2r! !eE5L6I56P0Q ! ! ! !D% De D D D D1e 1% 1E 1 1 1 1H HE H H% He HH+.H<?HJOH H-0->H H H H*="%=47=EH=$=59= )-= = = =    DN9%%	!	  	  773.	:: ; ; ; 	  - . . .
 Q$G$>PPPP; ;3 ;C ;% ; ;-0->; ; ; ;4*8;*JM* '** 5:* KO* -1	* * * * DN9%%	!	  	  773.	:: ; ; ; 	  1 2 2 2
 B$/&B B B B Hh     s   A AA