
    Vpfh                       d Z ddlmZ ddlmZ ddlmZ ddlZddlZ	ddl
mZmZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z%  G d d          Z& e&            Z'eej(        dgd                        Z)e)*                    d            ej(        dhdid            Z+ej(        dgd            Z,ej(        dgd            Z-ej(        dgd            Z. eej(        d           dgd!            Z/ej(        dgd"            Z0ej(        dgd#            Z1e1Z2ej(        dgd$            Z3ej(        dgd%            Z4ej(        djdkd(            Z5ej(        dldmd+            Z6ej(        dgd,            Z7ej(        djdkd-            Z8ej(        dgd.            Z9dndod1Z: eej(        d23          dpdqd7            Z;e%Z$ eej(        d23          d4de'fdrd=            Z<d4de'fdrd>Z= eej        d?@          d4dej>         fdsdA            Z?e?j@        dB             ZAd4dej>         fdsdCZB eej(        d23          	 	 	 	 	 dtdudH            ZC eej(        dI3          dvdN            ZDejE        d4dOdvdPZFej        ej(        dgdQ                        ZGeG*                    dR            ej(        dgdS            ZHej(        dgdT            ZIeIZJdU ZKdV ZLdwd`ZMddddaddbdxdfZdS )yz6Shared neural network activations and other functions.    )annotations)Sequence)partialN)AnyLiteral)
custom_jvp)lax)config)core)dtypes)util)AxisName)dot_product_attentionMaskType)Array	ArrayLike)	logsumexpc                      e Zd Zd ZdS )Unspecifiedc                    dS )N_UNSPECIFIED )selfs    U/var/www/html/nettyfy-visnx/env/lib/python3.11/site-packages/jax/_src/nn/functions.py__repr__zUnspecified.__repr__+   s    >    N)__name__
__module____qualname__r   r   r   r   r   r   *   s#            r   r   xr   returnr   c                ,    t          j        | d          S )uJ  Rectified linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{relu}(x) = \max(x, 0)

  except under differentiation, we take:

  .. math::
    \nabla \mathrm{relu}(0) = 0

  For more information see
  `Numerical influence of ReLU’(0) on backpropagation
  <https://openreview.net/forum?id=urrcVI-_jRm>`_.

  Args:
    x : input array

  Returns:
    An array.

  Examples:
    >>> jax.nn.relu(jax.numpy.array([-2., -1., -0.5, 0, 0.5, 1., 2.]))
    Array([0. , 0. , 0. , 0. , 0.5, 1. , 2. ], dtype=float32)

  See also:
    :func:`relu6`

  r   )jnpmaximumr    s    r   relur&   2   s    B 
Q		r   c                \    t          j        |dk    | t          j        | d                    S )Nr   r	   select	full_likegansr    s      r   <lambda>r.   U   s$    sz!a%CM!Q4G4GHH r      bc                   t          j        d|            t          j        d|           t          j        |           } t          j        |          }| t          j        t          j        |           |z             z   }|dz  S )zSquareplus activation function.

  Computes the element-wise function

  .. math::
    \mathrm{squareplus}(x) = \frac{x + \sqrt{x^2 + b}}{2}

  as described in https://arxiv.org/abs/2112.11687.

  Args:
    x : input array
    b : smoothness parameter
  
squareplus   )
numpy_utilcheck_arrayliker#   asarraysqrtsquare)r    r0   ys      r   r2   r2   W   sn     \1---\1---	k!nn!	k!nn!#(3:a==1$
%
%%!	
Q,r   c                ,    t          j        | d          S )zSoftplus activation function.

  Computes the element-wise function

  .. math::
    \mathrm{softplus}(x) = \log(1 + e^x)

  Args:
    x : input array
  r   )r#   	logaddexpr%   s    r   softplusr<   m   s     
q!		r   c           
         t          j        d|            t          j        |           } t          j        | dk    dt          j        | dk    | | dz   dz  dz                      S )a  Sparse plus function.

  Computes the function:

  .. math::

    \mathrm{sparse\_plus}(x) = \begin{cases}
      0, & x \leq -1\\
      \frac{1}{4}(x+1)^2, & -1 < x < 1 \\
      x, & 1 \leq x
    \end{cases}

  This is the twin function of the softplus activation ensuring a zero output
  for inputs less than -1 and a linear output for inputs greater than 1,
  while remaining smooth, convex, monotonic by an adequate definition between
  -1 and 1.

  Args:
    x: input (float)
  sparse_plusg                    ?r3   r/   r4   r5   r#   r6   wherer%   s    r   r>   r>   {   sZ    , ]A...	k!nn!	19c39Q#Xq1s7Q,q.#I#I	J	JJr   c                    t          j        d|            t          j        |           }|t          j        |          dz   z  S )zSoft-sign activation function.

  Computes the element-wise function

  .. math::
    \mathrm{soft\_sign}(x) = \frac{x}{|x| + 1}

  Args:
    x : input array
  	soft_sign   )r4   r5   r#   r6   absr    x_arrs     r   rD   rD      s<     [!,,,
+a..%	#'%..1$	%%r   T)inlinec                *    t          j        |           S )zSigmoid activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{sigmoid}(x) = \frac{1}{1 + e^{-x}}

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`log_sigmoid`

  )r	   logisticr%   s    r   sigmoidrL      s    & 
ar   c                :    dt          j        | dz   dd          z  S )a  Sparse sigmoid activation function.

  Computes the function:

  .. math::

    \mathrm{sparse\_sigmoid}(x) = \begin{cases}
      0, & x \leq -1\\
      \frac{1}{2}(x+1), & -1 < x < 1 \\
      1, & 1 \leq x
    \end{cases}

  This is the twin function of the ``sigmoid`` activation ensuring a zero output
  for inputs less than -1, a 1 output for inputs greater than 1, and a linear
  output for inputs between -1 and 1. It is the derivative of ``sparse_plus``.

  For more information, see `Learning with Fenchel-Young Losses (section 6.2)
  <https://arxiv.org/abs/1901.02324>`_.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
        ?r@   r?   g       @)r#   clipr%   s    r   sparse_sigmoidrP      s!    < 
sxCc**	**r   c                x    t          j        d|            t          j        |           }|t	          |          z  S )aH  SiLU (aka swish) activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{silu}(x) = x \cdot \mathrm{sigmoid}(x) = \frac{x}{1 + e^{-x}}

  :func:`swish` and :func:`silu` are both aliases for the same function.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  silu)r4   r5   r#   r6   rL   rG   s     r   rR   rR      s5    ( VQ'''
+a..%		r   c                    t          j        d|            t          j        |           }|t          j        t          |                    z  S )aM  Mish activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{mish}(x) = x \cdot \mathrm{tanh}(\mathrm{softplus}(x))

  For more information, see
  `Mish: A Self Regularized Non-Monotonic Activation Function
  <https://arxiv.org/abs/1908.08681>`_.

  Args:
    x : input array

  Returns:
    An array.
  mish)r4   r5   r#   r6   tanhr<   rG   s     r   rT   rT      s?    & VQ'''
+a..%	(5//**	**r   c                v    t          j        d|            t          j        |           }t	          |            S )zLog-sigmoid activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{log\_sigmoid}(x) = \log(\mathrm{sigmoid}(x)) = -\log(1 + e^{-x})

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  log_sigmoid)r4   r5   r#   r6   r<   rG   s     r   rW   rW     s7    $ ]A...
+a..%
E6

	r   r@   alphac                    t          j        d|            t          j        |           }t          j        |dk    ||t          j        t          j        |dk    d|                    z            S )ak  Exponential linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{elu}(x) = \begin{cases}
      x, & x > 0\\
      \alpha \left(\exp(x) - 1\right), & x \le 0
    \end{cases}

  Args:
    x : input array
    alpha : scalar or array of alpha values (default: 1.0)

  Returns:
    An array.

  See also:
    :func:`selu`
  elur   r?   )r4   r5   r#   r6   rB   expm1)r    rX   rH   s      r   rZ   rZ   !  sh    , UA&&&
+a..%	51939SYuqy"e%D%DEEE
G 
G Gr   {Gz?negative_slopec                    t          j        d|            t          j        |           }t          j        |dk    |||z            S )a  Leaky rectified linear unit activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{leaky\_relu}(x) = \begin{cases}
      x, & x \ge 0\\
      \alpha x, & x < 0
    \end{cases}

  where :math:`\alpha` = :code:`negative_slope`.

  Args:
    x : input array
    negative_slope : array or scalar specifying the negative slope (default: 0.01)

  Returns:
    An array.

  See also:
    :func:`relu`
  
leaky_relur   rA   )r    r]   rH   s      r   r_   r_   =  sA    0 \1---
+a..%	5A:unu&<	=	==r   c           	         t          j        d|            t          j        |           }t          j        |dk    dt          j        |dk     d|                    S )a  Hard :math:`\mathrm{tanh}` activation function.

  Computes the element-wise function:

  .. math::
    \mathrm{hard\_tanh}(x) = \begin{cases}
      -1, & x < -1\\
      x, & -1 \le x \le 1\\
      1, & 1 < x
    \end{cases}

  Args:
    x : input array

  Returns:
    An array.
  	hard_tanhrE   rA   rG   s     r   ra   ra   Y  sN    & [!,,,
+a..%	519a52:r5!A!A	B	BBr   c                    t          j        | d          |t          j        t          j        | d          |z            z  z   S )a  Continuously-differentiable exponential linear unit activation.

  Computes the element-wise function:

  .. math::
    \mathrm{celu}(x) = \begin{cases}
      x, & x > 0\\
      \alpha \left(\exp(\frac{x}{\alpha}) - 1\right), & x \le 0
    \end{cases}

  For more information, see
  `Continuously Differentiable Exponential Linear Units
  <https://arxiv.org/abs/1704.07483>`_.

  Args:
    x : input array
    alpha : array or scalar (default: 1.0)

  Returns:
    An array.
  r?   )r#   r$   r[   minimum)r    rX   s     r   celure   p  s;    . 
Q		usyQ1D1Du1L'M'MM	MMr   c                0    d}d}|t          | |          z  S )a
  Scaled exponential linear unit activation.

  Computes the element-wise function:

  .. math::
    \mathrm{selu}(x) = \lambda \begin{cases}
      x, & x > 0\\
      \alpha e^x - \alpha, & x \le 0
    \end{cases}

  where :math:`\lambda = 1.0507009873554804934193349852946` and
  :math:`\alpha = 1.6732632423543772848170429916717`.

  For more information, see
  `Self-Normalizing Neural Networks
  <https://arxiv.org/abs/1706.02515>`_.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`elu`
  g,x?g2֫?)rZ   )r    rX   scales      r   selurh     s!    8 ,%
+%	Q	r   approximateboolc           	        t          j        d|           \  }|rdt          j        dt          j        z                                |j                  }ddt          j        ||d|dz  z  z   z            z   z  }||z  S t          j        d                              |j                  }t          j	        |t          j        ||z            dz   z  dz  |j                  S )	a  Gaussian error linear unit activation function.

  If ``approximate=False``, computes the element-wise function:

  .. math::
    \mathrm{gelu}(x) = \frac{x}{2} \left(1 + \mathrm{erf} \left(
      \frac{x}{\sqrt{2}} \right) \right)

  If ``approximate=True``, uses the approximate formulation of GELU:

  .. math::
    \mathrm{gelu}(x) = \frac{x}{2} \left(1 + \mathrm{tanh} \left(
      \sqrt{\frac{2}{\pi}} \left(x + 0.044715 x^3 \right) \right) \right)

  For more information, see `Gaussian Error Linear Units (GELUs)
  <https://arxiv.org/abs/1606.08415>`_, section 2.

  Args:
    x : input array
    approximate: whether to use the approximate or exact formulation.
  gelur3   rN   r@   gHm?   rE   dtype)r4   promote_args_inexactnpr7   piastypero   r#   rU   arrayr	   erf)r    ri   rH   sqrt_2_over_picdfsqrt_2s         r   rl   rl     s    , +FA66'5 SWQY''..u{;;N
sx%(eqj:Q2Q RSSS
TC3;WQZZu{++F9Ucgefn559:Q>ekRRRRr   axis)static_argnamesrb   rz   intc                    t          j        d|            t          j        |           }|j        |         }|dz  dk    s
J d            t          j        |d|          \  }}|t          |          z  S )a	  Gated linear unit activation function.

  Computes the function:

  .. math::
    \mathrm{glu}(x) =  x\left[\ldots, 0:\frac{n}{2}, \ldots\right] \cdot
      \mathrm{sigmoid} \left( x\left[\ldots, \frac{n}{2}:n, \ldots\right]
        \right)

  where the array is split into two along ``axis``. The size of the ``axis``
  dimension must be divisible by two.

  Args:
    x : input array
    axis: the axis along which the split should be computed (default: -1)

  Returns:
    An array.

  See also:
    :func:`sigmoid`
  glur3   r   z axis size must be divisible by 2)r4   r5   r#   r6   shapesplitrL   )r    rz   rH   sizex1x2s         r   r~   r~     sp    0 UA&&&
+a..%	T	$	Q:9UAt$$&"b	gbkk	r   int | tuple[int, ...] | NonerB   ArrayLike | NoneinitialArrayLike | None | Unspecifiedc                ,   |t           urt          j        dt          d           ~t	          j        d|            t          j        |           }t          j        |||t          j	         d          }||n t          j
        ||t          j	                   }|t          j        |          z
  }t          j        t          j        t          j        |          ||d                    }||z
  }	|!t          j
        ||	t          j	                   S |	S )	a  Log-Softmax function.

  Computes the logarithm of the :code:`softmax` function, which rescales
  elements to the range :math:`[-\infty, 0)`.

  .. math ::
    \mathrm{log\_softmax}(x)_i = \log \left( \frac{\exp(x_i)}{\sum_j \exp(x_j)}
    \right)

  Args:
    x : input array
    axis: the axis or axes along which the :code:`log_softmax` should be
      computed. Either an integer or a tuple of integers.
    where: Elements to include in the :code:`log_softmax`.

  Returns:
    An array.

  Note:
    If any input values are ``+inf``, the result will be all ``NaN``: this reflects the
    fact that ``inf / inf`` is not well-defined in the context of floating-point math.

  See also:
    :func:`softmax`
  zPThe initial argument to log_softmax is deprecated, and no longer has any effect.r3   
stacklevellog_softmaxTrB   r   keepdimsNrB   r   )r   warningswarnDeprecationWarningr4   r5   r#   r6   maxinfrB   r	   stop_gradientlogsumexp)
r    rz   rB   r   rH   x_maxx_safeshiftedshifted_logsumexpresults
             r   r   r     s   < L  Md$4 4 4 4]A...
+a..%
'%USWHt
L
L
L%M55syx'H'H&S&u---'g	gcggEDAAAC C&&&
9UFSWH---	-r   c                    |t           urt          j        dt          d           ~t          j        j        rt          | ||          S t          | ||          S )a  Softmax function.

  Computes the function which rescales elements to the range :math:`[0, 1]`
  such that the elements along :code:`axis` sum to :math:`1`.

  .. math ::
    \mathrm{softmax}(x) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}

  Args:
    x : input array
    axis: the axis or axes along which the softmax should be computed. The
      softmax output summed across these dimensions should sum to :math:`1`.
      Either an integer or a tuple of integers.
    where: Elements to include in the :code:`softmax`.

  Returns:
    An array.

  Note:
    If any input values are ``+inf``, the result will be all ``NaN``: this reflects the
    fact that ``inf / inf`` is not well-defined in the context of floating-point math.

  See also:
    :func:`log_softmax`
  zLThe initial argument to softmax is deprecated, and no longer has any effect.r3   r   )	r   r   r   r   r
   softmax_custom_jvpvalue_softmax_softmax_deprecated)r    rz   rB   r   s       r   softmaxr   !  sg    : L  M`$4 4 4 4$ / AtU###q$...r   )rE   )nondiff_argnumsc                    t          j        | |||d          }|| nt          j        || |          }t          j        ||z
            }|t          j        |||d          z  }|t          j        ||d          }|S NTr   r   r   )r#   r   rB   r   r   r    rz   rB   r   r   r   unnormalizedr   s           r   r   r   L  s     '!T$
G
G
G%1139UAw#?#?&%((,#',EDQQQQ&
Yufa((F	-r   c                    ||c\  }}}\  }}}t          || ||          }|||||z                      | |d          z
  z  fS )NTr   )r   r   )	rz   primalstangentsr    rB   r   x_dot_r9   s	            r   _softmax_jvpr   Z  sY    '.$1eW}q!q$w''!	
A!e)UTJJJK	KKr   c                $   t          j        | |||d          }|| nt          j        || |          }t          j        |t	          j        |          z
            }|t          j        |||d          z  }|t          j        ||d          }|S r   )r#   r   rB   r   r	   r   r   r   s           r   r   r   `  s    
 '!T$
G
G
G%1139UAw#?#?&#"3E":"::;;,#',EDQQQQ&
Yufa((F	-r   h㈵>meanvarianceepsilonc                   t          j        d|            t          j        d|||           |t          j        | |d|          }|?t          j        t          j        |           |d|          t          j        |          z
  }t          j        | t          j        |                    t          j	        t          j        |          |z             z  S )z]Normalizes an array by subtracting ``mean`` and dividing by :math:`\sqrt{\mathrm{variance}}`.standardizeNT)r   rB   )
r4   r5   check_arraylike_or_noner#   r   r8   subtractr6   r	   rsqrt)r    rz   r   r   r   rB   s         r   r   r   n  s     ]A...$]D(EJJJ	\8Atd%888D
 x
1td%9 9 9;>:d;K;KLH	aT**	+	+ciH8M8MPW8W.X.X	XXr   )num_classesro   rz   r   r   ro   int | AxisNamec          	        t          j        |d          }t          j        |          }t	          j        |           }	 t          j        ||j        dz             }nr# t          $ re t          j        d|          }||k    rt          d| d| d|           d t          j        |          }t	          j        ||k    |          cY S w xY wt          j        |          }t          j        ||f          }dg|j        z  }	|	                    ||           t          j        |j        |	|          }
t	          j        ||
k    |          S )N9The error arose in jax.nn.one_hot argument `num_classes`.rE   z/Expected num_classes to match the size of axis z, but z != rn   )r   concrete_dim_or_errorr   canonicalize_dtyper#   r6   r   canonicalize_axisndim	TypeErrorr	   psum
ValueError
axis_indexoperatorindexexpand_dimsinsertbroadcasted_iotaro   )r    r   ro   rz   rH   output_pos_axis	axis_sizeaxis_idxlhs	rhs_shaperhss              r   _one_hotr     sz    *AC C+ 
#E
*
*%
+a..%7,T5:>BBOO	 7 7 7D!!Ii ; ; ;); ;/8; ; < <AEF~d##H;u(6666667 
		$w''#cEJ)?K000U[)_EE#	SCZu	-	-	--s   A A,CCro   rz   c               R    t          j        |d          }t          | |||          S )a  One-hot encodes the given indices.

  Each index in the input ``x`` is encoded as a vector of zeros of length
  ``num_classes`` with the element at ``index`` set to one::

    >>> jax.nn.one_hot(jnp.array([0, 1, 2]), 3)
    Array([[1., 0., 0.],
           [0., 1., 0.],
           [0., 0., 1.]], dtype=float32)

  Indices outside the range [0, num_classes) will be encoded as zeros::

    >>> jax.nn.one_hot(jnp.array([-1, 3]), 3)
    Array([[0., 0., 0.],
           [0., 0., 0.]], dtype=float32)

  Args:
    x: A tensor of indices.
    num_classes: Number of classes in the one-hot dimension.
    dtype: optional, a float dtype for the returned values (default :obj:`jnp.float_`).
    axis: the axis or axes along which the function should be
      computed.
  r   r   )r   r   r   )r    r   ro   rz   s       r   one_hotr     s7    2 *AC C+ 
![D	9	9	99r   c                R    t          j        t          j        | d          d          S )an  Rectified Linear Unit 6 activation function.

  Computes the element-wise function

  .. math::
    \mathrm{relu6}(x) = \min(\max(x, 0), 6)

  except under differentiation, we take:

  .. math::
    \nabla \mathrm{relu}(0) = 0

  and

  .. math::
    \nabla \mathrm{relu}(6) = 0

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`relu`
  r         @)r#   rd   r$   r%   s    r   relu6r     s"    : 
S[A&&	+	++r   c                j    t          j        |dk    |dk     z  | t          j        | d                    S )Nr      r(   r+   s      r   r.   r.     s1    j!a%AE*As}Q/B/BCC r   c                ,    t          | dz             dz  S )zHard Sigmoid activation function.

  Computes the element-wise function

  .. math::
    \mathrm{hard\_sigmoid}(x) = \frac{\mathrm{relu6}(x + 3)}{6}

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`relu6`
  g      @r   )r   r%   s    r   hard_sigmoidr     s    $ 
q2v	r   c                x    t          j        d|            t          j        |           }|t	          |          z  S )aM  Hard SiLU (swish) activation function

  Computes the element-wise function

  .. math::
    \mathrm{hard\_silu}(x) = x \cdot \mathrm{hard\_sigmoid}(x)

  Both :func:`hard_silu` and :func:`hard_swish` are aliases for the same
  function.

  Args:
    x : input array

  Returns:
    An array.

  See also:
    :func:`hard_sigmoid`
  	hard_silu)r4   r5   r#   r6   r   rG   s     r   r   r     s7    * [!,,,
+a..%	e$$	$$r   c                f    t          j        |           j        }t          j        d|z  |           S )Ngffffffrn   )r#   finfor   r6   )ro   	dtype_maxs     r   _get_large_negativer     s.    i")	TI%U	3	3	33r   c                   t          j        t          j        | |ft           j                            }t          j        |t          j        d|          t          |                    }|t           j        t           j        d d d d f         S )Nrn   r?   )r#   trilonesbool_rB   r6   r   newaxis)TSro   predmasks        r   _get_causal_maskr     sn    	#(Aq6333	4	4$	4S%002Ee2L2L	M	M$	ck3;111,	--r   querykeyr   biasArray | Noner   	is_causalrg   floatc                   t          j        | j        t           j                  }t          j        d| ||          }|t          j        ||j                  z  }|||z                       |j                  }|B|j        t           j        k    sJ t          |j                  }	t          j	        |||	          }
n|}
|r5| j
        d         |j
        d         }}t          |||j                  }|
|z   }
|
                    t           j                  }
t          j                            |
d                              |j                  }t          j        d||          }|S )NzBTNH,BSNH->BNTS)preferred_element_typern   rb   ry   zBNTS,BSNH->BTNH)r#   promote_typesro   float32einsumrt   rs   r   r   rB   r   r   jaxnnr   )r   r   r   r   r   r   rg   logits_dtypelogitslarge_negative_numberpadded_logitsr   r   probsencodeds                  r   _dot_product_attention_xlar     sM    "5;<<,:'-9; ; ;& 	CIe6<0000&	tm##FL11F	:""""/==IdF,ABBMMM );r?CIbMqAAq&,//D!D(M  &&s{33-
&..R.
0
0
7
7	
B
B%J(%77'	.r   F)r   r   rg   r   implementationfloat | Noner   Literal['xla', 'cudnn'] | Nonec          	     \   dd	}t          j        |           } t          j        |          }t          j        |          }||nt          j        |          }||nt          j        |          }|j        \  }	}
}} |||	|
||gd            || |	d||gd           |dt          j        |          z  n|}| j        |j        cxk    r|j        k    s*n t          d| j         d|j         d|j         d          |-|j        t           j        k    rt          d|j         d          |xdk    r t          | ||||||          S xdk    r1 |rt          j
        nt          j        }t          | ||||||          S t          | ||||||          S 	 t          d|           )a  Scaled dot product attention function.

  Computes the attention function on Query, Key, and Value tensors:

  .. math::

    \mathrm{Attention}(Q, K, V)=\mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V

  If we define :code:`logits` as the output of :math:`QK^T` and the
  :code:`probs` as the output of :math:`softmax`.

  Throughout this function, we utilize the following uppercase letters to
  represent the shape of array:

    B = batch size
    S = length of the key/value (source)
    T = length of the query (target)
    N = number of attention heads
    H = dimensions of each attention head

  Args:
    query: query array; shape :code:`(BTNH)`
    key: key array; shape :code:`(BSNH)`
    value: value array; shape :code:`(BSNH)`
    bias: optional, bias array to be added to logits; shape broadcastable to
      :code:`(BNTS)`.
    mask: optional, mask array used to filter out logits. It is a boolean mask
      where `True` indicates the element should take part in attention. For an
      additive mask, users should pass it to `bias`. The shape is broadcastable
      to :code:`(BNTS)`.
    scale: scale for the logits. If None, the scale will be set to 1 divided by
      the square root of query's head dimension (i.e. H).
    is_causal: If true, causal attention will be applied. Note, some
      implementations like `xla` will generate a mask tensor and apply it to the
      logits to mask out the non-causal parts of the attention matrix, but other
      implementations like `cudnn` will avoid computing the non-causal regions,
      providing speedups.
    implementation: A string to control which implementation backend to use.
      Supported strings are `xla`, `cudnn` (cuDNN flash attention). It defaults
      to `None`, which will automatically select the best available backend.
      Note, `cudnn` supports only a subset of shapes/dtypes, and an exception
      will be thrown if its not supported.

  Returns:
    An array of the attention output with the same shape as :code:`query`.
  tr   r   Sequence[int]namestrr!   Nonec                4   | j         t          |          k    r)t          | dt          |           d| j                    t          | j                   D ]A}||         dk    r3| j        |         ||         k    rt          | d| d| j                   Bd S )Nz ndim should be z
, but got rb   z shape should be z
: but got )r   lenr   ranger   )r  r   r  is       r   _check_has_shapez/dot_product_attention.<locals>._check_has_shapeq  s    vU$NNE

NNafNNOOO16]] O O	qRAGAJ%(22DMM5MMAGMMNNNO Or   Nr   rb   r   r@   z4query/key/value should have the same dtype, but got z vs .z$Mask must be boolean dtype, but got xla)r   rg   cudnn)rg   	mask_typez#Unsupported implementation option: )r  r   r   r  r  r  r!   r  )r#   r6   r   rq   r7   ro   r   r   r   r   CAUSALNO_MASKcudnn_dot_product_attention)r   r   r   r   r   rg   r   r   r  Br   NH	scale_valr  s                  r   r   r   9  sH   pO O O O +e

%C#
+e

%3;t#4#4$3;t#4#4$y*!Q151aA,00051b!Q-111$)MsRWQZZu)
+
1
1
1
1ek
1
1
1
1
 GG G),G G8=G G G H H H	$*	11
IDJIII
J
JJ	'
eT49I    
%.D(//H4Di(
eT4yI    
 (
eT49I    
M^MMNNNr   )r    r   r!   r   )r/   )r    r   r0   r   r!   r   )r@   )r    r   rX   r   r!   r   )r\   )r    r   r]   r   r!   r   )T)r    r   ri   rj   r!   r   )rb   )r    r   rz   r|   r!   r   )
r    r   rz   r   rB   r   r   r   r!   r   )
r    r   rz   r   rB   r   r   r   r!   r   )rb   NNr   N)r    r   rz   r   r   r   r   r   r   r   rB   r   r!   r   )
r    r   r   r|   ro   r   rz   r   r!   r   )r   r   r   r   r   r   r   r   r   r   r   rj   rg   r   )r   r   r   r   r   r   r   r   r   r   rg   r   r   rj   r   r  r!   r   )N__doc__
__future__r   collections.abcr   	functoolsr   r   numpyrq   typingr   r   r   r   	jax.numpyr#   r   r	   jax._srcr
   r   r   r   jax._src.corer   (jax._src.cudnn.fused_attention_stablehlor   r  r   jax._src.numpyr4   jax._src.typingr   r   jax._src.ops.specialr   
_logsumexpr   r   jitr&   defjvpsr2   r<   r>   rD   rL   rP   rR   swishrT   rW   rZ   r_   ra   re   rh   rl   r~   r   r   r   r   defjvpr   r   r   r   float_r   r   r   r   
hard_swishr   r   r   r   r   r   <module>r,     s"   = < " " " " " " $ $ $ $ $ $                     



                                           " " " " " "D D D D D D D D - - - - - - , , , , , , , , 8 8 8 8 8 8        {}}
    	 B HH I I I    	*    	 K K K 	K2 & & & 	& 	   ( + + + 	+>       	 . 	+ + + 	+,    	* G G G G 	G6 > > > > 	>6 C C C 	C, N N N N 	N0    	BS S S S S@ 	),,,    -,@ 	 	),,,57*.:F, , , , -,,d 24&*6B(/ (/ (/ (/ (/V 	... *,"!$	    /. 
L L L *,"!$	     	),,,35'++/#'(,Y Y Y Y -,Y* 	"BCCC. . . DC.2 B: : : : : :> , , , 	 ,8  D D E E E    	& % % % 	%0 
4 4 4. . .
" " " "R "!59`O `O `O `O `O `O `O `Or   