o
    ╘2·h<{ у                   @   s,  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	lm
Z dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ edГejdddДГГZ edgdНej			dАdd ДГГZ!edg dНej"ej			dАd!d"ДГГГZ#e#j e!_ ed#g dНej	dБd$d%ДГГZ$ed#gdНejedd&d'Г					dВd(d)ДГГГZ%ed*gdНejdБd+d,ДГГZ&ed-d.Гej'ejdГd0d1ДГГГZ(ed2ГejdДd4d5ДГГZ)ed6d7d8g d9вdНejedd:d;ГdЕd=d>ДГГГZ*ej+fd?d@ДZ,edAdBГejdБdCdDДГГZ-edEgdНej				dЖdFdGДГГZ.edEg dНej			dАdHdIДГГZ/edJgdНej				dЖdKdLДГГZ0edJg dНej			dАdMdNДГГZ1edOgdНej		dЖdPdQДГГZ2edOg dНejdЗdRdSДГГZ3edTГejdБdUdVДГГZ4edWgdНej				dЖdXdYДГГZ5edWg dНej			dЗdZd[ДГГZ6ed\gdНej		dАd]d^ДГГZ7ed\g dНejdd_d`ДГГZ8edaГej	dБdbdcДГГZ9eddgdНej			e	f	g		/dИdhdiДГГZ:edjgdНej											dЙdkdlДГГZ;edjg dНej	dБdmdnДГГZ<dodpД Z=	q		g		r		dКdsdtДZ>edug dНej	q			vdЛdwdxДГГZ?edugdНej	q			r	vdМdydvДГГZ@edzg dНej	q		g		{dНd|d}ДГГZAedzgdНej	q		g	r	{	dОd~d{ДГГZBdS )Пz,Implementation of Neural Net (NN) functions.щ    N)┌constant_op)┌dtypes)┌ops)┌	array_ops)┌array_ops_stack)┌candidate_sampling_ops)┌cond)┌ctc_ops)┌custom_gradient)┌embedding_ops)┌
gen_nn_ops)┌gen_sparse_ops)┌
linalg_ops)┌math_ops)┌nn_fused_batch_norm_grad)┌nn_ops)┌	variables)┌device_context)┌dispatch)┌deprecated_args)┌deprecated_argument_lookup)┌	tf_exportznn.log_poisson_lossFc              
   C   s@  t а|d|| gбПН}t j|ddН}t j| ddН} z| аб а|аб б W n ty9   td|аб Ы d| аб Ы dЭГВw tа|б||   }|rНtj	d| j
d	Н}tj	d
tj | j
d	Н}| tа| б |  |tа||  б  }tj| | j
d	Н}tj| | j
d	Н}	tа| |k| |	kб}
|tа|
||б7 }|W  d  Г S 1 sЩw   Y  dS )aИ  Computes log Poisson loss given `log_input`.

  Gives the log-likelihood loss between the prediction and the target under the
  assumption that the target has a Poisson distribution.
  Caveat: By default, this is not the exact loss, but the loss minus a
    constant term [log(z!)]. That has no effect for optimization, but
    does not play well with relative loss comparisons. To compute an
    approximation of the log factorial term, specify
    compute_full_loss=True to enable Stirling's Approximation.

  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
  loss is

        -log(exp(-x) * (x^z) / z!)
      = -log(exp(-x) * (x^z)) + log(z!)
      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
          [ Note the second term is the Stirling's Approximation for log(z!).
            It is invariant to x and does not affect optimization, though
            important for correct relative loss comparisons. It is only
            computed when compute_full_loss == True. ]
      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

  Args:
    targets: A `Tensor` of the same type and shape as `log_input`.
    log_input: A `Tensor` of type `float32` or `float64`.
    compute_full_loss: whether to compute the full loss. If false, a constant
      term is dropped in favor of more efficient optimization.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `log_input` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `log_input` and `targets` do not have the same shape.
  ┌log_poisson_loss┌	log_inputй┌name┌targetsz>`log_input` and `targets` must have the same shape, received (· vs ·).g      р?й┌dtypeщ   N)r   ┌
name_scope┌convert_to_tensor┌	get_shape┌assert_is_compatible_with┌
ValueErrorr   ┌expr   ┌constantr    ┌math┌pi┌logr   ┌
zeros_like┌	ones_like┌logical_and┌where)r   r   ┌compute_full_lossr   ┌result┌
point_five┌two_pi┌stirling_approx┌zeros┌onesr   й r7   ·X/var/www/html/chatgem/venv/lib/python3.10/site-packages/tensorflow/python/ops/nn_impl.pyr   +   s6   (     $щr   z$nn.sigmoid_cross_entropy_with_logits)┌v1c              
   C   s■   t аd| |б tа|d|| gбПe}tj|ddН}tj| ddН} z| аб а|аб б W n ty@   td|аб Ы d| аб Ы dЭГВw tj	||j
d	Н}||k}tа|||б}tа|| |б}tj|||   tаtа|бб|dНW  d
  Г S 1 sxw   Y  d
S )z)See sigmoid_cross_entropy_with_logits_v2.┌!sigmoid_cross_entropy_with_logits┌logistic_loss┌logitsr   ┌labels·:`logits` and `labels` must have the same shape, received (r   r   r   N)r   ┌_ensure_xent_argsr   r"   r#   r$   r%   r&   r   r,   r    r/   r   ┌add┌log1pr'   )r=   r<   r   r5   r   ┌relu_logits┌neg_abs_logitsr7   r7   r8   r:   m   s.    
■ 
¤$ъr:   c                 C   s   t || |dНS )aB  Computes sigmoid cross entropy given `logits`.

  Measures the probability error in tasks with two outcomes in which each
  outcome is independent and need not have a fully certain label. For instance,
  one could perform a regression where the probability of an event happening is
  known and used as a label. This loss may also be used for binary
  classification, where labels are either zero or one.

  For brevity, let `x = logits`, `z = labels`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `labels` must have the same type and shape.

  >>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])
  >>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=labels, logits=logits).numpy()
  array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,
         0.6931472], dtype=float32)

  Compared to the losses which handle multiple outcomes,
  `tf.nn.softmax_cross_entropy_with_logits` for general multi-class
  classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more
  efficient multi-class classification with hard labels,
  `sigmoid_cross_entropy_with_logits` is a slight simplification for binary
  classification:

        sigmoid(x) = softmax([x, 0])[0]

  $$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$

  While `sigmoid_cross_entropy_with_logits` works for soft binary labels
  (probabilities between 0 and 1), it can also be used for binary classification
  where the labels are hard. There is an equivalence between all three symbols
  in this case, with a probability 0 indicating the second class or 1 indicating
  the first class:

  >>> sigmoid_logits = tf.constant([1., -1., 0.])
  >>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],
  ...                           axis=-1)
  >>> soft_binary_labels = tf.constant([1., 1., 0.])
  >>> soft_multiclass_labels = tf.stack(
  ...     [soft_binary_labels, 1. - soft_binary_labels], axis=-1)
  >>> hard_labels = tf.constant([0, 0, 1])
  >>> tf.nn.sparse_softmax_cross_entropy_with_logits(
  ...     labels=hard_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)
  >>> tf.nn.softmax_cross_entropy_with_logits(
  ...     labels=soft_multiclass_labels, logits=softmax_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
  >>> tf.nn.sigmoid_cross_entropy_with_logits(
  ...     labels=soft_binary_labels, logits=sigmoid_logits).numpy()
  array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,
      inclusive.
    logits: A `Tensor` of type `float32` or `float64`. Any real number.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  )r<   r=   r   )r:   йr=   r<   r   r7   r7   r8   ┌$sigmoid_cross_entropy_with_logits_v2Ц   s   [ rE   z%nn.weighted_cross_entropy_with_logitsc                 C   sт   t а|d|| gбП^}t j|ddН}t j| ddН} z| аб а|аб б W n ty9   td|аб Ы d| аб Ы dЭГВw d|d |   }tjd|  | |tаtа	tа
|б ббtа| б  |dНW  d	  Г S 1 sjw   Y  d	S )
aO
  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  >>> labels = tf.constant([1., 0.5, 0.])
  >>> logits = tf.constant([1.5, -0.1, -10.])
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()
  array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)
  >>> tf.nn.weighted_cross_entropy_with_logits(
  ...     labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()
  array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)

  Args:
    labels: A `Tensor` of the same type and shape as `logits`, with values
      between 0 and 1 inclusive.
    logits: A `Tensor` of type `float32` or `float64`, any real numbers.
    pos_weight: A coefficient to use on the positive examples, typically a
      scalar but otherwise broadcastable to the shape of `logits`. Its value
      should be non-negative.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r;   r<   r   r=   r>   r   r   щ   N)r   r"   r#   r$   r%   r&   r   r@   rA   r'   ┌absr   ┌relu)r=   r<   ┌
pos_weightr   ┌
log_weightr7   r7   r8   ┌%weighted_cross_entropy_with_logits_v2∙   s*   D 
■ 

 №$яrK   z)targets is deprecated, use labels insteadr   c                 C   s   t d| d|Г} t| |||ГS )aс  Computes a weighted cross entropy.

  This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
  allows one to trade off recall and precision by up- or down-weighting the
  cost of a positive error relative to a negative error.

  The usual cross-entropy cost is defined as:

      labels * -log(sigmoid(logits)) +
          (1 - labels) * -log(1 - sigmoid(logits))

  A value `pos_weight > 1` decreases the false negative count, hence increasing
  the recall.
  Conversely setting `pos_weight < 1` decreases the false positive count and
  increases the precision.
  This can be seen from the fact that `pos_weight` is introduced as a
  multiplicative coefficient for the positive labels term
  in the loss expression:

      labels * -log(sigmoid(logits)) * pos_weight +
          (1 - labels) * -log(1 - sigmoid(logits))

  For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
  The loss is:

        qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
      = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

  Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
  the implementation uses

      (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

  `logits` and `labels` must have the same type and shape.

  Args:
    labels: A `Tensor` of the same type and shape as `logits`.
    logits: A `Tensor` of type `float32` or `float64`.
    pos_weight: A coefficient to use on the positive examples.
    name: A name for the operation (optional).
    targets: Deprecated alias for labels.

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    weighted logistic losses.

  Raises:
    ValueError: If `logits` and `labels` do not have the same shape.
  r=   r   )r   rK   )r=   r<   rI   r   r   r7   r7   r8   ┌"weighted_cross_entropy_with_logitsU  s   =rL   znn.relu_layerc                 C   sД   t а|d| ||gбП.}t j| ddН} t j|ddН}t j|ddН}tаtа| |б|б}tj||dНW  d  Г S 1 s;w   Y  dS )a╡  Computes Relu(x * weight + biases).

  Args:
    x: a 2D tensor.  Dimensions typically: batch, in_units
    weights: a 2D tensor.  Dimensions typically: in_units, out_units
    biases: a 1D tensor.  Dimensions: out_units
    name: A name for the operation (optional).  If not specified
      "nn_relu_layer" is used.

  Returns:
    A 2-D Tensor computing relu(matmul(x, weights) + biases).
    Dimensions typically: batch, out_units.
  ┌
relu_layer┌xr   ┌weights┌biasesN)r   r"   r#   r   ┌bias_addr   ┌matmulrH   )rN   rO   rP   r   ┌	xw_plus_br7   r7   r8   rM   Ц  s   $√rM   znn.siluznn.swishч      Ё?c                 C   sB   t j| ddН} t j|ddН}tа|| jб}tjddД Г}|| |ГS )a  Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.

  beta : Hyperparameter for Swish activation function. Default value 1.0.

  The SiLU activation function was introduced in "Gaussian Error Linear Units
  (GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and
  "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in
  Reinforcement Learning"
  [Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently
  discovered (and called swish) in "Searching for Activation Functions"
  [Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)

  Args:
    features: A `Tensor` representing preactivation values.
    beta: A 'Tensor' representing value of beta hyperparameter.

  Returns:
    The activation value.
  ┌featuresr   ┌betac                    s$   З ЗfddД}Иt аИ И б |fS )Nc                    s~   t а| gбП tаИ И б}W d  Г n1 sw   Y  |dИ И d|    }tа| tаИб | d|  б}| | |fS )z+Gradient for the Swish activation function.NrT   )r   ┌control_dependenciesr   ┌sigmoid┌
reduce_sum┌square)┌dy┌sigmoid_features┌activation_grad┌	beta_gradйrV   rU   r7   r8   ┌grad╬  s    
    z'swish.<locals>.swish_impl.<locals>.grad)r   rX   )rU   rV   r`   r7   r_   r8   ┌
swish_impl╦  s   zswish.<locals>.swish_impl)r   r#   r   ┌castr    r
   )rU   rV   ra   r7   r7   r8   ┌swishо  s   

rc   zlinalg.normalize┌	euclideanc                 C   sn   t а|d| gбП%}t а| б} tj| ||ddН}tа|| jб}| | }||fW  d  Г S 1 s0w   Y  dS )aю  Normalizes `tensor` along dimension `axis` using specified norm.

  This uses `tf.linalg.norm` to compute the norm along `axis`.

  This function can compute several different vector norms (the 1-norm, the
  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).

  Args:
    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
    ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,
      `2`, `np.inf` and any positive real number yielding the corresponding
      p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
      `tensor` is a matrix and equivalent to 2-norm for vectors.
      Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for
        vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,
        '`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`
        on how to compute norms for a batch of vectors or matrices stored in a
        tensor.
    axis: If `axis` is `None` (the default), the input is considered a vector
      and a single vector norm is computed over the entire set of values in the
      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
      `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the
      input is considered a batch of vectors, and `axis` determines the axis in
      `tensor` over which to compute vector norms. If `axis` is a 2-tuple of
      Python integers it is considered a batch of matrices and `axis` determines
      the axes in `tensor` over which to compute a matrix norm.
      Negative indices are supported. Example: If you are passing a tensor that
        can be either a matrix or a batch of matrices at runtime, pass
        `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
        computed.
    name: The name of the op.

  Returns:
    normalized: A normalized `Tensor` with the same shape as `tensor`.
    norm: The computed norms with the same shape and dtype `tensor` but the
      final axis is 1 instead. Same as running
      `tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.

  Raises:
    ValueError: If `ord` or `axis` is invalid.
  ┌	normalizeTй┌keepdimsN)r   r"   r#   r   ┌normr   rb   r    )┌tensor┌ord┌axisr   rh   ┌
normalizedr7   r7   r8   re   ч  s   -
$√re   ·math.l2_normalize·linalg.l2_normalize·nn.l2_normalize)rm   rn   ro   z#dim is deprecated, use axis instead┌dimчъ-БЩЧq=c                 C   s  t d|d|Г}tа|d| gбПv}tj| ddН} | jjrbtаtа| бб}tаtа	| бб}tаtj
|| |ddНб}tаtа||бб}tаtа| б|б}	tаtа	| б|б}
tj|	|
|dНW  d  Г S tj
tа| б|ddН}tаtа||бб}tj| ||dНW  d  Г S 1 sИw   Y  dS )	a■  Normalizes along dimension `axis` using an L2 norm.

  For a 1-D tensor with `axis = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `axis`.

  1-D tensor example:
  >>> x = tf.constant([3.0, 4.0])
  >>> tf.math.l2_normalize(x).numpy()
  array([0.6, 0.8], dtype=float32)

  2-D tensor example:
  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 0).numpy()
  array([[0.6],
       [0.8]], dtype=float32)

  >>> x = tf.constant([[3.0], [4.0]])
  >>> tf.math.l2_normalize(x, 1).numpy()
  array([[1.],
       [1.]], dtype=float32)

  Args:
    x: A `Tensor`.
    axis: Dimension along which to normalize.  A scalar or a vector of
      integers.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).
    dim: Deprecated, do not use.

  Returns:
    A `Tensor` with the same shape as `x`.
  rk   rp   ┌l2_normalizerN   r   Trf   N)r   r   r"   r#   r    ┌
is_complexr   rZ   ┌real┌imagrY   ┌rsqrt┌maximum┌multiply┌complex)rN   rk   ┌epsilonr   rp   ┌square_real┌square_imag┌
square_sum┌
x_inv_norm┌	norm_real┌	norm_imagr7   r7   r8   rr     s$   * Ў$єrr   c                 C   sh   t jd| gdНП" tjg | jdН}tjtjtа| |б|dНddН}|W  d  Г S 1 s-w   Y  dS )zЁSame as math_ops.count_nonzero.

  The reduction is done in dtype, which can be faster for 32-bit dtypes.

  Args:
      input_tensor: numeric tensor
      dtype: reduction dtype

  Returns:
      number of nonzero values with type dtype
  ┌count_nonzero)┌valuesr   ┌nonzero_countr   N)	r   r"   r   r5   r    r   rY   rb   ┌	not_equal)┌input_tensorr    ┌zerorГ   r7   r7   r8   ┌_count_nonzeroW  s   
■¤$·rЗ   zmath.zero_fractionznn.zero_fractionc              	      sр   t а|dИ gбП^ t jИ ddНЙ tjИ tjdН}tj|tj	j
kЗ fddДЗ fddДdН}t аd	бП  || }tj|tjd
Н}tj|tjd
Н}|| }W d  Г n1 sTw   Y  tа|dбW  d  Г S 1 siw   Y  dS )a─  Returns the fraction of zeros in `value`.

  If `value` is empty, the result is `nan`.

  This is useful in summaries to measure and report sparsity.  For example,

  ```python
      z = tf.nn.relu(...)
      summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
  ```

  Args:
    value: A tensor of numeric type.
    name: A name for the operation (optional).

  Returns:
    The fraction of zeros in `value`, with type `float32`.
  ┌zero_fraction┌valuer   )┌out_typec                      s   t jtИ tjdНtjdНS йNr   )r   rb   rЗ   r   ┌int32┌int64r7   йrЙ   r7   r8   ┌<lambda>И  s    ■zzero_fraction.<locals>.<lambda>c                      s   t И tjdНS rЛ   )rЗ   r   rН   r7   rО   r7   r8   rП   Л  s    )┌true_fn┌false_fn┌counts_to_fractionr   N┌fraction)r   r"   r#   r   ┌sizer   rН   ┌tf_condr   rМ   ┌maxr   rb   ┌float32┌identity)rЙ   r   rФ   ┌num_nonzero┌num_zero┌num_zero_float32┌size_float32┌zero_fraction_float32r7   rО   r8   rИ   l  s    


·
№
$юrИ   znn.depthwise_conv2dc           	   
      s  t d|d|Г}tаИd| ИgбПgЙtj| ddН} tjИddНЙ|du r'ddg}tаб durYИ d	kr<dd|d
 |d g}n
d|d
 |d dg}tj| ИИ|И |ИdНW  d  Г S З ЗЗЗfddД}tj| t	а
Иб||И |dНW  d  Г S 1 szw   Y  dS )ah  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
                                           strides[2] * j + rate[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                                  padding=[[0, 0], [1, 0], [1, 0], [0, 0]]
  ...                                 ).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  ┌	dilations┌rate┌	depthwise┌	tensor_inr   ┌	filter_inNrF   ┌NCHWr   й┌input┌filter┌strides┌padding┌data_formatrЮ   r   c                    s   t j| ИИ|И ИdНS )Nйrе   rж   rз   rи   rй   r   йr   ┌depthwise_conv2d_nativeй┌input_converted┌_rи   йrй   rж   r   rз   r7   r8   ┌op  є   ·zdepthwise_conv2d.<locals>.opйrе   ┌filter_shape┌dilation_raterи   rй   r▒   )r   r   r"   r#   r   ┌enclosing_tpu_contextr   rм   ┌with_space_to_batchr   ┌shape)	rе   rж   rз   rи   rЯ   r   rй   rЮ   r▒   r7   r░   r8   ┌depthwise_conv2dЧ  s<   Y∙Ї	·$тr╣   c              	   C   s   t | ||||||dНS )aМ  Depthwise 2-D convolution.

  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail, with the default NHWC format,

      output[b, i, j, k * channel_multiplier + q] =
          sum_{di, dj} filter[di, dj, k, q] *
                       input[b, strides[1] * i + dilations[0] * di,
                                strides[2] * j + dilations[1] * dj, k]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `dilations` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Usage Example:

  >>> x = np.array([
  ...     [1., 2.],
  ...     [3., 4.],
  ...     [5., 6.]
  ... ], dtype=np.float32).reshape((1, 3, 2, 1))
  >>> kernel = np.array([
  ...     [1., 2.],
  ...     [3., 4]
  ... ], dtype=np.float32).reshape((2, 1, 1, 2))
  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding='VALID').numpy()
    array([[[[10., 14.],
             [14., 20.]],
            [[18., 26.],
             [22., 32.]]]], dtype=float32)

  >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
  ...                        padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()
    array([[[[ 0.,  0.],
             [ 3.,  4.],
             [ 6.,  8.]],
            [[ 0.,  0.],
             [10., 14.],
             [14., 20.]],
            [[ 0.,  0.],
             [18., 26.],
             [22., 32.]]]], dtype=float32)

  Args:
    input: 4-D with shape according to `data_format`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the convolution. Can
      be the string `"SAME"` or `"VALID"` indicating the type of padding
      algorithm to use, or a list indicating the explicit paddings at the start
      and end of each dimension. See
      [here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)
      for more information. When explicit padding is used and data_format
      is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
      [pad_left, pad_right], [0, 0]]`. When explicit padding used and
      data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
    "NHWC" format, shape is
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  )rе   rж   rз   rи   rЯ   r   rй   )r╣   rд   r7   r7   r8   ┌depthwise_conv2d_v2  s   Y·r║   znn.separable_conv2dc	              	      sЇ   t d|d|Г}tа|d| И|gбП_}tj| ddН} tjИddНЙtj|ddН}|аб аdб}	|	jd	 аd
б |	jd
 аd
б |du rFd
d
g}З ЗЗfddД}
tj	| t
аИб||И |
dН}tj||g dвdИ |dНW  d  Г S 1 ssw   Y  dS )a░
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
      Contains `in_channels` convolutional filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape
      `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
      filter to mix channels after `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for
      each dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    rate: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: Alias of rate.

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  rЮ   rЯ   ┌separable_conv2drб   r   ┌depthwise_filter┌pointwise_filterщ   r   rF   Nc                    s   t j| ИИ|И ddНS )Nrа   rк   rл   rн   йrй   r╝   rз   r7   r8   r▒   ╨  r▓   zseparable_conv2d.<locals>.opr│   )rF   rF   rF   rF   ┌VALID)rи   rй   r   )r   r   r"   r#   r$   ┌	with_rank┌dimsr%   r   r╖   r   r╕   ┌conv2d)rе   r╝   r╜   rз   rи   rЯ   r   rй   rЮ   ┌pointwise_filter_shaper▒   rа   r7   r┐   r8   r╗   }  sD   ?   	·√$▄r╗   c              
   C   s   t | |||||||dНS )aЧ
  2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail, with the default NHWC format,

      output[b, i, j, k] = sum_{di, dj, q, r}
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
  If any value in `rate` is greater than 1, we perform atrous depthwise
  convolution, in which case all values in the `strides` tensor must be equal
  to 1.

  Args:
    input: 4-D `Tensor` with shape according to `data_format`.
    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
      filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
      in_channels, out_channels]`.  Pointwise filter to mix channels after
      `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for each
      dimension of `input`.
    padding: Controls how to pad the image before applying the depthwise
      convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
      of padding algorithm to use, or a Python list indicating the explicit
      paddings at the start and end of each dimension. When explicit padding is
      used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
      [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
      padding used and data_format is `"NCHW"`, this should be in the form
      `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
    dilations: 1-D of size 2. The dilation rate in which we sample input values
      across the `height` and `width` dimensions in atrous convolution. If it is
      greater than 1, then all values of strides must be 1.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` with shape according to 'data_format'. For
      example, with data_format="NHWC", shape is [batch, out_height,
      out_width, out_channels].
  )rЯ   r   rй   )r╗   )rе   r╝   r╜   rз   rи   rй   rЮ   r   r7   r7   r8   ┌separable_conv2d_v2щ  s   ?°r┼   znn.sufficient_statisticsc                    sr  t t|ГГ}td|d|Г}|du rd}tа|d| |gбПП tj| ddН} | аб ЙИjdurPtЗfdd	Д|D ГГrPd
}|D ]
}|Иj	| j
9 }q<tj|| jdН}n#tа| бЙ З fddД|D Г}tаtаtа| б| jб|б}	tj|	ddН}|durЛtj|ddН}tа| |б}
tа| |б}n| }
tа| б}tj|
||ddН}
tj|||ddН}W d  Г n1 sоw   Y  ||
||fS )a8  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  For example:
  >>> t = [[1, 2, 3], [4, 5, 6]]
  >>> sufficient_statistics(t, [1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
  >>> sufficient_statistics(t, [-1])
  (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
  dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance. As in
      Python, the axes can also be negative numbers. A negative axis is
      interpreted as counting from the end of the rank, i.e., axis +
      rank(values)-th dimension.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.
    keepdims: Alias for keep_dims.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  rg   ┌	keep_dimsNF┌sufficient_statisticsrN   r   c                 3   s    Б | ]}И j | jd uV  qd S йN)r┬   rЙ   )┌.0┌d)┌x_shaper7   r8   ┌	<genexpr>g  s   А 
 z(sufficient_statistics.<locals>.<genexpr>rF   r   c                    s    g | ]}|d k r|И  n|СqS )r   r7   )r╔   rk   )┌rankr7   r8   ┌
<listcomp>p  s     z)sufficient_statistics.<locals>.<listcomp>┌count┌shift┌mean_ssйrg   r   ┌var_ss)┌list┌setr   r   r"   r#   r$   r═   ┌allr┬   rЙ   r   r(   r    r   ┌gatherr   rb   r╕   ┌reduce_prod┌subtract┌squared_differencerZ   rY   )rN   ┌axesr╨   r╞   r   rg   ┌countsr╩   ┌positive_axes┌x_dims┌m_ss┌v_ssr7   )r═   r╦   r8   r╟   5  s@   *  
 
шr╟   c                 C   s   t | ||||dНS )aJ  Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keepdims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  )rN   r█   r╨   r╞   r   )r╟   йrN   r█   r╨   rg   r   r7   r7   r8   ┌sufficient_statistics_v2А  s   
 rт   znn.normalize_momentsc           	      C   sо   t а|d| |||gбП@ tj| ddН}|dur(tj||ddН}tj||ddН}n
tj||ddН}|}tjtа||бtа|бddН}W d  Г ||fS 1 sNw   Y  ||fS )a╧  Calculate the mean and variance of based on the sufficient statistics.

  Args:
    counts: A `Tensor` containing the total count of the data (one value).
    mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
      shifted) sum of the elements to average over.
    variance_ss: A `Tensor` containing the variance sufficient statistics: the
      (possibly shifted) squared sum of the data to compute the variance over.
    shift: A `Tensor` containing the value by which the data is shifted for
      numerical stability, or `None` if no shift was performed.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  re   ┌divisorr   N┌shifted_mean┌mean┌variance)r   r"   r   ┌
reciprocalrx   r@   r┘   rZ   )	r▄   r╤   ┌variance_ssr╨   r   rу   rф   rх   rц   r7   r7   r8   ┌normalize_momentsЮ  s    
¤
°Їrщ   z
nn.momentsc           	      C   s·   t d|d|Г}|du rd}tа|d| |gбП] | jtjkr$tа| tjбn| }tj	||dddН}tj	tа
|tа|бб|dd	dН}|sMtа||б}tа||б}| jtjkrhtа|tjбtа|tjбfW  d  Г S ||fW  d  Г S 1 svw   Y  dS )
a▌  Calculate the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation
    name: Name used to scope the operations that compute the moments.
    keep_dims: produce moments with the same dimensionality as the input.
    keepdims: Alias to keep_dims.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  rg   r╞   NF┌momentsTrх   r╥   rц   )r   r   r"   r    r   ┌float16r   rb   rЧ   ┌reduce_meanr┌   r   ┌stop_gradient┌squeeze)	rN   r█   r╨   r   r╞   rg   ┌yrх   rц   r7   r7   r8   rъ   ┐  s0   $ № ь$щrъ   c                 C   є   t | ||||dНS )a╝  Calculates the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  Note: shift is currently not used; the true mean is computed and used.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):

   * for so-called "global normalization", used with convolutional filters with
     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
   * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: Array of ints.  Axes along which to compute mean and
      variance.
    shift: Not used in the current implementation.
    keepdims: produce moments with the same dimensionality as the input.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  )rN   r█   r╨   r   r╞   )rъ   rс   r7   r7   r8   ┌
moments_v2  s   "rё   znn.weighted_momentsc                 C   sZ  t d|d|Г}|du rd}tа|d| ||gбПМ tj| ddН} tj|ddН}| jtjk}|r5tа| tj	б} |j| jkrBtа|| jб}tj
||  |d	d
dН}|tа| б }tj
||dd
dН}	tа||	б}
tj
|tа| |
б |dd
dН}tа||	б}|sИtj|
|dН}
tj||dН}|rШtа|
tjб}
tа|tjб}|
|fW  d  Г S 1 sжw   Y  dS )aї  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    name: Name used to scope the operation.
    keep_dims: Produce moments with the same dimensionality as the input.
    keepdims: Alias of keep_dims.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  rg   r╞   NF┌weighted_momentsrN   r   ┌frequency_weights┌weighted_input_sumT)r   rg   ┌sum_of_weights┌weighted_distsq)rk   )r   r   r"   r#   r    r   rы   r   rb   rЧ   rY   r   r,   ┌
div_no_nanr┌   rю   )rN   r█   rє   r   r╞   rg   ┌
needs_castrЇ   ┌broadcasted_weightsrї   ┌weighted_meanrЎ   ┌weighted_variancer7   r7   r8   rЄ   &  sP      	 № $═rЄ   c                 C   rЁ   )a╥  Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    keepdims: Produce moments with the same dimensionality as the input.
    name: Name used to scope the operation.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  )rN   r█   rє   r   r╞   )rЄ   )rN   r█   rє   rg   r   r7   r7   r8   ┌weighted_moments_v2s  s   √r№   znn.batch_normalizationc              	   C   sФ   t а|d| ||||gбП4 tа|| б}|dur||9 }| tа|| jб tа|dur0|||  n| | | jб W  d  Г S 1 sCw   Y  dS )a	  Batch normalization.

  Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
  `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):

  \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)

  `mean`, `variance`, `offset` and `scale` are all expected to be of one of two
  shapes:

    * In all generality, they can have the same number of dimensions as the
      input `x`, with identical sizes as `x` for the dimensions that are not
      normalized over (the 'depth' dimension(s)), and dimension 1 for the
      others which are being normalized over.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=True)` during training, or running averages
      thereof during inference.
    * In the common case where the 'depth' dimension is the last dimension in
      the input tensor `x`, they may be one dimensional tensors of the same
      size as the 'depth' dimension.
      This is the case for example for the common `[batch, depth]` layout of
      fully-connected layers, and `[batch, height, width, depth]` for
      convolutions.
      `mean` and `variance` in this case would typically be the outputs of
      `tf.nn.moments(..., keepdims=False)` during training, or running averages
      thereof during inference.

  See equation 11 in Algorithm 2 of source:
  [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of arbitrary dimensionality.
    mean: A mean `Tensor`.
    variance: A variance `Tensor`.
    offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
      None. If present, will be added to the normalized tensor.
    scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
      `None`. If present, the scale is applied to the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    name: A name for this operation (optional).

  Returns:
    the normalized, scaled, offset tensor.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  ┌	batchnormN)r   r"   r   rv   rb   r    )rN   rх   rц   ┌offset┌scale┌variance_epsilonr   ┌invr7   r7   r8   ┌batch_normalizationМ  s   =  $·r  znn.fused_batch_normч№йё╥MbP?┌NHWCTc
                 C   s▓   |r|	dkr|du s|du rt d|Ыd|ЫЭГВtj| ddН} tj|ddН}tj|ddН}|du r6tаg б}|du r?tаg б}tj| ||||||	|||d	Н
\}
}}}}}|
||fS )
aп  Batch normalization.


  See Source: [Batch Normalization: Accelerating Deep Network Training by
  Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
  (http://arxiv.org/abs/1502.03167).

  Args:
    x: Input `Tensor` of 4 or 5 dimensions.
    scale: A `Tensor` of 1 dimension for scaling.
    offset: A `Tensor` of 1 dimension for bias.
    mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
          of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Mean must be a `Tensor` of the same shape as scale containing the
            estimated population mean computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Mean must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Mean must be a `Tensor` of the same shape as scale containing the
            exponential running mean.
    variance: A `Tensor` of 1 dimension for population variance. The shape and
          meaning of this argument depends on the value of is_training and
          exponential_avg_factor as follows:
          is_training==False (inference):
            Variance must be a `Tensor` of the same shape as scale containing
            the estimated population variance computed during training.
          is_training==True and exponential_avg_factor == 1.0:
            Variance must be None.
          is_training==True and exponential_avg_factor != 1.0:
            Variance must be a `Tensor` of the same shape as scale containing
            the exponential running variance.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
                 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
    is_training: A bool value to specify if the operation is used for
                 training or inference.
    name: A name for this operation (optional).
    exponential_avg_factor: A float number (usually between 0 and 1) used
                            for controlling the decay of the running
                            population average of mean and variance.
                            If set to 1.0, the current batch average is
                            returned.

  Returns:
    y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
    running_mean: A 1D Tensor for the exponential running mean of x.
                  The output value is (1 - exponential_avg_factor) * mean +
                  exponential_avg_factor * batch_mean), where batch_mean
                  is the mean of the current batch in x.
    running_var: A 1D Tensor for the exponential running variance
                 The output value is (1 - exponential_avg_factor) * variance +
                 exponential_avg_factor * batch_variance), where batch_variance
                 is the variance of the current batch in x.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  rT   NzАBoth `mean` and `variance` must be a 1D tensor when `is_training` is False or `exponential_avg_factor` != 1.0. Received: `mean` z and `variance` rе   r   r    r■   )rz   ┌exponential_avg_factorrй   ┌is_trainingr   )r&   r   r#   r   r(   r   ┌fused_batch_norm_v3)rN   r    r■   rх   rц   rz   rй   r  r   r  rя   ┌running_mean┌running_varrп   r7   r7   r8   ┌fused_batch_norm╙  s6   K■¤

Ў
r
  z'nn.batch_norm_with_global_normalizationc                 C   sL   t d|d| Г} t d|	d|Г}t d|
d|Г}t| ||||r!|||ГS d||ГS )aG  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    t: A 4D input Tensor.
    m: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    v: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).
    input: Alias for t.
    mean: Alias for m.
    variance: Alias for v.

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing
    Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  rе   ┌trх   ┌mrц   ┌vN)r   r  )r  r  r  rV   ┌gammar   ┌scale_after_normalizationr   rе   rх   rц   r7   r7   r8   ┌$batch_norm_with_global_normalization:  s   /  r  c              
   C   s   t | |||||||dНS )a  Batch normalization.

  This op is deprecated. See `tf.nn.batch_normalization`.

  Args:
    input: A 4D input Tensor.
    mean: A 1D mean Tensor with size matching the last dimension of t.
      This is the first output from tf.nn.moments,
      or a saved moving average thereof.
    variance: A 1D variance Tensor with size matching the last dimension of t.
      This is the second output from tf.nn.moments,
      or a saved moving average thereof.
    beta: A 1D beta Tensor with size matching the last dimension of t.
      An offset to be added to the normalized tensor.
    gamma: A 1D gamma Tensor with size matching the last dimension of t.
      If "scale_after_normalization" is true, this tensor will be multiplied
      with the normalized tensor.
    variance_epsilon: A small float number to avoid dividing by 0.
    scale_after_normalization: A bool indicating whether the resulted tensor
      needs to be multiplied with gamma.
    name: A name for this operation (optional).

  Returns:
     A batch-normalized `t`.

  References:
    Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:
      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
  )r  r  r  rV   r  r   r  r   )r  )rе   rх   rц   rV   r  r   r  r   r7   r7   r8   ┌'batch_norm_with_global_normalization_v2q  s   (∙r  c                 C   s@   t а| бd }tа|dgб}t а|| jб}t аtа| |бdgбS )z5Returns a vector summing up each row of the matrix x.rF   щ    )	r   r╕   r   ┌stackr6   r    ┌reshaper   rR   )rN   ┌cols┌
ones_shaper6   r7   r7   r8   ┌	_sum_rowsе  s   r  rF   ┌modc           (   
   C   sX  t | tjГr
t| Г} t | tГs| g} tа|d| |||g бРПВ |jtjkr-t	а
|tjб}tа|dgб}|du rCtj|||d||dН}ddД |D Г\}}}t	а
|tjб}tа||gdб}tj| ||
d	Н}|j|jkrqt	а
||jб}tа|ddgtаtа|бd dgбб}tа|tаtа|бd dgбddgб}t	j||dd
Н}tj|||
d	Н}|j|jkr┤t	а
||jб}tа|dgtа|бб}tа|tа|бdgб}tа|бddЕ }tаd|g|gdб}t	аtа|dбtа||бб}tа|tаdg|gdбб}tаt|Гd|gб}tа|d|gб}||7 }||7 }|	Рrmtj|||dН}|\}} }!tа|ddgб}"tаt	а
| tjбddgб}#tа|"|#gddб}$tаtа|бddЕ tа|dбgdб}%|j|!jkРrat	а
|!|jб}!|tj|$|%|!dddН7 }|Рr~|t	а|б8 }|t	а|б8 }tа||gdб}&tаtа|б| tа |бgdб}'|&|'fW  d  Г S 1 Рsеw   Y  dS )a(
  Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
        class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.
  Returns:
    out_logits: `Tensor` object with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
    out_labels: A Tensor object with the same shape as `out_logits`.
  ┌compute_sampled_logitsr  NT)┌true_classes┌num_true┌num_sampled┌unique┌	range_max┌seedc                 s   s   Б | ]}t а|бV  qd S r╚   )r   rэ   )r╔   ┌sr7   r7   r8   r╠     s   А 

 z*_compute_sampled_logits.<locals>.<genexpr>r   )┌partition_strategy)┌transpose_brF   r!   )r  ┌sparse_indicesg        F)┌default_value┌validate_indices)!┌
isinstancer   ┌PartitionedVariabler╘   r   r"   r    r   rН   r   rb   r   r  r   ┌log_uniform_candidate_sampler┌concatr   ┌embedding_lookup┌slicer   r  r╕   rR   rx   ┌expand_dimsr  ┌compute_accidental_hitsrМ   r   ┌sparse_to_denser+   r-   r,   )(rO   rP   r=   ┌inputsr  ┌num_classesr  ┌sampled_values┌subtract_log_q┌remove_accidental_hitsr!  r   r  ┌labels_flat┌sampled┌true_expected_count┌sampled_expected_count┌all_ids┌all_w┌true_w┌	sampled_w┌sampled_logits┌all_b┌true_b┌	sampled_brp   ┌new_true_w_shape┌row_wise_dots┌dots_as_matrix┌true_logits┌acc_hits┌acc_indices┌acc_ids┌acc_weights┌acc_indices_2d┌acc_ids_2d_int32r#  ┌sampled_logits_shape┌
out_logits┌
out_labelsr7   r7   r8   ┌_compute_sampled_logits▓  s─   <
 ·	    ■ 

■  
  
 ■√■¤&КrM  znn.nce_loss┌nce_lossc
           
      C   s   t | ||||||||d|	dНS )aИ  Computes and returns the noise-contrastive estimation training loss.

  See [Noise-contrastive estimation: A new estimation principle for
  unnormalized statistical
  models](https://arxiv.org/abs/1806.03664).
  Also see our [Candidate Sampling Algorithms
  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
      per batch. This single sample of negative classes is evaluated for each
      element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  If set to `True`,
      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
      generate log-odds instead of log probabilities.  See our [Candidate
      Sampling Algorithms Reference]
        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
          False.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.
  ┌div)r  r1  r3  r!  r   )rN  )
rO   rP   r=   r/  r  r0  r  r1  r3  r   r7   r7   r8   ┌nce_loss_v2l  s   [їrP  c                 C   s:   t | |||||||d||	|
dН\}}t||ddН}t|ГS )an  Computes and returns the noise-contrastive estimation training loss.

  A common use case is to use this method for training, and calculate the full
  sigmoid loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.nce_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
    loss = tf.reduce_sum(loss, axis=1)
  ```

  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
  so your labels must be sorted in order of decreasing frequency to achieve
  good results.  For more details, see
  `tf.random.log_uniform_candidate_sampler`.

  Note: In the case where `num_true` > 1, we assign to each target class
  the target probability 1 / `num_true` so that the target probabilities
  sum to 1 per-example.

  Note: It would be useful to allow a variable number of target classes per
  example.  We hope to provide this functionality in a future release.
  For now, if you have a variable number of target classes, you can pad them
  out to a constant number by either repeating them or by padding
  with an otherwise unused class.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of negative classes to randomly sample
        per batch. This single sample of negative classes is evaluated for each
        element in the batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  If set to
        `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
        learning to generate log-odds instead of log probabilities. See
        our Candidate Sampling Algorithms Reference
        ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
        Default is False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example NCE losses.

  References:
    Noise-contrastive estimation - A new estimation principle for unnormalized
    statistical models:
      [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)
      ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))
  T)rO   rP   r=   r/  r  r0  r  r1  r2  r3  r!  r   ┌sampled_lossesrD   )rM  r:   r  )rO   rP   r=   r/  r  r0  r  r1  r3  r!  r   r<   rQ  r7   r7   r8   rN  ╒  s$   \
Ї znn.sampled_softmax_loss┌sampled_softmax_lossc                 C   s    t | ||||||||d|
|	dНS )aк
  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference as in the following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...)
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our [Candidate Sampling Algorithms Reference]
  (https://www.tensorflow.org/extras/candidate_sampling.pdf)

  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.

  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
  strategy will be used. Support for other partition strategy will be added
  later.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
      objects whose concatenation along dimension 0 has shape [num_classes,
      dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
      target classes.  Note that this format differs from the `labels` argument
      of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
      the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
      (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
      where a sampled class equals one of the target classes.  Default is True.
    seed: random seed for candidate sampling. Default to None, which doesn't set
      the op-level random seed for candidate sampling.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  rO  )r  r1  r3  r!  r   r  )rR  )rO   rP   r=   r/  r  r0  r  r1  r3  r  r   r7   r7   r8   ┌sampled_softmax_loss_v2E  s   MЇrS  c                 C   sF   t | |||||||d||	|
|dН\}}tj|ddН}tj||dН}|S )a2  Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = tf.nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  See our Candidate Sampling Algorithms Reference
  ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
  Also see Section 3 of (Jean et al., 2014) for the math.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        True.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
    seed: random seed for candidate sampling. Default to None, which doesn't set
        the op-level random seed for candidate sampling.

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  References:
    On Using Very Large Target Vocabulary for Neural Machine Translation:
      [Jean et al., 2014]
      (https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)
      ([pdf](http://aclweb.org/anthology/P15-1001))
  T)rO   rP   r=   r/  r  r0  r  r1  r2  r3  r!  r   r  ┌labels_stop_gradientr   )r=   r<   )rM  r   rэ   r   ┌$softmax_cross_entropy_with_logits_v2)rO   rP   r=   r/  r  r0  r  r1  r3  r!  r   r  r<   rQ  r7   r7   r8   rR  б  s(   T
є )FN)NNNr╚   )NNNNN)rT   )rd   NN)Nrq   NN)NNNN)NFN)NNr  r  TNrT   )NNNNNNNNNNN)rF   NTFr  NN)rF   NFrN  )rF   NFr  rN  )rF   NTNrR  )rF   NTr  rR  N)C┌__doc__r)   ┌tensorflow.python.frameworkr   r   r   ┌tensorflow.python.opsr   r   r   r   rХ   r	   r
   r   r   r   r   r   r   r   r   ┌tensorflow.python.platformr   ┌tensorflow.python.utilr   ┌"tensorflow.python.util.deprecationr   r   ┌ tensorflow.python.util.tf_exportr   ┌add_dispatch_supportr   r:   ┌register_binary_elementwise_apirE   rK   rL   rM   ┌register_unary_elementwise_apirc   re   rr   rН   rЗ   rИ   r╣   r║   r╗   r┼   r╟   rт   rщ   rъ   rё   rЄ   r№   r  r
  r  r  r  rM  rP  rN  rS  rR  r7   r7   r7   r8   ┌<module>   sЮ  
@¤
'¤] 
 
Z
№
>63 
7
)∙
·
c°
j°
J 
I
·
@√
# 
K·
EЎ
eЎ
5∙2
Ї 
;ў
gЎ
nЎ
Zї