o
    2h4                     @   sX   d dl mZ d dl mZ d dlmZ d dlmZ d dlmZ edG dd deZ	d	S )
    )backend)ops)keras_export)KerasTensor)Layerzkeras.layers.Attentionc                       s   e Zd ZdZ				d fdd	Zdd	 Zd
d ZdddZdd Z				dddZ	dddZ
dd Z				d ddZdddZ fddZ  ZS )!	Attentiona  Dot-product attention layer, a.k.a. Luong-style attention.

    Inputs are a list with 2 or 3 elements:
    1. A `query` tensor of shape `(batch_size, Tq, dim)`.
    2. A `value` tensor of shape `(batch_size, Tv, dim)`.
    3. A optional `key` tensor of shape `(batch_size, Tv, dim)`. If none
        supplied, `value` will be used as a `key`.

    The calculation follows the steps:
    1. Calculate attention scores using `query` and `key` with shape
        `(batch_size, Tq, Tv)`.
    2. Use scores to calculate a softmax distribution with shape
        `(batch_size, Tq, Tv)`.
    3. Use the softmax distribution to create a linear combination of `value`
        with shape `(batch_size, Tq, dim)`.

    Args:
        use_scale: If `True`, will create a scalar variable to scale the
            attention scores.
        dropout: Float between 0 and 1. Fraction of the units to drop for the
            attention scores. Defaults to `0.0`.
        seed: A Python integer to use as random seed in case of `dropout`.
        score_mode: Function to use to compute attention scores, one of
            `{"dot", "concat"}`. `"dot"` refers to the dot product between the
            query and key vectors. `"concat"` refers to the hyperbolic tangent
            of the concatenation of the `query` and `key` vectors.

    Call arguments:
        inputs: List of the following tensors:
            - `query`: Query tensor of shape `(batch_size, Tq, dim)`.
            - `value`: Value tensor of shape `(batch_size, Tv, dim)`.
            - `key`: Optional key tensor of shape `(batch_size, Tv, dim)`. If
                not given, will use `value` for both `key` and `value`, which is
                the most common case.
        mask: List of the following tensors:
            - `query_mask`: A boolean mask tensor of shape `(batch_size, Tq)`.
                If given, the output will be zero at the positions where
                `mask==False`.
            - `value_mask`: A boolean mask tensor of shape `(batch_size, Tv)`.
                If given, will apply the mask such that values at positions
                 where `mask==False` do not contribute to the result.
        return_attention_scores: bool, it `True`, returns the attention scores
            (after masking and softmax) as an additional output argument.
        training: Python boolean indicating whether the layer should behave in
            training mode (adding dropout) or in inference mode (no dropout).
        use_causal_mask: Boolean. Set to `True` for decoder self-attention. Adds
            a mask such that position `i` cannot attend to positions `j > i`.
            This prevents the flow of information from the future towards the
            past. Defaults to `False`.

    Output:
        Attention outputs of shape `(batch_size, Tq, dim)`.
        (Optional) Attention scores after masking and softmax with shape
            `(batch_size, Tq, Tv)`.
    Fdot        Nc                    s`   t  jdi | || _|| _|| _| jdkrtjj|d| _| jdvr+t	d| d| _
d S )Nr   seed)r   concatz_Invalid value for argument score_mode. Expected one of {'dot', 'concat'}. Received: score_mode=F )super__init__	use_scale
score_modedropoutr   randomSeedGeneratorseed_generator
ValueError_return_attention_scores)selfr   r   r   r   kwargs	__class__r   _/var/www/html/chatgem/venv/lib/python3.10/site-packages/keras/src/layers/attention/attention.pyr   B   s   


zAttention.__init__c                 C   s^   |  | d | _d | _| jr| jddd| jdd| _| jdkr-| jddd| jdd| _d S d S )Nscaler   onesT)nameshapeinitializerdtype	trainabler   concat_score_weight)_validate_inputsr   r$   r   
add_weightr"   r   )r   input_shaper   r   r   buildZ   s(   

zAttention.buildc                 C   s   | j dkrt|tj|g dd}| jdur|| j9 }|S | j dkr[tj|dd}tj|dd}| jdurJ| jtjt| j||  d	d }|S | jtjt|| d	d }|S t	d
)a  Calculates attention scores as a query-key dot product.

        Args:
            query: Query tensor of shape `(batch_size, Tq, dim)`.
            key: Key tensor of shape `(batch_size, Tv, dim)`.

        Returns:
            Tensor of shape `(batch_size, Tq, Tv)`.
        r   )r         )axesNr   axiszscores not computed)
r   r   matmul	transposer   expand_dimsr$   sumtanhr   )r   querykeyscores
q_reshaped
k_reshapedr   r   r   _calculate_scoreso   s$   






zAttention._calculate_scoresc                 C   s~   |durt |}|jdkrdnd}||t j||jd 8 }t j|dd}|r7| jdkr7tjj|| j| jd	}t 	|||fS )
a  Applies attention scores to the given value tensor.

        To use this method in your attention layer, follow the steps:

        * Use `query` tensor of shape `(batch_size, Tq)` and `key` tensor of
            shape `(batch_size, Tv)` to calculate the attention `scores`.
        * Pass `scores` and `value` tensors to this method. The method applies
            `scores_mask`, calculates
            `attention_distribution = softmax(scores)`, then returns
            `matmul(attention_distribution, value).
        * Apply `query_mask` and return the result.

        Args:
            scores: Scores float tensor of shape `(batch_size, Tq, Tv)`.
            value: Value tensor of shape `(batch_size, Tv, dim)`.
            scores_mask: A boolean mask tensor of shape `(batch_size, 1, Tv)`
                or `(batch_size, Tq, Tv)`. If given, scores at positions where
                `scores_mask==False` do not contribute to the result. It must
                contain at least one `True` value in each line along the last
                dimension.
            training: Python boolean indicating whether the layer should behave
                in training mode (adding dropout) or in inference mode
                (no dropout).

        Returns:
            Tensor of shape `(batch_size, Tq, dim)`.
            Attention scores after masking and softmax with shape
                `(batch_size, Tq, Tv)`.
        Nfloat16g     @g    eAr"   r0   r-   r   r
   )
r   logical_notr"   castsoftmaxr   r   r   r   r1   )r   r8   valuescores_masktrainingpadding_mask	max_valueweightsr   r   r   _apply_scores   s   
zAttention._apply_scoresc           
      C   s   |r>t |}d|d |d f}t j|dd}t j|dd}t j|dd}t ||}	|d ur<t j|dd}t ||	S |	S |S )Nr*   r,   r0   int32)r    r"   r-   )r   r    r   cumsumgreater_equalr3   logical_and)
r   r8   v_maskuse_causal_maskscore_shape
mask_shape	ones_mask	row_index	col_indexcausal_maskr   r   r   _calculate_score_mask   s   
zAttention._calculate_score_maskc                 C   s   | j ||d || _|d }|d }t|dkr|d n|}|r$|d nd }	|r,|d nd }
| j||d}| ||
|}| j||||d\}}|	d ur\tj|	dd}	|tj|	|j	d	9 }|rb||fS |S )
Ninputsmaskr   r*   r)   )r6   r7   )r8   rA   rB   rC   r0   r-   r=   )
r%   r   lenr;   rT   rG   r   r3   r?   r"   )r   rV   rW   rC   return_attention_scoresrM   qvkq_maskrL   r8   rB   attention_outputattention_scoresr   r   r   call   s(   
zAttention.callc                 C   s4   | j ||d |d u s|d d u rd S t|d S )NrU   r   )r%   r   convert_to_tensor)r   rV   rW   r   r   r   compute_mask   s   zAttention.compute_maskc                 C   sX   |\}}}|d u r|}g |d d |d R }| j r*|d |d |d f}||fS |S )Nr0   r   r*   )r   )r   r'   query_shapevalue_shape	key_shapeoutput_shapescores_shaper   r   r   compute_output_shape   s   
zAttention.compute_output_shapec                 C   s   |  || |d }|d }t|dkr|d n|}| |j|j|jg}	t|	| jd}
| js1|rJ|jd |jd |jd f}t|| jd}|
|fS |
S )Nr   r*   r)   r=   )r%   rX   rh   r    r   compute_dtyper   )r   rV   rW   rY   rC   rM   r6   rA   r7   rf   output_specrg   attention_scores_specr   r   r   compute_output_spec  s$   	
zAttention.compute_output_specc                 C   s   | j j}t|tst| d| dt|dk st|dkr+t| dt| d|durWt|ts>t| d| dt|dk sJt|dkrYt| d| d	| ddS dS )
z'Validates arguments of the call method.zj layer must be called on a list of inputs, namely [query, value] or [query, value, key]. Received: inputs=.r)      zl layer accepts inputs list of length 2 or 3, namely [query, value] or [query, value, key]. Received length: NzL layer mask must be a list, namely [query_mask, value_mask]. Received: mask=z< layer accepts mask list of length 2 or 3. Received: inputs=z, mask=)r   __name__
isinstancelistr   rX   )r   rV   rW   
class_namer   r   r   r%   '  s<   

zAttention._validate_inputsc                    s(   t   }| j| j| jd}i ||S )N)r   r   r   )r   
get_configr   r   r   )r   base_configconfigr   r   r   rs   B  s   
zAttention.get_config)Fr   r	   N)NF)NFFF)N)NFNF)ro   
__module____qualname____doc__r   r(   r;   rG   rT   r`   rb   rh   rl   r%   rs   __classcell__r   r   r   r   r      s2    :
!.



"r   N)
	keras.srcr   r   keras.src.api_exportr   keras.src.backendr   keras.src.layers.layerr   r   r   r   r   r   <module>   s    