o
    2h9*                     @   sL   d dl Z d dlmZ d dlmZ d dlmZ edgG dd dejZdS )    N)ops)keras_export)	optimizerzkeras.optimizers.Muonc                       s   e Zd ZdZ															
									d$ fdd	Zdd Z fddZdd Zdd Zdd Z	dd Z
defd d!Z fd"d#Z  ZS )%Muona>  Optimizer that implements the Muon algorithm.

    Note that this optimizer should not be used in the following layers:

    1. Embedding layer
    2. Final output fully connected layer
    3. Any {0,1}-D variables

    These should all be optimized using AdamW.

    The Muon optimizer can use both the Muon update step or the
    AdamW update step based on the following:

    - For any variable that isn't 2D, 3D or 4D, the AdamW step
        will be used. This is not configurable.
    - If the argument `exclude_embeddings` (defaults to `True`) is set
    to `True`, the AdamW step will be used.
    - For any variablewith a name that matches an expression
        listed in the argument `exclude_layers` (a list), the
        AdamW step will be used.
    - Any other variable uses the Muon step.

    Typically, you only need to pass the name of your densely-connected
    output layer to `exclude_layers`, e.g.
    `exclude_layers=["output_dense"]`.

    References:
        - [Original implementation](https://github.com/KellerJordan/Muon)
        - [Liu et al, 2025](https://arxiv.org/abs/2502.16982)

    Args:
        learning_rate: A float,
            `keras.optimizers.schedules.LearningRateSchedule` instance, or
            a callable that takes no arguments and returns the actual value to
            use. The learning rate. Defaults to `0.001`.
        adam_beta_1: A float value or a constant float tensor, or a callable
            that takes no arguments and returns the actual value to use.
            The exponential decay rate for the 1st moment estimates. Defaults to
            `0.9`.
        adam_beta_2: A float value or a constant float tensor, ora callable
            that takes no arguments and returns the actual value to use.
            The exponential decay rate for the 2nd moment estimates. Defaults to
            `0.999`.
        epsilon: A small constant for numerical stability. This is
            "epsilon hat" in the Kingma and Ba paper
            (in the formula just before Section 2.1),
            not the epsilon in Algorithm 1 of the paper.
            It be used at Adamw.Defaults to `1e-7`.
        exclude_layers: List of strings, keywords of layer names to exclude.
            All layers with keywords in their path will use adamw.
        exclude_embeddings: Boolean value
            If True, embedding layers will use adamw.
        muon_a: Float, parameter a of the muon algorithm.
            It is recommended to use the default value
        muon_b: Float, parameter b of the muon algorithm.
            It is recommended to use the default value
        muon_c: Float, parameter c of the muon algorithm.
            It is recommended to use the default value
        adam_lr_ratio: Float, the ratio of the learning rate when
                using Adam to the main learning rate.
                it is recommended to set it to 0.1
        momentum: Float, momentum used by internal SGD.
        ns_steps: Integer, number of Newton-Schulz iterations to run.
        nesterov: Boolean, whether to use Nesterov-style momentum
        {{base_optimizer_keyword_args}}
    MbP??+?Hz>皙?NFGz?muonTuV@皙獗n@ @ffffff?   c                    sz   t  jd|||||||	|
|||d| || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|p9g | _d S )N)learning_ratenameweight_decayclipnorm	clipvalueglobal_clipnormuse_emaema_momentumema_overwrite_frequencyloss_scale_factorgradient_accumulation_steps )super__init__adam_beta_1adam_beta_2epsilonmuon_amuon_bmuon_cadam_lr_ratiomomentumns_stepsnesterovexclude_embeddingsexclude_layers)selfr   r    r!   r"   r   r   r   r   r   r   r   r   r   r   r+   r*   r#   r$   r%   r&   r'   r(   r)   kwargs	__class__r   T/var/www/html/chatgem/venv/lib/python3.10/site-packages/keras/src/optimizers/muon.pyr   M   s6   zMuon.__init__c                 C   s^   dt |j  k rdk sdS  dS | jrd|j v rdS | jD ]}t||jr, dS q dS )N      T	embeddingF)lenshaper*   pathlowerr+   research)r,   variablekeywordr   r   r0   _should_use_adamw   s   
zMuon._should_use_adamwc                    s|   | j rdS t | i | _i | _i | _i | _|D ]"}| |s;| j|dd| j|j	< | 
|r;| j|dd| j|j	< qdS )a  Initialize optimizer variables.

        Adam optimizer has 3 types of variables: momentums, velocities and
        velocity_hat (only set when amsgrad is applied),

        Args:
            var_list: list of model variables to build Adam variables on.
        Nr'   )reference_variabler   velocity)builtr   buildadam_momentumsadam_velocitiesmuon_momentumsmuon_velocities!_overwrite_variable_with_gradientadd_variable_from_referencer6   r<   )r,   var_listvarr.   r   r0   r@      s(   	



z
Muon.buildc                 C   s4   |  |r| |||| j  d S | ||| d S )N)r<   _adamw_update_stepr&   _muon_update_step)r,   gradientr:   r   r   r   r0   update_step   s
   
zMuon.update_stepc              	   C   s   | j |j }| |t||| jd   |j}| jr%t|| j| }n|}| ||| 	|| j
 td|d |d  d   d S )Nr1   r   g      ?)rA   r6   
assign_addr   addr'   r5   r)   
assign_subzeropower_via_newtonschulz5r(   max)r,   rK   r:   lrmr5   gr   r   r0   rJ      s   zMuon._muon_update_stepc                 C   s  t ||j}t ||j}t | jd |j}t t | j|j|}t t | j|j|}| j|j }| j	|j }	|t 
d|  d|  }
| |t t ||d| j  | |	t t t ||	d| j  | |t t ||
t t 
|	| j dS )z=Update step given gradient and the associated model variable.r1   N)r   castdtype
iterationspowerr    r!   rA   r6   rB   sqrtrM   multiplysubtractsquarerO   dividerN   r"   )r,   rK   r:   r   rR   
local_stepadam_beta_1_poweradam_beta_2_powerrS   valphar   r   r0   rI      s6   zMuon._adamw_update_stepc                 C   sF   t |}ttt|}|d |d< t|d |d< t ||}|S )N   )r   r5   listranger4   	transpose)r,   Xr5   
temp_orderr   r   r0   transpose_last_axis   s   
zMuon.transpose_last_axisstepsc           
      C   s   t |}t|dksJ | j| j| j}}}|d |d kr%| |}|t j|dddd  }t|D ]}|| | }|| || |  }	|| |	|  }q5|d |d kr^| |}|S )a  We apply the Newton-Schulz iteration to compute matrix G.

        We select a quintic iteration that maximizes the slope at zero. This
        approach helps minimize steps, even if the iteration doesn't fully
        converge across the interval. The result isn't exactly UV^T (from the
        SVD of G), but rather an approximation like US'V^T. Despite this
        approximation, model performance remains unaffected compared to using
        the exact UV^T from the SVD.
        re   rd   rc   )rd   rc   T)axiskeepdimsr	   )	r   r5   r4   r#   r$   r%   rk   normrg   )
r,   xrl   r5   abc_temp_atemp_br   r   r0   rP      s   



z Muon.zeropower_via_newtonschulz5c                    sJ   t   }|| j| j| j| j| j| j| j	| j
| j| j| j| jd |S )N)r    r!   r"   r+   r#   r$   r%   r&   r'   r(   r)   r*   )r   
get_configupdater    r!   r"   r+   r#   r$   r%   r&   r'   r(   r)   r*   )r,   configr.   r   r0   rw     s"   
zMuon.get_config)r   r   r   r	   r
   NNNFr   NNNr   NTr   r   r   r
   r   r   T)__name__
__module____qualname____doc__r   r<   r@   rL   rJ   rI   rk   intrP   rw   __classcell__r   r   r.   r0   r      sD    E6 	!r   )	r8   	keras.srcr   keras.src.api_exportr   keras.src.optimizersr   	Optimizerr   r   r   r   r0   <module>   s    