
    pif                    X   d dl Z d dlZd dl mZ d dlmZ d dlmZmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlmc mZ d dlZd dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ ej`                  jc                  e2d      Z3ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA ddlBmCZCmDZDmEZEmFZF ddlGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZR ddlSmTZTmUZUmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\m]Z]m^Z^ ddl_m`Z`maZambZbmcZc ddldmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZp ddlqmrZrmsZs ddltmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZmZmZ dd lmZmZmZmZmZ dd!lmZmZmZmZmZmZmZmZmZmZmZmZmZ dd"lmZ eZ ej@                         Zi Zd#e jF                  d$ezd%ee>   d&eud'e)d(ee/   d)exfd*Zdedd d+dfd+d,d-d.ed/ed0ee   d1ed2ee   d3ed4ed5ee   d)efd6Zd7ejT                  d)ejT                  fd8Zd+d9d7ejT                  d/eev   d0eev   d1ed2ed5eev   d:e$d;ed<ed=efd>Zdedd+dddd+fd7ejT                  d/evd0eev   d1ed2ee   d5eev   d?ee&   d:ee$   d;ed)ejT                  fd@ZdA Z	 dSdd+d+eedBd#e jF                  d7ejT                  d2ee   d/eev   d0eev   d)efdCZdDed)efdEZddd+dddFd7ejT                  d2ee   dGedHee   dIedJee   d)eejf                  jh                  e~f   fdKZd ddLdMedGed3ed2ee   d)ejf                  jh                  f
dNZd dd+d+dd+d+ddOdMed3ed2ee   dPedIedJee   dQedGed)eejf                  jh                  eejn                  ejn                  f   fdRZeZeZy)T    N)nullcontext)wraps)AnyCallableOptional)patch)Tensor)PhiloxStateTrackerrng_decompositions)enable_python_dispatcher)compiled_autograd)CompileEventLoggerdynamo_timedpreserve_rng_stateset_feature_use)detect_fake_mode)BoxedDeviceIndex)	BoxedBool)
FakeTensorFakeTensorMode)reorder_kwargs)make_fx)ShapeEnvcudagraph_static_inputs   )config)AOTAutogradCacheautograd_cache_keyshould_use_local_autograd_cache should_use_remote_autograd_cache)*run_functionalized_fw_and_collect_metadata)AOTInputBufferAOTInputParamAOTInputPlainAOTInput)_detect_attribute_assignment_try_get_metadata_from_dynamoconstruct_fake_modeprocess_inputs)"_check_if_mutation_can_be_in_graph&are_all_mutations_hidden_from_autograd1are_all_mutations_under_no_grad_or_inference_modeassert_functional_graphfrom_fungen_alias_from_basehas_data_mutationhas_metadata_mutationis_funsync_functional_tensorto_fun)aot_dispatch_subclasscreate_functional_callcreate_functionalized_fn%create_functionalized_rng_ops_wrappercreate_jointfn_input_mutations_to_outputsfn_prepped_for_autograd)aot_stage1_graph_captureaot_stage2_compileaot_stage2_export)compute_overlapping_inputscreate_graph_signaturecreate_synthetic_base_metadataremove_dupe_metadata)callback_setdescribe_inputformat_guard_bug_msgget_aot_compilation_contextget_aot_graph_nameget_graph_being_compiledgraph_being_compiled
model_name	nth_graphset_model_name#setup_stacktrace_preservation_hookstrack_graph_compiling)AOTDedupeWrapperAOTSyntheticBaseWrapper)	AOTConfigAOTDispatchCompilerAOTGraphCaptureAOTStateBackwardSignatureFakifiedFlatArgsFQNGraphInputNameGraphOutputNameGraphSignatureInputAliasInfoJointWithDescriptorsMutationTypeOutputAliasInfo
OutputTypeSerializableAOTDispatchCompilerSubclassCreationMetaSubclassMetaTensorAliasViewAndMutationMeta)requires_subclass_dispatchunwrap_tensor_subclasses1unwrap_tensor_subclasses_with_indices_to_originalwrap_tensor_subclasses"wrap_tensor_subclasses_maybe_joint)_get_autocast_states_get_symint_hintscall_func_at_runtime_with_argscreate_tree_flattened_fnKNOWN_TYPESmake_boxed_compilermake_boxed_funcmaybe_to_fresh_inputnormalize_as_listpartial_flatten_asdict%root_module_when_exporting_non_strictsimple_wraps
strict_zip)default_partitionstackfake_flat_argsflat_args_descs
aot_config	fake_mode	shape_envreturnc                  
   | j                  t        dd             |j                  i |_        i t        |j                  |_        t        j
                  ri t        |j                  |_        |
t               n	t               }| j                  t        j                  j                  d             | j                  t                      | j                  |       | j                  |       | j                  t                      | j                  t        j                  j                  j!                                ddlmm fd}t)        d	 |D              }	t               5  t+        d
d       5  t-        |      }
|
t/        |
      }n
t               }t        j0                  j                  j2                  rt               }nt        dd      }|5  |5   t5        |||j6                  |j8                  |	|j:                  |j<                         ||       ddd       ddd       t?        |      }tA        jB                  d|       t)        fdjD                  D               xr t)        d jF                  D               }|	r|rd}	|r3 t5        |||j8                  d|j:                  |j6                        | ntI        jF                  jD                  jJ                  |j8                  jL                  jN                  jP                  jR                  jT                  djV                  j6                        ddd       ddd       jJ                  dkD  rQrOJ dtY        jD                        D cg c]%  \  }}|jZ                  t\        j^                  k(  s$|' c}} d       |j<                  rta        jF                  D cg c]  }|jb                  s| c}      dk7  rte        dtg                     ta        jF                  D cg c]  }|jh                  r|jj                  r| c}      dk7  r#|jl                  rte        dtg                     rte        d      t        j
                  rte        d      to        |	 ||      |||       S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY wc c}}w c c}w c c}w )a  
    Traces the forward and backward graphs of the attr:`flat_fn` to generate a
    joint graph. The joint graph is an Fx graph with Aten ops. Please refer to
    the tracing mechanism to understand the graph capturing details.

    The joint graph is then passed through attr:`partition_fn` to isolate the
    forward and backward portions, which are then respectively compiled via the
    provided attr:`fw_compiler` and attr:`bw_compiler`.

    The resulting compiled forward and backward graphs are then wrapped up in a
    ``torch.autograd.Function`` object.

    The calling convention here is that the first aot_config.num_params_buffers
    inputs in flat_args are parameters and buffers, and the rest are inputs.

    We use this to assume that parameters/buffer's shapes don't change.
    create_aot_dispatcher_functionT)log_pt2_compile_eventNFr   )FakeScriptObjectmaybe_to_fake_objc                 ~    | D cg c],  }t        |      r t        |       |j                        n|. c}S c c}w N)
isinstancer   real_obj)ry   argr   r   s     _/opt/services/ai/voice_agent/venv/lib/python3.12/site-packages/torch/_functorch/aot_autograd.py_dup_fake_script_objz.create_aot_state.<locals>._dup_fake_script_obj  sM    
 &	
  #/0 .~>M
 	
 
s   1:c              3   V   K   | ]!  }t        |t              s|j                   # y wr   )r   r	   requires_grad.0xs     r   	<genexpr>z#create_aot_state.<locals>.<genexpr>  s"      :a3Hs   ))ztorch.cuda.set_rng_statec                       y r    )argss    r   <lambda>z"create_aot_state.<locals>.<lambda>&  s        aot_collect_metadata)rz   static_input_indiceskeep_input_mutationsis_trainpre_dispatch	is_exportbackend_compile)re   c              3      K   | ]d  }|j                   xrR |j                  t        j                  t        j                  fv xr# j
                  |j                     j                     f y wr   )r   output_typer_   alias_of_inputis_input
input_infobase_idx)r   r   fw_metadatas     r   r   z#create_aot_state.<locals>.<genexpr>H  sk      /    MMj&?&?ATAT%UU I#..qzz:HH/s   A*A-c              3      K   | ]<  }|j                   xr* |j                  xr |j                   xr |j                    > y wr   )r   mutates_data)mutations_under_no_grad_or_inference_modemutations_hidden_from_autogradr   s     r   r   z#create_aot_state.<locals>.<genexpr>S  sS      
 	  9NN9CCC9 8889s   AA)rz   r   r   r   r   )r   output_infonum_intermediate_basesr   traced_tangentstraced_tangents_descssubclass_inp_metasubclass_fw_graph_out_metasubclass_tangent_metar   tokensr   ztorch.compile is currently being used with tensor subclass inputs.
We are attempting to a compile a graph with two graph outputs
that alias one another, specifically output indices:

    z

ANY output aliasing (even for regular tensors) is currently unsupported if
there are any subclass outputs. If you run into this, please file a github
issuezFound an input that received a metadata mutation, through e.g. a call to `.resize_()` or `.transpose_()`.
This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.

fw_metadata=zFound a graph input that requires gradients, and received a mutation.
This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.

fw_metadata=zaot_export is not currently supported with traceable tensor subclass.
If you need this feature, please comment on <CREATE_ISSUE_LINK>zFunctionalized RNG is not currently supported in the aot_export workflow. Please file a github issue,
or otherwise set torch._functorch.config.functionalize_rng_ops = False.)needs_autograd	flat_argsrz   r   r{   rx   )8enter_contextr   decompositionsaot_autograd_decompositionsr   functionalize_rng_opsr   r   r   torchautogradset_multithreading_enabledr   r
   _dynamoutils+_disable_saved_tensors_hooks_during_tracing"torch._library.fake_class_registryr   r   anyr   rt   r&   
_functorch"fake_tensor_propagate_real_tensorsr!   r   keep_inference_input_mutationsr   r   re   r   try_add_pt2_compiler   r   rd   r   r   r   r   r   r   r   	enumerater   r_   alias_of_intermediatelenmutates_metadataRuntimeErrorstrr   r   export_trace_jointrT   )rx   flat_fnry   rz   r{   r|   r}   python_dispatcher_moder   r   modctxdynamo_timed_ctxreq_subclass_dispatchoutput_and_mutation_safeir   r   r   r   s                    @@@r   create_aot_stater     s&   : 
5TR   ($&
!!
%!

#
#!J
 ##%
 %
''%

! '0&; "  
AA%HI	*,-		"	./	*,-	GGI W

  !/ N 
"	# Y -/AB V	7@C237!m&&II $/= #/*$$  " 	93 	9H$3)3)H)H)3)R)R+!+!8!8(22 (79	9 	9 %?%! 22!>S ,/ / %00/ , ( (  
 %//   %& ": "' )#"L(7-7-V-V!&%/%<%<-7-L-L# &#'K #6#.#9#9$/$;$;/:/Q/Q-7-V-V(3(C(C.9.O.O*5*G*G3>3Y3Y.9.O.O!&*11-8-M-M#KSV	Yv ))A-( 			 /
 k556lDAq!--:KkKk:kalm n	+	 			(  ;11HaQ5G5GHIQN 	"   )331>>   -- 	"  !C  ''K  %&~6' M	9 	9 	9 	9!V	 V	Y YB m Isu   .S#<A"SS	!AR<	%S	-D)SS#%S0
7S0
%S67S6/!S;<SS		SSS 	S##S-FT)dynamic
enable_logfnfw_compilerbw_compilerpartition_fnr   num_params_buffersr   inference_compilerc                     ||}||}t        ||||||t        t              ||ddd|	      dt                fd       }
|
S )a
  
    Traces the forward and backward graph of :attr:`fn` using torch dispatch
    mechanism, and then compiles the generated forward and backward graphs
    through :attr:`fw_compiler` and :attr:`bw_compiler`.

    :func:`aot_function` traces the forward and backward graph ahead of time,
    and generates a joint forward and backward graph.  :attr:`partition_fn` is
    then used to separate out forward and backward graphs. The partitioner
    function can be used to perform optimizations such as recomputation. One can
    set `decompositions` dictionary to decompose the operators into a sequence
    of core or simpler operators supported by the backend compilers.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fn (Callable): A Python function that takes one or more arguments. Must
            return one or more Tensors.
        fw_compiler (Callable): A Python function that accepts an Fx graph with
            Aten ops and input args, and returns a Callable that semantically is
            equivalent to the input Fx graph.
        bw_compiler (Optional[Callable]): A Python function that accepts an
            Fx graph with Aten ops and input args, and returns a Callable that
            semantically is equivalent to the input Fx graph.  Default: None
            (when None, it defaults to the :attr:`fw_compiler`)
        partition_fn (Callable): A Python function that takes a joint forward
            and backward graph, and partitions it into separate forward and
            backward graphs.
        decompositions (Dict): A dictionary to define the decomposition of
            larger Aten ops into simpler or core Aten ops.
        inference_compiler (Optional[Callable]): A Python function that accepts an
            Fx graph with Aten ops and input args, and returns a Callable that
            semantically is equivalent to the input Fx graph. inference_compiler is invoked
            if no autograd is needed. Default: None
            (when None, it defaults to the :attr:`fw_compiler`)
    Returns:
        Returns a ``Callable`` that retains the eager behavior of the original
        :attr:`fn`, but with forward and backward graph compiled via
        :attr:`fw_compile` and :attr:`bw_compile`.

    A simple example usage of :func:`aot_function` is as follows. This example
    will print the forward and backward graphs of the function ``fn``

        >>> fn = lambda x: x.sin().cos()
        >>> def print_compile_fn(fx_module, args):
        >>>     print(fx_module)
        >>>     return fx_module
        >>> aot_fn = aot_function(fn, print_compile_fn)
        >>> x = torch.randn(4, 5, requires_grad=True)
        >>> aot_fn(x)
    NF)r   r   r   r   r   r   aot_idr   dynamic_shapesaot_autograd_arg_pos_to_sourcer   no_tangentsr   c            
         t        j                  | i |}t        | |      \  }}t        |      \  }}t	        |||      }t        t        |            D cg c]  }t        |       }	}t        j                         5 }
t        |
|||	||      }t        ||      }t        ||      \  }}d d d        |f\  }} ||      }|j                  |      S c c}w # 1 sw Y   0xY wr   )pytreearg_tree_leavesrm   r(   r)   ranger   r%   
contextlib	ExitStackr   r<   r=   	unflatten)r   kwargsr   r   out_specr|   r}   ry   r   fake_flat_args_descsrx   	aot_stateaot_graph_capturecompiled_fn_	cached_fnoutr{   
cached_resr   s                    r   returned_functionz'aot_function.<locals>.returned_function  s#    **D;F;	  8T6 JGX%8J%O"Y	/=:y)0N +0N0C*D$%&a $  $ %%' R5,"(	 %=Y$P!!3I?P!QQR &x0J(	8	"!!#&&'$R Rs   C-C  C))rQ   nextAOT_COUNTERr   )r   r   r   r   r   r   r   r   r   r   r   r{   r   s   `          @@r   aot_functionr     sy    D !!(-!%-K 'E'+J J
2Y!' !'F r   r   c                 l    t         j                  j                  j                           fd}t	         j                  d            t	         j                  d            t              t              z   }t        |g|d|i| G  fddt        j                        } |       S )a  
    Traces the forward and backward graph of :attr:`mod` using torch dispatch
    tracing mechanism. It is wrapper function, that underneath uses
    :func:`aot_function` to perform tracing and compilation.

    :func:`aot_module` lifts the parameters and buffers of ``nn.Module`` as inputs
    to a new callable which is then compiled through :func:`aot_function`.

    .. warning::
        This API is experimental and likely to change.

    Args:
        mod (Callable): A ``nn.Module`` module.
        args : args to be passed to :func:`aot_function`
        kwargs : kwargs to be passed to :func:`aot_function`

    Returns:
        Returns a ``nn.Module`` that retains the eager behavior of the original
        :attr:`mod`, but with forward and backward graph compiled.

    c                 T    i | |}t         j                  j                  |||      S r   )r   funcfunctional_call)named_paramsnamed_buffersr   r   params_and_buffersr   s        r   r   z#aot_module.<locals>.functional_call^  s/    >>>zz))#/A4PPr   Fremove_duplicater   c                   2     e Zd Zd fdZfdZ xZS )aot_module.<locals>.AOTModulec                 0    t         |           | _        y r   )super__init__orig_module)self	__class__r   s    r   r  z&aot_module.<locals>.AOTModule.__init__j  s    G"Dr   c                      g|i |S r   r   )r  r   r   
compiled_fr   r   s      r   forwardz%aot_module.<locals>.AOTModule.forwardn  s*      	 r   )r~   N)__name__
__module____qualname__r  r  __classcell__)r  r  r   r   r   s   @r   	AOTModuler   i  s    	#	 	r   r  )r   r   r    assert_no_fake_params_or_buffersdictnamed_parametersr   r   r   nnModule)	r   r   r   r   r   r  r  r   r   s	   `     @@@r   
aot_moduler  E  s    . 
MM88=Q ,,e,DEL**E*BCM\*S-??3EIOJ BII  ;r   ) force_non_lazy_backward_loweringboxed_forward_device_indexignore_shape_envflattenr  c                \   |s|J |i }t        | j                  d            }t        t        | j                  d                  }t        |j	                               t        |j                               }}t        |      }t        |j	                               t        |j                               }}t        |      }i ||}||z   }||z   }t        | |||z   |       }g |||}d\  }}|r*t        |||      \  }}t        j                  ||f      \  }}~g }|j                  d |D               |j                  d |D               |j                  d t        t        |      t        |      z
        D               t        j                  j                  j!                         x}r||_        t%        |      \  |_        |_        t+        | |j                         t        |            \  }}d} |D ],  }!t-        |!t.              s|!j0                  j2                  d u}  n t5        di d|d	|d
|d|d|d||z   dt7        t8              d|d| d|d|dddddd d|
dt;        | dd       d|}"t=        ||"      \  }#}$t?        ||"|#|$|
      }%|||||%||"|#|$||fS )NFr   )strict_out_tuple)NNc              3   2   K   | ]  }t        |        y wr   )r$   r   fqns     r   r   z0prepare_aot_module_simplified.<locals>.<genexpr>  s     E#=-E   c              3   2   K   | ]  }t        |        y wr   )r#   r  s     r   r   z0prepare_aot_module_simplified.<locals>.<genexpr>  s     G3>#.Gr  c              3   2   K   | ]  }t        |        y wr   )r%   )r   r   s     r   r   z0prepare_aot_module_simplified.<locals>.<genexpr>  s      ar  r   r   r   r   r   r   r   r   r   r   r   r   r   
cache_infor  precompile_backend_id_backend_idr  r   ) r  r  r   listvalueskeysr   r6   rm   r   tree_flattenextendr   r   _guardsTracingContexttry_getparams_flatrg   params_flat_unwrap_subclassesparams_unwrapped_to_flat_indexr'   r   r   r|   r}   rQ   r   r   getattrr(   r)   )&r   r   r   r   r   r   r   r   r   r  r  r  r  paramsbuffersr*  params_spec
params_lenbuffers_flatbuffers_specbuffers_lenparams_buffersparams_buffers_flatparams_buffers_specr   	full_argsin_specr   full_args_descstracing_contextr   r   r   r   r{   r|   r}   ry   s&                                         r   prepare_aot_module_simplifiedr<  y  s^     ~~	 #&&&>?F4))5)ABCG#FMMO4d6;;=6IK[!J!%gnn&6!7glln9M,Ll#K**'*N%4%4 - *{":QX[O 5+44t4I"GX$<Y%
! $00)V1DE	7 OEEEG,GG "'I_9M(M"N   --66>>@@@ ':# >>QR	
9: 	&c>+>+>+@#i.Q& N a$[[22$>N
   . "	
 & &3 K  (F & (F 2    *  &c=$?!" *J#J& /y*EIy#:y)5EN
 	 r   
cudagraphsc                 |   |7t        t        j                  j                  j                  j
                        }||}||}t        j                         5 }t        | |d|||||||	|
d      \  }}}}}}}}}}dt        |t              r?t               }t               }|s|r't        d|       t        j                  | ||||	||      O|j!                  t#        j$                                t'        |||||||      }t)        ||      }t+        ||      \  }ddd       t        | t        j,                  j.                  j0                        rdt2        t4           ffd}ndt6        t4           ffd}| j8                  |_        | j:                  |_        | j<                  |_        |S # 1 sw Y   xY w)a  
    This is the simplified or low overhead version of aot_module. For frontends
    like TorchDynamo, the input functions/modules to AOT are static and have
    unpacked inputs/outputs. This gives us an opportunity to remove the
        (1) pytree overhead to parse inputs/outputs,
        (2) AOT Autograd cache,
        (3) Reading of params/buffers in every forward call

    :func:`aot_module_simplified` removes these overheads.
    NF)r  aot_autograd_remote_cacheruntime_argsc                 |    g }|j                         |j                  |        | j                           |      S r   )r&  clear)r@  r   r   r6  s     r   r  z&aot_module_simplified.<locals>.forwardY  s<    I01\* y))r   c                  \    g }|j                         |j                  |         |      S r   )r&  )r@  r8  r   r6  s     r   r  z&aot_module_simplified.<locals>.forwardf  s0    I01\*y))r   )r   r   	_inductorr   tritonr=  r   r   r<  r   r`   r   r    r   r   try_loadr   r   _disabler   r<   r=   r   r   	GmWrapperr"  r   tuple	zero_gradr  r   )r   r   r   r   r   r   r   r   r=  r  r  rx   r   _params_spec_buffers_specry   r:  r{   r|   r}   _in_spec	_out_speclocalremoter   r   r   r  r   r6  s                               @@r   aot_module_simplifiedrQ    s   4 u55<<GGH
!!(				 9N5 **&
	
  k#BC35E57F ;VD.77".  1 : : <=(I !9O T/	;LMNKs9Nv #u}}**445
	*$s) 	*	*5: 	* G"33G--GNq9N 9Ns   CF22F;c                        fd}d|_         |S )Nc                     t         j                  j                  j                         5  t         j                  j	                        j                  |       cd d d        S # 1 sw Y   y xY wr   )r   fx	tracebackpreserve_node_metaInterpreter	boxed_run)r   fx_gs    r   runz)boxed_nop_preserve_node_meta.<locals>.runu  sK    XX224 	>88''-77=	> 	> 	>s   .A""A+T)_boxed_call)rY  example_inputsrZ  s   `  r   boxed_nop_preserve_node_metar]  t  s    > COJr   )r   r   r  r   r   c                $   t        |||||t        ||dd|dd      \  }	}
}}}}}}}}}| j                  t        j                                t        | |	|||||      }t        ||	      }|j                  J t        ||||||j                        S )a  
    This API captures the joint graph for an nn.Module.  However, unlike
    aot_export_joint_simple or aot_export_module(trace_joint=True), the
    calling convention of the produced joint graph follows no fixed positional
    schema; for example, you cannot rely on the second argument of the traced
    joint graph to correspond to the second argument of the module you traced.
    However, the inputs and outputs of the traced graph are schematized
    with **descriptors**, annotated on meta['desc'] on the placeholder and
    return FX nodes, which you can use to determine the meaning of arguments.

    The major benefit of using this export rather than aot_export_joint_simple
    is that we have feature parity with all situations that torch.compile
    supports (via aot_module_simplified), including handling for more
    complicated cases such as multiple differentiable outputs, input mutations
    that must be handled outside of the graph, tensor subclasses, etc.

    What can you do with one of these joint graphs with descriptors?  The
    motivating use case (autoparallel) involves taking the joint graph, doing
    optimizations on it, and then turning it back into a callable so it can be
    torch.compile'd at a later point in time.  This cannot be done as a
    traditional torch.compile joint graph pass for two reasons:

        1. The sharding of parameters must be decided before parameter
           initialization / checkpoint load, far before torch.compile would
           ordinarily run.

        2. We need to change the meaning of parameters (e.g., we might replace
           a replicated parameter with a sharded version of it, changing its
           input size).  torch.compile is ordinarily semantics preserving, and
           not allowed to change the meaning of inputs.

    Some descriptors can be quite exotic, so we recommend thinking carefully
    if there is a safe fallback you can apply to descriptors you don't understand.
    For example, you should have some way to handle not finding a particular
    input exactly as is in the final FX graph inputs.

    Note: When using this API, you must create and enter an ExitStack context
    manager, which will be passed into this function.  This context manager
    must remain active if you call the compile function to finish compilation.
    (TODO: We may relax this requirement by having AOTAutograd keep track of
    how to reconstruct all the context managers at a later point in time.)

    NB: You're not obligated to do a /full/ compile in stage2; instead you can
    leave the forward/backward compilers unspecified in which case the
    partitioned FX graphs will directly run.  The overall autograd Function
    can be allowed in graph so you can reprocess it in the context of a
    (potentially larger) compiled region later.

    NB: These APIs do NOT hit cache, as we only ever cache the final compile results,
    not the intermediate export result.

    NB: If the passed nn.Module has parameters and buffers on it, we will
    generate extra implicit parameter/buffer arguments and assign ParamAOTInput
    and BufferAOTInput descriptors to them.  However, if you generate the input
    nn.Module from a mechanism like Dynamo, you will NOT get these descriptors
    (because Dynamo will already have taken care of lifting the parameters/buffers
    into arguments!)  In that case, it would be necessary to analyze the Sources
    of the inputs to determine if inputs are parameters and their FQNs.
    NT)r  r  )
_aot_state_aot_graph_capturer0  r3  r9  r   )	r<  rw   r   r   rG  r   r<   specr\   )rx   r   r   r   r   r   r  r   r   r   _params_buffers_flatr0  r3  ry   r:  r{   r|   r}   r9  r   r   r   s                         r   !aot_export_joint_with_descriptorsrc  }  s    h 	&& *.%	0 
)2245 I 1OL==$$$,! r   jdc                      t         j                   j                        \  }t              t        j
                  j                   fd              }|S )a  
    Companion function for aot_export_joint_with_descriptors which compiles the joint
    graph into a callable function that follows a standard calling convention.
    params_flat all are arguments.

    Note: We do NOT instantiate the module; this gives you the flexibility to subclass it and
    customize its behavior without having to worry about FQN rebinding.

    TODO: Consider if we should allow_in_graph the result by default.
    c                      t        j                  | t        |j                        f      d   } |      }t        j                  |j
                        S )Nr   )r   r%  r   r9  tree_unflattenr   )r   r   flat_inputsflat_outputsr   rd  s       r   unflattened_compiled_fnzCaot_compile_joint_with_descriptors.<locals>.unflattened_compiled_fn  sL     ))4

1S*TUVWX";/$$\2;;??r   )r=   r_  r`  ru   r   r   nonstrict_trace)rd  r   rj  r   s   `  @r   "aot_compile_joint_with_descriptorsrl    sT     (r7L7LMNK +
]]""@ # @ #"r   )r   output_loss_indexr   r   r   trace_jointrm  r   r   c                X   |r|rt        d      t        | j                  d            }t        | j                  d            }	i t        |      t        |	      }
t	        j
                  |
      \  }}t        |      }t        |      }|xs i }t        | ||d      d|rfd}t        }n|rt        nt        j                  }}g }|j                  |       |j                  |        |       5  t        ||||d||||	      \  }}ddd       |r$t              fd	       } t        |d
      | t	        j                   |i |}t#        ||t%        |j'                               t%        |	j'                               |      fS # 1 sw Y   xY w)a  
    This function takes in a module, and returns:
    (1) an FX graph that can be exported
    (2) some metadata about the graph

    If `trace_joint=True` we will return a joint graph of the forward + backward.

    The traced FX graph will have the following properties compared to the original module:
    (1) Inputs and outputs to the module will be pytree-flattened
    (2) Parameters and buffers on the module will be lifted into graph inputs,
        graph_inputs = (*parameters, *buffers, *user_inputs)
    (3) The graph will be fully functionalized
    (4) Any input mutations will be converted into additional outputs in the graph,
        meaning whoever calls this graph is responsible for applying the mutations
        back to the original inputs.
    (5) If is_joint is provided the graph will return parameter gradients in addition to user outputs.
        The graph output will look like:
        graph_outputs = (*updated_inputs, *user_outputs, *param_gradients)

    There are also several restrictions on what modules can use this API. In particular:
    (1) If trace_joint is specified, we expect the loss function to be **fused**
        into the module forward. One of the outputs to the forward must be a scalar loss,
        which is specified with `output_loss_index`.
        All other outputs to the forward are presumed to not require gradients.
    (2) This API cannot capture optimizers (although in theory we could build an API for this).
    (3) Metadata mutations on params/buffers/inputs are banned.
    (4) Data mutations on anything that requires gradients are banned (parameters)
    (5) If an input is mutated, it is not allowed to alias any other inputs.
    (6) Parameters must not be duplicated.
    z7pre_dispatch is not supported when trace_joint is True.Fr   T)store_orig_modNc                      |  }t        d      t        |t        j                        r|f}t        |t        t
        f      st        dt        |             t        |      D ])  \  }}|j                  s|k7  st        d d| d       |   }t        |      |j                  st        d d      |j                         dk7  rt        d	 d
|j                         |S )NzIf trace_joint=Trueit is required that one of your forward outputs must be a scalar loss.
You must specify the which (index) output is the loss with output_loss_index.zPExpected forward output to be either a tensor or a list/tuple of tensors. found au  Found an output of the forward that requires gradients, that was not the scalar loss.
We require all outputs to the forward that are not the scalar loss to not require gradient,
because we will only compute a backward graph against the scalar loss.
You can fix this by calling .detach() on each of your forward outputs that is not the loss.
You specified that output index z4 is the loss, but we found that
the output at index z requires gradients.zThe output at index z: was marked as the loss, but it does not require gradientsr   z3We require the output marked as the loss (at index z#) to be a scalar, but it has shape )r   r   r   r	   rI  r"  typer   r   r   numelshape)r   r   r   oout_lossr   num_fw_outsrm  s        r   fn_to_tracez&aot_export_module.<locals>.fn_to_traced  sE   !4(C ("Q 
 #.fcE4=1"fgklogpfqr  "# 1 ??q,='=&!
 "3 3 4C+/   ,-Hc(K))"&''ae  ~~1$"44E3FFijrjxjxiy}  Jr   )r   r   r   r   r   rn  r   c                  z   t        	j                  	j                  z         D cg c]  }d  }} | |      \  }}t        |      t        |       k(  sJ g }t	        | |      D ]J  \  }}t        |t        j                        r'|j                  r|J d       |j                  |       G|JJ  g ||S c c}w )NzFound a parameter that did not receive a gradient.
"This is most likely a bug, but if this needs to be supported please comment on this Github issue:
https://github.com/pytorch/pytorch/issues/101192
)
r   num_outputsnum_mutated_inp_runtime_indicesr   zipr   r   r	   r   append)
r   r   fake_tangentsfw_outs	gradientsoutput_gradientsagradrY  metadatas
           r   flattened_jointz*aot_export_module.<locals>.flattened_joint  s    ( ((8+S+SS M  "&dM!:GYy>SY...!tY/ 	(4a.1??+  . +
 %++D1<'<	( /G....'s   	B8)record_module_stack)user_args_flatparams_and_buffers_flatparam_namesbuffer_namesrn  num_user_fw_outs
loss_index)r   r  r  r   r   r%  rI  r   r6   r   r   no_gradr&  _aot_export_functionr   r   r   r@   r"  r$  )r   r   r   rn  rm  r   r   r   r  r   r   r  r0  r1  rx  r   r8  r9  r   r  r  r   rY  r  rw  s       `                @@@@r   aot_export_moduler    s   \ TUUC00%0HI**E*BCM

 
}
 ,2+>+>?Q+R([#$;<,-J\rF,[*TO K
,	\  *ku}}%I ,-T	 
,@))%)#
-
)h
 		$	/ 
 $	/L BwDA9M++T<V<N'% 7)..01-,,./$$  u
 
s   =F  F))r   r   r   c                   |rt         }nt        j                  } |       5  t        | |||      \  }}}}	|j                  \  }}
ddd       t        j                  D cg c]  }|j                  s|j                  s| c}      dk7  rt        dt        |             t        |j                  D cg c]"  }|j                  t        j                  k7  s!|$ c}      dk7  rt        dt        |             j                         rt        dt        |             t!        d |j                  D              st        dt        |             	j                         rt        d	t        |	             t!        d
 |	j                  D              st        dt        |	             t"        j$                  r[t&        j)                  |t        t*        j,                              \  }}t/        |      }|
t1               }|5   ||  ddd       |S S # 1 sw Y   xY wc c}w c c}w # 1 sw Y   |S xY w)a  
    A simplified version of export. Used by higher order operators.

    This function makes a high-level "no calling convention changes" guarantee:
    - If no inputs require grad (so we export an inference graph),
      there are *no* calling convention change between the exported graph, and "func".
    - If at least one input requires grad (so we trace out and export a joint fw-bw graph),
      Then if you were partition the graph into a separate forward and backward graph,
      The forward graph will have no calling convention changes compared to "func".

    The above also relies on some strong restrictions around which functions this API accepts:
    (1) `args` cannot contain any pytrees (they must have been pytree_flattened already)
    (2) `func` cannot mutate any inputs
    (3) The outputs of `func` cannot alias any inputs.

    Note: this function is only lightly tested today. It will probably be tested more heavily by higher order ops.
    )r   rn  Nr   z:aot_export_joint_simple does not support input mutations. zDaot_export_joint_simple does not support outputs that alias inputs. zKaot_export_joint_simple requires inputs to be a single list/tuple. in_spec=c              3   <   K   | ]  }|j                           y wr   is_leafr   childs     r   r   z*aot_export_joint_simple.<locals>.<genexpr>0  s     C5u}}C   zNaot_export_joint_simple requires individual inputs not to be pytrees. in_spec=zMaot_export_joint_simple requires outputs to be a single list/tuple. out_spec=c              3   <   K   | ]  }|j                           y wr   r  r  s     r   r   z*aot_export_joint_simple.<locals>.<genexpr>8  s     D5u}}Dr  zPaot_export_joint_simple requires individual outputs not to be pytrees. out_spec=)num_fwd_outputs)r   r   r  r  children_specsr   r   r   r   r   r   r   r   r_   	non_aliasr  allr   debug_assertr{   rw   r   output_infosr   r   )r   r   rn  r   r   r   rY  r  r9  r   _kw_in_specr   	fw_module
_bw_moduler|   s                  r   aot_export_joint_simpler    sP   8  mm	 6,@)#	-
)h  '556 	++T1q~~ASASQTU	 HXX
 	

 	,,V1AUAU0UQVW	 RSVW_S`Rab
 	
 YZ]^eZfYgh
 	
 CG,B,BCC\]`ah]i\jk
 	
 [\_`h\i[jk
 	
 DH,C,CDD^_bck_l^mn
 	
  * < < 8 89 != !
	: %T*	&(I 	t	K4K}6 6 U WJ	Ks/   $H H,>H,6"H1H1H6H)6I )r   r   r   r   r   r   rn  r   r   r   c                   |	xs i }	t        | ||	      \  }
}t        j                  ||	f      \  }}d }|t        |      }|t	        | d      rt        | j                  t        j                  j                        rS| j                  j                  j                  D cg c]  }d|j                  v r|j                  d   ! }}t        |      }|d uxr |j                  d u}t        d d d d ||t        t               ||d d|||      }|t#        ||      \  }}n|j                  }t%        ||||      }t'        t)        |            D cg c]  }t+        |       }}t-        j.                         5 }t1        ||
|||||      }t3        ||
      }t5        ||      \  }}d d d        ||j6                  fS c c}w c c}w # 1 sw Y   #xY w)N	_orig_modvalT)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rm   r   r%  r   hasattrr   r  r   rT  GraphModulegraphnodesmetar}   rQ   r   r   r(   r)   r   r   r%   r   r   r   r<   r>   ra  )r   r   r   r   r   r   r   r   rn  r   r   r   r   r9  r|   nodevalsr{   r}   ry   r   r   rx   r   r   rY  r  s                              r   r  r  S  s   , \rF0tVDGX,,dF^<IwI$Y/	k*4>>588+?+?@ !NN0066DII% 		% D 
 ).I"$.R93F3Fd3R
 %-K  (<%'+!&#J& 29jI	9''	#Iz9iPN6;C<O6PQM!,QQ				 E5$ 
	 5YH&y2CD
dE w--gH RE Es   $F8F=3-GGr   )r   	itertoolsr   	functoolsr   typingr   r   r   unittest.mockr   r   torch._dynamo.loggingtorch.nnr  torch.utils._pytreer   _pytreer   torch.utils.dlpackr	   $torch._decomp.decompositions_for_rngr
   r   torch._dispatch.pythonr   torch._dynamor   torch._dynamo.utilsr   r   r   r   torch._guardsr   torch._inductor.cudagraph_utilsr   torch._inductor.utilsr   torch._subclassesr   r   torch.export._tree_utilsr   "torch.fx.experimental.proxy_tensorr   %torch.fx.experimental.symbolic_shapesr   _logginggetArtifactLoggerr  static_inputs_log r   _aot_autograd.autograd_cacher   r   r   r    '_aot_autograd.collect_metadata_analysisr!   _aot_autograd.descriptorsr"   r#   r$   r%   _aot_autograd.frontend_utilsr&   r'   r(   r)   _aot_autograd.functional_utilsr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   $_aot_autograd.graph_capture_wrappersr5   r6   r7   r8   r9   r:   r;   _aot_autograd.graph_compiler<   r=   r>   #_aot_autograd.input_output_analysisr?   r@   rA   rB   _aot_autograd.logging_utilsrC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   _aot_autograd.runtime_wrappersrO   rP   _aot_autograd.schemasrQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   _aot_autograd.subclass_utilsre   rf   rg   rh   ri   _aot_autograd.utilsrj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   partitionersrw   r|  countr   r   r   r"  r   r  intboolr   r  r  r<  rQ  r]  rc  callablerl  rI  rT  r  r  r  TreeSpecr  compiled_functioncompiled_moduler   r   r   <module>r     s     "  * *     $ $   W ; +  + < + 8 3 6 : NN44'           
         ,     ,  iooz ! zz %z (^	z
 z z !z z@ '+.%)+0-1{ {{{ (#{ 	{
 TN{ { %){ !*{ {|1BII 1299 1D .3|	| -.	|
 -.| | | !!45| !1| | | '+|F 26.%)#(8< '+=A"y	y %y -.	y
 y TNy !!45y #y !))9 :y y YYyx 	C &*#(1M1MCC	C TNC -.C -.C CL#+? #H #: &* (,%)M	M TN	M M  }M M TNM 588/0Mp  %)`
` 	` ` TN` XX`V  %) %)!&)W.
W. 	W.
 TNW. W. W.  TN!W." #W.& 'W.* 588!4foovVW+W.t ! r   