
    i                    &   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJrJr  S SKJrJrJr  S SKJr  S SKJrJr  S SKJr  S SKJr  S SKJrJrJr  S S	K J!r!  S S
K"J#r#  S SK$J%r%  S SK&r&S SK'J(r(J)r)  S SK*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3J4r4J5r5J6r6  SSK7J8r8J9r9  SSK1J:r:  \8" 5       (       a  S SK;r;S SK&J<r<J=r=J>r>J?r?  SSK@JArAJBrB  \," 5       (       a  S SKCrC\0" 5       (       a  S SKDJErE  \." 5       =(       a#    \+" 5       =(       a    \/" 5       =(       a    \-" 5       rF\F(       a  S SKGrGS SKHJIrIJJrJ  S SKKJLrL  S SKMJNrNJOrO  S SKPJQrQ  S SKRJSrS  S SKTJUrU  S SKVJWrWJXrXJYrYJZrZJ[r[  S SK\J]r]  S SK^J_r_J`r`JaraJbrbJcrcJdrdJereJfrfJgrgJhrhJiriJjrjJkrkJlrlJmrm  S SKnJoro  S S KpJqrqJrrrJsrs   " S! S"\oS#S$9rt " S% S&\]S#S$9ru " S' S(\SS#S$9rv\r" \t5      rw\r" \u5      rx\r" \v5      ry1 S)krz1 S*kr{1 S+kr|\9R                  " \~5      rS,S-S.S/.0r\" \GR                  5       5      rS0r " S1 S2\GR
                  5      rS3\4S4 jrS5\S6S7S8S74S9 jr " S: S;5      r " S< S=5      r\ " S> S?5      5       r " S@ SA\:5      r\~SB:X  a  \" 5       r\GR                  5         gg)C    N)ArgumentParser	Namespace)AsyncGenerator	GeneratorIterable)asynccontextmanager)	dataclassfield)BytesIO)Thread)Optional	TypedDictUnion)
model_info)HF_HUB_OFFLINE)DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   $    \ rS rSr% Sr\\S'   Srg))TransformersResponseCreateParamsStreaming{   zo
OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
generation_config N__name__
__module____qualname____firstlineno____doc__str__annotations____static_attributes__rQ       m/home/dmtnaga/Documents/work/airagagent/rag_env/lib/python3.13/site-packages/transformers/commands/serving.pyrN   rN   {       	 r[   rN   F)totalc                   $    \ rS rSr% Sr\\S'   Srg)+TransformersCompletionCreateParamsStreaming   z
OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
rP   rQ   NrR   rQ   r[   r\   r`   r`      r]   r[   r`   c                   <    \ rS rSr% Sr\\S'   \\S'   Sr\	\S'   Sr
g)	%TransformersTranscriptionCreateParams   zo
OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
filerP   FstreamrQ   N)rS   rT   rU   rV   rW   bytesrY   rX   rf   boolrZ   rQ   r[   r\   rc   rc      s    	 r[   rc   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstoprj   audiork   logprobsmetadata	functions
modalities
predictionrq   rr   rs   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   rl   rm   languager   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                   $    \ rS rSrSrSrSrSrSrg)Modality   LLMVLMSTTTTSrQ   N)	rS   rT   rU   rV   r   r   r   r   rZ   rQ   r[   r\   r   r      s    
C
C
C
Cr[   r   argsc                     [        U 5      $ )zr
Factory function used to instantiate serving server from provided command line arguments.

Returns: ServeCommand
)ServeCommand)r   s    r\   serve_command_factoryr      s     r[   reqmodel_generation_configr'   returnc                 ~   U R                  S5      b#  [        S0 [        R                  " U S   5      D6nO[        R
                  " U5      nUR                  " S0 UD6nUR                  5        H  u  pVUc  M
  [        X5U5        M     U R                  S5      b  [        U S   5      Ul
        U R                  S5      b  [        U S   5      Ul
        U R                  S5      b  [        U S   5      Ul        U R                  S5      b
  U S   Ul        U R                  S5      b
  U S   Ul        U R                  S5      b,  [        U S   5      Ul        [        U S   5      S:X  a  S	Ul        U R                  S
5      b  [        U S
   5      Ul        U R                  S5      b  [$        R&                  " U S   5        U$ )ax  
Creates a generation config from the parameters of the request. If a generation config is passed in the request,
it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
Other parameters in the request will be applied on top of the baseline.

Args:
    req (`dict`):
        The request which may optionally contain generation parameters.
    model_generation_config (`GenerationConfig`):
        The model's default generation config.
    kwargs (`dict`):
        Additional parameters to set in the generation config.

Returns:
    The prepared `GenerationConfig` object.
rP   max_output_tokens
max_tokensfrequency_penalty
logit_biasrw   temperatureg        Ftop_pseedrQ   )getr'   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrP   non_standard_kwargskvs          r\   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB+22<V<#))+=%!, ,
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"#f+&r[   c                   $    \ rS rSrSrS rS rSrg)	ToolStatei$  z7Lightweight class to keep track of the tool call state.c                 $    U R                  5         g N)resetselfs    r\   __init__ToolState.__init__'  s    

r[   c                 <    SU l         SU l        SU l        SU l        g)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    r\   r   ToolState.reset*  s!     %%*"!"r[   )r   r   r   r   N)rS   rT   rU   rV   rW   r   r   rZ   rQ   r[   r\   r   r   $  s    Ar[   r   c            	       X    \ rS rSrSr SSSS\S\\S      4S	 jjrS
 r	S r
S rS rSrg)
TimedModeli2  z
A class that holds a PreTrainedModel instance and its associated processor.
Automatically deletes the instances after a specified timeout.
Nmodelr(   timeout_seconds	processor)r   r   c                     Xl         [        UR                  5      U l        X0l        X l        [        R                  " U R
                  U R                  5      U l	        U R                  R                  5         g r   )r   rX   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   s       r\   r   TimedModel.__init__8  sU     
 !3!34".ood&:&:D<P<PQr[   c                     U R                   R                  5         [        R                  " U R                  U R
                  5      U l         U R                   R                  5         g)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    r\   reset_timerTimedModel.reset_timerE  s@    ood&:&:D<P<PQr[   c                 H   [        U S5      (       a  U R                  b  U ?U ?SU l        SU l        [        R                  " 5         [
        R                  R                  5       (       a  [
        R                  R                  5         U R                  R                  5         ggg)z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   cudais_availableempty_cacher   r   r   s    r\   delete_modelTimedModel.delete_modelK  sx    4!!djj&<
DJ!DNJJL zz&&((

&&( KK  '=!r[   c                     U R                  5         [        R                  U R                   SU R                   S35        g )Nz was removed from memory after z seconds of inactivity)r   loggerinfor   r   r   s    r\   r   TimedModel.timeout_reached[  s7    t))**I$J^J^I__uvwr[   c                 N    [        U S5      (       + =(       d    U R                  SL $ )z)Check if the instances have been deleted.r   N)r   r   r   s    r\   
is_deletedTimedModel.is_deleted_  s     4))?TZZ4-??r[   )r   r   r   r   r   r   )rS   rT   rU   rV   rW   r   r   r   r   r   r   r   r   rZ   rQ   r[   r\   r   r   2  sP     SW	   E"MNO	! x@r[   r   c                   H   \ rS rSr% Sr\" SSS0S9r\\S'   \" SSS	0S9r	\
\S
'   \" SS/ SQS.S9r\\
   \S'   \" SS/ SQS.S9r\\
   \S'   \" SSS0S9r\\S'   \" SSS0S9r\\
   \S'   \" SSS0S9r\\S'   \" SSS0S9r\\S'   \" SSSS/S.S9r\
\S'   \" SSS0S9r\\S'   \" S SS!0S9r\
\S"'   \" S#SS$0S9r\\S%'   \" S&SS'0S9r\\S('   \" S)SS*0S9r\
\S+'   \" SSS,0S9r\\   \S-'   \" SSS.0S9r\\S/'   \" SSS00S9r\\S1'   \" SSS20S9r\\
   \S3'   S4 rS5rg)6ServeArgumentsid  z
Arguments for the serve CLI.

See the metadata arg for each argument's description -- the metadata will be printed with
`transformers serve --help`
Fhelpz8Whether to use continuous batching for chat completions.)defaultrz   continuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                     U R                   b`  U R                  c  U R                   U l        gU R                   U R                  :w  a&  [        SU R                    SU R                   S35      egg)z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)r   r   
ValueErrorr   s    r\   __post_init__ServeArguments.__post_init__  su     'zz!!--
!!TZZ/ $T%5%5$6mDJJ< PM M  0 (r[   )r   )rS   rT   rU   rV   rW   r
   r   rh   rY   r   rX   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r	  rZ   rQ   r[   r\   r   r   d  sT    !&TU!   >
FC  "'WA
"K#  !PA
E8C=  $)] ^t  */ r
*#  efL$  efL$   %UFZhmotgu=vww!&uHm?n!o$o kV=c4deD#edf6W-XYD#Y\]M3  &*d!eIs  #(([\#L(3-  &
K  #B
d  "'2
"K# 
r[   r   c                   ~   \ rS rSr\S\4S j5       rS\4S jrS\	S\
SS	S
\4S jrS\	4S jrS\	4S jrS\	4S jr        S3S\S\\   S\\   S\\   S\\   S\\S      S\\   S\\   S\4S jjrSSS\4S jrS r\R4                  S\\	\\4      4S j5       rS \	S\S\\S4   4S! jr\SS"S\4S# j5       r \S$\4S% j5       r!S \	S\"\SS4   4S& jr#S \	S\"\SS4   4S' jr$S \	S\"\SS4   4S( jr%S \	S\&4S) jr'\S\S\S*   4S+ j5       r(S,\S\4S- jr)S.\4S/ jr*S.\S\+S"\4   4S0 jr,S.\S\+S"\-4   4S1 jr.S2r/g)4r   i  parserc                 X    [         4nU R                  SUS9nUR                  [        S9  g)z
Register this command to argparse so it's available for the transformer-cli

Args:
    parser: Root parser to register command-specific arguments
serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r  r  serve_parsers      r\   register_subcommand ServeCommand.register_subcommand  s3     *+((/(R!!'<!=r[   r   c           	      R   [         (       d  [        S5      eXl        U R                  R                  U l        U R                  (       a  [
        R                  " 5       nU R                  R                  c(  X R                  l        [        R                  SU 35        [
        R                  " 5       nU R                  R                  U;  a)  [        SU SU R                  R                   SU S35      eU R                  R                  U l        U R                  R                  b*  [        R                  " U R                  R                  5        [         R"                  " S5      nUR%                  [         R&                  U R                  R(                  R+                  5          5        [         R"                  " S5      nUR%                  [         R&                  U R                  R(                  R+                  5          5        0 U l        S U l        S U l        S U l        S U l        g )	NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=`transformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr)    default_attention_implementationr   r   r   #supported_attention_implementationsr  r  r  r   r   r"   
get_loggersetLevel
log_levelsr  lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   default_attn_implsupported_attn_impltransformers_logger	cb_loggers         r\   r   ServeCommand.__init__  s   ++s 
 	'+yy'D'D$'' 9 Z Z \yy,,40A		-KL]K^_`";"_"_"ayy,,4GG 89L8MMjyy4459:K9LAO 
  990099!!-dii445 &00@$$W%7%7		8K8K8Q8Q8S%TU&&'TU	7--dii.A.A.G.G.IJK 57X\0 "!r[   requestschema	validatorrK   unused_fieldsc                 :   [         R                  SU 35        [        UR                  5       5      nUR                  nXV-
  nU(       a%  [         R                  SU 35        [        SSU 3S9eU R                  R                  (       aC   UR                  U5        XT-  n	U	(       a%  [         R                  SU	 35        [        SSU	 3S9egg! [         aC  n[         R                  SUR                  5        35        [        SUR                  5       S9eSnAff = f)a!  
Validates the request against the schema, and checks for unexpected keys.

Args:
    request (`dict`):
        The request to validate.
    schema (`TypedDict`):
        The schema of the request to validate. It is a `TypedDict` definition.
    validator (`TypeAdapter`):
        The validator to use to validate the request. Built from `schema`.
    unused_fields (`set`):
        Fields accepted by `schema`, but not used in `transformers serve`.

Raises:
    HTTPException: If the request is invalid or contains unexpected or unused fields.
zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr-   r   r  validate_pythonrL   errors)
r   r,  r-  r.  r/  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             r\   _validate_requestServeCommand._validate_request  s   . 	+G956 (
//$4LL;O;LMNC:Z[jZk8lmm99%%H))'2 (2'A$'=>V=WXY# #.LMeLf,g  ( & # H1!((*>?#AHHJGGHs   
C 
D>DDc                 @    U R                  U[        [        [        S9  g N)r,  r-  r.  r/  )r?  rN   response_validatorUNUSED_RESPONSE_FIELDSr   r,  s     r\   validate_response_request&ServeCommand.validate_response_requestA  s!    <(0	 	 	
r[   c                 @    U R                  U[        [        [        S9  g rB  )r?  r`   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSrE  s     r\    validate_chat_completion_request-ServeCommand.validate_chat_completion_requestI  s!    >*7	 	 	
r[   c                 @    U R                  U[        [        [        S9  g rB  )r?  rc   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSrE  s     r\   validate_transcription_request+ServeCommand.validate_transcription_requestQ  s!    8-5	 	 	
r[   N
request_idcontentr   rolefinish_reason
tool_callsr7   decode_stream	tokenizerr   c	                     Ub"  Ub  Ub  UR                  UR                  U5      n[        U[        [        R                  " 5       5      U[        [        UUUS9SUS9/SSS9n	SU	R                  SS	9 S
3$ )aO  
Builds a chunk of a streaming OpenAI Chat Completion response.

IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
like Cursor, assume that when the field exists, it has data.

Args:
    request_id (`str`):
        The request ID.
    content (`str`, *optional*):
        Content of the response from the model.
    model (`str`, *optional*):
        The model that generated the content.
    role (`str`, *optional*):
        The role of the next content, until a new role is defined.
    finish_reason (`str`, *optional*):
        The reason the generation by the model has finished.
    tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
        Data about the tool calls, when they are triggered.

Returns:
    `str`: The built chunk, a string containing a JSON string with the payload.
)rS  rT  rV  r   )deltaindexrU  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)step
_tokenizerr4   r   timer5   r6   model_dump_json)
r   rR  rS  r   rT  rU  rV  rW  rX  chunks
             r\   build_chat_completion_chunk(ServeCommand.build_chat_completion_chunkY  s    D $)<AV#(()=)=wGG#		$% '!#-
 "/
  "*!
$ --4-@AFFr[   responserJ   c                 (    SUR                  SS9 S3$ )a  
Builds a event of a streaming OpenAI Response response.

IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
like Cursor, assume that when the field exists, it has data.

Args:
    response (`BaseModel`):
        The response to build an event from. One of the multiple OpenAI Response output types

Returns:
    `str`: The built chunk, a string containing a JSON string with the payload.
r`  Tra  rc  )rg  )r   rk  s     r\   build_response_event!ServeCommand.build_response_event  s"     00d0CDDIIr[   c                    ^  [         S[        4U 4S jj5       n[        US9nT R                  (       a/  UR                  [        S/SS/S/S9  [
        R                  S5        SS	KJn  UR                  S
5      SUS[        4U 4S jj5       nUR                  S5      S[        4U 4S jj5       nUR                  S5      SU4U 4S jj5       nUR                  S5      UR                  S5      U 4S j5       5       nUR                  S5      S 5       nUR                  S5      SU4S j5       n	[        R                  " UT R                   R"                  T R                   R$                  T R                   R&                  S9  g)a  
Setup and run the FastAPI server for transformers serve.

Models will be loaded and unloaded automatically based on usage and a timeout.

The server will expose the following endpoints:
- POST /v1/chat/completions: Generates chat completions.
- POST /v1/responses: Generates responses.
- POST /v1/audio/transcriptions: Generates transcriptions from audio.
- GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
appc                   >#    S 7v   TR                   R                  5        H  nUR                  5         M     TR                  b  TR                  R	                  SSS9  g g 7f)NT   blocktimeout)r"  valuesr   r#  rw   )rp  r   r   s     r\   lifespan"ServeCommand.run.<locals>.lifespan  sZ     ++224""$ 577C88==DRS=T Ds   A A#)rw  *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsr,  bodyc                    > TR                  US9  TR                  (       a&  TR                  XR                  R                  5      nOTR                  U5      n[        USS9$ Nr,  text/event-stream
media_type)rK  r  #continuous_batching_chat_completionstaterR  generate_chat_completionr0   )r,  r  outputr   s      r\   chat_completion)ServeCommand.run.<locals>.chat_completion  sU    11$1?++AA$H`H`a66t<$V8KLLr[   z/v1/responsesc                 X   > TR                  U S9  TR                  U 5      n[        USS9$ r  )rF  generate_responser0   )r,  r  r   s     r\   	responses#ServeCommand.run.<locals>.responses  s2    **7*;++G4F$V8KLLr[   z/v1/audio/transcriptionsc           
        >#    U R                  5        IS h  vN n[        US   R                  5       I S h  vN US   S9n[        R	                  SUS   R
                   SUS   R                   SUS   R                  S-  S S	35        S S S 5      IS h  vN   TR                  WS
9  TR                  U5      n[        USS9$  N N N2! , IS h  vN  (       d  f       NG= f7f)Nre   r   )re   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr  r  r  )formrc   readr   r3  filenamecontent_typesizerP  generate_transcriptionr0   )r,  r  parsed_requestr  r   s       r\   audio_transcriptions.ServeCommand.run.<locals>.audio_transcriptions  s      ||~~!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@ &~ ///G00@F$V8KLL &2 &~~~sU   C'CC'CC	
ACC'C/C'	CC'C$CC$ C'z
/v1/modelsc                  <   > [        ST R                  5       S.5      $ )Nlist)r_  data)r/   get_gen_modelsr   s   r\   get_all_models(ServeCommand.run.<locals>.get_all_models  s      64;N;N;P QRRr[   z/healthc                      [        SS05      $ )Nstatusok)r/   rQ   r[   r\   healthcheck%ServeCommand.run.<locals>.healthcheck  s    4 011r[   httpc                    #    U R                   R                  [        5      =(       d    [        [        R
                  " 5       5      nX R                  l        U" U 5      I S h  vN nX#R                   [        '   U$  N7fr   )headersr   X_REQUEST_IDrX   uuiduuid4r  rR  )r,  	call_nextrR  rk  s       r\   get_or_set_request_id/ServeCommand.run.<locals>.get_or_set_request_id  sX      ,,\:Oc$**,>OJ'1MM$&w//H-7\*O 0s   A A="A;#A=)r   r   r  N)r   r,   r  add_middlewarer.   r   warning_oncefastapir~  postdictoptionsr   
middlewareuvicornrunr   r   r   r  )
r   rw  rp  r~  r  r  r  r  r  r  s
   `         r\   r  ServeCommand.run  s    
	U 	U 
	U x( "e"&"e"e   g 	$	(	)	MW 	MD 	M 
*	M 
/	"	Mt 	M 
#	M 
,	-	M 	M 
.	M" 
\	"			S 
 
#	S 
		2 
	2 
		 	 
 	 	Cdiinn499>>TYYM`M`ar[   c           	         / SQn[         (       aT  U Vs/ s HF  nUS[        R                  R                  5       R                  5       UR	                  S5      S   S.PMH     sn$ U Vs/ s H  n[        U5      PM     nnU Vs/ s H6  nUR                  SUR                  R                  5       UR                  S.PM8     sn$ s  snf s  snf s  snf )a  
This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
model working with generate can work.

This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
integrations.
)zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructr   /r   )r\  r_  r]  owned_by)	r   datetimenow	timestampsplitr   r\  
created_atauthor)r   modelsr   model_infoss       r\   r  ServeCommand.get_gen_models   s    
 > $ $E  %'00446@@B %C 0 3	 $  ;AA&:e,&KA ) )E  ((%$//99; %	 )  Bs   AC(C=Cr   c           
        ^ ^^	^
^ T R                  US   5      m	T	T R                  :g  nT	T l        U(       a.  T R                  b!  T R                  R                  SSS9  ST l        T R	                  T	5      u  pE[        US5      (       a  UR                  OUm[        UUR                  TR                  TR                  SSSS	9mT R                  cH  UR                  TSS
9T l        [        5       T R                  l        T R                  R                  5         UR                  US   SSS9R!                  UR"                  5      nU	U U4S jm
UU U
4S jnU" US   U5      $ )z
Generates an OpenAI Chat Completion using continuous batching.

Args:
    req (`dict`): The request to generate an OpenAI Chat Completion for.

Returns:
    `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
r   NTr   rs  rX  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rP   	streamingmessagespt)return_tensorsadd_generation_promptc           	   3     >#     TR                  U STS9v   TR                  R                  U 5       HX  nUR                  [        R
                  :X  a  TR                  U STS9v     g TR                  U UR                  S   TUTS9v   MZ     g ! [         aT  n[        R                  [        U5      5        TR                  R                  U 5        S[        U5       S3v    S nAg S nAff = f7f)	N	assistantrT  r   rw   rU  r   )rR  rS  r   rW  rX  data: {"error": ""})ri  r#  request_id_iterr  r*   FINISHEDgenerated_tokens	Exceptionr   r7  rX   cancel_request)rR  rW  resultr=  model_id_and_revisionr   rX  s       r\   stream_chat_completionPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completion^  s     7 66z[p6qq"FFVVWabF}}(>(>>">>&*0"7 ?  
 ">>'1$*$;$;B$?"7*7&/ ?   c"  7SV$88GG
S*3q6(#667s<   C1A#B (C1)&B C1
C.A
C)$C1)C..C1c                  >#     [        U R                  5       S5      nTR                  R                  XTR                  S9nT" X5       H&  nU7v   [
        R                  " S5      I S h  vN   M(     g  N	! [
        R                   a7    TR                  R                  U5        [        R                  SU S35         g f = f7f)NF)rR  r   r   zRequest z was cancelled.)r   tolistr#  add_requestr   asynciosleepCancelledErrorr  r   warning)_inputsrR  rW  rh  rP   r   r  s       r\   cancellation_wrapperNServeCommand.continuous_batching_chat_completion.<locals>.cancellation_wrapperz  s     G ,W^^-=u E!EEQQCTCcCc R 
 4JNEK!--*** O*)) G88GG
S*_EFGs<   CA'A8 ,A6-A8 5C6A8 8AC CCCr   )process_model_namer&  r#  rw   load_model_and_processorr   rX  r   rP   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )r   r   rR  must_discard_cacher   r   inputsr  rP   r  r  rX  s   `       @@@@r\   r  0ServeCommand.continuous_batching_chat_completion.  sb    !% 7 7G E2dooE/77C88==DRS=T;?8889NO+29k+J+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3t 8V 8D4 H[G\D44D44::< ..s:tko.pssLL
	78	G $F1Iz::r[   r(   c                     U R                   R                  nU[        R                  " 5       ;   a  [        R
                  nU$ U[        R                  " 5       ;   a  [        R                  nU$ [        SU 35      e)NzUnknown modality: )		__class__rS   r   rv  r   r   r   r   r  )r   model_classnamemodalitys      r\   get_model_modalityServeCommand.get_model_modality  sm    //22HOOQQ||H   A H H JJ||H  1/1BCDDr[   r  c           	         / nU  GH  nUS   / S.nU[         R                  :X  a{  [        US   [        5      (       a  US   nOV[        US   [        5      (       a>  / nUS    H"  nUS   S:X  d  M  UR                  US   5        M$     SR                  U5      nWUS'   GO7U[         R                  :X  Ga"  [        US   [        5      (       a  US   R                  SUS   S.5        OUS    H  nUS   S:X  a  US   R                  U5        M"  US   S:X  d  M-  S	US   S
   ;   a  [        R                  " SSUS   S
   5      n[        R                  " [        [        R                  " U5      5      5      n[        R                   " SSS9n	U	R"                  n
UR%                  U	R"                  5        OUS   S
   n
US   R                  SU
S.5        M     UR                  U5        GM     U$ )NrT  rT  rS  rS  typeri    )r  ri   	image_urlbase64urlz^   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerX   r  appendjoinr   resubr+   openr   r  	b64decodetempfileNamedTemporaryFilenamesave)r  r  processor_inputsmessageparsed_messageparsed_contentrS  
image_datar  re   r  s              r\   *get_processor_inputs_from_inbound_messages7ServeCommand.get_processor_inputs_from_inbound_messages  s   G&-fo"EN8<<' gi0#66%,Y%7N	 2D99%'N#*9#5"6?f4*11'&/B $6 &)XXn%=N,:y)X\\) gi0#66"9-44fgV_N`5ab#*9#5"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[ $6  ##N3O  P  r[   c           	        ^ ^^^^^ T R                   R                  b  T R                   R                  US'   US   nUS   S   S:X  a  gT R                  US   5      mTT R                  :g  nTT l        T R	                  T5      u  mnT R                  T5      nT R                  X%5      nSm[         H2  nUTR                  R                  S   R                  5       ;   d  M0  Um  O   UR                  USUR                  S	5      S
SSS9nUR                  TR                  5      nUR                  SS5      mSn	STR                  R                  S   R                  5       ;   a  Sn	[        UU	SS9n
[!        UTR"                  S9nSnT R%                  U5      (       aC  U(       d<  T R&                  R)                  5       nUS   R*                  S   U:  a  T R&                  n0 UEU
USUS.EmUUUUU U4S jnU" U
T5      $ )z
Generates an OpenAI Chat Completion using `generate`.

Args:
    req (`dict`): The request to generate an OpenAI Chat Completion for.

Returns:
    `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
Nr   r  r  rT  r  r   Ttoolsr  )r  r  r  return_dicttokenizerR  req_0gptossFskip_special_tokensskip_promptr   	input_ids)streamerrP   return_dict_in_generatepast_key_valuesc              3     >#    SnS nSTR                   R                  S   R                  5       ;   a  SnSnUU4S jn[        UTS9nSn UR	                  5         [        5       nTR                  TS	TS
9v   U  GH2  nSTR                   R                  S   R                  5       ;   a  UR                  S5      nXh-  nU(       a  X6;   a  SnMT  MV  TGb  UR                  5       [        T   S   :X  a	  SUl
        M  UR                  5       [        T   S   :X  a&  UR                  5         TR                  US STS9v   M  UR                  (       Ga@  U=R                  U-  sl        UR                  (       dV  [        R                  " SUR                  5      n	U	c  GM$  U	R!                  S5      n	SUl        [#        [%        U	S9SSUS-   S9n
OUS:X  a  GM\  SUR                  ;  a  GMo  U=R&                  UR)                  S5      -  sl        U=R&                  UR)                  S5      -  sl        UR&                  S:  a&  SR+                  UR-                  S5      S S 5      S-   n[#        [%        US9SSS9n
TR                  US U
/TS9v   GM  US:w  d  GM   TR                  XTS9v   GM5     TR                  USTS9v   UR+                  5         UR+                  5         g ! [.         a9  n[0        R3                  [5        U5      5        S [5        U5       S!3v    S nANOS nAff = f! UR+                  5         f = f7f)"NFr  r   T<|channel|>final<|message|>c                  L   > TR                   " S0 U D6nUR                  Tl        g NrQ   generater"  r%  r   generate_outputr   r   s     r\   generate_with_cachebServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache  "    "'..":6":%4%D%D"r[   targetr   r   r  r  
<|return|>r   r   rV  )rR  rT  rU  r   z\"name\": \"(.*?)\"r#   )r  function
_tool_call)r1  r[  r  r\  z"arguments": {{})	arguments)r1  r[  r  )rR  rT  rV  r   )rS  r   rw   r  r  r  )configarchitecturesr!  r   r   r   ri  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr7   r8   r   countr  r  r  r   r7  rX   )r   _request_id
filter_cotcot_trace_endr+  threadresults
tool_stater  	tool_nametoolr=  generation_kwargsr   r  rR  r   tool_model_familys               r\   r  EServeCommand.generate_chat_completion.<locals>.stream_chat_completion  sC     J M5<<55a8>>@@!
 =E #6?PQFGg&[
 66z[p6qq&F5<<#=#=a#@#F#F#HH!'!4!4\!B%G "(3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%666&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6Ttf\q #C #  % |">>'?T ?  i 'n 66{RX`u6vv   7SV$*3q6(#667
 sC   AM I
K% :K% M %
L(//L#L+ #L((L+ +L==M )r   r  r  r&  r  r  r  _MODELS_WITH_TOOL_SUPPORTr7  r8  r!  r  r   r  r   r    r   rP   is_continuationr%  get_seq_lengthshape)r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrP   r%  seq_lenr  rG  r   r  rR  rH  s   `              @@@@@r\   r  %ServeCommand.generate_chat_completion  s"    99  ,9900CL9<Z B<;. $ 7 7G E2dooE/889NOy**51JJ8^ !(A$'5<<+E+Ea+H+N+N+PP$<! )B .."&'''" / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$$-?((779Gk"((,w6 $ 2 2

+!2'+,
x	 x	t &&9:FFr[   c                 t  ^ ^^^^^ T R                  TS   5      mTT R                  :g  nTT l        T R                  T5      u  mn[        TS   [        5      (       a)  ST;   a	  STS   S./O/ nUR                  STS   S.5        O[        TS   [        5      (       a;  ST;   a/  TS   S   S   S:w  a  STS   S./TS   QnO`TS   nTS   US   S	'   OOTS   nOI[        TS   [        5      (       a&  ST;   a	  STS   S./O/ nUR                  TS   5        O[        S
5      eUR                  USSS9nUR                  TR                  5      nTR                  SS5      mSnSTR                  R                  S   R                  5       ;   a  Sn[!        UUSS9n[#        TTR$                  S9nSnT R'                  T5      (       aC  U(       d<  T R(                  R+                  5       n	US   R,                  S   U	:  a  T R(                  nU[.        R0                  " U5      UUSUS.mUUUUUU 4S jn
U
" UT5      $ )z
Generates an OpenAI Response using `generate`.

Args:
    req (`dict`): The request to generate an OpenAI Response for.

Returns:
    `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
r   inputinstructionssystemr  rj   r   rT  rS  z%inputs should be a list, dict, or strTr  )r  r  ru   r  r  Fr  r  Nr  r  )r  attention_maskr   rP   r!  r"  c              3     >#    SnS nSTR                   R                  S   R                  5       ;   a  SnSnUU4S jn[        UTS9nSnSnSn UR	                  5         [
        R
                  " 5       n	[        SU[        S	T 3U	S
TTR                  S5      SSS00S/ / TR                  SS5      STR                  S5      S9S9n
US-  nTR                  U
5      v   [        SU[        S	T 3U	STTR                  S5      SSS00S/ / TR                  SS5      STR                  S5      S9S9nUS-  nTR                  U5      v   [        SUU[        ST 3SSS/ S9S9nUS-  nTR                  U5      v   [        SST 3UUU[        SS / S!9S"9nUS-  nTR                  U5      v   S nU  H  nSTR                   R                  S   R                  5       ;   a  UR                  S#5      nX-  nU(       a  X>;   a  SnS nMU  MW  [!        S$ST 3UUUUS S%S&./S'9nUS-  nTR                  U5      v   M     [#        S(ST 3UUSUS S%S&./S)9nUS-  nTR                  U5      v   [%        S*ST 3UUU[        SUR&                  / S!9S"9nUS-  nUS-  nTR                  U5      v   [)        S+UU[        ST 3SS,SUR*                  // S-9S9nUS-  nUS-  nTR                  U5      v   [-        S.U[        S	T 3U	S,TTR                  S5      SSS00UR.                  /S/ TR                  SS5      STR                  S5      S/9S9nUS-  nTR                  U5      v   UR1                  5         UR1                  5         g ! [2         a  n[4        R7                  S0[9        U5       35        [;        S1U[9        U5      S29nUS-  nTR                  U5      v   [=        S3U[        S	T 3W	S4TTR                  S5      SSS00/ S/ SSTR                  S5      [?        S5[9        U5      S69S79S9nUS-  nTR                  U5      v    S nANS nAff = f! UR1                  5         f = f7f)8NFr  r   Tr$  c                  L   > TR                   " S0 U D6nUR                  Tl        g r&  r'  r)  s     r\   r+  TServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cache  r-  r[   r.  zresponse.createdresp_queuedrT  formatr  ri   rk  r   r   rz   )r\  r  r  r   rT  ri   r_  r  r  r   rq   rz   )r  sequence_numberrk  r#   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r\  r  r  rT  rS  )r  r]  output_indexitemzresponse.content_part.addedoutput_textr   )r  ri   annotations)r  item_idr]  r`  content_indexpartr0  zresponse.output_text.deltagX@)tokenlogprob)r  rd  r]  r`  re  rZ  ry   zresponse.output_text.done)r  rd  r]  r`  re  ri   ry   zresponse.content_part.donezresponse.output_item.done	completed)r\  r  r  rT  rS  rc  zresponse.completed)r\  r  r  r   rT  ri   r  r_  r  r   rq   rz   z"Exception in response generation: r7  )r  r]  r  zresponse.failedfailedserver_error)coder  )r\  r  r  r   rT  ri   r  r_  r  r   rq   rz   r7  ) r7  r8  r!  r   r   rf  r>   r:   r   rm  rB   rC   rE   r<   rF   r9  rG   rH   r=   ri   rD   rf  r;   ra  r  r  r   r7  rX   r@   rA   r?   )r   r?  r@  rA  r+  rB  r]  r`  re  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedrC  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr=  error_eventresponse_failedrG  r   r  r   rR  r   s                           r\   stream_response7ServeCommand.generate_response.<locals>.stream_response  s     J M5<<55a8>>@@!
 =E #6?PQFOLMM!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL &F5<<#=#=a#@#F#F#HH!'!4!4\!B%G "(3).J&(G$$1G9"&zl 3(7%1&3$,.4"@!A2. $q(O334NOO3 '8 -B4":,/$3!-"# (*t<=-)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH s>   AQ&L!N 0Q&
QB9Q	Q 	QQ Q##Q&)r  r&  r  r  rX   r  r  r  r  r  r  r   r   r7  r8  r!  r    r   rP   rK  r%  rL  rM  r   	ones_like)r   r   r  r   r  r  rO  rP   r%  rP  rx  rG  r   r  rR  s   ``         @@@@r\   r  ServeCommand.generate_response  si    !% 7 7G E2dooE/889NOyc'lC((M[_bMbxC4GHIhjFMM6c'lCDGd++$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd++M[_bMbxC4GHIhjFMM#g,'DEE..vTbf.g5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$$-?((779Gk"((,w6 $ 2 2 #oof5+!2'+,
`	 `	D 2J??r[   c                   ^
^^^ [        5       (       d  [        S5      eU R                  US   5      nU R                  U5      u  mm[	        TR
                  SSS9n[        UTR                  S9nTR                  R                  n[        R                  " US   5      n[        R                  " XeSS9u  pxT" XuSS	9R                  TR                  5      m
T
S
   R                  TR                   5      T
S
'   UUSS.mU
UUU4S jn	U	" 5       $ )z
Generates an OpenAI Transcription using the audio file.

Args:
    req (`dict`): The request containing the audio file and model information.

Returns:
    `Generator[str, None, None]`: A generator that yields the transcription result.
z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  re   )srmonor  )sampling_rater  input_features)r   rP   r!  c               3      >#    TR                   " S0 TDTD6n TR                  U R                  SS9S   n[        US9nUR	                  SS9 v   g 7f)NT)r  r   )ri   ra  rQ   )r(  batch_decode	sequencesr1   rg  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorrG  s      r\   _generate_transcriptionDServeCommand.generate_transcription.<locals>._generate_transcription  sd     '00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r  r  load_audio_model_and_processorr    rX  r   rP   feature_extractorr  ior   librosaloadr  r   r   )r   r   r  rO  rP   model_sampling_rateaudio_bytesaudio_array_r  r  r  r  rG  s             @@@@r\   r  #ServeCommand.generate_transcription  s#    $%%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- kPTU&{fjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H 	H '((r[   c                 N   UR                  S5      =(       d    UR                  S5      nSnU R                  c  SnOc[        U R                  5      [        U5      :  a  SnO>[        [        U R                  5      5       H  nU R                  U   X$   :w  d  M  Sn  O   X l        U$ )a  
Determines whether the current request is a continuation of the last request. In other words, if it is the
same chat session.

Args:
    req (`dict`): The request to check.

Returns:
    `True` if the request is a continuation of the last request, `False` otherwise.
r  rS  TF)r   r$  lenrange)r   r   r  req_continues_last_messagesis        r\   rK  ServeCommand.is_continuation  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123%%a(HK727/ 4
 &**r[   r&   c                     U R                   (       a7  [        SU R                  U R                  U R                  U R                  S9nU$ U R
                  (       a  [        SS9nU$ SnU$ )z
Returns the quantization config for the given CLI arguments.

Args:
    args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

Returns:
    `Optional[BitsAndBytesConfig]`: The quantization config.
T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r&   r   r   r   r   )r   quantization_configs     r\   get_quantization_config$ServeCommand.get_quantization_config   sr     "4!'+zz$($<$<*.*C*C'+zz# #" "4!# #" #'""r[   model_idc                 v    U R                   R                  b  U R                   R                  nSU;   a  U$ U S3$ )a  
Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
If the model_id DOESN'T contain an @, it defaults to "model_id@main".

Args:
    model_id (`str`): The model ID.

Returns:
    `str`: The canonicalized model name to be used
@z@main)r   r  )r   r  s     r\   r  ServeCommand.process_model_name  s<     99  ,yy,,H(?O5!!r[   r  c                    U R                   n[        R                  SU 35        SU;   a  UR                  SS5      u  p4OUSpC[        R
                  " UUUR                  S9nUR                  S;   a  UR                  O[        [        UR                  5      nU R                  U5      nUUR                  USUR                  S.nUb  XxS
'   [        R
                  " U40 UD6n	[        [        U	R                  S   5      n
U
R
                  " U40 UD6n[        USS	5      c  UR                  UR                   5      nUR"                  R$                  S	L =(       a    UR"                  R&                  S:H  nUR"                  R$                  S	L=(       a    UR"                  R$                  S:  nU(       d  U(       a  SUR"                  l        [        R                  SU 35        X4$ )a  
Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
arguments.

Args:
    model_id_and_revision (`str`):
        The model ID and revision to load.
    model_cls (`type[PreTrainedModel]`):
        The model class to load.

Returns:
    `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
    data processor (tokenizer, audio processor, etc.).
zLoading r  r#   main)revisionr   )r   Nr   )r  r   r   
device_mapr   Nr  r   hf_device_map   r  zLoaded model )r   r   r   r  r%   from_pretrainedr   r   getattrr   r  r   r   r  r8  r  r   rP   r   
max_length)r   r  r   r  r  data_processorr   r  model_kwargsr7  architecturer   has_default_max_lengthhas_short_max_new_tokenss                 r\   _load_model_and_data_processor+ServeCommand._load_model_and_data_processor.  s    yyh4567''!6!<!<S!!DHh!6h&66"44
 #jjN:

tzz@Z"::4@ !#'#;#; !%!7!7
 *2E./++HEE|V-A-A!-DE,,XFF5/408HHT[[)E ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<$$r[   c                    XR                   ;  d"  U R                   U   R                  5       (       aB  U R                  U5      u  p#[        UU R                  R
                  US9U R                   U'   X#4$ U R                   U   R                  5         U R                   U   R                  nU R                   U   R                  nX#4$ )a$  
Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

Args:
    model_id_and_revision (`str`):
        The model ID and revision to load.

Returns:
    `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
r   r   	r"  r   r  r   r   r  r   r   r   )r   r  r   r   s       r\   r  %ServeCommand.load_model_and_processork  s     !(:(::d>P>PQf>g>r>r>t>t#BBCXYE8B $		 7 7#9D45 	 45AAC&&'<=CCE**+@AKKIr[   c                    XR                   ;  d"  U R                   U   R                  5       (       aB  U R                  U5      u  p#[        UU R                  R
                  US9U R                   U'   X#4$ U R                   U   R                  5         U R                   U   R                  nU R                   U   R                  nX#4$ )a  
Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

Args:
    model_id_and_revision (`str`):
        The model ID and revision to load.

Returns:
    `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
r  r  )r   r  r  r  s       r\   r  +ServeCommand.load_audio_model_and_processor  s     !(:(::d>P>PQf>g>r>r>t>t+/+N+NOd+e(K8B $		 7 7)9D45 ++	 45AAC,,-BCIIK"001FGQQO++r[   )r   r  r%  r$  r&  r"  r#  r  )r   NNNNNNN)0rS   rT   rU   rV   staticmethodr   r  r   r   r  r   r4  r?  rF  rK  rP  rX   r   r   r  r   r   ri  rm  r  	functoolscacheanyr  r   r  r   r  r  r   r  r  r  rh   rK  r  r  r  tupler  r   r  rZ   rQ   r[   r\   r   r     s   	>N 	> 	>+^ +Z// / !	/
 /b
 

 

d 
 !%#"'+<@047;6G6G #6G }	6G
 sm6G  }6G T"7896G  -6G 346G 
6GpJ[ JS J ]b~ __+T#s(^ 4 + +ZZ;t Z; Z;Q_`cei`iQj Z;x 	"3 	 	 	 + x +  + ZGGD GGYsD$5O GGRc@T c@iT4.H c@J	.)$ .)9S$_3M .)`+4 +D +< #n #BV9W # #8"3 "3 "";%C ;%z %( 	 "99	: 6,C ,ERcesRsLt ,r[   r   __main__)r  r  r   r  enumr  r   r  r   r  r
  r   rf  r  argparser   r   collections.abcr   r   r   
contextlibr   dataclassesr	   r
   r   r   typingr   r   r   huggingface_hubr   huggingface_hub.constantsr   tokenizers.decodersr   r  &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   r    utilsr!   r"   r$   r   r%   r&   r'   r(   generation.continuous_batchingr)   r*   r  PILr+   r  r  r  r,   r-   fastapi.middleware.corsr.   fastapi.responsesr/   r0    openai.types.audio.transcriptionr1   .openai.types.audio.transcription_create_paramsr2   openai.types.chatr3   'openai.types.chat.chat_completion_chunkr4   r5   r6   r7   r8   *openai.types.chat.completion_create_paramsr9   openai.types.responsesr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   -openai.types.responses.response_create_paramsrI   pydanticrJ   rK   rL   rN   r`   rc   rC  rI  rN  rD  rJ  rO  r  rS   r   r;  r  r5  rJ  r  Enumr   r   r  r   r   r   r   r   r  r  rQ   r[   r\   <module>r     sy         	 	  	     . ? ? * (   - - & 4 ,    0 (   Z k 4 6k;O;QkViVk   .6A>\<  [    " \@@4QY^ 6U]b 0MUZ  %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: tyy 	 8	8/8 	8v /@ /@d n n nbG,- G,T& zNE	IIK r[   