Skip to content

vllm.v1.engine.processor

Processor

Source code in vllm/v1/engine/processor.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
class Processor:

    def __init__(
        self,
        vllm_config: VllmConfig,
        tokenizer: TokenizerGroup,
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
    ):

        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.lora_config = vllm_config.lora_config
        self.decoding_config = vllm_config.decoding_config
        self.tokenizer = tokenizer

        self.generation_config_fields = (
            self.model_config.try_get_generation_config())

        self.mm_registry = mm_registry
        self.mm_processor_cache = processor_cache_from_config(
            vllm_config, mm_registry)

        self.input_preprocessor = InputPreprocessor(
            self.model_config,
            self.tokenizer,
            mm_registry,
            mm_processor_cache=self.mm_processor_cache,
        )

    def _validate_logprobs(
        self,
        params: SamplingParams,
    ) -> None:
        max_logprobs = self.model_config.max_logprobs
        if max_logprobs == -1:
            return
        # Validate sample logprobs.
        if params.logprobs and (params.logprobs == -1
                                or params.logprobs > max_logprobs):
            raise ValueError(
                f"Requested sample logprobs of {params.logprobs}, "
                f"which is greater than max allowed: {max_logprobs}")

        # Validate prompt logprobs.
        if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
            raise ValueError(
                f"Requested prompt logprobs of {params.prompt_logprobs}, "
                f"which is greater than max allowed: {max_logprobs}")

    def _validate_sampling_params(
        self,
        params: SamplingParams,
        lora_request: Optional[LoRARequest],
    ) -> None:
        self._validate_structured_output(params)
        self._validate_logit_bias(params)

        if params.allowed_token_ids is None:
            return
        if not params.allowed_token_ids:
            raise ValueError("allowed_token_ids is not None and empty!")
        if self.tokenizer is None:
            # When skip_tokenizer_init=True, we can't validate token IDs
            # Skip validation and let the model handle invalid tokens
            return
        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
        vocab_size = len(tokenizer)
        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
            raise ValueError(
                "allowed_token_ids contains out-of-vocab token id!")

    def _validate_logit_bias(
        self,
        params: SamplingParams,
    ) -> None:
        """Validate logit_bias token IDs are within vocabulary range."""
        if not params.logit_bias:
            return

        vocab_size = self.model_config.get_vocab_size()
        invalid_token_ids = []

        for token_id in params.logit_bias:
            if token_id < 0 or token_id >= vocab_size:
                invalid_token_ids.append(token_id)

        if invalid_token_ids:
            raise ValueError(
                f"token_id(s) {invalid_token_ids} in logit_bias contain "
                f"out-of-vocab token ids. Vocabulary size: {vocab_size}")

    def _validate_supported_sampling_params(
        self,
        params: SamplingParams,
    ) -> None:
        # Best of not yet supported.
        if params.best_of is not None and params.best_of > 1:
            raise ValueError("vLLM V1 does not yet support best_of.")
        # Logits processors not supported.
        if params.logits_processors:
            raise ValueError("vLLM V1 does not support per request "
                             "user provided logits processors.")

    def _validate_params(
        self,
        params: Union[SamplingParams, PoolingParams],
        lora_request: Optional[LoRARequest],
    ):
        """
        Validate supported SamplingParam.
        Should raise ValueError if unsupported for API Server.
        """

        if isinstance(params, PoolingParams):
            return

        self._validate_logprobs(params)
        self._validate_sampling_params(params, lora_request)
        self._validate_supported_sampling_params(params)

    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
        """
        Validate that user-provided multi_modal_uuids align with
        multi_modal_data in the incoming request prompt(s).
        Only checks lengths; `None` entries are allowed and will be 
        auto-hashed downstream.
        """

        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
            if not isinstance(single_prompt, dict):
                return
            mm_data = single_prompt.get("multi_modal_data")
            mm_uuids = single_prompt.get("multi_modal_uuids")
            if not mm_data or not mm_uuids:
                return

            for modality, items in mm_data.items():
                if modality in mm_uuids:
                    data_len = len(items) if isinstance(items, list) else 1
                    uuid_len = len(mm_uuids[modality]) if isinstance(
                        mm_uuids[modality], list) else 1
                    if uuid_len != data_len:
                        raise ValueError(
                            f"multi_modal_uuids for modality '{modality}' "
                            "must have same length as data: got "
                            f"{uuid_len} uuids vs "
                            f"{data_len} items.")
                else:
                    raise ValueError(
                        f"multi_modal_uuids for modality '{modality}' must "
                        "be provided if multi_modal_data is provided.")

        # Handle explicit encoder/decoder prompts or singleton prompt
        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
            enc = prompt.get("encoder_prompt")
            dec = prompt.get("decoder_prompt")
            if enc is not None:
                _validate_single_prompt(enc)
            if dec is not None:
                _validate_single_prompt(dec)
        else:
            _validate_single_prompt(prompt)  # type: ignore[arg-type]

    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                             "not enabled!")

    def _validate_structured_output(self, params: SamplingParams) -> None:
        if not params.guided_decoding or not self.decoding_config:
            return

        if self.model_config.skip_tokenizer_init and params.guided_decoding:
            raise ValueError(
                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
            )

        engine_level_backend = self.decoding_config.backend
        if params.guided_decoding.backend:
            # Request-level backend selection is not supported in V1.
            # The values may differ if `params` is reused and was set
            # to a specific backend based on `auto` behavior in a previous
            # request. We remember that it was set as a result of `auto`
            # using the `_auto` option set on the backend in the params.
            if (params.guided_decoding.backend != engine_level_backend
                    and not (engine_level_backend == "auto"
                             and params.guided_decoding.backend_was_auto)):
                raise ValueError(
                    "Request-level structured output backend selection is no "
                    "longer supported. The request specified "
                    f"'{params.guided_decoding.backend}', but vLLM was "
                    f"initialised with '{engine_level_backend}'. This error "
                    "can be resolved by removing backend selection from the "
                    "request.")
        else:
            params.guided_decoding.backend = engine_level_backend

        # Request content validation
        if (isinstance(params.guided_decoding.choice, list)
                and not params.guided_decoding.choice):
            # It is invalid for choice to be an empty list
            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
                             "cannot be an empty list")

        if engine_level_backend.startswith("xgrammar"):
            # xgrammar with no fallback
            validate_xgrammar_grammar(params)
        elif engine_level_backend.startswith("guidance"):
            # TODO: ideally we would have the LLTokenizer here as Lark syntax
            # allows <|special_token|> and similar, see
            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
            # Without tokenizer these are disallowed in grammars.
            validate_guidance_grammar(params, tokenizer=None)
        elif engine_level_backend == "outlines":
            # outlines backend
            validate_structured_output_request_outlines(params)
        elif engine_level_backend == "lm-format-enforcer":
            # lm format enforcer backend
            validate_structured_output_request_lm_format_enforcer(params)
        else:
            # NOTE: engine_level_backend must be "auto" here, because we have
            # checked supported_backends above.
            # "auto" is an opt-in to opinionated behavior where we try to
            # choose a backend based on request contents. This is not the
            # default as it is less predictable and subject to change
            # between releases as feature support changes.
            try:
                validate_xgrammar_grammar(params)
                params.guided_decoding.backend = "xgrammar"
            except ValueError:
                # The request either failed validation
                # or includes some jsonschema feature(s) that
                # are not supported in xgrammar. Fall back to guidance.
                validate_guidance_grammar(params, tokenizer=None)
                params.guided_decoding.backend = "guidance"
            # Remember that this backend was set automatically
            params.guided_decoding.backend_was_auto = True

    def _maybe_build_mm_hash_overrides(
        self,
        request_id: str,
        prompt: PromptType,
    ) -> Optional[dict[str, list[str]]]:
        """Build per-item multimodal hash overrides when enabled. In this case,
        multimodal data items are identified by their request id, modality and
        index rather than their content.

        Returns a dictionary of modality -> list[str] of overrides, or None if
        disabled or no multimodal data is present.
        """

        def _extract_mm_data(p: PromptType):
            if isinstance(p, dict) and "encoder_prompt" in p:
                enc = p.get("encoder_prompt")
                if isinstance(enc, dict):
                    return enc.get("multi_modal_data")
                return None
            if isinstance(p, dict):
                return p.get("multi_modal_data")
            return None

        mm_data = _extract_mm_data(prompt)
        if not mm_data:
            return None

        overrides: dict[str, list[str]] = {}
        for modality, data in mm_data.items():
            n = len(data) if isinstance(data, list) else 1
            overrides[modality] = [
                f"{request_id}-{modality}-{i}" for i in range(n)
            ]
        return overrides

    def process_inputs(
        self,
        request_id: str,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        data_parallel_rank: Optional[int] = None,
    ) -> tuple[Optional[str], EngineCoreRequest]:

        # TODO(woosuk): Support pooling models.
        # TODO(woosuk): Support encoder-decoder models.
        self._validate_lora(lora_request)
        self._validate_params(params, lora_request)
        if trace_headers is not None:
            raise ValueError("V1 does not support tracing yet.")

        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
        if data_parallel_rank is not None and not (0 <= data_parallel_rank <
                                                   data_parallel_size):
            raise ValueError(f"data_parallel_rank {data_parallel_rank} "
                             f"is out of range [0, {data_parallel_size}).")

        if arrival_time is None:
            arrival_time = time.time()

        # Optionally generate multimodal hash overrides to avoid hashing
        # multimodal data items by their content as their identifiers.

        # NOTE: when users explicitly turn off BOTH prefix caching and input
        # processing caching, no multimodal features or embeddings will be
        # reused across requests, therefore identifying multimodal data items
        # by their content is no longer necessary, and we create uuids with
        # request id-modality-index as multimodal hash overrides.
        if (self.model_config.multimodal_config and
                self.model_config.multimodal_config.mm_processor_cache_gb == 0
                and not self.cache_config.enable_prefix_caching):
            mm_hash_overrides = self._maybe_build_mm_hash_overrides(
                request_id, prompt)
        else:
            # Otherwise, use user-provided uuids as multimodal hash overrides
            # if provided.
            self._validate_multi_modal_uuids(prompt)
            if isinstance(prompt, dict):
                mm_hash_overrides = prompt.get("multi_modal_uuids")
            else:
                mm_hash_overrides = None

        # Process inputs, which includes:
        # 1. Tokenize text prompt, with LoRA request if one exists.
        # 2. For multimodal models with a merged preprocessor, preprocess
        #   multimodal data and expand prompt token ids accordingly.
        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
            prompt,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request,
            mm_hash_overrides=mm_hash_overrides,
        )
        from vllm.platforms import current_platform
        current_platform.validate_request(
            prompt=prompt,
            params=params,
            processed_inputs=processed_inputs,
        )

        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

        self._validate_model_inputs(processed_inputs, lora_request)

        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)

        # TODO: Impl encoder-decoder
        if encoder_inputs is not None:
            raise NotImplementedError

        sampling_params = None
        pooling_params = None
        if isinstance(params, SamplingParams):
            # TODO: can we avoid cloning here in multiproc case?
            sampling_params = params.clone()
            # If unset max tokens, then generate up to the max_model_len.
            if sampling_params.max_tokens is None:
                sampling_params.max_tokens = (
                    self.model_config.max_model_len -
                    len(decoder_inputs["prompt_token_ids"]))
            sampling_params.update_from_generation_config(
                self.generation_config_fields, eos_token_id)
            if self.tokenizer is not None:
                sampling_params.update_from_tokenizer(
                    self.tokenizer.get_lora_tokenizer(lora_request))
        else:
            pooling_params = params.clone()

        # Multimodal related.
        mm_features: Optional[list[MultiModalFeatureSpec]] = None

        if decoder_inputs["type"] == "multimodal":
            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
            decoder_mm_positions = decoder_inputs["mm_placeholders"]
            decoder_mm_hashes = decoder_inputs["mm_hashes"]

            # Merge and flatten multimodal placeholders, hashes and inputs
            # from dictionaries to lists, and sort them by each item's position
            # in the input sequence.
            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)

            mm_features = []
            for modality, idx in sorted_mm_idxs:
                mm_features.append(
                    MultiModalFeatureSpec(
                        data=decoder_mm_inputs[modality][idx],
                        modality=modality,
                        identifier=decoder_mm_hashes[modality][idx],
                        mm_position=decoder_mm_positions[modality][idx]))

        return decoder_inputs.get("prompt"), EngineCoreRequest(
            request_id=request_id,
            prompt_token_ids=decoder_inputs["prompt_token_ids"],
            mm_features=mm_features,
            sampling_params=sampling_params,
            pooling_params=pooling_params,
            eos_token_id=eos_token_id,
            arrival_time=arrival_time,
            lora_request=lora_request,
            cache_salt=decoder_inputs.get("cache_salt"),
            priority=priority,
            data_parallel_rank=data_parallel_rank,
        )

    def _validate_model_inputs(self,
                               inputs: ProcessorInputs,
                               lora_request: Optional[LoRARequest] = None):
        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)

        if encoder_inputs is not None:
            self._validate_model_input(encoder_inputs,
                                       lora_request,
                                       prompt_type="encoder")

        self._validate_model_input(decoder_inputs,
                                   lora_request,
                                   prompt_type="decoder")

    def _validate_model_input(
        self,
        prompt_inputs: SingletonInputs,
        lora_request: Optional[LoRARequest],
        *,
        prompt_type: Literal["encoder", "decoder"],
    ):
        model_config = self.model_config

        prompt_ids = prompt_inputs["prompt_token_ids"]
        if not prompt_ids:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                pass  # Mllama may have empty encoder inputs for text-only data
            else:
                raise ValueError(f"The {prompt_type} prompt cannot be empty")

        if self.model_config.skip_tokenizer_init:
            tokenizer = None
        else:
            tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
            max_input_id = max(prompt_ids, default=0)

            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
            # self.model_config.get_vocab_size() is the model’s vocab size.
            # For Qwen3 models, the language model has extra tokens that do
            # not exist in the tokenizer, and vice versa for multimodal
            # placeholder tokens in some multimodal models.
            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501

            # Here we take the max of the two to determine if a token id is
            # truly out-of-vocabulary.
            if max_input_id > max(tokenizer.max_token_id,
                                  self.model_config.get_vocab_size() - 1):
                raise ValueError(
                    f"Token id {max_input_id} is out of vocabulary")

        max_prompt_len = self.model_config.max_model_len
        if len(prompt_ids) > max_prompt_len:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                mm_registry = self.input_preprocessor.mm_registry
                mm_processor = mm_registry.create_processor(
                    model_config,
                    tokenizer=tokenizer,
                )
                assert isinstance(mm_processor, EncDecMultiModalProcessor)

                if mm_processor.pad_dummy_encoder_prompt:
                    return  # Skip encoder length check for Whisper and Donut

            if model_config.is_multimodal_model:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well.")
            else:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens.")

            raise ValueError(
                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
                f"longer than the maximum model length of {max_prompt_len}. "
                f"{suggestion}")

            # TODO: Find out how many placeholder tokens are there so we can
            # check that chunked prefill does not truncate them
            # max_batch_len = self.scheduler_config.max_num_batched_tokens

    def clear_cache(self) -> None:
        self.input_preprocessor.clear_cache()

cache_config instance-attribute

cache_config = cache_config

decoding_config instance-attribute

decoding_config = decoding_config

generation_config_fields instance-attribute

generation_config_fields = try_get_generation_config()

input_preprocessor instance-attribute

input_preprocessor = InputPreprocessor(
    model_config,
    tokenizer,
    mm_registry,
    mm_processor_cache=mm_processor_cache,
)

lora_config instance-attribute

lora_config = lora_config

mm_processor_cache instance-attribute

mm_processor_cache = processor_cache_from_config(
    vllm_config, mm_registry
)

mm_registry instance-attribute

mm_registry = mm_registry

model_config instance-attribute

model_config = model_config

tokenizer instance-attribute

tokenizer = tokenizer

vllm_config instance-attribute

vllm_config = vllm_config

__init__

__init__(
    vllm_config: VllmConfig,
    tokenizer: TokenizerGroup,
    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
)
Source code in vllm/v1/engine/processor.py
def __init__(
    self,
    vllm_config: VllmConfig,
    tokenizer: TokenizerGroup,
    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
):

    self.vllm_config = vllm_config
    self.model_config = vllm_config.model_config
    self.cache_config = vllm_config.cache_config
    self.lora_config = vllm_config.lora_config
    self.decoding_config = vllm_config.decoding_config
    self.tokenizer = tokenizer

    self.generation_config_fields = (
        self.model_config.try_get_generation_config())

    self.mm_registry = mm_registry
    self.mm_processor_cache = processor_cache_from_config(
        vllm_config, mm_registry)

    self.input_preprocessor = InputPreprocessor(
        self.model_config,
        self.tokenizer,
        mm_registry,
        mm_processor_cache=self.mm_processor_cache,
    )

_maybe_build_mm_hash_overrides

_maybe_build_mm_hash_overrides(
    request_id: str, prompt: PromptType
) -> Optional[dict[str, list[str]]]

Build per-item multimodal hash overrides when enabled. In this case, multimodal data items are identified by their request id, modality and index rather than their content.

Returns a dictionary of modality -> list[str] of overrides, or None if disabled or no multimodal data is present.

Source code in vllm/v1/engine/processor.py
def _maybe_build_mm_hash_overrides(
    self,
    request_id: str,
    prompt: PromptType,
) -> Optional[dict[str, list[str]]]:
    """Build per-item multimodal hash overrides when enabled. In this case,
    multimodal data items are identified by their request id, modality and
    index rather than their content.

    Returns a dictionary of modality -> list[str] of overrides, or None if
    disabled or no multimodal data is present.
    """

    def _extract_mm_data(p: PromptType):
        if isinstance(p, dict) and "encoder_prompt" in p:
            enc = p.get("encoder_prompt")
            if isinstance(enc, dict):
                return enc.get("multi_modal_data")
            return None
        if isinstance(p, dict):
            return p.get("multi_modal_data")
        return None

    mm_data = _extract_mm_data(prompt)
    if not mm_data:
        return None

    overrides: dict[str, list[str]] = {}
    for modality, data in mm_data.items():
        n = len(data) if isinstance(data, list) else 1
        overrides[modality] = [
            f"{request_id}-{modality}-{i}" for i in range(n)
        ]
    return overrides

_validate_logit_bias

_validate_logit_bias(params: SamplingParams) -> None

Validate logit_bias token IDs are within vocabulary range.

Source code in vllm/v1/engine/processor.py
def _validate_logit_bias(
    self,
    params: SamplingParams,
) -> None:
    """Validate logit_bias token IDs are within vocabulary range."""
    if not params.logit_bias:
        return

    vocab_size = self.model_config.get_vocab_size()
    invalid_token_ids = []

    for token_id in params.logit_bias:
        if token_id < 0 or token_id >= vocab_size:
            invalid_token_ids.append(token_id)

    if invalid_token_ids:
        raise ValueError(
            f"token_id(s) {invalid_token_ids} in logit_bias contain "
            f"out-of-vocab token ids. Vocabulary size: {vocab_size}")

_validate_logprobs

_validate_logprobs(params: SamplingParams) -> None
Source code in vllm/v1/engine/processor.py
def _validate_logprobs(
    self,
    params: SamplingParams,
) -> None:
    max_logprobs = self.model_config.max_logprobs
    if max_logprobs == -1:
        return
    # Validate sample logprobs.
    if params.logprobs and (params.logprobs == -1
                            or params.logprobs > max_logprobs):
        raise ValueError(
            f"Requested sample logprobs of {params.logprobs}, "
            f"which is greater than max allowed: {max_logprobs}")

    # Validate prompt logprobs.
    if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
        raise ValueError(
            f"Requested prompt logprobs of {params.prompt_logprobs}, "
            f"which is greater than max allowed: {max_logprobs}")

_validate_lora

_validate_lora(lora_request: Optional[LoRARequest]) -> None
Source code in vllm/v1/engine/processor.py
def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
    if lora_request is not None and not self.lora_config:
        raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                         "not enabled!")

_validate_model_input

_validate_model_input(
    prompt_inputs: SingletonInputs,
    lora_request: Optional[LoRARequest],
    *,
    prompt_type: Literal["encoder", "decoder"],
)
Source code in vllm/v1/engine/processor.py
def _validate_model_input(
    self,
    prompt_inputs: SingletonInputs,
    lora_request: Optional[LoRARequest],
    *,
    prompt_type: Literal["encoder", "decoder"],
):
    model_config = self.model_config

    prompt_ids = prompt_inputs["prompt_token_ids"]
    if not prompt_ids:
        if prompt_type == "encoder" and model_config.is_multimodal_model:
            pass  # Mllama may have empty encoder inputs for text-only data
        else:
            raise ValueError(f"The {prompt_type} prompt cannot be empty")

    if self.model_config.skip_tokenizer_init:
        tokenizer = None
    else:
        tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
        max_input_id = max(prompt_ids, default=0)

        # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
        # self.model_config.get_vocab_size() is the model’s vocab size.
        # For Qwen3 models, the language model has extra tokens that do
        # not exist in the tokenizer, and vice versa for multimodal
        # placeholder tokens in some multimodal models.
        # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
        # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501

        # Here we take the max of the two to determine if a token id is
        # truly out-of-vocabulary.
        if max_input_id > max(tokenizer.max_token_id,
                              self.model_config.get_vocab_size() - 1):
            raise ValueError(
                f"Token id {max_input_id} is out of vocabulary")

    max_prompt_len = self.model_config.max_model_len
    if len(prompt_ids) > max_prompt_len:
        if prompt_type == "encoder" and model_config.is_multimodal_model:
            mm_registry = self.input_preprocessor.mm_registry
            mm_processor = mm_registry.create_processor(
                model_config,
                tokenizer=tokenizer,
            )
            assert isinstance(mm_processor, EncDecMultiModalProcessor)

            if mm_processor.pad_dummy_encoder_prompt:
                return  # Skip encoder length check for Whisper and Donut

        if model_config.is_multimodal_model:
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens plus multimodal tokens. For image "
                "inputs, the number of image tokens depends on the number "
                "of images, and possibly their aspect ratios as well.")
        else:
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens.")

        raise ValueError(
            f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
            f"longer than the maximum model length of {max_prompt_len}. "
            f"{suggestion}")

_validate_model_inputs

_validate_model_inputs(
    inputs: ProcessorInputs,
    lora_request: Optional[LoRARequest] = None,
)
Source code in vllm/v1/engine/processor.py
def _validate_model_inputs(self,
                           inputs: ProcessorInputs,
                           lora_request: Optional[LoRARequest] = None):
    encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)

    if encoder_inputs is not None:
        self._validate_model_input(encoder_inputs,
                                   lora_request,
                                   prompt_type="encoder")

    self._validate_model_input(decoder_inputs,
                               lora_request,
                               prompt_type="decoder")

_validate_multi_modal_uuids

_validate_multi_modal_uuids(prompt: PromptType) -> None

Validate that user-provided multi_modal_uuids align with multi_modal_data in the incoming request prompt(s). Only checks lengths; None entries are allowed and will be auto-hashed downstream.

Source code in vllm/v1/engine/processor.py
def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
    """
    Validate that user-provided multi_modal_uuids align with
    multi_modal_data in the incoming request prompt(s).
    Only checks lengths; `None` entries are allowed and will be 
    auto-hashed downstream.
    """

    def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
        if not isinstance(single_prompt, dict):
            return
        mm_data = single_prompt.get("multi_modal_data")
        mm_uuids = single_prompt.get("multi_modal_uuids")
        if not mm_data or not mm_uuids:
            return

        for modality, items in mm_data.items():
            if modality in mm_uuids:
                data_len = len(items) if isinstance(items, list) else 1
                uuid_len = len(mm_uuids[modality]) if isinstance(
                    mm_uuids[modality], list) else 1
                if uuid_len != data_len:
                    raise ValueError(
                        f"multi_modal_uuids for modality '{modality}' "
                        "must have same length as data: got "
                        f"{uuid_len} uuids vs "
                        f"{data_len} items.")
            else:
                raise ValueError(
                    f"multi_modal_uuids for modality '{modality}' must "
                    "be provided if multi_modal_data is provided.")

    # Handle explicit encoder/decoder prompts or singleton prompt
    if isinstance(prompt, dict) and "encoder_prompt" in prompt:
        enc = prompt.get("encoder_prompt")
        dec = prompt.get("decoder_prompt")
        if enc is not None:
            _validate_single_prompt(enc)
        if dec is not None:
            _validate_single_prompt(dec)
    else:
        _validate_single_prompt(prompt)  # type: ignore[arg-type]

_validate_params

_validate_params(
    params: Union[SamplingParams, PoolingParams],
    lora_request: Optional[LoRARequest],
)

Validate supported SamplingParam. Should raise ValueError if unsupported for API Server.

Source code in vllm/v1/engine/processor.py
def _validate_params(
    self,
    params: Union[SamplingParams, PoolingParams],
    lora_request: Optional[LoRARequest],
):
    """
    Validate supported SamplingParam.
    Should raise ValueError if unsupported for API Server.
    """

    if isinstance(params, PoolingParams):
        return

    self._validate_logprobs(params)
    self._validate_sampling_params(params, lora_request)
    self._validate_supported_sampling_params(params)

_validate_sampling_params

_validate_sampling_params(
    params: SamplingParams,
    lora_request: Optional[LoRARequest],
) -> None
Source code in vllm/v1/engine/processor.py
def _validate_sampling_params(
    self,
    params: SamplingParams,
    lora_request: Optional[LoRARequest],
) -> None:
    self._validate_structured_output(params)
    self._validate_logit_bias(params)

    if params.allowed_token_ids is None:
        return
    if not params.allowed_token_ids:
        raise ValueError("allowed_token_ids is not None and empty!")
    if self.tokenizer is None:
        # When skip_tokenizer_init=True, we can't validate token IDs
        # Skip validation and let the model handle invalid tokens
        return
    tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
    vocab_size = len(tokenizer)
    if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
        raise ValueError(
            "allowed_token_ids contains out-of-vocab token id!")

_validate_structured_output

_validate_structured_output(params: SamplingParams) -> None
Source code in vllm/v1/engine/processor.py
def _validate_structured_output(self, params: SamplingParams) -> None:
    if not params.guided_decoding or not self.decoding_config:
        return

    if self.model_config.skip_tokenizer_init and params.guided_decoding:
        raise ValueError(
            "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
        )

    engine_level_backend = self.decoding_config.backend
    if params.guided_decoding.backend:
        # Request-level backend selection is not supported in V1.
        # The values may differ if `params` is reused and was set
        # to a specific backend based on `auto` behavior in a previous
        # request. We remember that it was set as a result of `auto`
        # using the `_auto` option set on the backend in the params.
        if (params.guided_decoding.backend != engine_level_backend
                and not (engine_level_backend == "auto"
                         and params.guided_decoding.backend_was_auto)):
            raise ValueError(
                "Request-level structured output backend selection is no "
                "longer supported. The request specified "
                f"'{params.guided_decoding.backend}', but vLLM was "
                f"initialised with '{engine_level_backend}'. This error "
                "can be resolved by removing backend selection from the "
                "request.")
    else:
        params.guided_decoding.backend = engine_level_backend

    # Request content validation
    if (isinstance(params.guided_decoding.choice, list)
            and not params.guided_decoding.choice):
        # It is invalid for choice to be an empty list
        raise ValueError(f"Choice '{params.guided_decoding.choice}' "
                         "cannot be an empty list")

    if engine_level_backend.startswith("xgrammar"):
        # xgrammar with no fallback
        validate_xgrammar_grammar(params)
    elif engine_level_backend.startswith("guidance"):
        # TODO: ideally we would have the LLTokenizer here as Lark syntax
        # allows <|special_token|> and similar, see
        # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
        # Without tokenizer these are disallowed in grammars.
        validate_guidance_grammar(params, tokenizer=None)
    elif engine_level_backend == "outlines":
        # outlines backend
        validate_structured_output_request_outlines(params)
    elif engine_level_backend == "lm-format-enforcer":
        # lm format enforcer backend
        validate_structured_output_request_lm_format_enforcer(params)
    else:
        # NOTE: engine_level_backend must be "auto" here, because we have
        # checked supported_backends above.
        # "auto" is an opt-in to opinionated behavior where we try to
        # choose a backend based on request contents. This is not the
        # default as it is less predictable and subject to change
        # between releases as feature support changes.
        try:
            validate_xgrammar_grammar(params)
            params.guided_decoding.backend = "xgrammar"
        except ValueError:
            # The request either failed validation
            # or includes some jsonschema feature(s) that
            # are not supported in xgrammar. Fall back to guidance.
            validate_guidance_grammar(params, tokenizer=None)
            params.guided_decoding.backend = "guidance"
        # Remember that this backend was set automatically
        params.guided_decoding.backend_was_auto = True

_validate_supported_sampling_params

_validate_supported_sampling_params(
    params: SamplingParams,
) -> None
Source code in vllm/v1/engine/processor.py
def _validate_supported_sampling_params(
    self,
    params: SamplingParams,
) -> None:
    # Best of not yet supported.
    if params.best_of is not None and params.best_of > 1:
        raise ValueError("vLLM V1 does not yet support best_of.")
    # Logits processors not supported.
    if params.logits_processors:
        raise ValueError("vLLM V1 does not support per request "
                         "user provided logits processors.")

clear_cache

clear_cache() -> None
Source code in vllm/v1/engine/processor.py
def clear_cache(self) -> None:
    self.input_preprocessor.clear_cache()

process_inputs

process_inputs(
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]
Source code in vllm/v1/engine/processor.py
def process_inputs(
    self,
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]:

    # TODO(woosuk): Support pooling models.
    # TODO(woosuk): Support encoder-decoder models.
    self._validate_lora(lora_request)
    self._validate_params(params, lora_request)
    if trace_headers is not None:
        raise ValueError("V1 does not support tracing yet.")

    data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
    if data_parallel_rank is not None and not (0 <= data_parallel_rank <
                                               data_parallel_size):
        raise ValueError(f"data_parallel_rank {data_parallel_rank} "
                         f"is out of range [0, {data_parallel_size}).")

    if arrival_time is None:
        arrival_time = time.time()

    # Optionally generate multimodal hash overrides to avoid hashing
    # multimodal data items by their content as their identifiers.

    # NOTE: when users explicitly turn off BOTH prefix caching and input
    # processing caching, no multimodal features or embeddings will be
    # reused across requests, therefore identifying multimodal data items
    # by their content is no longer necessary, and we create uuids with
    # request id-modality-index as multimodal hash overrides.
    if (self.model_config.multimodal_config and
            self.model_config.multimodal_config.mm_processor_cache_gb == 0
            and not self.cache_config.enable_prefix_caching):
        mm_hash_overrides = self._maybe_build_mm_hash_overrides(
            request_id, prompt)
    else:
        # Otherwise, use user-provided uuids as multimodal hash overrides
        # if provided.
        self._validate_multi_modal_uuids(prompt)
        if isinstance(prompt, dict):
            mm_hash_overrides = prompt.get("multi_modal_uuids")
        else:
            mm_hash_overrides = None

    # Process inputs, which includes:
    # 1. Tokenize text prompt, with LoRA request if one exists.
    # 2. For multimodal models with a merged preprocessor, preprocess
    #   multimodal data and expand prompt token ids accordingly.
    processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
        prompt,
        tokenization_kwargs=tokenization_kwargs,
        lora_request=lora_request,
        mm_hash_overrides=mm_hash_overrides,
    )
    from vllm.platforms import current_platform
    current_platform.validate_request(
        prompt=prompt,
        params=params,
        processed_inputs=processed_inputs,
    )

    eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

    self._validate_model_inputs(processed_inputs, lora_request)

    encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)

    # TODO: Impl encoder-decoder
    if encoder_inputs is not None:
        raise NotImplementedError

    sampling_params = None
    pooling_params = None
    if isinstance(params, SamplingParams):
        # TODO: can we avoid cloning here in multiproc case?
        sampling_params = params.clone()
        # If unset max tokens, then generate up to the max_model_len.
        if sampling_params.max_tokens is None:
            sampling_params.max_tokens = (
                self.model_config.max_model_len -
                len(decoder_inputs["prompt_token_ids"]))
        sampling_params.update_from_generation_config(
            self.generation_config_fields, eos_token_id)
        if self.tokenizer is not None:
            sampling_params.update_from_tokenizer(
                self.tokenizer.get_lora_tokenizer(lora_request))
    else:
        pooling_params = params.clone()

    # Multimodal related.
    mm_features: Optional[list[MultiModalFeatureSpec]] = None

    if decoder_inputs["type"] == "multimodal":
        decoder_mm_inputs = decoder_inputs["mm_kwargs"]
        decoder_mm_positions = decoder_inputs["mm_placeholders"]
        decoder_mm_hashes = decoder_inputs["mm_hashes"]

        # Merge and flatten multimodal placeholders, hashes and inputs
        # from dictionaries to lists, and sort them by each item's position
        # in the input sequence.
        sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)

        mm_features = []
        for modality, idx in sorted_mm_idxs:
            mm_features.append(
                MultiModalFeatureSpec(
                    data=decoder_mm_inputs[modality][idx],
                    modality=modality,
                    identifier=decoder_mm_hashes[modality][idx],
                    mm_position=decoder_mm_positions[modality][idx]))

    return decoder_inputs.get("prompt"), EngineCoreRequest(
        request_id=request_id,
        prompt_token_ids=decoder_inputs["prompt_token_ids"],
        mm_features=mm_features,
        sampling_params=sampling_params,
        pooling_params=pooling_params,
        eos_token_id=eos_token_id,
        arrival_time=arrival_time,
        lora_request=lora_request,
        cache_salt=decoder_inputs.get("cache_salt"),
        priority=priority,
        data_parallel_rank=data_parallel_rank,
    )