api/pkg/apis/inference/v1/model_config.proto

1991 lines
67 KiB
Protocol Buffer

// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2018, TensorFlow Authors. All rights reserved.
syntax = "proto3";
package inference.v1;
option go_package = "d7y.io/api/pkg/apis/inference/v1;inference";
//@@.. cpp:namespace:: inference
//@@
//@@.. cpp:enum:: DataType
//@@
//@@ Data types supported for input and output tensors.
//@@
enum DataType {
//@@ .. cpp:enumerator:: DataType::INVALID = 0
TYPE_INVALID = 0;
//@@ .. cpp:enumerator:: DataType::BOOL = 1
TYPE_BOOL = 1;
//@@ .. cpp:enumerator:: DataType::UINT8 = 2
TYPE_UINT8 = 2;
//@@ .. cpp:enumerator:: DataType::UINT16 = 3
TYPE_UINT16 = 3;
//@@ .. cpp:enumerator:: DataType::UINT32 = 4
TYPE_UINT32 = 4;
//@@ .. cpp:enumerator:: DataType::UINT64 = 5
TYPE_UINT64 = 5;
//@@ .. cpp:enumerator:: DataType::INT8 = 6
TYPE_INT8 = 6;
//@@ .. cpp:enumerator:: DataType::INT16 = 7
TYPE_INT16 = 7;
//@@ .. cpp:enumerator:: DataType::INT32 = 8
TYPE_INT32 = 8;
//@@ .. cpp:enumerator:: DataType::INT64 = 9
TYPE_INT64 = 9;
//@@ .. cpp:enumerator:: DataType::FP16 = 10
TYPE_FP16 = 10;
//@@ .. cpp:enumerator:: DataType::FP32 = 11
TYPE_FP32 = 11;
//@@ .. cpp:enumerator:: DataType::FP64 = 12
TYPE_FP64 = 12;
//@@ .. cpp:enumerator:: DataType::STRING = 13
TYPE_STRING = 13;
//@@ .. cpp:enumerator:: DataType::BF16 = 14
TYPE_BF16 = 14;
}
//@@
//@@ .. cpp:var:: message ModelRateLimiter
//@@
//@@ The specifications required by the rate limiter to properly
//@@ schedule the inference requests across the different models
//@@ and their instances.
//@@
message ModelRateLimiter
{
//@@ .. cpp:var:: message Resource
//@@
//@@ The resource property.
//@@
message Resource
{
//@@ .. cpp:var:: string name
//@@
//@@ The name associated with the resource.
//@@
string name = 1;
//@@ .. cpp:var:: bool global
//@@
//@@ Whether or not the resource is global. If true then the resource
//@@ is assumed to be shared among the devices otherwise specified
//@@ count of the resource is assumed for each device associated
//@@ with the instance.
//@@
bool global = 2;
//@@ .. cpp:var:: uint32 count
//@@
//@@ The number of resources required for the execution of the model
//@@ instance.
//@@
uint32 count = 3;
}
//@@ .. cpp:var:: Resource resources (repeated)
//@@
//@@ The resources required to execute the request on a model instance.
//@@ Resources are just names with a corresponding count. The execution
//@@ of the instance will be blocked until the specificied resources are
//@@ available. By default an instance uses no rate-limiter resources.
//@@
repeated Resource resources = 1;
//@@ .. cpp:var:: uint32 priority
//@@
//@@ The optional weighting value to be used for prioritizing across
//@@ instances. An instance with priority 2 will be given 1/2 the
//@@ number of scheduling chances as an instance_group with priority
//@@ 1. The default priority is 1. The priority of value 0 will be
//@@ treated as priority 1.
//@@
uint32 priority = 2;
}
//@@
//@@.. cpp:var:: message ModelInstanceGroup
//@@
//@@ A group of one or more instances of a model and resources made
//@@ available for those instances.
//@@
message ModelInstanceGroup
{
//@@
//@@ .. cpp:enum:: Kind
//@@
//@@ Kind of this instance group.
//@@
enum Kind {
//@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0
//@@
//@@ This instance group represents instances that can run on either
//@@ CPU or GPU. If all GPUs listed in 'gpus' are available then
//@@ instances will be created on GPU(s), otherwise instances will
//@@ be created on CPU.
//@@
KIND_AUTO = 0;
//@@ .. cpp:enumerator:: Kind::KIND_GPU = 1
//@@
//@@ This instance group represents instances that must run on the
//@@ GPU.
//@@
KIND_GPU = 1;
//@@ .. cpp:enumerator:: Kind::KIND_CPU = 2
//@@
//@@ This instance group represents instances that must run on the
//@@ CPU.
//@@
KIND_CPU = 2;
//@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3
//@@
//@@ This instance group represents instances that should run on the
//@@ CPU and/or GPU(s) as specified by the model or backend itself.
//@@ The inference server will not override the model/backend
//@@ settings.
//@@
KIND_MODEL = 3;
}
//@@
//@@ .. cpp:var:: message SecondaryDevice
//@@
//@@ A secondary device required for a model instance.
//@@
message SecondaryDevice
{
//@@
//@@ .. cpp:enum:: SecondaryDeviceKind
//@@
//@@ The kind of the secondary device.
//@@
enum SecondaryDeviceKind {
//@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0
//@@
//@@ An NVDLA core. http://nvdla.org
//@@ Currently KIND_NVDLA is only supported by the TensorRT backend.
//@@
KIND_NVDLA = 0;
}
//@@ .. cpp:var:: SecondaryDeviceKind kind
//@@
//@@ The secondary device kind.
//@@
SecondaryDeviceKind kind = 1;
//@@ .. cpp:var:: int64 device_id
//@@
//@@ Identifier for the secondary device.
//@@
int64 device_id = 2;
}
//@@ .. cpp:var:: string name
//@@
//@@ Optional name of this group of instances. If not specified the
//@@ name will be formed as <model name>_<group number>. The name of
//@@ individual instances will be further formed by a unique instance
//@@ number and GPU index:
//@@
string name = 1;
//@@ .. cpp:var:: Kind kind
//@@
//@@ The kind of this instance group. Default is KIND_AUTO. If
//@@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
//@@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid
//@@ and 'gpu' cannot be specified.
//@@
Kind kind = 4;
//@@ .. cpp:var:: int32 count
//@@
//@@ For a group assigned to GPU, the number of instances created for
//@@ each GPU listed in 'gpus'. For a group assigned to CPU the number
//@@ of instances created. Default is 1.
int32 count = 2;
//@@ .. cpp:var:: ModelRateLimiter rate_limiter
//@@
//@@ The rate limiter specific settings to be associated with this
//@@ instance group. Optional, if not specified no rate limiting
//@@ will be applied to this instance group.
//@@
ModelRateLimiter rate_limiter = 6;
//@@ .. cpp:var:: int32 gpus (repeated)
//@@
//@@ GPU(s) where instances should be available. For each GPU listed,
//@@ 'count' instances of the model will be available. Setting 'gpus'
//@@ to empty (or not specifying at all) is eqivalent to listing all
//@@ available GPUs.
//@@
repeated int32 gpus = 3;
//@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated)
//@@
//@@ Secondary devices that are required by instances specified by this
//@@ instance group. Optional.
//@@
repeated SecondaryDevice secondary_devices = 8;
//@@ .. cpp:var:: string profile (repeated)
//@@
//@@ For TensorRT models containing multiple optimization profile, this
//@@ parameter specifies a set of optimization profiles available to this
//@@ instance group. The inference server will choose the optimal profile
//@@ based on the shapes of the input tensors. This field should lie
//@@ between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1
//@@ and be specified only for TensorRT backend, otherwise an error will
//@@ be generated. If not specified, the server will select the first
//@@ optimization profile by default.
//@@
repeated string profile = 5;
//@@ .. cpp:var:: bool passive
//@@
//@@ Whether the instances within this instance group will be accepting
//@@ inference requests from the scheduler. If true, the instances will
//@@ not be added to the scheduler. Default value is false.
//@@
bool passive = 7;
//@@ .. cpp:var:: string host_policy
//@@
//@@ The host policy name that the instance to be associated with.
//@@ The default value is set to reflect the device kind of the instance,
//@@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and
//@@ KIND_GPU is "gpu_<gpu_id>".
//@@
string host_policy = 9;
}
//@@
//@@.. cpp:var:: message ModelTensorReshape
//@@
//@@ Reshape specification for input and output tensors.
//@@
message ModelTensorReshape
{
//@@ .. cpp:var:: int64 shape (repeated)
//@@
//@@ The shape to use for reshaping.
//@@
repeated int64 shape = 1;
}
//@@
//@@.. cpp:var:: message ModelInput
//@@
//@@ An input required by the model.
//@@
message ModelInput
{
//@@
//@@ .. cpp:enum:: Format
//@@
//@@ The format for the input.
//@@
enum Format {
//@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0
//@@
//@@ The input has no specific format. This is the default.
//@@
FORMAT_NONE = 0;
//@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1
//@@
//@@ HWC image format. Tensors with this format require 3 dimensions
//@@ if the model does not support batching (max_batch_size = 0) or 4
//@@ dimensions if the model does support batching (max_batch_size
//@@ >= 1). In either case the 'dims' below should only specify the
//@@ 3 non-batch dimensions (i.e. HWC or CHW).
//@@
FORMAT_NHWC = 1;
//@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2
//@@
//@@ CHW image format. Tensors with this format require 3 dimensions
//@@ if the model does not support batching (max_batch_size = 0) or 4
//@@ dimensions if the model does support batching (max_batch_size
//@@ >= 1). In either case the 'dims' below should only specify the
//@@ 3 non-batch dimensions (i.e. HWC or CHW).
//@@
FORMAT_NCHW = 2;
}
//@@ .. cpp:var:: string name
//@@
//@@ The name of the input.
//@@
string name = 1;
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The data-type of the input.
//@@
DataType data_type = 2;
//@@ .. cpp:var:: Format format
//@@
//@@ The format of the input. Optional.
//@@
Format format = 3;
//@@ .. cpp:var:: int64 dims (repeated)
//@@
//@@ The dimensions/shape of the input tensor that must be provided
//@@ when invoking the inference API for this model.
//@@
repeated int64 dims = 4;
//@@ .. cpp:var:: ModelTensorReshape reshape
//@@
//@@ The shape expected for this input by the backend. The input will
//@@ be reshaped to this before being presented to the backend. The
//@@ reshape must have the same number of elements as the input shape
//@@ specified by 'dims'. Optional.
//@@
ModelTensorReshape reshape = 5;
//@@ .. cpp:var:: bool is_shape_tensor
//@@
//@@ Whether or not the input is a shape tensor to the model. This field
//@@ is currently supported only for the TensorRT model. An error will be
//@@ generated if this specification does not comply with underlying
//@@ model.
//@@
bool is_shape_tensor = 6;
//@@ .. cpp:var:: bool allow_ragged_batch
//@@
//@@ Whether or not the input is allowed to be "ragged" in a dynamically
//@@ created batch. Default is false indicating that two requests will
//@@ only be batched if this tensor has the same shape in both requests.
//@@ True indicates that two requests can be batched even if this tensor
//@@ has a different shape in each request.
//@@
bool allow_ragged_batch = 7;
//@@ .. cpp:var:: bool optional
//@@
//@@ Whether or not the input is optional for the model execution.
//@@ If true, the input is not required in the inference request.
//@@ Default value is false.
//@@
bool optional = 8;
}
//@@
//@@.. cpp:var:: message ModelOutput
//@@
//@@ An output produced by the model.
//@@
message ModelOutput
{
//@@ .. cpp:var:: string name
//@@
//@@ The name of the output.
//@@
string name = 1;
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The data-type of the output.
//@@
DataType data_type = 2;
//@@ .. cpp:var:: int64 dims (repeated)
//@@
//@@ The dimensions/shape of the output tensor.
//@@
repeated int64 dims = 3;
//@@ .. cpp:var:: ModelTensorReshape reshape
//@@
//@@ The shape produced for this output by the backend. The output will
//@@ be reshaped from this to the shape specifed in 'dims' before being
//@@ returned in the inference response. The reshape must have the same
//@@ number of elements as the output shape specified by 'dims'. Optional.
//@@
ModelTensorReshape reshape = 5;
//@@ .. cpp:var:: string label_filename
//@@
//@@ The label file associated with this output. Should be specified only
//@@ for outputs that represent classifications. Optional.
//@@
string label_filename = 4;
//@@ .. cpp:var:: bool is_shape_tensor
//@@
//@@ Whether or not the output is a shape tensor to the model. This field
//@@ is currently supported only for the TensorRT model. An error will be
//@@ generated if this specification does not comply with underlying
//@@ model.
//@@
bool is_shape_tensor = 6;
}
//@@ .. cpp:var:: message BatchInput
//@@
//@@ A batch input is an additional input that must be added by
//@@ the backend based on all the requests in a batch.
//@@
message BatchInput
{
//@@
//@@ .. cpp:enum:: Kind
//@@
//@@ The kind of the batch input.
//@@
enum Kind {
//@@ .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0
//@@
//@@ The element count of the 'source_input' will be added as
//@@ input with shape [1].
//@@
BATCH_ELEMENT_COUNT = 0;
//@@ .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1
//@@
//@@ The accumulated element count of the 'source_input' will be
//@@ added as input with shape [1]. For example, if there is a
//@@ batch of two request, each with 2 elements, an input of value
//@@ 2 will be added to the first request, and an input of value
//@@ 4 will be added to the second request.
//@@
BATCH_ACCUMULATED_ELEMENT_COUNT = 1;
//@@ .. cpp:enumerator::
//@@ Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2
//@@
//@@ The accumulated element count of the 'source_input' will be
//@@ added as input with shape [1], except for the first request
//@@ in the batch. For the first request in the batch, the input
//@@ will have shape [2] where the first element is value 0.
//@@
BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2;
//@@ .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3
//@@
//@@ Among the requests in the batch, the max element count of the
//@@ 'source_input' will be added as input with shape
//@@ [max_element_count] for the first request in the batch.
//@@ For other requests, such input will be with shape [0].
//@@ The data of the tensor will be uninitialized.
//@@
BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3;
//@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4
//@@
//@@ Among the requests in the batch, the shape of the
//@@ 'source_input' will be added as input with shape
//@@ [batch_size, len(input_dim)]. For example, if one
//@@ batch-2 input with shape [3, 1] and batch-1 input
//@@ with shape [2, 2] are batched, the batch input will
//@@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]].
//@@
BATCH_ITEM_SHAPE = 4;
//@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5
//@@
//@@ Among the requests in the batch, the shape of the
//@@ 'source_input' will be added as input with single dimensional
//@@ shape [batch_size * len(input_dim)]. For example, if one
//@@ batch-2 input with shape [3, 1] and batch-1 input
//@@ with shape [2, 2] are batched, the batch input will
//@@ have shape [6] and value [3, 1, 3, 1, 2, 2].
//@@
BATCH_ITEM_SHAPE_FLATTEN = 5;
}
//@@ .. cpp:var:: Kind kind
//@@
//@@ The kind of this batch input.
//@@
Kind kind = 1;
//@@ .. cpp:var:: string target_name (repeated)
//@@
//@@ The name of the model inputs that the backend will create
//@@ for this batch input.
//@@
repeated string target_name = 2;
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The input's datatype. The data type can be TYPE_INT32 or
//@@ TYPE_FP32.
//@@
DataType data_type = 3;
//@@ .. cpp:var:: string source_input (repeated)
//@@
//@@ The backend derives the value for each batch input from one or
//@@ more other inputs. 'source_input' gives the names of those
//@@ inputs.
//@@
repeated string source_input = 4;
}
//@@.. cpp:var:: message BatchOutput
//@@
//@@ A batch output is an output produced by the model that must be handled
//@@ differently by the backend based on all the requests in a batch.
//@@
message BatchOutput
{
//@@
//@@ .. cpp:enum:: Kind
//@@
//@@ The kind of the batch output.
//@@
enum Kind {
//@@ .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0
//@@
//@@ The output should be scattered according to the shape of
//@@ 'source_input'. The dynamic dimension of the output will
//@@ be set to the value of the same dimension in the input.
//@@
BATCH_SCATTER_WITH_INPUT_SHAPE = 0;
}
//@@ .. cpp:var:: string target_name (repeated)
//@@
//@@ The name of the outputs to be produced by this batch output
//@@ specification.
//@@
repeated string target_name = 1;
//@@ .. cpp:var:: Kind kind
//@@
//@@ The kind of this batch output.
//@@
Kind kind = 2;
//@@ .. cpp:var:: string source_input (repeated)
//@@
//@@ The backend derives each batch output from one or more inputs.
//@@ 'source_input' gives the names of those inputs.
//@@
repeated string source_input = 3;
}
//@@
//@@.. cpp:var:: message ModelVersionPolicy
//@@
//@@ Policy indicating which versions of a model should be made
//@@ available by the inference server.
//@@
message ModelVersionPolicy
{
//@@ .. cpp:var:: message Latest
//@@
//@@ Serve only the latest version(s) of a model. This is
//@@ the default policy.
//@@
message Latest
{
//@@ .. cpp:var:: uint32 num_versions
//@@
//@@ Serve only the 'num_versions' highest-numbered versions. T
//@@ The default value of 'num_versions' is 1, indicating that by
//@@ default only the single highest-number version of a
//@@ model will be served.
//@@
uint32 num_versions = 1;
}
//@@ .. cpp:var:: message All
//@@
//@@ Serve all versions of the model.
//@@
message All {}
//@@ .. cpp:var:: message Specific
//@@
//@@ Serve only specific versions of the model.
//@@
message Specific
{
//@@ .. cpp:var:: int64 versions (repeated)
//@@
//@@ The specific versions of the model that will be served.
//@@
repeated int64 versions = 1;
}
//@@ .. cpp:var:: oneof policy_choice
//@@
//@@ Each model must implement only a single version policy. The
//@@ default policy is 'Latest'.
//@@
oneof policy_choice
{
//@@ .. cpp:var:: Latest latest
//@@
//@@ Serve only latest version(s) of the model.
//@@
Latest latest = 1;
//@@ .. cpp:var:: All all
//@@
//@@ Serve all versions of the model.
//@@
All all = 2;
//@@ .. cpp:var:: Specific specific
//@@
//@@ Serve only specific version(s) of the model.
//@@
Specific specific = 3;
}
}
//@@
//@@.. cpp:var:: message ModelOptimizationPolicy
//@@
//@@ Optimization settings for a model. These settings control if/how a
//@@ model is optimized and prioritized by the backend framework when
//@@ it is loaded.
//@@
message ModelOptimizationPolicy
{
//@@
//@@ .. cpp:var:: message Graph
//@@
//@@ Enable generic graph optimization of the model. If not specified
//@@ the framework's default level of optimization is used. Supports
//@@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow
//@@ causes XLA to be enabled/disabled for the model. For Onnx defaults
//@@ to enabling all optimizations, -1 enables only basic optimizations,
//@@ +1 enables only basic and extended optimizations.
//@@
message Graph
{
//@@ .. cpp:var:: int32 level
//@@
//@@ The optimization level. Defaults to 0 (zero) if not specified.
//@@
//@@ - -1: Disabled
//@@ - 0: Framework default
//@@ - 1+: Enable optimization level (greater values indicate
//@@ higher optimization levels)
//@@
int32 level = 1;
}
//@@
//@@ .. cpp:enum:: ModelPriority
//@@
//@@ Model priorities. A model will be given scheduling and execution
//@@ preference over models at lower priorities. Current model
//@@ priorities only work for TensorRT models.
//@@
enum ModelPriority {
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0
//@@
//@@ The default model priority.
//@@
PRIORITY_DEFAULT = 0;
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1
//@@
//@@ The maximum model priority.
//@@
PRIORITY_MAX = 1;
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2
//@@
//@@ The minimum model priority.
//@@
PRIORITY_MIN = 2;
}
//@@
//@@ .. cpp:var:: message Cuda
//@@
//@@ CUDA-specific optimization settings.
//@@
message Cuda
{
//@@ .. cpp:var:: message GraphSpec
//@@
//@@ Specification of the CUDA graph to be captured.
//@@
message GraphSpec
{
//@@ .. cpp:var:: message Dims
//@@
//@@ Specification of tensor dimension.
//@@
message Shape
{
//@@ .. cpp:var:: int64 dim (repeated)
//@@
//@@ The dimension.
//@@
repeated int64 dim = 1;
}
message LowerBound
{
//@@ .. cpp:var:: int32 batch_size
//@@
//@@ The batch size of the CUDA graph. If 'max_batch_size' is 0,
//@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must
//@@ be set to value between 1 and 'max_batch_size'.
//@@
int32 batch_size = 1;
//@@ .. cpp:var:: map<string, Shape> input
//@@
//@@ The specification of the inputs. 'Shape' is the shape of
//@@ the input without batching dimension.
//@@
map<string, Shape> input = 2;
}
//@@ .. cpp:var:: int32 batch_size
//@@
//@@ The batch size of the CUDA graph. If 'max_batch_size' is 0,
//@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must
//@@ be set to value between 1 and 'max_batch_size'.
//@@
int32 batch_size = 1;
//@@ .. cpp:var:: map<string, Shape> input
//@@
//@@ The specification of the inputs. 'Shape' is the shape of the
//@@ input without batching dimension.
//@@
map<string, Shape> input = 2;
//@@ .. cpp:var:: LowerBound graph_lower_bound
//@@
//@@ Specify the lower bound of the CUDA graph. Optional.
//@@ If specified, the graph can be used for input shapes and
//@@ batch sizes that are in closed interval between the lower
//@@ bound specification and graph specification. For dynamic
//@@ shape model, this allows CUDA graphs to be launched
//@@ frequently without capturing all possible shape combinations.
//@@ However, using graph for shape combinations different from
//@@ the one used for capturing introduces uninitialized data for
//@@ execution and it may distort the inference result if
//@@ the model is sensitive to uninitialized data.
//@@
LowerBound graph_lower_bound = 3;
}
//@@ .. cpp:var:: bool graphs
//@@
//@@ Use CUDA graphs API to capture model operations and execute
//@@ them more efficiently. Default value is false.
//@@ Currently only recognized by TensorRT backend.
//@@
bool graphs = 1;
//@@ .. cpp:var:: bool busy_wait_events
//@@
//@@ Use busy-waiting to synchronize CUDA events to achieve minimum
//@@ latency from event complete to host thread to be notified, with
//@@ the cost of high CPU load. Default value is false.
//@@ Currently only recognized by TensorRT backend.
//@@
bool busy_wait_events = 2;
//@@ .. cpp:var:: GraphSpec graph_spec (repeated)
//@@
//@@ Specification of the CUDA graph to be captured. If not specified
//@@ and 'graphs' is true, the default CUDA graphs will be captured
//@@ based on model settings.
//@@ Currently only recognized by TensorRT backend.
//@@
repeated GraphSpec graph_spec = 3;
//@@ .. cpp:var:: bool output_copy_stream
//@@
//@@ Uses a CUDA stream separate from the inference stream to copy the
//@@ output to host. However, be aware that setting this option to
//@@ true will lead to an increase in the memory consumption of the
//@@ model as Triton will allocate twice as much GPU memory for its
//@@ I/O tensor buffers. Default value is false.
//@@ Currently only recognized by TensorRT backend.
//@@
bool output_copy_stream = 4;
}
//@@
//@@ .. cpp:var:: message ExecutionAccelerators
//@@
//@@ Specify the preferred execution accelerators to be used to execute
//@@ the model. Currently only recognized by ONNX Runtime backend and
//@@ TensorFlow backend.
//@@
//@@ For ONNX Runtime backend, it will deploy the model with the execution
//@@ accelerators by priority, the priority is determined based on the
//@@ order that they are set, i.e. the provider at the front has highest
//@@ priority. Overall, the priority will be in the following order:
//@@ <gpu_execution_accelerator> (if instance is on GPU)
//@@ CUDA Execution Provider (if instance is on GPU)
//@@ <cpu_execution_accelerator>
//@@ Default CPU Execution Provider
//@@
message ExecutionAccelerators
{
//@@
//@@ .. cpp:var:: message Accelerator
//@@
//@@ Specify the accelerator to be used to execute the model.
//@@ Accelerator with the same name may accept different parameters
//@@ depending on the backends.
//@@
message Accelerator
{
//@@ .. cpp:var:: string name
//@@
//@@ The name of the execution accelerator.
//@@
string name = 1;
//@@ .. cpp:var:: map<string, string> parameters
//@@
//@@ Additional paremeters used to configure the accelerator.
//@@
map<string, string> parameters = 2;
}
//@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated)
//@@
//@@ The preferred execution provider to be used if the model instance
//@@ is deployed on GPU.
//@@
//@@ For ONNX Runtime backend, possible value is "tensorrt" as name,
//@@ and no parameters are required.
//@@
//@@ For TensorFlow backend, possible values are "tensorrt",
//@@ "auto_mixed_precision", "gpu_io".
//@@
//@@ For "tensorrt", the following parameters can be specified:
//@@ "precision_mode": The precision used for optimization.
//@@ Allowed values are "FP32" and "FP16". Default value is "FP32".
//@@
//@@ "max_cached_engines": The maximum number of cached TensorRT
//@@ engines in dynamic TensorRT ops. Default value is 100.
//@@
//@@ "minimum_segment_size": The smallest model subgraph that will
//@@ be considered for optimization by TensorRT. Default value is 3.
//@@
//@@ "max_workspace_size_bytes": The maximum GPU memory the model
//@@ can use temporarily during execution. Default value is 1GB.
//@@
//@@ For "auto_mixed_precision", no parameters are required. If set,
//@@ the model will try to use FP16 for better performance.
//@@ This optimization can not be set with "tensorrt".
//@@
//@@ For "gpu_io", no parameters are required. If set, the model will
//@@ be executed using TensorFlow Callable API to set input and output
//@@ tensors in GPU memory if possible, which can reduce data transfer
//@@ overhead if the model is used in ensemble. However, the Callable
//@@ object will be created on model creation and it will request all
//@@ outputs for every model execution, which may impact the
//@@ performance if a request does not require all outputs. This
//@@ optimization will only take affect if the model instance is
//@@ created with KIND_GPU.
//@@
repeated Accelerator gpu_execution_accelerator = 1;
//@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated)
//@@
//@@ The preferred execution provider to be used if the model instance
//@@ is deployed on CPU.
//@@
//@@ For ONNX Runtime backend, possible value is "openvino" as name,
//@@ and no parameters are required.
//@@
repeated Accelerator cpu_execution_accelerator = 2;
}
//@@
//@@ .. cpp:var:: message PinnedMemoryBuffer
//@@
//@@ Specify whether to use a pinned memory buffer when transferring data
//@@ between non-pinned system memory and GPU memory. Using a pinned
//@@ memory buffer for system from/to GPU transfers will typically provide
//@@ increased performance. For example, in the common use case where the
//@@ request provides inputs and delivers outputs via non-pinned system
//@@ memory, if the model instance accepts GPU IOs, the inputs will be
//@@ processed by two copies: from non-pinned system memory to pinned
//@@ memory, and from pinned memory to GPU memory. Similarly, pinned
//@@ memory will be used for delivering the outputs.
//@@
message PinnedMemoryBuffer
{
//@@ .. cpp:var:: bool enable
//@@
//@@ Use pinned memory buffer. Default is true.
//@@
bool enable = 1;
}
//@@ .. cpp:var:: Graph graph
//@@
//@@ The graph optimization setting for the model. Optional.
//@@
Graph graph = 1;
//@@ .. cpp:var:: ModelPriority priority
//@@
//@@ The priority setting for the model. Optional.
//@@
ModelPriority priority = 2;
//@@ .. cpp:var:: Cuda cuda
//@@
//@@ CUDA-specific optimization settings. Optional.
//@@
Cuda cuda = 3;
//@@ .. cpp:var:: ExecutionAccelerators execution_accelerators
//@@
//@@ The accelerators used for the model. Optional.
//@@
ExecutionAccelerators execution_accelerators = 4;
//@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory
//@@
//@@ Use pinned memory buffer when the data transfer for inputs
//@@ is between GPU memory and non-pinned system memory.
//@@ Default is true.
//@@
PinnedMemoryBuffer input_pinned_memory = 5;
//@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory
//@@
//@@ Use pinned memory buffer when the data transfer for outputs
//@@ is between GPU memory and non-pinned system memory.
//@@ Default is true.
//@@
PinnedMemoryBuffer output_pinned_memory = 6;
//@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold
//@@
//@@ The backend may use a gather kernel to gather input data if the
//@@ device has direct access to the source buffer and the destination
//@@ buffer. In such case, the gather kernel will be used only if the
//@@ number of buffers to be gathered is greater or equal to
//@@ the specifed value. If 0, the gather kernel will be disabled.
//@@ Default value is 0.
//@@ Currently only recognized by TensorRT backend.
//@@
uint32 gather_kernel_buffer_threshold = 7;
//@@ .. cpp:var:: bool eager_batching
//@@
//@@ Start preparing the next batch before the model instance is ready
//@@ for the next inference. This option can be used to overlap the
//@@ batch preparation with model execution, with the trade-off that
//@@ the next batch might be smaller than what it could have been.
//@@ Default value is false.
//@@ Currently only recognized by TensorRT backend.
//@@
bool eager_batching = 8;
}
//@@
//@@.. cpp:var:: message ModelQueuePolicy
//@@
//@@ Queue policy for inference requests.
//@@
message ModelQueuePolicy
{
//@@
//@@ .. cpp:enum:: TimeoutAction
//@@
//@@ The action applied to timed-out requests.
//@@
enum TimeoutAction {
//@@ .. cpp:enumerator:: Action::REJECT = 0
//@@
//@@ Reject the request and return error message accordingly.
//@@
REJECT = 0;
//@@ .. cpp:enumerator:: Action::DELAY = 1
//@@
//@@ Delay the request until all other requests at the same
//@@ (or higher) priority levels that have not reached their timeouts
//@@ are processed. A delayed request will eventually be processed,
//@@ but may be delayed indefinitely due to newly arriving requests.
//@@
DELAY = 1;
}
//@@
//@@ .. cpp:var:: TimeoutAction timeout_action
//@@
//@@ The action applied to timed-out request.
//@@ The default action is REJECT.
//@@
TimeoutAction timeout_action = 1;
//@@
//@@ .. cpp:var:: uint64 default_timeout_microseconds
//@@
//@@ The default timeout for every request, in microseconds.
//@@ The default value is 0 which indicates that no timeout is set.
//@@
uint64 default_timeout_microseconds = 2;
//@@
//@@ .. cpp:var:: bool allow_timeout_override
//@@
//@@ Whether individual request can override the default timeout value.
//@@ When true, individual requests can set a timeout that is less than
//@@ the default timeout value but may not increase the timeout.
//@@ The default value is false.
//@@
bool allow_timeout_override = 3;
//@@
//@@ .. cpp:var:: uint32 max_queue_size
//@@
//@@ The maximum queue size for holding requests. A request will be
//@@ rejected immediately if it can't be enqueued because the queue is
//@@ full. The default value is 0 which indicates that no maximum
//@@ queue size is enforced.
//@@
uint32 max_queue_size = 4;
}
//@@
//@@.. cpp:var:: message ModelDynamicBatching
//@@
//@@ Dynamic batching configuration. These settings control how dynamic
//@@ batching operates for the model.
//@@
message ModelDynamicBatching
{
//@@ .. cpp:var:: int32 preferred_batch_size (repeated)
//@@
//@@ Preferred batch sizes for dynamic batching. If a batch of one of
//@@ these sizes can be formed it will be executed immediately. If
//@@ not specified a preferred batch size will be chosen automatically
//@@ based on model and GPU characteristics.
//@@
repeated int32 preferred_batch_size = 1;
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
//@@
//@@ The maximum time, in microseconds, a request will be delayed in
//@@ the scheduling queue to wait for additional requests for
//@@ batching. Default is 0.
//@@
uint64 max_queue_delay_microseconds = 2;
//@@ .. cpp:var:: bool preserve_ordering
//@@
//@@ Should the dynamic batcher preserve the ordering of responses to
//@@ match the order of requests received by the scheduler. Default is
//@@ false. If true, the responses will be returned in the same order as
//@@ the order of requests sent to the scheduler. If false, the responses
//@@ may be returned in arbitrary order. This option is specifically
//@@ needed when a sequence of related inference requests (i.e. inference
//@@ requests with the same correlation ID) are sent to the dynamic
//@@ batcher to ensure that the sequence responses are in the correct
//@@ order.
//@@
bool preserve_ordering = 3;
//@@ .. cpp:var:: uint64 priority_levels
//@@
//@@ The number of priority levels to be enabled for the model,
//@@ the priority level starts from 1 and 1 is the highest priority.
//@@ Requests are handled in priority order with all priority 1 requests
//@@ processed before priority 2, all priority 2 requests processed before
//@@ priority 3, etc. Requests with the same priority level will be
//@@ handled in the order that they are received.
//@@
uint64 priority_levels = 4;
//@@ .. cpp:var:: uint64 default_priority_level
//@@
//@@ The priority level used for requests that don't specify their
//@@ priority. The value must be in the range [ 1, 'priority_levels' ].
//@@
uint64 default_priority_level = 5;
//@@ .. cpp:var:: ModelQueuePolicy default_queue_policy
//@@
//@@ The default queue policy used for requests that don't require
//@@ priority handling and requests that specify priority levels where
//@@ there is no specific policy given. If not specified, a policy with
//@@ default field values will be used.
//@@
ModelQueuePolicy default_queue_policy = 6;
//@@ .. cpp:var:: map<uint64, ModelQueuePolicy> priority_queue_policy
//@@
//@@ Specify the queue policy for the priority level. The default queue
//@@ policy will be used if a priority level doesn't specify a queue
//@@ policy.
//@@
map<uint64, ModelQueuePolicy> priority_queue_policy = 7;
}
//@@
//@@.. cpp:var:: message ModelSequenceBatching
//@@
//@@ Sequence batching configuration. These settings control how sequence
//@@ batching operates for the model.
//@@
message ModelSequenceBatching
{
//@@ .. cpp:var:: message Control
//@@
//@@ A control is a signal that the sequence batcher uses to
//@@ communicate with a backend.
//@@
message Control
{
//@@
//@@ .. cpp:enum:: Kind
//@@
//@@ The kind of the control.
//@@
enum Kind {
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0
//@@
//@@ A new sequence is/is-not starting. If true a sequence is
//@@ starting, if false a sequence is continuing. Must
//@@ specify either int32_false_true, fp32_false_true or
//@@ bool_false_true for this control. This control is optional.
//@@
CONTROL_SEQUENCE_START = 0;
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1
//@@
//@@ A sequence is/is-not ready for inference. If true the
//@@ input tensor data is valid and should be used. If false
//@@ the input tensor data is invalid and inferencing should
//@@ be "skipped". Must specify either int32_false_true,
//@@ fp32_false_true or bool_false_true for this control. This
//@@ control is optional.
//@@
CONTROL_SEQUENCE_READY = 1;
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
//@@
//@@ A sequence is/is-not ending. If true a sequence is
//@@ ending, if false a sequence is continuing. Must specify
//@@ either int32_false_true, fp32_false_true or bool_false_true
//@@ for this control. This control is optional.
//@@
CONTROL_SEQUENCE_END = 2;
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3
//@@
//@@ The correlation ID of the sequence. The correlation ID
//@@ is an uint64_t value that is communicated in whole or
//@@ in part by the tensor. The tensor's datatype must be
//@@ specified by data_type and must be TYPE_UINT64, TYPE_INT64,
//@@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified
//@@ the correlation ID will be truncated to the low-order 32
//@@ bits. This control is optional.
//@@
CONTROL_SEQUENCE_CORRID = 3;
}
//@@ .. cpp:var:: Kind kind
//@@
//@@ The kind of this control.
//@@
Kind kind = 1;
//@@ .. cpp:var:: int32 int32_false_true (repeated)
//@@
//@@ The control's true and false setting is indicated by setting
//@@ a value in an int32 tensor. The tensor must be a
//@@ 1-dimensional tensor with size equal to the batch size of
//@@ the request. 'int32_false_true' must have two entries: the
//@@ first the false value and the second the true value.
//@@
repeated int32 int32_false_true = 2;
//@@ .. cpp:var:: float fp32_false_true (repeated)
//@@
//@@ The control's true and false setting is indicated by setting
//@@ a value in a fp32 tensor. The tensor must be a
//@@ 1-dimensional tensor with size equal to the batch size of
//@@ the request. 'fp32_false_true' must have two entries: the
//@@ first the false value and the second the true value.
//@@
repeated float fp32_false_true = 3;
//@@ .. cpp:var:: bool bool_false_true (repeated)
//@@
//@@ The control's true and false setting is indicated by setting
//@@ a value in a bool tensor. The tensor must be a
//@@ 1-dimensional tensor with size equal to the batch size of
//@@ the request. 'bool_false_true' must have two entries: the
//@@ first the false value and the second the true value.
//@@
repeated bool bool_false_true = 5;
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The control's datatype.
//@@
DataType data_type = 4;
}
//@@ .. cpp:var:: message ControlInput
//@@
//@@ The sequence control values to communicate by a model input.
//@@
message ControlInput
{
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model input.
//@@
string name = 1;
//@@ .. cpp:var:: Control control (repeated)
//@@
//@@ The control value(s) that should be communicated to the
//@@ model using this model input.
//@@
repeated Control control = 2;
}
//@@
//@@ .. cpp:var:: message InitialState
//@@
//@@ Settings used to initialize data for implicit state.
//@@
message InitialState
{
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The data-type of the state.
//@@
DataType data_type = 1;
//@@ .. cpp:var:: int64 dims (repeated)
//@@
//@@ The shape of the state tensor, not including the batch
//@@ dimension.
//@@
repeated int64 dims = 2;
//@@ .. cpp:var:: oneof state_data
//@@
//@@ Specify how the initial state data is generated.
//@@
oneof state_data
{
//@@
//@@ .. cpp:var:: bool zero_data
//@@
//@@ The identifier for using zeros as initial state data.
//@@ Note that the value of 'zero_data' will not be checked,
//@@ instead, zero data will be used as long as the field is set.
//@@
bool zero_data = 3;
//@@ .. cpp:var:: string data_file
//@@
//@@ The file whose content will be used as the initial data for
//@@ the state in row-major order. The file must be provided in
//@@ sub-directory 'initial_state' under the model directory.
//@@
string data_file = 4;
}
//@@ .. cpp:var:: string name
//@@
//@@ The name of the state initialization.
//@@
string name = 5;
}
//@@ .. cpp:var:: message State
//@@
//@@ An input / output pair of tensors that carry state for the sequence.
//@@
message State
{
//@@ .. cpp:var:: string input_name
//@@
//@@ The name of the model state input.
//@@
string input_name = 1;
//@@ .. cpp:var:: string output_name
//@@
//@@ The name of the model state output.
//@@
string output_name = 2;
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The data-type of the state.
//@@
DataType data_type = 3;
//@@ .. cpp:var:: int64 dim (repeated)
//@@
//@@ The dimension.
//@@
repeated int64 dims = 4;
//@@ .. cpp:var:: InitialState initial_state (repeated)
//@@
//@@ The optional field to specify the initial state for the model.
//@@
repeated InitialState initial_state = 5;
}
//@@ .. cpp:var:: message StrategyDirect
//@@
//@@ The sequence batcher uses a specific, unique batch
//@@ slot for each sequence. All inference requests in a
//@@ sequence are directed to the same batch slot in the same
//@@ model instance over the lifetime of the sequence. This
//@@ is the default strategy.
//@@
message StrategyDirect
{
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
//@@
//@@ The maximum time, in microseconds, a candidate request
//@@ will be delayed in the sequence batch scheduling queue to
//@@ wait for additional requests for batching. Default is 0.
//@@
uint64 max_queue_delay_microseconds = 1;
//@@ .. cpp:var:: float minimum_slot_utilization
//@@
//@@ The minimum slot utilization that must be satisfied to
//@@ execute the batch before 'max_queue_delay_microseconds' expires.
//@@ For example, a value of 0.5 indicates that the batch should be
//@@ executed as soon as 50% or more of the slots are ready even if
//@@ the 'max_queue_delay_microseconds' timeout has not expired.
//@@ The default is 0.0, indicating that a batch will be executed
//@@ before 'max_queue_delay_microseconds' timeout expires if at least
//@@ one batch slot is ready. 'max_queue_delay_microseconds' will be
//@@ ignored unless minimum_slot_utilization is set to a non-zero
//@@ value.
//@@
float minimum_slot_utilization = 2;
}
//@@ .. cpp:var:: message StrategyOldest
//@@
//@@ The sequence batcher maintains up to 'max_candidate_sequences'
//@@ candidate sequences. 'max_candidate_sequences' can be greater
//@@ than the model's 'max_batch_size'. For inferencing the batcher
//@@ chooses from the candidate sequences up to 'max_batch_size'
//@@ inference requests. Requests are chosen in an oldest-first
//@@ manner across all candidate sequences. A given sequence is
//@@ not guaranteed to be assigned to the same batch slot for
//@@ all inference requests of that sequence.
//@@
message StrategyOldest
{
//@@ .. cpp:var:: int32 max_candidate_sequences
//@@
//@@ Maximum number of candidate sequences that the batcher
//@@ maintains. Excess seqences are kept in an ordered backlog
//@@ and become candidates when existing candidate sequences
//@@ complete.
//@@
int32 max_candidate_sequences = 1;
//@@ .. cpp:var:: int32 preferred_batch_size (repeated)
//@@
//@@ Preferred batch sizes for dynamic batching of candidate
//@@ sequences. If a batch of one of these sizes can be formed
//@@ it will be executed immediately. If not specified a
//@@ preferred batch size will be chosen automatically
//@@ based on model and GPU characteristics.
//@@
repeated int32 preferred_batch_size = 2;
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
//@@
//@@ The maximum time, in microseconds, a candidate request
//@@ will be delayed in the dynamic batch scheduling queue to
//@@ wait for additional requests for batching. Default is 0.
//@@
uint64 max_queue_delay_microseconds = 3;
}
//@@ .. cpp:var:: oneof strategy_choice
//@@
//@@ The strategy used by the sequence batcher. Default strategy
//@@ is 'direct'.
//@@
oneof strategy_choice
{
//@@ .. cpp:var:: StrategyDirect direct
//@@
//@@ StrategyDirect scheduling strategy.
//@@
StrategyDirect direct = 3;
//@@ .. cpp:var:: StrategyOldest oldest
//@@
//@@ StrategyOldest scheduling strategy.
//@@
StrategyOldest oldest = 4;
}
//@@ .. cpp:var:: uint64 max_sequence_idle_microseconds
//@@
//@@ The maximum time, in microseconds, that a sequence is allowed to
//@@ be idle before it is aborted. The inference server considers a
//@@ sequence idle when it does not have any inference request queued
//@@ for the sequence. If this limit is exceeded, the inference server
//@@ will free the sequence slot allocated by the sequence and make it
//@@ available for another sequence. If not specified (or specified as
//@@ zero) a default value of 1000000 (1 second) is used.
//@@
uint64 max_sequence_idle_microseconds = 1;
//@@ .. cpp:var:: ControlInput control_input (repeated)
//@@
//@@ The model input(s) that the server should use to communicate
//@@ sequence start, stop, ready and similar control values to the
//@@ model.
//@@
repeated ControlInput control_input = 2;
//@@ .. cpp:var:: State state (repeated)
//@@
//@@ The optional state that can be stored in Triton for performing
//@@ inference requests on a sequence. Each sequence holds an implicit
//@@ state local to itself. The output state tensor provided by the
//@@ model in 'output_name' field of the current inference request will
//@@ be transferred as an input tensor named 'input_name' in the next
//@@ request of the same sequence. The input state of the first request
//@@ in the sequence contains garbage data.
//@@
repeated State state = 5;
}
//@@
//@@.. cpp:var:: message ModelEnsembling
//@@
//@@ Model ensembling configuration. These settings specify the models that
//@@ compose the ensemble and how data flows between the models.
//@@
message ModelEnsembling
{
//@@ .. cpp:var:: message Step
//@@
//@@ Each step specifies a model included in the ensemble,
//@@ maps ensemble tensor names to the model input tensors,
//@@ and maps model output tensors to ensemble tensor names
//@@
message Step
{
//@@ .. cpp:var:: string model_name
//@@
//@@ The name of the model to execute for this step of the ensemble.
//@@
string model_name = 1;
//@@ .. cpp:var:: int64 model_version
//@@
//@@ The version of the model to use for inference. If -1
//@@ the latest/most-recent version of the model is used.
//@@
int64 model_version = 2;
//@@ .. cpp:var:: map<string,string> input_map
//@@
//@@ Map from name of an input tensor on this step's model to ensemble
//@@ tensor name. The ensemble tensor must have the same data type and
//@@ shape as the model input. Each model input must be assigned to
//@@ one ensemble tensor, but the same ensemble tensor can be assigned
//@@ to multiple model inputs.
//@@
map<string, string> input_map = 3;
//@@ .. cpp:var:: map<string,string> output_map
//@@
//@@ Map from name of an output tensor on this step's model to ensemble
//@@ tensor name. The data type and shape of the ensemble tensor will
//@@ be inferred from the model output. It is optional to assign all
//@@ model outputs to ensemble tensors. One ensemble tensor name
//@@ can appear in an output map only once.
//@@
map<string, string> output_map = 4;
//@@ .. cpp:var:: string model_namespace
//@@
//@@ [RESERVED] currently this field is reserved for internal use, users
//@@ must not set any value to this field to avoid unexpected behavior.
//@@
string model_namespace = 5;
}
//@@ .. cpp:var:: Step step (repeated)
//@@
//@@ The models and the input / output mappings used within the ensemble.
//@@
repeated Step step = 1;
}
//@@
//@@.. cpp:var:: message ModelParameter
//@@
//@@ A model parameter.
//@@
message ModelParameter
{
//@@ .. cpp:var:: string string_value
//@@
//@@ The string value of the parameter.
//@@
string string_value = 1;
}
//@@
//@@.. cpp:var:: message ModelWarmup
//@@
//@@ Settings used to construct the request sample for model warmup.
//@@
message ModelWarmup
{
//@@
//@@ .. cpp:var:: message Input
//@@
//@@ Meta data associated with an input.
//@@
message Input
{
//@@ .. cpp:var:: DataType data_type
//@@
//@@ The data-type of the input.
//@@
DataType data_type = 1;
//@@ .. cpp:var:: int64 dims (repeated)
//@@
//@@ The shape of the input tensor, not including the batch dimension.
//@@
repeated int64 dims = 2;
//@@ .. cpp:var:: oneof input_data_type
//@@
//@@ Specify how the input data is generated. If the input has STRING
//@@ data type and 'random_data' is set, the data generation will fall
//@@ back to 'zero_data'.
//@@
oneof input_data_type
{
//@@
//@@ .. cpp:var:: bool zero_data
//@@
//@@ The identifier for using zeros as input data. Note that the
//@@ value of 'zero_data' will not be checked, instead, zero data
//@@ will be used as long as the field is set.
//@@
bool zero_data = 3;
//@@
//@@ .. cpp:var:: bool random_data
//@@
//@@ The identifier for using random data as input data. Note that
//@@ the value of 'random_data' will not be checked, instead,
//@@ random data will be used as long as the field is set.
//@@
bool random_data = 4;
//@@ .. cpp:var:: string input_data_file
//@@
//@@ The file whose content will be used as raw input data in
//@@ row-major order. The file must be provided in a sub-directory
//@@ 'warmup' under the model directory. The file contents should be
//@@ in binary format. For TYPE_STRING data-type, an element is
//@@ represented by a 4-byte unsigned integer giving the length
//@@ followed by the actual bytes.
//@@
string input_data_file = 5;
}
}
//@@ .. cpp:var:: string name
//@@
//@@ The name of the request sample.
//@@
string name = 1;
//@@ .. cpp:var:: uint32 batch_size
//@@
//@@ The batch size of the inference request. This must be >= 1. For
//@@ models that don't support batching, batch_size must be 1. If
//@@ batch_size > 1, the 'inputs' specified below will be duplicated to
//@@ match the batch size requested.
//@@
uint32 batch_size = 2;
//@@ .. cpp:var:: map<string, Input> inputs
//@@
//@@ The warmup meta data associated with every model input, including
//@@ control tensors.
//@@
map<string, Input> inputs = 3;
//@@ .. cpp:var:: uint32 count
//@@
//@@ The number of iterations that this warmup sample will be executed.
//@@ For example, if this field is set to 2, 2 model executions using this
//@@ sample will be scheduled for warmup. Default value is 0 which
//@@ indicates that this sample will be used only once.
//@@ Note that for sequence model, 'count' may not work well
//@@ because the model often expect a valid sequence of requests which
//@@ should be represented by a series of warmup samples. 'count > 1'
//@@ essentially "resends" one of the sample, which may invalidate the
//@@ sequence and result in unexpected warmup failure.
//@@
uint32 count = 4;
}
//@@
//@@ .. cpp:var:: message ModelOperations
//@@
//@@ The metadata of libraries providing custom operations for this model.
//@@
message ModelOperations
{
//@@ .. cpp:var:: string op_library_filename (repeated)
//@@
//@@ Optional paths of the libraries providing custom operations for
//@@ this model. Valid only for ONNX models.
//@@
repeated string op_library_filename = 1;
}
//@@
//@@ .. cpp:var:: message ModelTransactionPolicy
//@@
//@@ The specification that describes the nature of transactions
//@@ to be expected from the model.
//@@
message ModelTransactionPolicy
{
//@@ .. cpp:var:: bool decoupled
//@@
//@@ Indicates whether responses generated by the model are decoupled with
//@@ the requests issued to it, which means the number of responses
//@@ generated by model may differ from number of requests issued, and
//@@ that the responses may be out of order relative to the order of
//@@ requests. The default is false, which means the model will generate
//@@ exactly one response for each request.
//@@
bool decoupled = 1;
}
//@@
//@@.. cpp:var:: message ModelRepositoryAgents
//@@
//@@ The repository agents for the model.
//@@
message ModelRepositoryAgents
{
//@@
//@@ .. cpp:var:: message Agent
//@@
//@@ A repository agent that should be invoked for the specified
//@@ repository actions for this model.
//@@
message Agent
{
//@@ .. cpp:var:: string name
//@@
//@@ The name of the agent.
//@@
string name = 1;
//@@ .. cpp:var:: map<string, string> parameters
//@@
//@@ The parameters for the agent.
//@@
map<string, string> parameters = 2;
}
//@@
//@@ .. cpp:var:: Agent agents (repeated)
//@@
//@@ The ordered list of agents for the model. These agents will be
//@@ invoked in order to respond to repository actions occuring for the
//@@ model.
//@@
repeated Agent agents = 1;
}
//@@
//@@.. cpp:var:: message ModelResponseCache
//@@
//@@ The response cache setting for the model.
//@@
message ModelResponseCache
{
//@@
//@@ .. cpp::var:: bool enable
//@@
//@@ Whether or not to use response cache for the model. If True, the
//@@ responses from the model are cached and when identical request
//@@ is encountered, instead of going through the model execution,
//@@ the response from the cache is utilized. By default, response
//@@ cache is disabled for the models.
//@@
bool enable = 1;
}
//@@
//@@.. cpp:var:: message ModelConfig
//@@
//@@ A model configuration.
//@@
message ModelConfig
{
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model.
//@@
string name = 1;
//@@ .. cpp:var:: string platform
//@@
//@@ Additional backend-specific configuration for the model.
//@@ Please refer to the backend documentation on whether this field
//@@ should be specified.
//@@
string platform = 2;
//@@ .. cpp:var:: string backend
//@@
//@@ The backend used by the model.
//@@
string backend = 17;
//@@ .. cpp:var:: ModelVersionPolicy version_policy
//@@
//@@ Policy indicating which version(s) of the model will be served.
//@@
ModelVersionPolicy version_policy = 3;
//@@ .. cpp:var:: int32 max_batch_size
//@@
//@@ Maximum batch size allowed for inference. This can only decrease
//@@ what is allowed by the model itself. A max_batch_size value of 0
//@@ indicates that batching is not allowed for the model and the
//@@ dimension/shape of the input and output tensors must exactly
//@@ match what is specified in the input and output configuration. A
//@@ max_batch_size value > 0 indicates that batching is allowed and
//@@ so the model expects the input tensors to have an additional
//@@ initial dimension for the batching that is not specified in the
//@@ input (for example, if the model supports batched inputs of
//@@ 2-dimensional tensors then the model configuration will specify
//@@ the input shape as [ X, Y ] but the model will expect the actual
//@@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0
//@@ returned outputs will also have an additional initial dimension
//@@ for the batch.
//@@
int32 max_batch_size = 4;
//@@ .. cpp:var:: ModelInput input (repeated)
//@@
//@@ The inputs request by the model.
//@@
repeated ModelInput input = 5;
//@@ .. cpp:var:: ModelOutput output (repeated)
//@@
//@@ The outputs produced by the model.
//@@
repeated ModelOutput output = 6;
//@@ .. cpp:var:: BatchInput batch_input (repeated)
//@@
//@@ The model input(s) that the server should use to communicate
//@@ batch related values to the model.
//@@
repeated BatchInput batch_input = 20;
//@@ .. cpp:var:: BatchOutput batch_output (repeated)
//@@
//@@ The outputs produced by the model that requires special handling
//@@ by the model backend.
//@@
repeated BatchOutput batch_output = 21;
//@@ .. cpp:var:: ModelOptimizationPolicy optimization
//@@
//@@ Optimization configuration for the model. If not specified
//@@ then default optimization policy is used.
//@@
ModelOptimizationPolicy optimization = 12;
//@@ .. cpp:var:: oneof scheduling_choice
//@@
//@@ The scheduling policy for the model. If not specified the
//@@ default scheduling policy is used for the model. The default
//@@ policy is to execute each inference request independently.
//@@
oneof scheduling_choice
{
//@@ .. cpp:var:: ModelDynamicBatching dynamic_batching
//@@
//@@ If specified, enables the dynamic-batching scheduling
//@@ policy. With dynamic-batching the scheduler may group
//@@ together independent requests into a single batch to
//@@ improve inference throughput.
//@@
ModelDynamicBatching dynamic_batching = 11;
//@@ .. cpp:var:: ModelSequenceBatching sequence_batching
//@@
//@@ If specified, enables the sequence-batching scheduling
//@@ policy. With sequence-batching, inference requests
//@@ with the same correlation ID are routed to the same
//@@ model instance. Multiple sequences of inference requests
//@@ may be batched together into a single batch to
//@@ improve inference throughput.
//@@
ModelSequenceBatching sequence_batching = 13;
//@@ .. cpp:var:: ModelEnsembling ensemble_scheduling
//@@
//@@ If specified, enables the model-ensembling scheduling
//@@ policy. With model-ensembling, inference requests
//@@ will be processed according to the specification, such as an
//@@ execution sequence of models. The input specified in this model
//@@ config will be the input for the ensemble, and the output
//@@ specified will be the output of the ensemble.
//@@
ModelEnsembling ensemble_scheduling = 15;
}
//@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated)
//@@
//@@ Instances of this model. If not specified, one instance
//@@ of the model will be instantiated on each available GPU.
//@@
repeated ModelInstanceGroup instance_group = 7;
//@@ .. cpp:var:: string default_model_filename
//@@
//@@ Optional filename of the model file to use if a
//@@ compute-capability specific model is not specified in
//@@ :cpp:var:`cc_model_filenames`. If not specified the default name
//@@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or
//@@ 'model.pt' depending on the model type.
//@@
string default_model_filename = 8;
//@@ .. cpp:var:: map<string,string> cc_model_filenames
//@@
//@@ Optional map from CUDA compute capability to the filename of
//@@ the model that supports that compute capability. The filename
//@@ refers to a file within the model version directory.
//@@
map<string, string> cc_model_filenames = 9;
//@@ .. cpp:var:: map<string,string> metric_tags
//@@
//@@ Optional metric tags. User-specific key-value pairs for metrics
//@@ reported for this model. These tags are applied to the metrics
//@@ reported on the HTTP metrics port.
//@@
map<string, string> metric_tags = 10;
//@@ .. cpp:var:: map<string,ModelParameter> parameters
//@@
//@@ Optional model parameters. User-specified parameter values.
//@@
map<string, ModelParameter> parameters = 14;
//@@ .. cpp:var:: ModelWarmup model_warmup (repeated)
//@@
//@@ Warmup setting of this model. If specified, all instances
//@@ will be run with the request samples in sequence before
//@@ serving the model.
//@@ This field can only be specified if the model is not an ensemble
//@@ model.
//@@
repeated ModelWarmup model_warmup = 16;
//@@ .. cpp:var:: ModelOperations model_operations
//@@
//@@ Optional metadata of the libraries providing custom operations for
//@@ this model.
//@@
ModelOperations model_operations = 18;
//@@ .. cpp:var:: ModelTransactionPolicy model_transaction_policy
//@@
//@@ Optional specification that describes the nature of transactions
//@@ to be expected from the model.
//@@
ModelTransactionPolicy model_transaction_policy = 19;
//@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents
//@@
//@@ Optional specification of the agent(s) that should be invoked
//@@ with repository actions are performed for this model.
//@@
ModelRepositoryAgents model_repository_agents = 23;
//@@ .. cpp:var:: ModelResponseCache response_cache
//@@
//@@ Optional setting for utilizing the response cache for this
//@@ model.
//@@
ModelResponseCache response_cache = 24;
}