mirror of https://github.com/dragonflyoss/api.git
1991 lines
67 KiB
Protocol Buffer
1991 lines
67 KiB
Protocol Buffer
// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2018, TensorFlow Authors. All rights reserved.
|
|
|
|
syntax = "proto3";
|
|
|
|
package inference.v1;
|
|
|
|
option go_package = "d7y.io/api/pkg/apis/inference/v1;inference";
|
|
|
|
//@@.. cpp:namespace:: inference
|
|
|
|
//@@
|
|
//@@.. cpp:enum:: DataType
|
|
//@@
|
|
//@@ Data types supported for input and output tensors.
|
|
//@@
|
|
enum DataType {
|
|
//@@ .. cpp:enumerator:: DataType::INVALID = 0
|
|
TYPE_INVALID = 0;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::BOOL = 1
|
|
TYPE_BOOL = 1;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::UINT8 = 2
|
|
TYPE_UINT8 = 2;
|
|
//@@ .. cpp:enumerator:: DataType::UINT16 = 3
|
|
TYPE_UINT16 = 3;
|
|
//@@ .. cpp:enumerator:: DataType::UINT32 = 4
|
|
TYPE_UINT32 = 4;
|
|
//@@ .. cpp:enumerator:: DataType::UINT64 = 5
|
|
TYPE_UINT64 = 5;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::INT8 = 6
|
|
TYPE_INT8 = 6;
|
|
//@@ .. cpp:enumerator:: DataType::INT16 = 7
|
|
TYPE_INT16 = 7;
|
|
//@@ .. cpp:enumerator:: DataType::INT32 = 8
|
|
TYPE_INT32 = 8;
|
|
//@@ .. cpp:enumerator:: DataType::INT64 = 9
|
|
TYPE_INT64 = 9;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::FP16 = 10
|
|
TYPE_FP16 = 10;
|
|
//@@ .. cpp:enumerator:: DataType::FP32 = 11
|
|
TYPE_FP32 = 11;
|
|
//@@ .. cpp:enumerator:: DataType::FP64 = 12
|
|
TYPE_FP64 = 12;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::STRING = 13
|
|
TYPE_STRING = 13;
|
|
|
|
//@@ .. cpp:enumerator:: DataType::BF16 = 14
|
|
TYPE_BF16 = 14;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message ModelRateLimiter
|
|
//@@
|
|
//@@ The specifications required by the rate limiter to properly
|
|
//@@ schedule the inference requests across the different models
|
|
//@@ and their instances.
|
|
//@@
|
|
message ModelRateLimiter
|
|
{
|
|
//@@ .. cpp:var:: message Resource
|
|
//@@
|
|
//@@ The resource property.
|
|
//@@
|
|
message Resource
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name associated with the resource.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: bool global
|
|
//@@
|
|
//@@ Whether or not the resource is global. If true then the resource
|
|
//@@ is assumed to be shared among the devices otherwise specified
|
|
//@@ count of the resource is assumed for each device associated
|
|
//@@ with the instance.
|
|
//@@
|
|
bool global = 2;
|
|
|
|
//@@ .. cpp:var:: uint32 count
|
|
//@@
|
|
//@@ The number of resources required for the execution of the model
|
|
//@@ instance.
|
|
//@@
|
|
uint32 count = 3;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Resource resources (repeated)
|
|
//@@
|
|
//@@ The resources required to execute the request on a model instance.
|
|
//@@ Resources are just names with a corresponding count. The execution
|
|
//@@ of the instance will be blocked until the specificied resources are
|
|
//@@ available. By default an instance uses no rate-limiter resources.
|
|
//@@
|
|
repeated Resource resources = 1;
|
|
|
|
//@@ .. cpp:var:: uint32 priority
|
|
//@@
|
|
//@@ The optional weighting value to be used for prioritizing across
|
|
//@@ instances. An instance with priority 2 will be given 1/2 the
|
|
//@@ number of scheduling chances as an instance_group with priority
|
|
//@@ 1. The default priority is 1. The priority of value 0 will be
|
|
//@@ treated as priority 1.
|
|
//@@
|
|
uint32 priority = 2;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelInstanceGroup
|
|
//@@
|
|
//@@ A group of one or more instances of a model and resources made
|
|
//@@ available for those instances.
|
|
//@@
|
|
message ModelInstanceGroup
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: Kind
|
|
//@@
|
|
//@@ Kind of this instance group.
|
|
//@@
|
|
enum Kind {
|
|
//@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0
|
|
//@@
|
|
//@@ This instance group represents instances that can run on either
|
|
//@@ CPU or GPU. If all GPUs listed in 'gpus' are available then
|
|
//@@ instances will be created on GPU(s), otherwise instances will
|
|
//@@ be created on CPU.
|
|
//@@
|
|
KIND_AUTO = 0;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::KIND_GPU = 1
|
|
//@@
|
|
//@@ This instance group represents instances that must run on the
|
|
//@@ GPU.
|
|
//@@
|
|
KIND_GPU = 1;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::KIND_CPU = 2
|
|
//@@
|
|
//@@ This instance group represents instances that must run on the
|
|
//@@ CPU.
|
|
//@@
|
|
KIND_CPU = 2;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3
|
|
//@@
|
|
//@@ This instance group represents instances that should run on the
|
|
//@@ CPU and/or GPU(s) as specified by the model or backend itself.
|
|
//@@ The inference server will not override the model/backend
|
|
//@@ settings.
|
|
//@@
|
|
KIND_MODEL = 3;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message SecondaryDevice
|
|
//@@
|
|
//@@ A secondary device required for a model instance.
|
|
//@@
|
|
message SecondaryDevice
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: SecondaryDeviceKind
|
|
//@@
|
|
//@@ The kind of the secondary device.
|
|
//@@
|
|
enum SecondaryDeviceKind {
|
|
//@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0
|
|
//@@
|
|
//@@ An NVDLA core. http://nvdla.org
|
|
//@@ Currently KIND_NVDLA is only supported by the TensorRT backend.
|
|
//@@
|
|
KIND_NVDLA = 0;
|
|
}
|
|
|
|
//@@ .. cpp:var:: SecondaryDeviceKind kind
|
|
//@@
|
|
//@@ The secondary device kind.
|
|
//@@
|
|
SecondaryDeviceKind kind = 1;
|
|
|
|
//@@ .. cpp:var:: int64 device_id
|
|
//@@
|
|
//@@ Identifier for the secondary device.
|
|
//@@
|
|
int64 device_id = 2;
|
|
}
|
|
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ Optional name of this group of instances. If not specified the
|
|
//@@ name will be formed as <model name>_<group number>. The name of
|
|
//@@ individual instances will be further formed by a unique instance
|
|
//@@ number and GPU index:
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: Kind kind
|
|
//@@
|
|
//@@ The kind of this instance group. Default is KIND_AUTO. If
|
|
//@@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
|
|
//@@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid
|
|
//@@ and 'gpu' cannot be specified.
|
|
//@@
|
|
Kind kind = 4;
|
|
|
|
//@@ .. cpp:var:: int32 count
|
|
//@@
|
|
//@@ For a group assigned to GPU, the number of instances created for
|
|
//@@ each GPU listed in 'gpus'. For a group assigned to CPU the number
|
|
//@@ of instances created. Default is 1.
|
|
int32 count = 2;
|
|
|
|
//@@ .. cpp:var:: ModelRateLimiter rate_limiter
|
|
//@@
|
|
//@@ The rate limiter specific settings to be associated with this
|
|
//@@ instance group. Optional, if not specified no rate limiting
|
|
//@@ will be applied to this instance group.
|
|
//@@
|
|
ModelRateLimiter rate_limiter = 6;
|
|
|
|
//@@ .. cpp:var:: int32 gpus (repeated)
|
|
//@@
|
|
//@@ GPU(s) where instances should be available. For each GPU listed,
|
|
//@@ 'count' instances of the model will be available. Setting 'gpus'
|
|
//@@ to empty (or not specifying at all) is eqivalent to listing all
|
|
//@@ available GPUs.
|
|
//@@
|
|
repeated int32 gpus = 3;
|
|
|
|
//@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated)
|
|
//@@
|
|
//@@ Secondary devices that are required by instances specified by this
|
|
//@@ instance group. Optional.
|
|
//@@
|
|
repeated SecondaryDevice secondary_devices = 8;
|
|
|
|
//@@ .. cpp:var:: string profile (repeated)
|
|
//@@
|
|
//@@ For TensorRT models containing multiple optimization profile, this
|
|
//@@ parameter specifies a set of optimization profiles available to this
|
|
//@@ instance group. The inference server will choose the optimal profile
|
|
//@@ based on the shapes of the input tensors. This field should lie
|
|
//@@ between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1
|
|
//@@ and be specified only for TensorRT backend, otherwise an error will
|
|
//@@ be generated. If not specified, the server will select the first
|
|
//@@ optimization profile by default.
|
|
//@@
|
|
repeated string profile = 5;
|
|
|
|
//@@ .. cpp:var:: bool passive
|
|
//@@
|
|
//@@ Whether the instances within this instance group will be accepting
|
|
//@@ inference requests from the scheduler. If true, the instances will
|
|
//@@ not be added to the scheduler. Default value is false.
|
|
//@@
|
|
bool passive = 7;
|
|
|
|
//@@ .. cpp:var:: string host_policy
|
|
//@@
|
|
//@@ The host policy name that the instance to be associated with.
|
|
//@@ The default value is set to reflect the device kind of the instance,
|
|
//@@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and
|
|
//@@ KIND_GPU is "gpu_<gpu_id>".
|
|
//@@
|
|
string host_policy = 9;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelTensorReshape
|
|
//@@
|
|
//@@ Reshape specification for input and output tensors.
|
|
//@@
|
|
message ModelTensorReshape
|
|
{
|
|
//@@ .. cpp:var:: int64 shape (repeated)
|
|
//@@
|
|
//@@ The shape to use for reshaping.
|
|
//@@
|
|
repeated int64 shape = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelInput
|
|
//@@
|
|
//@@ An input required by the model.
|
|
//@@
|
|
message ModelInput
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: Format
|
|
//@@
|
|
//@@ The format for the input.
|
|
//@@
|
|
enum Format {
|
|
//@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0
|
|
//@@
|
|
//@@ The input has no specific format. This is the default.
|
|
//@@
|
|
FORMAT_NONE = 0;
|
|
|
|
//@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1
|
|
//@@
|
|
//@@ HWC image format. Tensors with this format require 3 dimensions
|
|
//@@ if the model does not support batching (max_batch_size = 0) or 4
|
|
//@@ dimensions if the model does support batching (max_batch_size
|
|
//@@ >= 1). In either case the 'dims' below should only specify the
|
|
//@@ 3 non-batch dimensions (i.e. HWC or CHW).
|
|
//@@
|
|
FORMAT_NHWC = 1;
|
|
|
|
//@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2
|
|
//@@
|
|
//@@ CHW image format. Tensors with this format require 3 dimensions
|
|
//@@ if the model does not support batching (max_batch_size = 0) or 4
|
|
//@@ dimensions if the model does support batching (max_batch_size
|
|
//@@ >= 1). In either case the 'dims' below should only specify the
|
|
//@@ 3 non-batch dimensions (i.e. HWC or CHW).
|
|
//@@
|
|
FORMAT_NCHW = 2;
|
|
}
|
|
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the input.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The data-type of the input.
|
|
//@@
|
|
DataType data_type = 2;
|
|
|
|
//@@ .. cpp:var:: Format format
|
|
//@@
|
|
//@@ The format of the input. Optional.
|
|
//@@
|
|
Format format = 3;
|
|
|
|
//@@ .. cpp:var:: int64 dims (repeated)
|
|
//@@
|
|
//@@ The dimensions/shape of the input tensor that must be provided
|
|
//@@ when invoking the inference API for this model.
|
|
//@@
|
|
repeated int64 dims = 4;
|
|
|
|
//@@ .. cpp:var:: ModelTensorReshape reshape
|
|
//@@
|
|
//@@ The shape expected for this input by the backend. The input will
|
|
//@@ be reshaped to this before being presented to the backend. The
|
|
//@@ reshape must have the same number of elements as the input shape
|
|
//@@ specified by 'dims'. Optional.
|
|
//@@
|
|
ModelTensorReshape reshape = 5;
|
|
|
|
//@@ .. cpp:var:: bool is_shape_tensor
|
|
//@@
|
|
//@@ Whether or not the input is a shape tensor to the model. This field
|
|
//@@ is currently supported only for the TensorRT model. An error will be
|
|
//@@ generated if this specification does not comply with underlying
|
|
//@@ model.
|
|
//@@
|
|
bool is_shape_tensor = 6;
|
|
|
|
//@@ .. cpp:var:: bool allow_ragged_batch
|
|
//@@
|
|
//@@ Whether or not the input is allowed to be "ragged" in a dynamically
|
|
//@@ created batch. Default is false indicating that two requests will
|
|
//@@ only be batched if this tensor has the same shape in both requests.
|
|
//@@ True indicates that two requests can be batched even if this tensor
|
|
//@@ has a different shape in each request.
|
|
//@@
|
|
bool allow_ragged_batch = 7;
|
|
|
|
//@@ .. cpp:var:: bool optional
|
|
//@@
|
|
//@@ Whether or not the input is optional for the model execution.
|
|
//@@ If true, the input is not required in the inference request.
|
|
//@@ Default value is false.
|
|
//@@
|
|
bool optional = 8;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelOutput
|
|
//@@
|
|
//@@ An output produced by the model.
|
|
//@@
|
|
message ModelOutput
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the output.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The data-type of the output.
|
|
//@@
|
|
DataType data_type = 2;
|
|
|
|
//@@ .. cpp:var:: int64 dims (repeated)
|
|
//@@
|
|
//@@ The dimensions/shape of the output tensor.
|
|
//@@
|
|
repeated int64 dims = 3;
|
|
|
|
//@@ .. cpp:var:: ModelTensorReshape reshape
|
|
//@@
|
|
//@@ The shape produced for this output by the backend. The output will
|
|
//@@ be reshaped from this to the shape specifed in 'dims' before being
|
|
//@@ returned in the inference response. The reshape must have the same
|
|
//@@ number of elements as the output shape specified by 'dims'. Optional.
|
|
//@@
|
|
ModelTensorReshape reshape = 5;
|
|
|
|
//@@ .. cpp:var:: string label_filename
|
|
//@@
|
|
//@@ The label file associated with this output. Should be specified only
|
|
//@@ for outputs that represent classifications. Optional.
|
|
//@@
|
|
string label_filename = 4;
|
|
|
|
|
|
//@@ .. cpp:var:: bool is_shape_tensor
|
|
//@@
|
|
//@@ Whether or not the output is a shape tensor to the model. This field
|
|
//@@ is currently supported only for the TensorRT model. An error will be
|
|
//@@ generated if this specification does not comply with underlying
|
|
//@@ model.
|
|
//@@
|
|
bool is_shape_tensor = 6;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message BatchInput
|
|
//@@
|
|
//@@ A batch input is an additional input that must be added by
|
|
//@@ the backend based on all the requests in a batch.
|
|
//@@
|
|
message BatchInput
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: Kind
|
|
//@@
|
|
//@@ The kind of the batch input.
|
|
//@@
|
|
enum Kind {
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0
|
|
//@@
|
|
//@@ The element count of the 'source_input' will be added as
|
|
//@@ input with shape [1].
|
|
//@@
|
|
BATCH_ELEMENT_COUNT = 0;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1
|
|
//@@
|
|
//@@ The accumulated element count of the 'source_input' will be
|
|
//@@ added as input with shape [1]. For example, if there is a
|
|
//@@ batch of two request, each with 2 elements, an input of value
|
|
//@@ 2 will be added to the first request, and an input of value
|
|
//@@ 4 will be added to the second request.
|
|
//@@
|
|
BATCH_ACCUMULATED_ELEMENT_COUNT = 1;
|
|
|
|
//@@ .. cpp:enumerator::
|
|
//@@ Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2
|
|
//@@
|
|
//@@ The accumulated element count of the 'source_input' will be
|
|
//@@ added as input with shape [1], except for the first request
|
|
//@@ in the batch. For the first request in the batch, the input
|
|
//@@ will have shape [2] where the first element is value 0.
|
|
//@@
|
|
BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3
|
|
//@@
|
|
//@@ Among the requests in the batch, the max element count of the
|
|
//@@ 'source_input' will be added as input with shape
|
|
//@@ [max_element_count] for the first request in the batch.
|
|
//@@ For other requests, such input will be with shape [0].
|
|
//@@ The data of the tensor will be uninitialized.
|
|
//@@
|
|
BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4
|
|
//@@
|
|
//@@ Among the requests in the batch, the shape of the
|
|
//@@ 'source_input' will be added as input with shape
|
|
//@@ [batch_size, len(input_dim)]. For example, if one
|
|
//@@ batch-2 input with shape [3, 1] and batch-1 input
|
|
//@@ with shape [2, 2] are batched, the batch input will
|
|
//@@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]].
|
|
//@@
|
|
BATCH_ITEM_SHAPE = 4;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5
|
|
//@@
|
|
//@@ Among the requests in the batch, the shape of the
|
|
//@@ 'source_input' will be added as input with single dimensional
|
|
//@@ shape [batch_size * len(input_dim)]. For example, if one
|
|
//@@ batch-2 input with shape [3, 1] and batch-1 input
|
|
//@@ with shape [2, 2] are batched, the batch input will
|
|
//@@ have shape [6] and value [3, 1, 3, 1, 2, 2].
|
|
//@@
|
|
BATCH_ITEM_SHAPE_FLATTEN = 5;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Kind kind
|
|
//@@
|
|
//@@ The kind of this batch input.
|
|
//@@
|
|
Kind kind = 1;
|
|
|
|
//@@ .. cpp:var:: string target_name (repeated)
|
|
//@@
|
|
//@@ The name of the model inputs that the backend will create
|
|
//@@ for this batch input.
|
|
//@@
|
|
repeated string target_name = 2;
|
|
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The input's datatype. The data type can be TYPE_INT32 or
|
|
//@@ TYPE_FP32.
|
|
//@@
|
|
DataType data_type = 3;
|
|
|
|
//@@ .. cpp:var:: string source_input (repeated)
|
|
//@@
|
|
//@@ The backend derives the value for each batch input from one or
|
|
//@@ more other inputs. 'source_input' gives the names of those
|
|
//@@ inputs.
|
|
//@@
|
|
repeated string source_input = 4;
|
|
}
|
|
|
|
//@@.. cpp:var:: message BatchOutput
|
|
//@@
|
|
//@@ A batch output is an output produced by the model that must be handled
|
|
//@@ differently by the backend based on all the requests in a batch.
|
|
//@@
|
|
message BatchOutput
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: Kind
|
|
//@@
|
|
//@@ The kind of the batch output.
|
|
//@@
|
|
enum Kind {
|
|
//@@ .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0
|
|
//@@
|
|
//@@ The output should be scattered according to the shape of
|
|
//@@ 'source_input'. The dynamic dimension of the output will
|
|
//@@ be set to the value of the same dimension in the input.
|
|
//@@
|
|
BATCH_SCATTER_WITH_INPUT_SHAPE = 0;
|
|
}
|
|
|
|
//@@ .. cpp:var:: string target_name (repeated)
|
|
//@@
|
|
//@@ The name of the outputs to be produced by this batch output
|
|
//@@ specification.
|
|
//@@
|
|
repeated string target_name = 1;
|
|
|
|
//@@ .. cpp:var:: Kind kind
|
|
//@@
|
|
//@@ The kind of this batch output.
|
|
//@@
|
|
Kind kind = 2;
|
|
|
|
//@@ .. cpp:var:: string source_input (repeated)
|
|
//@@
|
|
//@@ The backend derives each batch output from one or more inputs.
|
|
//@@ 'source_input' gives the names of those inputs.
|
|
//@@
|
|
repeated string source_input = 3;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelVersionPolicy
|
|
//@@
|
|
//@@ Policy indicating which versions of a model should be made
|
|
//@@ available by the inference server.
|
|
//@@
|
|
message ModelVersionPolicy
|
|
{
|
|
//@@ .. cpp:var:: message Latest
|
|
//@@
|
|
//@@ Serve only the latest version(s) of a model. This is
|
|
//@@ the default policy.
|
|
//@@
|
|
message Latest
|
|
{
|
|
//@@ .. cpp:var:: uint32 num_versions
|
|
//@@
|
|
//@@ Serve only the 'num_versions' highest-numbered versions. T
|
|
//@@ The default value of 'num_versions' is 1, indicating that by
|
|
//@@ default only the single highest-number version of a
|
|
//@@ model will be served.
|
|
//@@
|
|
uint32 num_versions = 1;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message All
|
|
//@@
|
|
//@@ Serve all versions of the model.
|
|
//@@
|
|
message All {}
|
|
|
|
//@@ .. cpp:var:: message Specific
|
|
//@@
|
|
//@@ Serve only specific versions of the model.
|
|
//@@
|
|
message Specific
|
|
{
|
|
//@@ .. cpp:var:: int64 versions (repeated)
|
|
//@@
|
|
//@@ The specific versions of the model that will be served.
|
|
//@@
|
|
repeated int64 versions = 1;
|
|
}
|
|
|
|
//@@ .. cpp:var:: oneof policy_choice
|
|
//@@
|
|
//@@ Each model must implement only a single version policy. The
|
|
//@@ default policy is 'Latest'.
|
|
//@@
|
|
oneof policy_choice
|
|
{
|
|
//@@ .. cpp:var:: Latest latest
|
|
//@@
|
|
//@@ Serve only latest version(s) of the model.
|
|
//@@
|
|
Latest latest = 1;
|
|
|
|
//@@ .. cpp:var:: All all
|
|
//@@
|
|
//@@ Serve all versions of the model.
|
|
//@@
|
|
All all = 2;
|
|
|
|
//@@ .. cpp:var:: Specific specific
|
|
//@@
|
|
//@@ Serve only specific version(s) of the model.
|
|
//@@
|
|
Specific specific = 3;
|
|
}
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelOptimizationPolicy
|
|
//@@
|
|
//@@ Optimization settings for a model. These settings control if/how a
|
|
//@@ model is optimized and prioritized by the backend framework when
|
|
//@@ it is loaded.
|
|
//@@
|
|
message ModelOptimizationPolicy
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: message Graph
|
|
//@@
|
|
//@@ Enable generic graph optimization of the model. If not specified
|
|
//@@ the framework's default level of optimization is used. Supports
|
|
//@@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow
|
|
//@@ causes XLA to be enabled/disabled for the model. For Onnx defaults
|
|
//@@ to enabling all optimizations, -1 enables only basic optimizations,
|
|
//@@ +1 enables only basic and extended optimizations.
|
|
//@@
|
|
message Graph
|
|
{
|
|
//@@ .. cpp:var:: int32 level
|
|
//@@
|
|
//@@ The optimization level. Defaults to 0 (zero) if not specified.
|
|
//@@
|
|
//@@ - -1: Disabled
|
|
//@@ - 0: Framework default
|
|
//@@ - 1+: Enable optimization level (greater values indicate
|
|
//@@ higher optimization levels)
|
|
//@@
|
|
int32 level = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:enum:: ModelPriority
|
|
//@@
|
|
//@@ Model priorities. A model will be given scheduling and execution
|
|
//@@ preference over models at lower priorities. Current model
|
|
//@@ priorities only work for TensorRT models.
|
|
//@@
|
|
enum ModelPriority {
|
|
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0
|
|
//@@
|
|
//@@ The default model priority.
|
|
//@@
|
|
PRIORITY_DEFAULT = 0;
|
|
|
|
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1
|
|
//@@
|
|
//@@ The maximum model priority.
|
|
//@@
|
|
PRIORITY_MAX = 1;
|
|
|
|
//@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2
|
|
//@@
|
|
//@@ The minimum model priority.
|
|
//@@
|
|
PRIORITY_MIN = 2;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message Cuda
|
|
//@@
|
|
//@@ CUDA-specific optimization settings.
|
|
//@@
|
|
message Cuda
|
|
{
|
|
//@@ .. cpp:var:: message GraphSpec
|
|
//@@
|
|
//@@ Specification of the CUDA graph to be captured.
|
|
//@@
|
|
message GraphSpec
|
|
{
|
|
//@@ .. cpp:var:: message Dims
|
|
//@@
|
|
//@@ Specification of tensor dimension.
|
|
//@@
|
|
message Shape
|
|
{
|
|
//@@ .. cpp:var:: int64 dim (repeated)
|
|
//@@
|
|
//@@ The dimension.
|
|
//@@
|
|
repeated int64 dim = 1;
|
|
}
|
|
|
|
message LowerBound
|
|
{
|
|
//@@ .. cpp:var:: int32 batch_size
|
|
//@@
|
|
//@@ The batch size of the CUDA graph. If 'max_batch_size' is 0,
|
|
//@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must
|
|
//@@ be set to value between 1 and 'max_batch_size'.
|
|
//@@
|
|
int32 batch_size = 1;
|
|
|
|
//@@ .. cpp:var:: map<string, Shape> input
|
|
//@@
|
|
//@@ The specification of the inputs. 'Shape' is the shape of
|
|
//@@ the input without batching dimension.
|
|
//@@
|
|
map<string, Shape> input = 2;
|
|
}
|
|
|
|
//@@ .. cpp:var:: int32 batch_size
|
|
//@@
|
|
//@@ The batch size of the CUDA graph. If 'max_batch_size' is 0,
|
|
//@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must
|
|
//@@ be set to value between 1 and 'max_batch_size'.
|
|
//@@
|
|
int32 batch_size = 1;
|
|
|
|
//@@ .. cpp:var:: map<string, Shape> input
|
|
//@@
|
|
//@@ The specification of the inputs. 'Shape' is the shape of the
|
|
//@@ input without batching dimension.
|
|
//@@
|
|
map<string, Shape> input = 2;
|
|
|
|
//@@ .. cpp:var:: LowerBound graph_lower_bound
|
|
//@@
|
|
//@@ Specify the lower bound of the CUDA graph. Optional.
|
|
//@@ If specified, the graph can be used for input shapes and
|
|
//@@ batch sizes that are in closed interval between the lower
|
|
//@@ bound specification and graph specification. For dynamic
|
|
//@@ shape model, this allows CUDA graphs to be launched
|
|
//@@ frequently without capturing all possible shape combinations.
|
|
//@@ However, using graph for shape combinations different from
|
|
//@@ the one used for capturing introduces uninitialized data for
|
|
//@@ execution and it may distort the inference result if
|
|
//@@ the model is sensitive to uninitialized data.
|
|
//@@
|
|
LowerBound graph_lower_bound = 3;
|
|
}
|
|
|
|
//@@ .. cpp:var:: bool graphs
|
|
//@@
|
|
//@@ Use CUDA graphs API to capture model operations and execute
|
|
//@@ them more efficiently. Default value is false.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
bool graphs = 1;
|
|
|
|
//@@ .. cpp:var:: bool busy_wait_events
|
|
//@@
|
|
//@@ Use busy-waiting to synchronize CUDA events to achieve minimum
|
|
//@@ latency from event complete to host thread to be notified, with
|
|
//@@ the cost of high CPU load. Default value is false.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
bool busy_wait_events = 2;
|
|
|
|
//@@ .. cpp:var:: GraphSpec graph_spec (repeated)
|
|
//@@
|
|
//@@ Specification of the CUDA graph to be captured. If not specified
|
|
//@@ and 'graphs' is true, the default CUDA graphs will be captured
|
|
//@@ based on model settings.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
repeated GraphSpec graph_spec = 3;
|
|
|
|
//@@ .. cpp:var:: bool output_copy_stream
|
|
//@@
|
|
//@@ Uses a CUDA stream separate from the inference stream to copy the
|
|
//@@ output to host. However, be aware that setting this option to
|
|
//@@ true will lead to an increase in the memory consumption of the
|
|
//@@ model as Triton will allocate twice as much GPU memory for its
|
|
//@@ I/O tensor buffers. Default value is false.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
bool output_copy_stream = 4;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message ExecutionAccelerators
|
|
//@@
|
|
//@@ Specify the preferred execution accelerators to be used to execute
|
|
//@@ the model. Currently only recognized by ONNX Runtime backend and
|
|
//@@ TensorFlow backend.
|
|
//@@
|
|
//@@ For ONNX Runtime backend, it will deploy the model with the execution
|
|
//@@ accelerators by priority, the priority is determined based on the
|
|
//@@ order that they are set, i.e. the provider at the front has highest
|
|
//@@ priority. Overall, the priority will be in the following order:
|
|
//@@ <gpu_execution_accelerator> (if instance is on GPU)
|
|
//@@ CUDA Execution Provider (if instance is on GPU)
|
|
//@@ <cpu_execution_accelerator>
|
|
//@@ Default CPU Execution Provider
|
|
//@@
|
|
message ExecutionAccelerators
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: message Accelerator
|
|
//@@
|
|
//@@ Specify the accelerator to be used to execute the model.
|
|
//@@ Accelerator with the same name may accept different parameters
|
|
//@@ depending on the backends.
|
|
//@@
|
|
message Accelerator
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the execution accelerator.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: map<string, string> parameters
|
|
//@@
|
|
//@@ Additional paremeters used to configure the accelerator.
|
|
//@@
|
|
map<string, string> parameters = 2;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated)
|
|
//@@
|
|
//@@ The preferred execution provider to be used if the model instance
|
|
//@@ is deployed on GPU.
|
|
//@@
|
|
//@@ For ONNX Runtime backend, possible value is "tensorrt" as name,
|
|
//@@ and no parameters are required.
|
|
//@@
|
|
//@@ For TensorFlow backend, possible values are "tensorrt",
|
|
//@@ "auto_mixed_precision", "gpu_io".
|
|
//@@
|
|
//@@ For "tensorrt", the following parameters can be specified:
|
|
//@@ "precision_mode": The precision used for optimization.
|
|
//@@ Allowed values are "FP32" and "FP16". Default value is "FP32".
|
|
//@@
|
|
//@@ "max_cached_engines": The maximum number of cached TensorRT
|
|
//@@ engines in dynamic TensorRT ops. Default value is 100.
|
|
//@@
|
|
//@@ "minimum_segment_size": The smallest model subgraph that will
|
|
//@@ be considered for optimization by TensorRT. Default value is 3.
|
|
//@@
|
|
//@@ "max_workspace_size_bytes": The maximum GPU memory the model
|
|
//@@ can use temporarily during execution. Default value is 1GB.
|
|
//@@
|
|
//@@ For "auto_mixed_precision", no parameters are required. If set,
|
|
//@@ the model will try to use FP16 for better performance.
|
|
//@@ This optimization can not be set with "tensorrt".
|
|
//@@
|
|
//@@ For "gpu_io", no parameters are required. If set, the model will
|
|
//@@ be executed using TensorFlow Callable API to set input and output
|
|
//@@ tensors in GPU memory if possible, which can reduce data transfer
|
|
//@@ overhead if the model is used in ensemble. However, the Callable
|
|
//@@ object will be created on model creation and it will request all
|
|
//@@ outputs for every model execution, which may impact the
|
|
//@@ performance if a request does not require all outputs. This
|
|
//@@ optimization will only take affect if the model instance is
|
|
//@@ created with KIND_GPU.
|
|
//@@
|
|
repeated Accelerator gpu_execution_accelerator = 1;
|
|
|
|
//@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated)
|
|
//@@
|
|
//@@ The preferred execution provider to be used if the model instance
|
|
//@@ is deployed on CPU.
|
|
//@@
|
|
//@@ For ONNX Runtime backend, possible value is "openvino" as name,
|
|
//@@ and no parameters are required.
|
|
//@@
|
|
repeated Accelerator cpu_execution_accelerator = 2;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message PinnedMemoryBuffer
|
|
//@@
|
|
//@@ Specify whether to use a pinned memory buffer when transferring data
|
|
//@@ between non-pinned system memory and GPU memory. Using a pinned
|
|
//@@ memory buffer for system from/to GPU transfers will typically provide
|
|
//@@ increased performance. For example, in the common use case where the
|
|
//@@ request provides inputs and delivers outputs via non-pinned system
|
|
//@@ memory, if the model instance accepts GPU IOs, the inputs will be
|
|
//@@ processed by two copies: from non-pinned system memory to pinned
|
|
//@@ memory, and from pinned memory to GPU memory. Similarly, pinned
|
|
//@@ memory will be used for delivering the outputs.
|
|
//@@
|
|
message PinnedMemoryBuffer
|
|
{
|
|
//@@ .. cpp:var:: bool enable
|
|
//@@
|
|
//@@ Use pinned memory buffer. Default is true.
|
|
//@@
|
|
bool enable = 1;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Graph graph
|
|
//@@
|
|
//@@ The graph optimization setting for the model. Optional.
|
|
//@@
|
|
Graph graph = 1;
|
|
|
|
//@@ .. cpp:var:: ModelPriority priority
|
|
//@@
|
|
//@@ The priority setting for the model. Optional.
|
|
//@@
|
|
ModelPriority priority = 2;
|
|
|
|
//@@ .. cpp:var:: Cuda cuda
|
|
//@@
|
|
//@@ CUDA-specific optimization settings. Optional.
|
|
//@@
|
|
Cuda cuda = 3;
|
|
|
|
//@@ .. cpp:var:: ExecutionAccelerators execution_accelerators
|
|
//@@
|
|
//@@ The accelerators used for the model. Optional.
|
|
//@@
|
|
ExecutionAccelerators execution_accelerators = 4;
|
|
|
|
//@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory
|
|
//@@
|
|
//@@ Use pinned memory buffer when the data transfer for inputs
|
|
//@@ is between GPU memory and non-pinned system memory.
|
|
//@@ Default is true.
|
|
//@@
|
|
PinnedMemoryBuffer input_pinned_memory = 5;
|
|
|
|
//@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory
|
|
//@@
|
|
//@@ Use pinned memory buffer when the data transfer for outputs
|
|
//@@ is between GPU memory and non-pinned system memory.
|
|
//@@ Default is true.
|
|
//@@
|
|
PinnedMemoryBuffer output_pinned_memory = 6;
|
|
|
|
//@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold
|
|
//@@
|
|
//@@ The backend may use a gather kernel to gather input data if the
|
|
//@@ device has direct access to the source buffer and the destination
|
|
//@@ buffer. In such case, the gather kernel will be used only if the
|
|
//@@ number of buffers to be gathered is greater or equal to
|
|
//@@ the specifed value. If 0, the gather kernel will be disabled.
|
|
//@@ Default value is 0.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
uint32 gather_kernel_buffer_threshold = 7;
|
|
|
|
//@@ .. cpp:var:: bool eager_batching
|
|
//@@
|
|
//@@ Start preparing the next batch before the model instance is ready
|
|
//@@ for the next inference. This option can be used to overlap the
|
|
//@@ batch preparation with model execution, with the trade-off that
|
|
//@@ the next batch might be smaller than what it could have been.
|
|
//@@ Default value is false.
|
|
//@@ Currently only recognized by TensorRT backend.
|
|
//@@
|
|
bool eager_batching = 8;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelQueuePolicy
|
|
//@@
|
|
//@@ Queue policy for inference requests.
|
|
//@@
|
|
message ModelQueuePolicy
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: TimeoutAction
|
|
//@@
|
|
//@@ The action applied to timed-out requests.
|
|
//@@
|
|
enum TimeoutAction {
|
|
//@@ .. cpp:enumerator:: Action::REJECT = 0
|
|
//@@
|
|
//@@ Reject the request and return error message accordingly.
|
|
//@@
|
|
REJECT = 0;
|
|
|
|
//@@ .. cpp:enumerator:: Action::DELAY = 1
|
|
//@@
|
|
//@@ Delay the request until all other requests at the same
|
|
//@@ (or higher) priority levels that have not reached their timeouts
|
|
//@@ are processed. A delayed request will eventually be processed,
|
|
//@@ but may be delayed indefinitely due to newly arriving requests.
|
|
//@@
|
|
DELAY = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: TimeoutAction timeout_action
|
|
//@@
|
|
//@@ The action applied to timed-out request.
|
|
//@@ The default action is REJECT.
|
|
//@@
|
|
TimeoutAction timeout_action = 1;
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: uint64 default_timeout_microseconds
|
|
//@@
|
|
//@@ The default timeout for every request, in microseconds.
|
|
//@@ The default value is 0 which indicates that no timeout is set.
|
|
//@@
|
|
uint64 default_timeout_microseconds = 2;
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: bool allow_timeout_override
|
|
//@@
|
|
//@@ Whether individual request can override the default timeout value.
|
|
//@@ When true, individual requests can set a timeout that is less than
|
|
//@@ the default timeout value but may not increase the timeout.
|
|
//@@ The default value is false.
|
|
//@@
|
|
bool allow_timeout_override = 3;
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: uint32 max_queue_size
|
|
//@@
|
|
//@@ The maximum queue size for holding requests. A request will be
|
|
//@@ rejected immediately if it can't be enqueued because the queue is
|
|
//@@ full. The default value is 0 which indicates that no maximum
|
|
//@@ queue size is enforced.
|
|
//@@
|
|
uint32 max_queue_size = 4;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelDynamicBatching
|
|
//@@
|
|
//@@ Dynamic batching configuration. These settings control how dynamic
|
|
//@@ batching operates for the model.
|
|
//@@
|
|
message ModelDynamicBatching
|
|
{
|
|
//@@ .. cpp:var:: int32 preferred_batch_size (repeated)
|
|
//@@
|
|
//@@ Preferred batch sizes for dynamic batching. If a batch of one of
|
|
//@@ these sizes can be formed it will be executed immediately. If
|
|
//@@ not specified a preferred batch size will be chosen automatically
|
|
//@@ based on model and GPU characteristics.
|
|
//@@
|
|
repeated int32 preferred_batch_size = 1;
|
|
|
|
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
|
|
//@@
|
|
//@@ The maximum time, in microseconds, a request will be delayed in
|
|
//@@ the scheduling queue to wait for additional requests for
|
|
//@@ batching. Default is 0.
|
|
//@@
|
|
uint64 max_queue_delay_microseconds = 2;
|
|
|
|
//@@ .. cpp:var:: bool preserve_ordering
|
|
//@@
|
|
//@@ Should the dynamic batcher preserve the ordering of responses to
|
|
//@@ match the order of requests received by the scheduler. Default is
|
|
//@@ false. If true, the responses will be returned in the same order as
|
|
//@@ the order of requests sent to the scheduler. If false, the responses
|
|
//@@ may be returned in arbitrary order. This option is specifically
|
|
//@@ needed when a sequence of related inference requests (i.e. inference
|
|
//@@ requests with the same correlation ID) are sent to the dynamic
|
|
//@@ batcher to ensure that the sequence responses are in the correct
|
|
//@@ order.
|
|
//@@
|
|
bool preserve_ordering = 3;
|
|
|
|
//@@ .. cpp:var:: uint64 priority_levels
|
|
//@@
|
|
//@@ The number of priority levels to be enabled for the model,
|
|
//@@ the priority level starts from 1 and 1 is the highest priority.
|
|
//@@ Requests are handled in priority order with all priority 1 requests
|
|
//@@ processed before priority 2, all priority 2 requests processed before
|
|
//@@ priority 3, etc. Requests with the same priority level will be
|
|
//@@ handled in the order that they are received.
|
|
//@@
|
|
uint64 priority_levels = 4;
|
|
|
|
//@@ .. cpp:var:: uint64 default_priority_level
|
|
//@@
|
|
//@@ The priority level used for requests that don't specify their
|
|
//@@ priority. The value must be in the range [ 1, 'priority_levels' ].
|
|
//@@
|
|
uint64 default_priority_level = 5;
|
|
|
|
//@@ .. cpp:var:: ModelQueuePolicy default_queue_policy
|
|
//@@
|
|
//@@ The default queue policy used for requests that don't require
|
|
//@@ priority handling and requests that specify priority levels where
|
|
//@@ there is no specific policy given. If not specified, a policy with
|
|
//@@ default field values will be used.
|
|
//@@
|
|
ModelQueuePolicy default_queue_policy = 6;
|
|
|
|
//@@ .. cpp:var:: map<uint64, ModelQueuePolicy> priority_queue_policy
|
|
//@@
|
|
//@@ Specify the queue policy for the priority level. The default queue
|
|
//@@ policy will be used if a priority level doesn't specify a queue
|
|
//@@ policy.
|
|
//@@
|
|
map<uint64, ModelQueuePolicy> priority_queue_policy = 7;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelSequenceBatching
|
|
//@@
|
|
//@@ Sequence batching configuration. These settings control how sequence
|
|
//@@ batching operates for the model.
|
|
//@@
|
|
message ModelSequenceBatching
|
|
{
|
|
//@@ .. cpp:var:: message Control
|
|
//@@
|
|
//@@ A control is a signal that the sequence batcher uses to
|
|
//@@ communicate with a backend.
|
|
//@@
|
|
message Control
|
|
{
|
|
//@@
|
|
//@@ .. cpp:enum:: Kind
|
|
//@@
|
|
//@@ The kind of the control.
|
|
//@@
|
|
enum Kind {
|
|
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0
|
|
//@@
|
|
//@@ A new sequence is/is-not starting. If true a sequence is
|
|
//@@ starting, if false a sequence is continuing. Must
|
|
//@@ specify either int32_false_true, fp32_false_true or
|
|
//@@ bool_false_true for this control. This control is optional.
|
|
//@@
|
|
CONTROL_SEQUENCE_START = 0;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1
|
|
//@@
|
|
//@@ A sequence is/is-not ready for inference. If true the
|
|
//@@ input tensor data is valid and should be used. If false
|
|
//@@ the input tensor data is invalid and inferencing should
|
|
//@@ be "skipped". Must specify either int32_false_true,
|
|
//@@ fp32_false_true or bool_false_true for this control. This
|
|
//@@ control is optional.
|
|
//@@
|
|
CONTROL_SEQUENCE_READY = 1;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
|
|
//@@
|
|
//@@ A sequence is/is-not ending. If true a sequence is
|
|
//@@ ending, if false a sequence is continuing. Must specify
|
|
//@@ either int32_false_true, fp32_false_true or bool_false_true
|
|
//@@ for this control. This control is optional.
|
|
//@@
|
|
CONTROL_SEQUENCE_END = 2;
|
|
|
|
//@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3
|
|
//@@
|
|
//@@ The correlation ID of the sequence. The correlation ID
|
|
//@@ is an uint64_t value that is communicated in whole or
|
|
//@@ in part by the tensor. The tensor's datatype must be
|
|
//@@ specified by data_type and must be TYPE_UINT64, TYPE_INT64,
|
|
//@@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified
|
|
//@@ the correlation ID will be truncated to the low-order 32
|
|
//@@ bits. This control is optional.
|
|
//@@
|
|
CONTROL_SEQUENCE_CORRID = 3;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Kind kind
|
|
//@@
|
|
//@@ The kind of this control.
|
|
//@@
|
|
Kind kind = 1;
|
|
|
|
//@@ .. cpp:var:: int32 int32_false_true (repeated)
|
|
//@@
|
|
//@@ The control's true and false setting is indicated by setting
|
|
//@@ a value in an int32 tensor. The tensor must be a
|
|
//@@ 1-dimensional tensor with size equal to the batch size of
|
|
//@@ the request. 'int32_false_true' must have two entries: the
|
|
//@@ first the false value and the second the true value.
|
|
//@@
|
|
repeated int32 int32_false_true = 2;
|
|
|
|
//@@ .. cpp:var:: float fp32_false_true (repeated)
|
|
//@@
|
|
//@@ The control's true and false setting is indicated by setting
|
|
//@@ a value in a fp32 tensor. The tensor must be a
|
|
//@@ 1-dimensional tensor with size equal to the batch size of
|
|
//@@ the request. 'fp32_false_true' must have two entries: the
|
|
//@@ first the false value and the second the true value.
|
|
//@@
|
|
repeated float fp32_false_true = 3;
|
|
|
|
//@@ .. cpp:var:: bool bool_false_true (repeated)
|
|
//@@
|
|
//@@ The control's true and false setting is indicated by setting
|
|
//@@ a value in a bool tensor. The tensor must be a
|
|
//@@ 1-dimensional tensor with size equal to the batch size of
|
|
//@@ the request. 'bool_false_true' must have two entries: the
|
|
//@@ first the false value and the second the true value.
|
|
//@@
|
|
repeated bool bool_false_true = 5;
|
|
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The control's datatype.
|
|
//@@
|
|
DataType data_type = 4;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message ControlInput
|
|
//@@
|
|
//@@ The sequence control values to communicate by a model input.
|
|
//@@
|
|
message ControlInput
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the model input.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: Control control (repeated)
|
|
//@@
|
|
//@@ The control value(s) that should be communicated to the
|
|
//@@ model using this model input.
|
|
//@@
|
|
repeated Control control = 2;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message InitialState
|
|
//@@
|
|
//@@ Settings used to initialize data for implicit state.
|
|
//@@
|
|
message InitialState
|
|
{
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The data-type of the state.
|
|
//@@
|
|
DataType data_type = 1;
|
|
|
|
//@@ .. cpp:var:: int64 dims (repeated)
|
|
//@@
|
|
//@@ The shape of the state tensor, not including the batch
|
|
//@@ dimension.
|
|
//@@
|
|
repeated int64 dims = 2;
|
|
|
|
//@@ .. cpp:var:: oneof state_data
|
|
//@@
|
|
//@@ Specify how the initial state data is generated.
|
|
//@@
|
|
oneof state_data
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: bool zero_data
|
|
//@@
|
|
//@@ The identifier for using zeros as initial state data.
|
|
//@@ Note that the value of 'zero_data' will not be checked,
|
|
//@@ instead, zero data will be used as long as the field is set.
|
|
//@@
|
|
bool zero_data = 3;
|
|
|
|
//@@ .. cpp:var:: string data_file
|
|
//@@
|
|
//@@ The file whose content will be used as the initial data for
|
|
//@@ the state in row-major order. The file must be provided in
|
|
//@@ sub-directory 'initial_state' under the model directory.
|
|
//@@
|
|
string data_file = 4;
|
|
}
|
|
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the state initialization.
|
|
//@@
|
|
string name = 5;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message State
|
|
//@@
|
|
//@@ An input / output pair of tensors that carry state for the sequence.
|
|
//@@
|
|
message State
|
|
{
|
|
//@@ .. cpp:var:: string input_name
|
|
//@@
|
|
//@@ The name of the model state input.
|
|
//@@
|
|
string input_name = 1;
|
|
|
|
//@@ .. cpp:var:: string output_name
|
|
//@@
|
|
//@@ The name of the model state output.
|
|
//@@
|
|
string output_name = 2;
|
|
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The data-type of the state.
|
|
//@@
|
|
DataType data_type = 3;
|
|
|
|
//@@ .. cpp:var:: int64 dim (repeated)
|
|
//@@
|
|
//@@ The dimension.
|
|
//@@
|
|
repeated int64 dims = 4;
|
|
|
|
//@@ .. cpp:var:: InitialState initial_state (repeated)
|
|
//@@
|
|
//@@ The optional field to specify the initial state for the model.
|
|
//@@
|
|
repeated InitialState initial_state = 5;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message StrategyDirect
|
|
//@@
|
|
//@@ The sequence batcher uses a specific, unique batch
|
|
//@@ slot for each sequence. All inference requests in a
|
|
//@@ sequence are directed to the same batch slot in the same
|
|
//@@ model instance over the lifetime of the sequence. This
|
|
//@@ is the default strategy.
|
|
//@@
|
|
message StrategyDirect
|
|
{
|
|
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
|
|
//@@
|
|
//@@ The maximum time, in microseconds, a candidate request
|
|
//@@ will be delayed in the sequence batch scheduling queue to
|
|
//@@ wait for additional requests for batching. Default is 0.
|
|
//@@
|
|
uint64 max_queue_delay_microseconds = 1;
|
|
|
|
//@@ .. cpp:var:: float minimum_slot_utilization
|
|
//@@
|
|
//@@ The minimum slot utilization that must be satisfied to
|
|
//@@ execute the batch before 'max_queue_delay_microseconds' expires.
|
|
//@@ For example, a value of 0.5 indicates that the batch should be
|
|
//@@ executed as soon as 50% or more of the slots are ready even if
|
|
//@@ the 'max_queue_delay_microseconds' timeout has not expired.
|
|
//@@ The default is 0.0, indicating that a batch will be executed
|
|
//@@ before 'max_queue_delay_microseconds' timeout expires if at least
|
|
//@@ one batch slot is ready. 'max_queue_delay_microseconds' will be
|
|
//@@ ignored unless minimum_slot_utilization is set to a non-zero
|
|
//@@ value.
|
|
//@@
|
|
float minimum_slot_utilization = 2;
|
|
}
|
|
|
|
//@@ .. cpp:var:: message StrategyOldest
|
|
//@@
|
|
//@@ The sequence batcher maintains up to 'max_candidate_sequences'
|
|
//@@ candidate sequences. 'max_candidate_sequences' can be greater
|
|
//@@ than the model's 'max_batch_size'. For inferencing the batcher
|
|
//@@ chooses from the candidate sequences up to 'max_batch_size'
|
|
//@@ inference requests. Requests are chosen in an oldest-first
|
|
//@@ manner across all candidate sequences. A given sequence is
|
|
//@@ not guaranteed to be assigned to the same batch slot for
|
|
//@@ all inference requests of that sequence.
|
|
//@@
|
|
message StrategyOldest
|
|
{
|
|
//@@ .. cpp:var:: int32 max_candidate_sequences
|
|
//@@
|
|
//@@ Maximum number of candidate sequences that the batcher
|
|
//@@ maintains. Excess seqences are kept in an ordered backlog
|
|
//@@ and become candidates when existing candidate sequences
|
|
//@@ complete.
|
|
//@@
|
|
int32 max_candidate_sequences = 1;
|
|
|
|
//@@ .. cpp:var:: int32 preferred_batch_size (repeated)
|
|
//@@
|
|
//@@ Preferred batch sizes for dynamic batching of candidate
|
|
//@@ sequences. If a batch of one of these sizes can be formed
|
|
//@@ it will be executed immediately. If not specified a
|
|
//@@ preferred batch size will be chosen automatically
|
|
//@@ based on model and GPU characteristics.
|
|
//@@
|
|
repeated int32 preferred_batch_size = 2;
|
|
|
|
//@@ .. cpp:var:: uint64 max_queue_delay_microseconds
|
|
//@@
|
|
//@@ The maximum time, in microseconds, a candidate request
|
|
//@@ will be delayed in the dynamic batch scheduling queue to
|
|
//@@ wait for additional requests for batching. Default is 0.
|
|
//@@
|
|
uint64 max_queue_delay_microseconds = 3;
|
|
}
|
|
|
|
//@@ .. cpp:var:: oneof strategy_choice
|
|
//@@
|
|
//@@ The strategy used by the sequence batcher. Default strategy
|
|
//@@ is 'direct'.
|
|
//@@
|
|
oneof strategy_choice
|
|
{
|
|
//@@ .. cpp:var:: StrategyDirect direct
|
|
//@@
|
|
//@@ StrategyDirect scheduling strategy.
|
|
//@@
|
|
StrategyDirect direct = 3;
|
|
|
|
//@@ .. cpp:var:: StrategyOldest oldest
|
|
//@@
|
|
//@@ StrategyOldest scheduling strategy.
|
|
//@@
|
|
StrategyOldest oldest = 4;
|
|
}
|
|
|
|
//@@ .. cpp:var:: uint64 max_sequence_idle_microseconds
|
|
//@@
|
|
//@@ The maximum time, in microseconds, that a sequence is allowed to
|
|
//@@ be idle before it is aborted. The inference server considers a
|
|
//@@ sequence idle when it does not have any inference request queued
|
|
//@@ for the sequence. If this limit is exceeded, the inference server
|
|
//@@ will free the sequence slot allocated by the sequence and make it
|
|
//@@ available for another sequence. If not specified (or specified as
|
|
//@@ zero) a default value of 1000000 (1 second) is used.
|
|
//@@
|
|
uint64 max_sequence_idle_microseconds = 1;
|
|
|
|
//@@ .. cpp:var:: ControlInput control_input (repeated)
|
|
//@@
|
|
//@@ The model input(s) that the server should use to communicate
|
|
//@@ sequence start, stop, ready and similar control values to the
|
|
//@@ model.
|
|
//@@
|
|
repeated ControlInput control_input = 2;
|
|
|
|
//@@ .. cpp:var:: State state (repeated)
|
|
//@@
|
|
//@@ The optional state that can be stored in Triton for performing
|
|
//@@ inference requests on a sequence. Each sequence holds an implicit
|
|
//@@ state local to itself. The output state tensor provided by the
|
|
//@@ model in 'output_name' field of the current inference request will
|
|
//@@ be transferred as an input tensor named 'input_name' in the next
|
|
//@@ request of the same sequence. The input state of the first request
|
|
//@@ in the sequence contains garbage data.
|
|
//@@
|
|
repeated State state = 5;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelEnsembling
|
|
//@@
|
|
//@@ Model ensembling configuration. These settings specify the models that
|
|
//@@ compose the ensemble and how data flows between the models.
|
|
//@@
|
|
message ModelEnsembling
|
|
{
|
|
//@@ .. cpp:var:: message Step
|
|
//@@
|
|
//@@ Each step specifies a model included in the ensemble,
|
|
//@@ maps ensemble tensor names to the model input tensors,
|
|
//@@ and maps model output tensors to ensemble tensor names
|
|
//@@
|
|
message Step
|
|
{
|
|
//@@ .. cpp:var:: string model_name
|
|
//@@
|
|
//@@ The name of the model to execute for this step of the ensemble.
|
|
//@@
|
|
string model_name = 1;
|
|
|
|
//@@ .. cpp:var:: int64 model_version
|
|
//@@
|
|
//@@ The version of the model to use for inference. If -1
|
|
//@@ the latest/most-recent version of the model is used.
|
|
//@@
|
|
int64 model_version = 2;
|
|
|
|
//@@ .. cpp:var:: map<string,string> input_map
|
|
//@@
|
|
//@@ Map from name of an input tensor on this step's model to ensemble
|
|
//@@ tensor name. The ensemble tensor must have the same data type and
|
|
//@@ shape as the model input. Each model input must be assigned to
|
|
//@@ one ensemble tensor, but the same ensemble tensor can be assigned
|
|
//@@ to multiple model inputs.
|
|
//@@
|
|
map<string, string> input_map = 3;
|
|
|
|
//@@ .. cpp:var:: map<string,string> output_map
|
|
//@@
|
|
//@@ Map from name of an output tensor on this step's model to ensemble
|
|
//@@ tensor name. The data type and shape of the ensemble tensor will
|
|
//@@ be inferred from the model output. It is optional to assign all
|
|
//@@ model outputs to ensemble tensors. One ensemble tensor name
|
|
//@@ can appear in an output map only once.
|
|
//@@
|
|
map<string, string> output_map = 4;
|
|
|
|
//@@ .. cpp:var:: string model_namespace
|
|
//@@
|
|
//@@ [RESERVED] currently this field is reserved for internal use, users
|
|
//@@ must not set any value to this field to avoid unexpected behavior.
|
|
//@@
|
|
string model_namespace = 5;
|
|
}
|
|
|
|
//@@ .. cpp:var:: Step step (repeated)
|
|
//@@
|
|
//@@ The models and the input / output mappings used within the ensemble.
|
|
//@@
|
|
repeated Step step = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelParameter
|
|
//@@
|
|
//@@ A model parameter.
|
|
//@@
|
|
message ModelParameter
|
|
{
|
|
//@@ .. cpp:var:: string string_value
|
|
//@@
|
|
//@@ The string value of the parameter.
|
|
//@@
|
|
string string_value = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelWarmup
|
|
//@@
|
|
//@@ Settings used to construct the request sample for model warmup.
|
|
//@@
|
|
message ModelWarmup
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: message Input
|
|
//@@
|
|
//@@ Meta data associated with an input.
|
|
//@@
|
|
message Input
|
|
{
|
|
//@@ .. cpp:var:: DataType data_type
|
|
//@@
|
|
//@@ The data-type of the input.
|
|
//@@
|
|
DataType data_type = 1;
|
|
|
|
//@@ .. cpp:var:: int64 dims (repeated)
|
|
//@@
|
|
//@@ The shape of the input tensor, not including the batch dimension.
|
|
//@@
|
|
repeated int64 dims = 2;
|
|
|
|
//@@ .. cpp:var:: oneof input_data_type
|
|
//@@
|
|
//@@ Specify how the input data is generated. If the input has STRING
|
|
//@@ data type and 'random_data' is set, the data generation will fall
|
|
//@@ back to 'zero_data'.
|
|
//@@
|
|
oneof input_data_type
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: bool zero_data
|
|
//@@
|
|
//@@ The identifier for using zeros as input data. Note that the
|
|
//@@ value of 'zero_data' will not be checked, instead, zero data
|
|
//@@ will be used as long as the field is set.
|
|
//@@
|
|
bool zero_data = 3;
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: bool random_data
|
|
//@@
|
|
//@@ The identifier for using random data as input data. Note that
|
|
//@@ the value of 'random_data' will not be checked, instead,
|
|
//@@ random data will be used as long as the field is set.
|
|
//@@
|
|
bool random_data = 4;
|
|
|
|
//@@ .. cpp:var:: string input_data_file
|
|
//@@
|
|
//@@ The file whose content will be used as raw input data in
|
|
//@@ row-major order. The file must be provided in a sub-directory
|
|
//@@ 'warmup' under the model directory. The file contents should be
|
|
//@@ in binary format. For TYPE_STRING data-type, an element is
|
|
//@@ represented by a 4-byte unsigned integer giving the length
|
|
//@@ followed by the actual bytes.
|
|
//@@
|
|
string input_data_file = 5;
|
|
}
|
|
}
|
|
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the request sample.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: uint32 batch_size
|
|
//@@
|
|
//@@ The batch size of the inference request. This must be >= 1. For
|
|
//@@ models that don't support batching, batch_size must be 1. If
|
|
//@@ batch_size > 1, the 'inputs' specified below will be duplicated to
|
|
//@@ match the batch size requested.
|
|
//@@
|
|
uint32 batch_size = 2;
|
|
|
|
//@@ .. cpp:var:: map<string, Input> inputs
|
|
//@@
|
|
//@@ The warmup meta data associated with every model input, including
|
|
//@@ control tensors.
|
|
//@@
|
|
map<string, Input> inputs = 3;
|
|
|
|
//@@ .. cpp:var:: uint32 count
|
|
//@@
|
|
//@@ The number of iterations that this warmup sample will be executed.
|
|
//@@ For example, if this field is set to 2, 2 model executions using this
|
|
//@@ sample will be scheduled for warmup. Default value is 0 which
|
|
//@@ indicates that this sample will be used only once.
|
|
//@@ Note that for sequence model, 'count' may not work well
|
|
//@@ because the model often expect a valid sequence of requests which
|
|
//@@ should be represented by a series of warmup samples. 'count > 1'
|
|
//@@ essentially "resends" one of the sample, which may invalidate the
|
|
//@@ sequence and result in unexpected warmup failure.
|
|
//@@
|
|
uint32 count = 4;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message ModelOperations
|
|
//@@
|
|
//@@ The metadata of libraries providing custom operations for this model.
|
|
//@@
|
|
message ModelOperations
|
|
{
|
|
//@@ .. cpp:var:: string op_library_filename (repeated)
|
|
//@@
|
|
//@@ Optional paths of the libraries providing custom operations for
|
|
//@@ this model. Valid only for ONNX models.
|
|
//@@
|
|
repeated string op_library_filename = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: message ModelTransactionPolicy
|
|
//@@
|
|
//@@ The specification that describes the nature of transactions
|
|
//@@ to be expected from the model.
|
|
//@@
|
|
message ModelTransactionPolicy
|
|
{
|
|
//@@ .. cpp:var:: bool decoupled
|
|
//@@
|
|
//@@ Indicates whether responses generated by the model are decoupled with
|
|
//@@ the requests issued to it, which means the number of responses
|
|
//@@ generated by model may differ from number of requests issued, and
|
|
//@@ that the responses may be out of order relative to the order of
|
|
//@@ requests. The default is false, which means the model will generate
|
|
//@@ exactly one response for each request.
|
|
//@@
|
|
bool decoupled = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelRepositoryAgents
|
|
//@@
|
|
//@@ The repository agents for the model.
|
|
//@@
|
|
message ModelRepositoryAgents
|
|
{
|
|
//@@
|
|
//@@ .. cpp:var:: message Agent
|
|
//@@
|
|
//@@ A repository agent that should be invoked for the specified
|
|
//@@ repository actions for this model.
|
|
//@@
|
|
message Agent
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the agent.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: map<string, string> parameters
|
|
//@@
|
|
//@@ The parameters for the agent.
|
|
//@@
|
|
map<string, string> parameters = 2;
|
|
}
|
|
|
|
//@@
|
|
//@@ .. cpp:var:: Agent agents (repeated)
|
|
//@@
|
|
//@@ The ordered list of agents for the model. These agents will be
|
|
//@@ invoked in order to respond to repository actions occuring for the
|
|
//@@ model.
|
|
//@@
|
|
repeated Agent agents = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelResponseCache
|
|
//@@
|
|
//@@ The response cache setting for the model.
|
|
//@@
|
|
message ModelResponseCache
|
|
{
|
|
//@@
|
|
//@@ .. cpp::var:: bool enable
|
|
//@@
|
|
//@@ Whether or not to use response cache for the model. If True, the
|
|
//@@ responses from the model are cached and when identical request
|
|
//@@ is encountered, instead of going through the model execution,
|
|
//@@ the response from the cache is utilized. By default, response
|
|
//@@ cache is disabled for the models.
|
|
//@@
|
|
bool enable = 1;
|
|
}
|
|
|
|
//@@
|
|
//@@.. cpp:var:: message ModelConfig
|
|
//@@
|
|
//@@ A model configuration.
|
|
//@@
|
|
message ModelConfig
|
|
{
|
|
//@@ .. cpp:var:: string name
|
|
//@@
|
|
//@@ The name of the model.
|
|
//@@
|
|
string name = 1;
|
|
|
|
//@@ .. cpp:var:: string platform
|
|
//@@
|
|
//@@ Additional backend-specific configuration for the model.
|
|
//@@ Please refer to the backend documentation on whether this field
|
|
//@@ should be specified.
|
|
//@@
|
|
string platform = 2;
|
|
|
|
//@@ .. cpp:var:: string backend
|
|
//@@
|
|
//@@ The backend used by the model.
|
|
//@@
|
|
string backend = 17;
|
|
|
|
//@@ .. cpp:var:: ModelVersionPolicy version_policy
|
|
//@@
|
|
//@@ Policy indicating which version(s) of the model will be served.
|
|
//@@
|
|
ModelVersionPolicy version_policy = 3;
|
|
|
|
//@@ .. cpp:var:: int32 max_batch_size
|
|
//@@
|
|
//@@ Maximum batch size allowed for inference. This can only decrease
|
|
//@@ what is allowed by the model itself. A max_batch_size value of 0
|
|
//@@ indicates that batching is not allowed for the model and the
|
|
//@@ dimension/shape of the input and output tensors must exactly
|
|
//@@ match what is specified in the input and output configuration. A
|
|
//@@ max_batch_size value > 0 indicates that batching is allowed and
|
|
//@@ so the model expects the input tensors to have an additional
|
|
//@@ initial dimension for the batching that is not specified in the
|
|
//@@ input (for example, if the model supports batched inputs of
|
|
//@@ 2-dimensional tensors then the model configuration will specify
|
|
//@@ the input shape as [ X, Y ] but the model will expect the actual
|
|
//@@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0
|
|
//@@ returned outputs will also have an additional initial dimension
|
|
//@@ for the batch.
|
|
//@@
|
|
int32 max_batch_size = 4;
|
|
|
|
//@@ .. cpp:var:: ModelInput input (repeated)
|
|
//@@
|
|
//@@ The inputs request by the model.
|
|
//@@
|
|
repeated ModelInput input = 5;
|
|
|
|
//@@ .. cpp:var:: ModelOutput output (repeated)
|
|
//@@
|
|
//@@ The outputs produced by the model.
|
|
//@@
|
|
repeated ModelOutput output = 6;
|
|
|
|
//@@ .. cpp:var:: BatchInput batch_input (repeated)
|
|
//@@
|
|
//@@ The model input(s) that the server should use to communicate
|
|
//@@ batch related values to the model.
|
|
//@@
|
|
repeated BatchInput batch_input = 20;
|
|
|
|
//@@ .. cpp:var:: BatchOutput batch_output (repeated)
|
|
//@@
|
|
//@@ The outputs produced by the model that requires special handling
|
|
//@@ by the model backend.
|
|
//@@
|
|
repeated BatchOutput batch_output = 21;
|
|
|
|
//@@ .. cpp:var:: ModelOptimizationPolicy optimization
|
|
//@@
|
|
//@@ Optimization configuration for the model. If not specified
|
|
//@@ then default optimization policy is used.
|
|
//@@
|
|
ModelOptimizationPolicy optimization = 12;
|
|
|
|
//@@ .. cpp:var:: oneof scheduling_choice
|
|
//@@
|
|
//@@ The scheduling policy for the model. If not specified the
|
|
//@@ default scheduling policy is used for the model. The default
|
|
//@@ policy is to execute each inference request independently.
|
|
//@@
|
|
oneof scheduling_choice
|
|
{
|
|
//@@ .. cpp:var:: ModelDynamicBatching dynamic_batching
|
|
//@@
|
|
//@@ If specified, enables the dynamic-batching scheduling
|
|
//@@ policy. With dynamic-batching the scheduler may group
|
|
//@@ together independent requests into a single batch to
|
|
//@@ improve inference throughput.
|
|
//@@
|
|
ModelDynamicBatching dynamic_batching = 11;
|
|
|
|
//@@ .. cpp:var:: ModelSequenceBatching sequence_batching
|
|
//@@
|
|
//@@ If specified, enables the sequence-batching scheduling
|
|
//@@ policy. With sequence-batching, inference requests
|
|
//@@ with the same correlation ID are routed to the same
|
|
//@@ model instance. Multiple sequences of inference requests
|
|
//@@ may be batched together into a single batch to
|
|
//@@ improve inference throughput.
|
|
//@@
|
|
ModelSequenceBatching sequence_batching = 13;
|
|
|
|
//@@ .. cpp:var:: ModelEnsembling ensemble_scheduling
|
|
//@@
|
|
//@@ If specified, enables the model-ensembling scheduling
|
|
//@@ policy. With model-ensembling, inference requests
|
|
//@@ will be processed according to the specification, such as an
|
|
//@@ execution sequence of models. The input specified in this model
|
|
//@@ config will be the input for the ensemble, and the output
|
|
//@@ specified will be the output of the ensemble.
|
|
//@@
|
|
ModelEnsembling ensemble_scheduling = 15;
|
|
}
|
|
|
|
//@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated)
|
|
//@@
|
|
//@@ Instances of this model. If not specified, one instance
|
|
//@@ of the model will be instantiated on each available GPU.
|
|
//@@
|
|
repeated ModelInstanceGroup instance_group = 7;
|
|
|
|
//@@ .. cpp:var:: string default_model_filename
|
|
//@@
|
|
//@@ Optional filename of the model file to use if a
|
|
//@@ compute-capability specific model is not specified in
|
|
//@@ :cpp:var:`cc_model_filenames`. If not specified the default name
|
|
//@@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or
|
|
//@@ 'model.pt' depending on the model type.
|
|
//@@
|
|
string default_model_filename = 8;
|
|
|
|
//@@ .. cpp:var:: map<string,string> cc_model_filenames
|
|
//@@
|
|
//@@ Optional map from CUDA compute capability to the filename of
|
|
//@@ the model that supports that compute capability. The filename
|
|
//@@ refers to a file within the model version directory.
|
|
//@@
|
|
map<string, string> cc_model_filenames = 9;
|
|
|
|
//@@ .. cpp:var:: map<string,string> metric_tags
|
|
//@@
|
|
//@@ Optional metric tags. User-specific key-value pairs for metrics
|
|
//@@ reported for this model. These tags are applied to the metrics
|
|
//@@ reported on the HTTP metrics port.
|
|
//@@
|
|
map<string, string> metric_tags = 10;
|
|
|
|
//@@ .. cpp:var:: map<string,ModelParameter> parameters
|
|
//@@
|
|
//@@ Optional model parameters. User-specified parameter values.
|
|
//@@
|
|
map<string, ModelParameter> parameters = 14;
|
|
|
|
//@@ .. cpp:var:: ModelWarmup model_warmup (repeated)
|
|
//@@
|
|
//@@ Warmup setting of this model. If specified, all instances
|
|
//@@ will be run with the request samples in sequence before
|
|
//@@ serving the model.
|
|
//@@ This field can only be specified if the model is not an ensemble
|
|
//@@ model.
|
|
//@@
|
|
repeated ModelWarmup model_warmup = 16;
|
|
|
|
//@@ .. cpp:var:: ModelOperations model_operations
|
|
//@@
|
|
//@@ Optional metadata of the libraries providing custom operations for
|
|
//@@ this model.
|
|
//@@
|
|
ModelOperations model_operations = 18;
|
|
|
|
//@@ .. cpp:var:: ModelTransactionPolicy model_transaction_policy
|
|
//@@
|
|
//@@ Optional specification that describes the nature of transactions
|
|
//@@ to be expected from the model.
|
|
//@@
|
|
ModelTransactionPolicy model_transaction_policy = 19;
|
|
|
|
//@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents
|
|
//@@
|
|
//@@ Optional specification of the agent(s) that should be invoked
|
|
//@@ with repository actions are performed for this model.
|
|
//@@
|
|
ModelRepositoryAgents model_repository_agents = 23;
|
|
|
|
//@@ .. cpp:var:: ModelResponseCache response_cache
|
|
//@@
|
|
//@@ Optional setting for utilizing the response cache for this
|
|
//@@ model.
|
|
//@@
|
|
ModelResponseCache response_cache = 24;
|
|
}
|