feat(components) Extend kserve component (#10136)
* add runtime version, resource requests and resource limits * adjust kservedeployer * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> --------- Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>
This commit is contained in:
parent
0cb2217934
commit
2054b7c45d
|
@ -39,6 +39,9 @@ kserve_op = components.load_component_from_url('https://raw.githubusercontent.co
|
|||
| canary_traffic_percent | `100` | The traffic split percentage between the candidate model and the last ready model |
|
||||
| namespace | | Kubernetes namespace where the KServe service is deployed. If no namespace is provided, `anonymous` will be used unless a namespace is provided in the `inferenceservice_yaml` argument. |
|
||||
| framework | | Machine learning framework for model serving. Currently the supported frameworks are `tensorflow`, `pytorch`, `sklearn`, `xgboost`, `onnx`, `triton`, `pmml`, and `lightgbm`. |
|
||||
| runtime_version | `latest` | Runtime Version of Machine Learning Framework |
|
||||
| resource_requests | `{"cpu": "0.5", "memory": "512Mi"}` | CPU and Memory requests for Model Serving |
|
||||
| resource_limits | `{"cpu": "1", "memory": "1Gi"}` | CPU and Memory limits for Model Serving |
|
||||
| custom_model_spec | `{}` | Custom model runtime container spec in JSON. Sample spec: `{"image": "codait/max-object-detector", "port":5000, "name": "test-container"}` |
|
||||
| inferenceservice_yaml | `{}` | Raw InferenceService serialized YAML for deployment. Use this if you need additional configurations for your InferenceService. |
|
||||
| autoscaling_target | `0` | Autoscaling Target Number. If not 0, sets the following annotation on the InferenceService: `autoscaling.knative.dev/target` |
|
||||
|
@ -185,4 +188,3 @@ kserve_op(
|
|||
inferenceservice_yaml=isvc_yaml
|
||||
)
|
||||
```
|
||||
|
||||
|
|
|
@ -1,25 +1,28 @@
|
|||
name: Serve a model with KServe
|
||||
description: Serve Models using KServe
|
||||
inputs:
|
||||
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
|
||||
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
|
||||
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
|
||||
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
|
||||
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
|
||||
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
|
||||
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
|
||||
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
|
||||
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
|
||||
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
|
||||
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
|
||||
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
|
||||
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
|
||||
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
|
||||
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
|
||||
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}
|
||||
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
|
||||
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
|
||||
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
|
||||
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
|
||||
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
|
||||
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
|
||||
- {name: Runtime Version, type: String, default: 'latest', description: 'Runtime Version of Machine Learning Framework'}
|
||||
- {name: Resource Requests, type: String, default: '{"cpu": "0.5", "memory": "512Mi"}', description: 'CPU and Memory requests for Model Serving'}
|
||||
- {name: Resource Limits, type: String, default: '{"cpu": "1", "memory": "1Gi"}', description: 'CPU and Memory limits for Model Serving'}
|
||||
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
|
||||
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
|
||||
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
|
||||
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
|
||||
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
|
||||
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
|
||||
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
|
||||
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
|
||||
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
|
||||
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}
|
||||
|
||||
outputs:
|
||||
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
|
||||
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
|
||||
implementation:
|
||||
container:
|
||||
image: quay.io/aipipeline/kserve-component:v0.11.1
|
||||
|
@ -32,6 +35,9 @@ implementation:
|
|||
--canary-traffic-percent, {inputValue: Canary Traffic Percent},
|
||||
--namespace, {inputValue: Namespace},
|
||||
--framework, {inputValue: Framework},
|
||||
--runtime-version, {inputValue: Runtime Version},
|
||||
--resource-requests, {inputValue: Resource Requests},
|
||||
--resource-limits, {inputValue: Resource Limits},
|
||||
--custom-model-spec, {inputValue: Custom Model Spec},
|
||||
--autoscaling-target, {inputValue: Autoscaling Target},
|
||||
--service-account, {inputValue: Service Account},
|
||||
|
|
|
@ -21,6 +21,7 @@ import time
|
|||
import yaml
|
||||
|
||||
from kubernetes import client
|
||||
from kubernetes.client.models import V1ResourceRequirements
|
||||
|
||||
from kserve import constants
|
||||
from kserve import KServeClient
|
||||
|
@ -50,8 +51,9 @@ AVAILABLE_FRAMEWORKS = {
|
|||
}
|
||||
|
||||
|
||||
def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
|
||||
service_account, min_replicas, max_replicas, containers, request_timeout):
|
||||
def create_predictor_spec(framework, runtime_version, resource_requests, resource_limits,
|
||||
storage_uri, canary_traffic_percent, service_account, min_replicas,
|
||||
max_replicas, containers, request_timeout):
|
||||
"""
|
||||
Create and return V1beta1PredictorSpec to be used in a V1beta1InferenceServiceSpec
|
||||
object.
|
||||
|
@ -81,7 +83,14 @@ def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
|
|||
setattr(
|
||||
predictor_spec,
|
||||
framework,
|
||||
AVAILABLE_FRAMEWORKS[framework](storage_uri=storage_uri)
|
||||
AVAILABLE_FRAMEWORKS[framework](
|
||||
storage_uri=storage_uri,
|
||||
resources=V1ResourceRequirements(
|
||||
requests=resource_requests,
|
||||
limits=resource_limits
|
||||
),
|
||||
runtime_version=runtime_version
|
||||
)
|
||||
)
|
||||
return predictor_spec
|
||||
|
||||
|
@ -178,10 +187,10 @@ def submit_api_request(kserve_client, action, name, isvc, namespace=None,
|
|||
return outputs
|
||||
|
||||
|
||||
def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace,
|
||||
framework, custom_model_spec, service_account, inferenceservice_yaml,
|
||||
request_timeout, autoscaling_target=0, enable_istio_sidecar=True,
|
||||
watch_timeout=300, min_replicas=0, max_replicas=0):
|
||||
def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace, framework,
|
||||
runtime_version, resource_requests, resource_limits, custom_model_spec,
|
||||
service_account, inferenceservice_yaml, request_timeout, autoscaling_target=0,
|
||||
enable_istio_sidecar=True, watch_timeout=300, min_replicas=0, max_replicas=0):
|
||||
"""
|
||||
Perform the specified action. If the action is not 'delete' and `inferenceService_yaml`
|
||||
was provided, the dict representation of the YAML will be sent directly to the
|
||||
|
@ -224,8 +233,9 @@ def perform_action(action, model_name, model_uri, canary_traffic_percent, namesp
|
|||
|
||||
# Build the V1beta1PredictorSpec.
|
||||
predictor_spec = create_predictor_spec(
|
||||
framework, model_uri, canary_traffic_percent, service_account,
|
||||
min_replicas, max_replicas, containers, request_timeout
|
||||
framework, runtime_version, resource_requests, resource_limits,
|
||||
model_uri, canary_traffic_percent, service_account, min_replicas,
|
||||
max_replicas, containers, request_timeout
|
||||
)
|
||||
|
||||
isvc = create_inference_service(metadata, predictor_spec)
|
||||
|
@ -287,6 +297,24 @@ def main():
|
|||
str(list(AVAILABLE_FRAMEWORKS.keys())),
|
||||
default=""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--runtime-version",
|
||||
type=str,
|
||||
help="Runtime Version of Machine Learning Framework",
|
||||
default="latest"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resource-requests",
|
||||
type=json.loads,
|
||||
help="CPU and Memory requests for Model Serving",
|
||||
default='{"cpu": "0.5", "memory": "512Mi"}',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resource-limits",
|
||||
type=json.loads,
|
||||
help="CPU and Memory limits for Model Serving",
|
||||
default='{"cpu": "1", "memory": "1Gi"}',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--custom-model-spec",
|
||||
type=json.loads,
|
||||
|
@ -342,6 +370,9 @@ def main():
|
|||
canary_traffic_percent = int(args.canary_traffic_percent)
|
||||
namespace = args.namespace
|
||||
framework = args.framework.lower()
|
||||
runtime_version = args.runtime_version.lower()
|
||||
resource_requests = args.resource_requests
|
||||
resource_limits = args.resource_limits
|
||||
output_path = args.output_path
|
||||
custom_model_spec = args.custom_model_spec
|
||||
autoscaling_target = int(args.autoscaling_target)
|
||||
|
@ -381,6 +412,9 @@ def main():
|
|||
canary_traffic_percent=canary_traffic_percent,
|
||||
namespace=namespace,
|
||||
framework=framework,
|
||||
runtime_version=runtime_version,
|
||||
resource_requests=resource_requests,
|
||||
resource_limits=resource_limits,
|
||||
custom_model_spec=custom_model_spec,
|
||||
autoscaling_target=autoscaling_target,
|
||||
service_account=service_account,
|
||||
|
|
Loading…
Reference in New Issue