File: api_op_DescribeTrainingJob.go

package info (click to toggle)
golang-github-aws-aws-sdk-go-v2 1.24.1-2~bpo12%2B1
links: PTS, VCS
area: main
in suites: bookworm-backports
size: 554,032 kB
sloc: java: 15,941; makefile: 419; sh: 175
file content (610 lines) | stat: -rw-r--r-- 23,101 bytes
// Code generated by smithy-go-codegen DO NOT EDIT.

package sagemaker

import (
	"context"
	"errors"
	"fmt"
	awsmiddleware "github.com/aws/aws-sdk-go-v2/aws/middleware"
	"github.com/aws/aws-sdk-go-v2/aws/signer/v4"
	"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
	smithy "github.com/aws/smithy-go"
	"github.com/aws/smithy-go/middleware"
	smithytime "github.com/aws/smithy-go/time"
	smithyhttp "github.com/aws/smithy-go/transport/http"
	smithywaiter "github.com/aws/smithy-go/waiter"
	"github.com/jmespath/go-jmespath"
	"time"
)

// Returns information about a training job. Some of the attributes below only
// appear if the training job successfully starts. If the training job fails,
// TrainingJobStatus is Failed and, depending on the FailureReason , attributes
// like TrainingStartTime , TrainingTimeInSeconds , TrainingEndTime , and
// BillableTimeInSeconds may not be present in the response.
func (c *Client) DescribeTrainingJob(ctx context.Context, params *DescribeTrainingJobInput, optFns ...func(*Options)) (*DescribeTrainingJobOutput, error) {
	if params == nil {
		params = &DescribeTrainingJobInput{}
	}

	result, metadata, err := c.invokeOperation(ctx, "DescribeTrainingJob", params, optFns, c.addOperationDescribeTrainingJobMiddlewares)
	if err != nil {
		return nil, err
	}

	out := result.(*DescribeTrainingJobOutput)
	out.ResultMetadata = metadata
	return out, nil
}

type DescribeTrainingJobInput struct {

	// The name of the training job.
	//
	// This member is required.
	TrainingJobName *string

	noSmithyDocumentSerde
}

type DescribeTrainingJobOutput struct {

	// Information about the algorithm used for training, and algorithm metadata.
	//
	// This member is required.
	AlgorithmSpecification *types.AlgorithmSpecification

	// A timestamp that indicates when the training job was created.
	//
	// This member is required.
	CreationTime *time.Time

	// Information about the Amazon S3 location that is configured for storing model
	// artifacts.
	//
	// This member is required.
	ModelArtifacts *types.ModelArtifacts

	// Resources, including ML compute instances and ML storage volumes, that are
	// configured for model training.
	//
	// This member is required.
	ResourceConfig *types.ResourceConfig

	// Provides detailed information about the state of the training job. For detailed
	// information on the secondary status of the training job, see StatusMessage
	// under SecondaryStatusTransition (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_SecondaryStatusTransition.html)
	// . SageMaker provides primary statuses and secondary statuses that apply to each
	// of them: InProgress
	//   - Starting - Starting the training job.
	//   - Downloading - An optional stage for algorithms that support File training
	//   input mode. It indicates that data is being downloaded to the ML storage
	//   volumes.
	//   - Training - Training is in progress.
	//   - Interrupted - The job stopped because the managed spot training instances
	//   were interrupted.
	//   - Uploading - Training is complete and the model artifacts are being uploaded
	//   to the S3 location.
	// Completed
	//   - Completed - The training job has completed.
	// Failed
	//   - Failed - The training job has failed. The reason for the failure is returned
	//   in the FailureReason field of DescribeTrainingJobResponse .
	// Stopped
	//   - MaxRuntimeExceeded - The job stopped because it exceeded the maximum allowed
	//   runtime.
	//   - MaxWaitTimeExceeded - The job stopped because it exceeded the maximum
	//   allowed wait time.
	//   - Stopped - The training job has stopped.
	// Stopping
	//   - Stopping - Stopping the training job.
	// Valid values for SecondaryStatus are subject to change. We no longer support
	// the following secondary statuses:
	//   - LaunchingMLInstances
	//   - PreparingTraining
	//   - DownloadingTrainingImage
	//
	// This member is required.
	SecondaryStatus types.SecondaryStatus

	// Specifies a limit to how long a model training job can run. It also specifies
	// how long a managed Spot training job has to complete. When the job reaches the
	// time limit, SageMaker ends the training job. Use this API to cap model training
	// costs. To stop a job, SageMaker sends the algorithm the SIGTERM signal, which
	// delays job termination for 120 seconds. Algorithms can use this 120-second
	// window to save the model artifacts, so the results of training are not lost.
	//
	// This member is required.
	StoppingCondition *types.StoppingCondition

	// The Amazon Resource Name (ARN) of the training job.
	//
	// This member is required.
	TrainingJobArn *string

	// Name of the model training job.
	//
	// This member is required.
	TrainingJobName *string

	// The status of the training job. SageMaker provides the following training job
	// statuses:
	//   - InProgress - The training is in progress.
	//   - Completed - The training job has completed.
	//   - Failed - The training job has failed. To see the reason for the failure, see
	//   the FailureReason field in the response to a DescribeTrainingJobResponse call.
	//   - Stopping - The training job is stopping.
	//   - Stopped - The training job has stopped.
	// For more detailed information, see SecondaryStatus .
	//
	// This member is required.
	TrainingJobStatus types.TrainingJobStatus

	// The Amazon Resource Name (ARN) of an AutoML job.
	AutoMLJobArn *string

	// The billable time in seconds. Billable time refers to the absolute wall-clock
	// time. Multiply BillableTimeInSeconds by the number of instances ( InstanceCount
	// ) in your training cluster to get the total compute time SageMaker bills you if
	// you run distributed training. The formula is as follows: BillableTimeInSeconds
	// * InstanceCount . You can calculate the savings from using managed spot training
	// using the formula (1 - BillableTimeInSeconds / TrainingTimeInSeconds) * 100 .
	// For example, if BillableTimeInSeconds is 100 and TrainingTimeInSeconds is 500,
	// the savings is 80%.
	BillableTimeInSeconds *int32

	// Contains information about the output location for managed spot training
	// checkpoint data.
	CheckpointConfig *types.CheckpointConfig

	// Configuration information for the Amazon SageMaker Debugger hook parameters,
	// metric and tensor collections, and storage paths. To learn more about how to
	// configure the DebugHookConfig parameter, see Use the SageMaker and Debugger
	// Configuration API Operations to Create, Update, and Debug Your Training Job (https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-createtrainingjob-api.html)
	// .
	DebugHookConfig *types.DebugHookConfig

	// Configuration information for Amazon SageMaker Debugger rules for debugging
	// output tensors.
	DebugRuleConfigurations []types.DebugRuleConfiguration

	// Evaluation status of Amazon SageMaker Debugger rules for debugging on a
	// training job.
	DebugRuleEvaluationStatuses []types.DebugRuleEvaluationStatus

	// To encrypt all communications between ML compute instances in distributed
	// training, choose True . Encryption provides greater security for distributed
	// training, but training might take longer. How long it takes depends on the
	// amount of communication between compute instances, especially if you use a deep
	// learning algorithms in distributed training.
	EnableInterContainerTrafficEncryption *bool

	// A Boolean indicating whether managed spot training is enabled ( True ) or not (
	// False ).
	EnableManagedSpotTraining *bool

	// If you want to allow inbound or outbound network calls, except for calls
	// between peers within a training cluster for distributed training, choose True .
	// If you enable network isolation for training jobs that are configured to use a
	// VPC, SageMaker downloads and uploads customer data and model artifacts through
	// the specified VPC, but the training container does not have network access.
	EnableNetworkIsolation *bool

	// The environment variables to set in the Docker container.
	Environment map[string]string

	// Associates a SageMaker job as a trial component with an experiment and trial.
	// Specified when you call the following APIs:
	//   - CreateProcessingJob (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateProcessingJob.html)
	//   - CreateTrainingJob (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html)
	//   - CreateTransformJob (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTransformJob.html)
	ExperimentConfig *types.ExperimentConfig

	// If the training job failed, the reason it failed.
	FailureReason *string

	// A collection of MetricData objects that specify the names, values, and dates
	// and times that the training algorithm emitted to Amazon CloudWatch.
	FinalMetricDataList []types.MetricData

	// Algorithm-specific parameters.
	HyperParameters map[string]string

	// Contains information about the infrastructure health check configuration for
	// the training job.
	InfraCheckConfig *types.InfraCheckConfig

	// An array of Channel objects that describes each data input channel.
	InputDataConfig []types.Channel

	// The Amazon Resource Name (ARN) of the SageMaker Ground Truth labeling job that
	// created the transform or training job.
	LabelingJobArn *string

	// A timestamp that indicates when the status of the training job was last
	// modified.
	LastModifiedTime *time.Time

	// The S3 path where model artifacts that you configured when creating the job are
	// stored. SageMaker creates subfolders for model artifacts.
	OutputDataConfig *types.OutputDataConfig

	// Configuration information for Amazon SageMaker Debugger system monitoring,
	// framework profiling, and storage paths.
	ProfilerConfig *types.ProfilerConfig

	// Configuration information for Amazon SageMaker Debugger rules for profiling
	// system and framework metrics.
	ProfilerRuleConfigurations []types.ProfilerRuleConfiguration

	// Evaluation status of Amazon SageMaker Debugger rules for profiling on a
	// training job.
	ProfilerRuleEvaluationStatuses []types.ProfilerRuleEvaluationStatus

	// Profiling status of a training job.
	ProfilingStatus types.ProfilingStatus

	// Configuration for remote debugging. To learn more about the remote debugging
	// functionality of SageMaker, see Access a training container through Amazon Web
	// Services Systems Manager (SSM) for remote debugging (https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-debugging.html)
	// .
	RemoteDebugConfig *types.RemoteDebugConfig

	// The number of times to retry the job when the job fails due to an
	// InternalServerError .
	RetryStrategy *types.RetryStrategy

	// The Amazon Web Services Identity and Access Management (IAM) role configured
	// for the training job.
	RoleArn *string

	// A history of all of the secondary statuses that the training job has
	// transitioned through.
	SecondaryStatusTransitions []types.SecondaryStatusTransition

	// Configuration of storage locations for the Amazon SageMaker Debugger
	// TensorBoard output data.
	TensorBoardOutputConfig *types.TensorBoardOutputConfig

	// Indicates the time when the training job ends on training instances. You are
	// billed for the time interval between the value of TrainingStartTime and this
	// time. For successful jobs and stopped jobs, this is the time after model
	// artifacts are uploaded. For failed jobs, this is the time when SageMaker detects
	// a job failure.
	TrainingEndTime *time.Time

	// Indicates the time when the training job starts on training instances. You are
	// billed for the time interval between this time and the value of TrainingEndTime
	// . The start time in CloudWatch Logs might be later than this time. The
	// difference is due to the time it takes to download the training data and to the
	// size of the training container.
	TrainingStartTime *time.Time

	// The training time in seconds.
	TrainingTimeInSeconds *int32

	// The Amazon Resource Name (ARN) of the associated hyperparameter tuning job if
	// the training job was launched by a hyperparameter tuning job.
	TuningJobArn *string

	// A VpcConfig (https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_VpcConfig.html)
	// object that specifies the VPC that this training job has access to. For more
	// information, see Protect Training Jobs by Using an Amazon Virtual Private Cloud (https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html)
	// .
	VpcConfig *types.VpcConfig

	// The status of the warm pool associated with the training job.
	WarmPoolStatus *types.WarmPoolStatus

	// Metadata pertaining to the operation's result.
	ResultMetadata middleware.Metadata

	noSmithyDocumentSerde
}

func (c *Client) addOperationDescribeTrainingJobMiddlewares(stack *middleware.Stack, options Options) (err error) {
	if err := stack.Serialize.Add(&setOperationInputMiddleware{}, middleware.After); err != nil {
		return err
	}
	err = stack.Serialize.Add(&awsAwsjson11_serializeOpDescribeTrainingJob{}, middleware.After)
	if err != nil {
		return err
	}
	err = stack.Deserialize.Add(&awsAwsjson11_deserializeOpDescribeTrainingJob{}, middleware.After)
	if err != nil {
		return err
	}
	if err := addProtocolFinalizerMiddlewares(stack, options, "DescribeTrainingJob"); err != nil {
		return fmt.Errorf("add protocol finalizers: %v", err)
	}

	if err = addlegacyEndpointContextSetter(stack, options); err != nil {
		return err
	}
	if err = addSetLoggerMiddleware(stack, options); err != nil {
		return err
	}
	if err = awsmiddleware.AddClientRequestIDMiddleware(stack); err != nil {
		return err
	}
	if err = smithyhttp.AddComputeContentLengthMiddleware(stack); err != nil {
		return err
	}
	if err = addResolveEndpointMiddleware(stack, options); err != nil {
		return err
	}
	if err = v4.AddComputePayloadSHA256Middleware(stack); err != nil {
		return err
	}
	if err = addRetryMiddlewares(stack, options); err != nil {
		return err
	}
	if err = awsmiddleware.AddRawResponseToMetadata(stack); err != nil {
		return err
	}
	if err = awsmiddleware.AddRecordResponseTiming(stack); err != nil {
		return err
	}
	if err = addClientUserAgent(stack, options); err != nil {
		return err
	}
	if err = smithyhttp.AddErrorCloseResponseBodyMiddleware(stack); err != nil {
		return err
	}
	if err = smithyhttp.AddCloseResponseBodyMiddleware(stack); err != nil {
		return err
	}
	if err = addSetLegacyContextSigningOptionsMiddleware(stack); err != nil {
		return err
	}
	if err = addOpDescribeTrainingJobValidationMiddleware(stack); err != nil {
		return err
	}
	if err = stack.Initialize.Add(newServiceMetadataMiddleware_opDescribeTrainingJob(options.Region), middleware.Before); err != nil {
		return err
	}
	if err = awsmiddleware.AddRecursionDetection(stack); err != nil {
		return err
	}
	if err = addRequestIDRetrieverMiddleware(stack); err != nil {
		return err
	}
	if err = addResponseErrorMiddleware(stack); err != nil {
		return err
	}
	if err = addRequestResponseLogging(stack, options); err != nil {
		return err
	}
	if err = addDisableHTTPSMiddleware(stack, options); err != nil {
		return err
	}
	return nil
}

// DescribeTrainingJobAPIClient is a client that implements the
// DescribeTrainingJob operation.
type DescribeTrainingJobAPIClient interface {
	DescribeTrainingJob(context.Context, *DescribeTrainingJobInput, ...func(*Options)) (*DescribeTrainingJobOutput, error)
}

var _ DescribeTrainingJobAPIClient = (*Client)(nil)

// TrainingJobCompletedOrStoppedWaiterOptions are waiter options for
// TrainingJobCompletedOrStoppedWaiter
type TrainingJobCompletedOrStoppedWaiterOptions struct {

	// Set of options to modify how an operation is invoked. These apply to all
	// operations invoked for this client. Use functional options on operation call to
	// modify this list for per operation behavior.
	APIOptions []func(*middleware.Stack) error

	// MinDelay is the minimum amount of time to delay between retries. If unset,
	// TrainingJobCompletedOrStoppedWaiter will use default minimum delay of 120
	// seconds. Note that MinDelay must resolve to a value lesser than or equal to the
	// MaxDelay.
	MinDelay time.Duration

	// MaxDelay is the maximum amount of time to delay between retries. If unset or
	// set to zero, TrainingJobCompletedOrStoppedWaiter will use default max delay of
	// 120 seconds. Note that MaxDelay must resolve to value greater than or equal to
	// the MinDelay.
	MaxDelay time.Duration

	// LogWaitAttempts is used to enable logging for waiter retry attempts
	LogWaitAttempts bool

	// Retryable is function that can be used to override the service defined
	// waiter-behavior based on operation output, or returned error. This function is
	// used by the waiter to decide if a state is retryable or a terminal state. By
	// default service-modeled logic will populate this option. This option can thus be
	// used to define a custom waiter state with fall-back to service-modeled waiter
	// state mutators.The function returns an error in case of a failure state. In case
	// of retry state, this function returns a bool value of true and nil error, while
	// in case of success it returns a bool value of false and nil error.
	Retryable func(context.Context, *DescribeTrainingJobInput, *DescribeTrainingJobOutput, error) (bool, error)
}

// TrainingJobCompletedOrStoppedWaiter defines the waiters for
// TrainingJobCompletedOrStopped
type TrainingJobCompletedOrStoppedWaiter struct {
	client DescribeTrainingJobAPIClient

	options TrainingJobCompletedOrStoppedWaiterOptions
}

// NewTrainingJobCompletedOrStoppedWaiter constructs a
// TrainingJobCompletedOrStoppedWaiter.
func NewTrainingJobCompletedOrStoppedWaiter(client DescribeTrainingJobAPIClient, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) *TrainingJobCompletedOrStoppedWaiter {
	options := TrainingJobCompletedOrStoppedWaiterOptions{}
	options.MinDelay = 120 * time.Second
	options.MaxDelay = 120 * time.Second
	options.Retryable = trainingJobCompletedOrStoppedStateRetryable

	for _, fn := range optFns {
		fn(&options)
	}
	return &TrainingJobCompletedOrStoppedWaiter{
		client:  client,
		options: options,
	}
}

// Wait calls the waiter function for TrainingJobCompletedOrStopped waiter. The
// maxWaitDur is the maximum wait duration the waiter will wait. The maxWaitDur is
// required and must be greater than zero.
func (w *TrainingJobCompletedOrStoppedWaiter) Wait(ctx context.Context, params *DescribeTrainingJobInput, maxWaitDur time.Duration, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) error {
	_, err := w.WaitForOutput(ctx, params, maxWaitDur, optFns...)
	return err
}

// WaitForOutput calls the waiter function for TrainingJobCompletedOrStopped
// waiter and returns the output of the successful operation. The maxWaitDur is the
// maximum wait duration the waiter will wait. The maxWaitDur is required and must
// be greater than zero.
func (w *TrainingJobCompletedOrStoppedWaiter) WaitForOutput(ctx context.Context, params *DescribeTrainingJobInput, maxWaitDur time.Duration, optFns ...func(*TrainingJobCompletedOrStoppedWaiterOptions)) (*DescribeTrainingJobOutput, error) {
	if maxWaitDur <= 0 {
		return nil, fmt.Errorf("maximum wait time for waiter must be greater than zero")
	}

	options := w.options
	for _, fn := range optFns {
		fn(&options)
	}

	if options.MaxDelay <= 0 {
		options.MaxDelay = 120 * time.Second
	}

	if options.MinDelay > options.MaxDelay {
		return nil, fmt.Errorf("minimum waiter delay %v must be lesser than or equal to maximum waiter delay of %v.", options.MinDelay, options.MaxDelay)
	}

	ctx, cancelFn := context.WithTimeout(ctx, maxWaitDur)
	defer cancelFn()

	logger := smithywaiter.Logger{}
	remainingTime := maxWaitDur

	var attempt int64
	for {

		attempt++
		apiOptions := options.APIOptions
		start := time.Now()

		if options.LogWaitAttempts {
			logger.Attempt = attempt
			apiOptions = append([]func(*middleware.Stack) error{}, options.APIOptions...)
			apiOptions = append(apiOptions, logger.AddLogger)
		}

		out, err := w.client.DescribeTrainingJob(ctx, params, func(o *Options) {
			o.APIOptions = append(o.APIOptions, apiOptions...)
		})

		retryable, err := options.Retryable(ctx, params, out, err)
		if err != nil {
			return nil, err
		}
		if !retryable {
			return out, nil
		}

		remainingTime -= time.Since(start)
		if remainingTime < options.MinDelay || remainingTime <= 0 {
			break
		}

		// compute exponential backoff between waiter retries
		delay, err := smithywaiter.ComputeDelay(
			attempt, options.MinDelay, options.MaxDelay, remainingTime,
		)
		if err != nil {
			return nil, fmt.Errorf("error computing waiter delay, %w", err)
		}

		remainingTime -= delay
		// sleep for the delay amount before invoking a request
		if err := smithytime.SleepWithContext(ctx, delay); err != nil {
			return nil, fmt.Errorf("request cancelled while waiting, %w", err)
		}
	}
	return nil, fmt.Errorf("exceeded max wait time for TrainingJobCompletedOrStopped waiter")
}

func trainingJobCompletedOrStoppedStateRetryable(ctx context.Context, input *DescribeTrainingJobInput, output *DescribeTrainingJobOutput, err error) (bool, error) {

	if err == nil {
		pathValue, err := jmespath.Search("TrainingJobStatus", output)
		if err != nil {
			return false, fmt.Errorf("error evaluating waiter state: %w", err)
		}

		expectedValue := "Completed"
		value, ok := pathValue.(types.TrainingJobStatus)
		if !ok {
			return false, fmt.Errorf("waiter comparator expected types.TrainingJobStatus value, got %T", pathValue)
		}

		if string(value) == expectedValue {
			return false, nil
		}
	}

	if err == nil {
		pathValue, err := jmespath.Search("TrainingJobStatus", output)
		if err != nil {
			return false, fmt.Errorf("error evaluating waiter state: %w", err)
		}

		expectedValue := "Stopped"
		value, ok := pathValue.(types.TrainingJobStatus)
		if !ok {
			return false, fmt.Errorf("waiter comparator expected types.TrainingJobStatus value, got %T", pathValue)
		}

		if string(value) == expectedValue {
			return false, nil
		}
	}

	if err == nil {
		pathValue, err := jmespath.Search("TrainingJobStatus", output)
		if err != nil {
			return false, fmt.Errorf("error evaluating waiter state: %w", err)
		}

		expectedValue := "Failed"
		value, ok := pathValue.(types.TrainingJobStatus)
		if !ok {
			return false, fmt.Errorf("waiter comparator expected types.TrainingJobStatus value, got %T", pathValue)
		}

		if string(value) == expectedValue {
			return false, fmt.Errorf("waiter state transitioned to Failure")
		}
	}

	if err != nil {
		var apiErr smithy.APIError
		ok := errors.As(err, &apiErr)
		if !ok {
			return false, fmt.Errorf("expected err to be of type smithy.APIError, got %w", err)
		}

		if "ValidationException" == apiErr.ErrorCode() {
			return false, fmt.Errorf("waiter state transitioned to Failure")
		}
	}

	return true, nil
}

func newServiceMetadataMiddleware_opDescribeTrainingJob(region string) *awsmiddleware.RegisterServiceMetadata {
	return &awsmiddleware.RegisterServiceMetadata{
		Region:        region,
		ServiceID:     ServiceID,
		OperationName: "DescribeTrainingJob",
	}
}