Skip to content

Commit

Permalink
Merge pull request #111113 from mimowo/retriable-pod-failures-job-con…
Browse files Browse the repository at this point in the history
…troller

Support handling of pod failures with respect to the configured rules

Kubernetes-commit: eefcf6aa801c5db48b564d0464470d623b8bfb79
  • Loading branch information
k8s-publishing-bot committed Aug 5, 2022
2 parents 3be517c + fe83bea commit 2f9e588
Show file tree
Hide file tree
Showing 19 changed files with 1,633 additions and 122 deletions.
1,256 changes: 1,138 additions & 118 deletions batch/v1/generated.pb.go

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions batch/v1/generated.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

130 changes: 130 additions & 0 deletions batch/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,120 @@ const (
IndexedCompletion CompletionMode = "Indexed"
)

// PodFailurePolicyAction specifies how a Pod failure is handled.
// +enum
type PodFailurePolicyAction string

const (
// This is an action which might be taken on a pod failure - mark the
// pod's job as Failed and terminate all running pods.
PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"

// This is an action which might be taken on a pod failure - the counter towards
// .backoffLimit, represented by the job's .status.failed field, is not
// incremented and a replacement pod is created.
PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"

// This is an action which might be taken on a pod failure - the pod failure
// is handled in the default way - the counter towards .backoffLimit,
// represented by the job's .status.failed field, is incremented.
PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
)

// +enum
type PodFailurePolicyOnExitCodesOperator string

const (
PodFailurePolicyOnExitCodesOpIn PodFailurePolicyOnExitCodesOperator = "In"
PodFailurePolicyOnExitCodesOpNotIn PodFailurePolicyOnExitCodesOperator = "NotIn"
)

// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
// a failed pod based on its container exit codes. In particular, it lookups the
// .state.terminated.exitCode for each app container and init container status,
// represented by the .status.containerStatuses and .status.initContainerStatuses
// fields in the Pod status, respectively. Containers completed with success
// (exit code 0) are excluded from the requirement check.
type PodFailurePolicyOnExitCodesRequirement struct {
// Restricts the check for exit codes to the container with the
// specified name. When null, the rule applies to all containers.
// When specified, it should match one the container or initContainer
// names in the pod template.
// +optional
ContainerName *string `json:"containerName" protobuf:"bytes,1,opt,name=containerName"`

// Represents the relationship between the container exit code(s) and the
// specified values. Containers completed with success (exit code 0) are
// excluded from the requirement check. Possible values are:
// - In: the requirement is satisfied if at least one container exit code
// (might be multiple if there are multiple containers not restricted
// by the 'containerName' field) is in the set of specified values.
// - NotIn: the requirement is satisfied if at least one container exit code
// (might be multiple if there are multiple containers not restricted
// by the 'containerName' field) is not in the set of specified values.
// Additional values are considered to be added in the future. Clients should
// react to an unknown operator by assuming the requirement is not satisfied.
Operator PodFailurePolicyOnExitCodesOperator `json:"operator" protobuf:"bytes,2,req,name=operator"`

// Specifies the set of values. Each returned container exit code (might be
// multiple in case of multiple containers) is checked against this set of
// values with respect to the operator. The list of values must be ordered
// and must not contain duplicates. Value '0' cannot be used for the In operator.
// At least one element is required. At most 255 elements are allowed.
// +listType=set
Values []int32 `json:"values" protobuf:"varint,3,rep,name=values"`
}

// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
// an actual pod condition type.
type PodFailurePolicyOnPodConditionsPattern struct {
// Specifies the required Pod condition type. To match a pod condition
// it is required that specified type equals the pod condition type.
Type corev1.PodConditionType `json:"type" protobuf:"bytes,1,req,name=type"`

// Specifies the required Pod condition status. To match a pod condition
// it is required that the specified status equals the pod condition status.
// Defaults to True.
Status corev1.ConditionStatus `json:"status" protobuf:"bytes,2,req,name=status"`
}

// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
// One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
type PodFailurePolicyRule struct {
// Specifies the action taken on a pod failure when the requirements are satisfied.
// Possible values are:
// - FailJob: indicates that the pod's job is marked as Failed and all
// running pods are terminated.
// - Ignore: indicates that the counter towards the .backoffLimit is not
// incremented and a replacement pod is created.
// - Count: indicates that the pod is handled in the default way - the
// counter towards the .backoffLimit is incremented.
// Additional values are considered to be added in the future. Clients should
// react to an unknown action by skipping the rule.
Action PodFailurePolicyAction `json:"action" protobuf:"bytes,1,req,name=action"`

// Represents the requirement on the container exit codes.
// +optional
OnExitCodes *PodFailurePolicyOnExitCodesRequirement `json:"onExitCodes" protobuf:"bytes,2,opt,name=onExitCodes"`

// Represents the requirement on the pod conditions. The requirement is represented
// as a list of pod condition patterns. The requirement is satisfied if at
// least one pattern matches an actual pod condition. At most 20 elements are allowed.
// +listType=atomic
OnPodConditions []PodFailurePolicyOnPodConditionsPattern `json:"onPodConditions" protobuf:"bytes,3,opt,name=onPodConditions"`
}

// PodFailurePolicy describes how failed pods influence the backoffLimit.
type PodFailurePolicy struct {
// A list of pod failure policy rules. The rules are evaluated in order.
// Once a rule matches a Pod failure, the remaining of the rules are ignored.
// When no rule matches the Pod failure, the default handling applies - the
// counter of pod failures is incremented and it is checked against
// the backoffLimit. At most 20 elements are allowed.
// +listType=atomic
Rules []PodFailurePolicyRule `json:"rules" protobuf:"bytes,1,opt,name=rules"`
}

// JobSpec describes how the job execution will look like.
type JobSpec struct {

Expand Down Expand Up @@ -115,6 +229,19 @@ type JobSpec struct {
// +optional
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty" protobuf:"varint,3,opt,name=activeDeadlineSeconds"`

// Specifies the policy of handling failed pods. In particular, it allows to
// specify the set of actions and conditions which need to be
// satisfied to take the associated action.
// If empty, the default behaviour applies - the counter of failed pods,
// represented by the jobs's .status.failed field, is incremented and it is
// checked against the backoffLimit. This field cannot be used in combination
// with restartPolicy=OnFailure.
//
// This field is alpha-level. To use this field, you must enable the
// `JobPodFailurePolicy` feature gate (disabled by default).
// +optional
PodFailurePolicy *PodFailurePolicy `json:"podFailurePolicy,omitempty" protobuf:"bytes,11,opt,name=podFailurePolicy"`

// Specifies the number of retries before marking this job failed.
// Defaults to 6
// +optional
Expand Down Expand Up @@ -297,6 +424,9 @@ const (
JobComplete JobConditionType = "Complete"
// JobFailed means the job has failed its execution.
JobFailed JobConditionType = "Failed"
// FailureTarget means the job is about to fail its execution.
// The constant is to be renamed once the name is accepted within the KEP-3329.
AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
)

// JobCondition describes current state of a job.
Expand Down
Loading

0 comments on commit 2f9e588

Please sign in to comment.