Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended scheduled job status messages #603

Merged
merged 12 commits into from
Apr 2, 2024
4 changes: 2 additions & 2 deletions api/alerting/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ import (
alertModels "github.com/equinor/radix-api/api/alerting/models"
"github.com/equinor/radix-api/api/utils/labelselector"
"github.com/equinor/radix-api/models"
"github.com/equinor/radix-common/utils"
operatoralert "github.com/equinor/radix-operator/pkg/apis/alert"
"github.com/equinor/radix-operator/pkg/apis/kube"
radixv1 "github.com/equinor/radix-operator/pkg/apis/radix/v1"
crdutils "github.com/equinor/radix-operator/pkg/apis/utils"
"github.com/equinor/radix-operator/pkg/apis/utils/slice"
corev1 "k8s.io/api/core/v1"
kubeErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -195,7 +195,7 @@ func (h *handler) validateUpdateAlertingConfig(config *alertModels.UpdateAlertin
return InvalidAlertReceiverError(alert.Alert, alert.Receiver)
}
// Verify alert name is valid
if !slice.ContainsString(h.validAlertNames, alert.Alert) {
if !utils.ContainsString(h.validAlertNames, alert.Alert) {
return InvalidAlertError(alert.Alert)
}
}
Expand Down
24 changes: 14 additions & 10 deletions api/deployments/component_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ import (
"strings"

deploymentModels "github.com/equinor/radix-api/api/deployments/models"
"github.com/equinor/radix-api/api/kubequery"
"github.com/equinor/radix-api/api/utils"
"github.com/equinor/radix-api/api/utils/event"
"github.com/equinor/radix-api/api/utils/labelselector"
radixutils "github.com/equinor/radix-common/utils"
"github.com/equinor/radix-common/utils/slice"
"github.com/equinor/radix-operator/pkg/apis/defaults"
"github.com/equinor/radix-operator/pkg/apis/deployment"
"github.com/equinor/radix-operator/pkg/apis/kube"
Expand Down Expand Up @@ -169,7 +172,12 @@ func GetComponentStateFromSpec(
}
componentPodNames = getPodNames(componentPods)
environmentVariables = getRadixEnvironmentVariables(componentPods)
replicaSummaryList = getReplicaSummaryList(componentPods)
eventList, err := kubequery.GetEventsForEnvironment(ctx, kubeClient, appName, deployment.Environment)
if err != nil {
return nil, err
}
lastEventWarnings := event.ConvertToEventWarnings(eventList)
replicaSummaryList = getReplicaSummaryList(componentPods, lastEventWarnings)
Comment on lines +175 to +180
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how useful it is to include the last warning event. There can be multiple warning events, and only showing the last will potentially hide other important events. Perhaps a better approach is to have an endpoint for the component and/or replica that returns events for the particular object. We can then show it in web console as an event list, similar to the event list on the environment page

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternative can be to show events within the replica page, but it will not show the rootcause of some issues with replica after it was deleted

auxResource, err = getAuxiliaryResources(ctx, kubeClient, appName, component, envNs)
if err != nil {
return nil, err
Expand Down Expand Up @@ -305,14 +313,10 @@ func getRadixEnvironmentVariables(pods []corev1.Pod) map[string]string {
return radixEnvironmentVariables
}

func getReplicaSummaryList(pods []corev1.Pod) []deploymentModels.ReplicaSummary {
replicaSummaryList := make([]deploymentModels.ReplicaSummary, 0, len(pods))

for _, pod := range pods {
replicaSummaryList = append(replicaSummaryList, deploymentModels.GetReplicaSummary(pod))
}

return replicaSummaryList
func getReplicaSummaryList(pods []corev1.Pod, lastEventWarnings event.LastEventWarnings) []deploymentModels.ReplicaSummary {
return slice.Map(pods, func(pod corev1.Pod) deploymentModels.ReplicaSummary {
return deploymentModels.GetReplicaSummary(pod, lastEventWarnings[pod.GetName()])
})
}

func getAuxiliaryResources(ctx context.Context, kubeClient kubernetes.Interface, appName string, component v1.RadixCommonDeployComponent, envNamespace string) (auxResource deploymentModels.AuxiliaryResource, err error) {
Expand Down Expand Up @@ -357,7 +361,7 @@ func getAuxiliaryResourceDeployment(ctx context.Context, kubeClient kubernetes.I
if err != nil {
return nil, err
}
auxResourceDeployment.ReplicaList = getReplicaSummaryList(pods.Items)
auxResourceDeployment.ReplicaList = getReplicaSummaryList(pods.Items, nil)
auxResourceDeployment.Status = deploymentModels.ComponentStatusFromDeployment(&deployment).String()
return &auxResourceDeployment, nil
}
Expand Down
49 changes: 38 additions & 11 deletions api/deployments/models/component_deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

radixutils "github.com/equinor/radix-common/utils"
"github.com/equinor/radix-common/utils/pointers"
corev1 "k8s.io/api/core/v1"
)

Expand Down Expand Up @@ -280,7 +281,19 @@ type ReplicaSummary struct {
//
// required: false
// example: 2006-01-02T15:04:05Z
Created string `json:"created"`
Created string `json:"created,omitempty"`

// The time at which the batch job's pod startedAt
//
// required: false
// example: 2006-01-02T15:04:05Z
StartTime string `json:"startTime,omitempty"`

// The time at which the batch job's pod finishedAt.
//
// required: false
// example: 2006-01-02T15:04:05Z
EndTime string `json:"endTime,omitempty"`

// Container started timestamp
//
Expand All @@ -291,47 +304,58 @@ type ReplicaSummary struct {
// Status describes the component container status
//
// required: false
Status ReplicaStatus `json:"replicaStatus"`
Status ReplicaStatus `json:"replicaStatus,omitempty"`

// StatusMessage provides message describing the status of a component container inside a pod
//
// required: false
StatusMessage string `json:"statusMessage"`
StatusMessage string `json:"statusMessage,omitempty"`

// RestartCount count of restarts of a component container inside a pod
//
// required: false
RestartCount int32 `json:"restartCount"`
RestartCount int32 `json:"restartCount,omitempty"`

// The image the container is running.
//
// required: false
// example: radixdev.azurecr.io/app-server:cdgkg
Image string `json:"image"`
Image string `json:"image,omitempty"`

// ImageID of the container's image.
//
// required: false
// example: radixdev.azurecr.io/app-server@sha256:d40cda01916ef63da3607c03785efabc56eb2fc2e0dab0726b1a843e9ded093f
ImageId string `json:"imageId"`
ImageId string `json:"imageId,omitempty"`

// The index of the pod in the re-starts
PodIndex int `json:"podIndex,omitempty"`

// Exit status from the last termination of the container
ExitCode int32 `json:"exitCode"`

// A brief CamelCase message indicating details about why the job is in this phase
Reason string `json:"reason,omitempty"`

// Resources Resource requirements for the pod
//
// required: false
Resources ResourceRequirements `json:"resources,omitempty"`
Resources *ResourceRequirements `json:"resources,omitempty"`
}

// ReplicaStatus describes the status of a component container inside a pod
// swagger:model ReplicaStatus
type ReplicaStatus struct {
// Status of the container
// - Pending = Container in Waiting state and the reason is ContainerCreating
// - Failing = Container in Waiting state and the reason is anything else but ContainerCreating
// - Failed = Container is failed
// - Failing = Container is failed
// - Running = Container in Running state
// - Succeeded = Container in Succeeded state
// - Terminated = Container in Terminated state
//
// required: true
// enum: Pending,Failing,Running,Terminated,Starting
// enum: Pending,Succeeded,Failing,Failed,Running,Terminated,Starting
// example: Running
Status string `json:"status"`
}
Expand Down Expand Up @@ -396,7 +420,7 @@ type ResourceRequirements struct {
Requests Resources `json:"requests,omitempty"`
}

func GetReplicaSummary(pod corev1.Pod) ReplicaSummary {
func GetReplicaSummary(pod corev1.Pod, lastEventWarning string) ReplicaSummary {
replicaSummary := ReplicaSummary{}
replicaSummary.Name = pod.GetName()
creationTimestamp := pod.GetCreationTimestamp()
Expand Down Expand Up @@ -450,7 +474,10 @@ func GetReplicaSummary(pod corev1.Pod) ReplicaSummary {
replicaSummary.Image = containerStatus.Image
replicaSummary.ImageId = containerStatus.ImageID
if len(pod.Spec.Containers) > 0 {
replicaSummary.Resources = ConvertResourceRequirements(pod.Spec.Containers[0].Resources)
replicaSummary.Resources = pointers.Ptr(ConvertResourceRequirements(pod.Spec.Containers[0].Resources))
}
if len(replicaSummary.StatusMessage) == 0 && (replicaSummary.Status.Status == Failing.String() || replicaSummary.Status.Status == Pending.String()) {
replicaSummary.StatusMessage = lastEventWarning
Comment on lines +479 to +480
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure its a good idea to set last warning event in StatusMessage. We should instead discuss if we shall show events per component and/or replica, as described in my other comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reason as above - to keep the failure reason in the radix-batch status, when replicas and events do not longer exist

}
return replicaSummary
}
Expand Down
5 changes: 2 additions & 3 deletions api/deployments/models/deployment_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ import (
"errors"
"time"

"github.com/equinor/radix-common/utils/slice"
crdUtils "github.com/equinor/radix-operator/pkg/apis/utils"

radixutils "github.com/equinor/radix-common/utils"
"github.com/equinor/radix-common/utils/slice"
"github.com/equinor/radix-operator/pkg/apis/kube"
v1 "github.com/equinor/radix-operator/pkg/apis/radix/v1"
crdUtils "github.com/equinor/radix-operator/pkg/apis/utils"
)

// DeploymentBuilder Builds DTOs
Expand Down
2 changes: 1 addition & 1 deletion api/deployments/models/scheduled_batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ type ScheduledJobSummary struct {
// Status of the job
//
// required: true
// enum: Running,Succeeded,Failed,Waiting,Stopping,Stopped
// enum: Running,Active,Succeeded,Failed,Waiting,Stopping,Stopped
// example: Waiting
Status string `json:"status"`

Expand Down
8 changes: 7 additions & 1 deletion api/environments/environment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,11 @@ func (c *environmentController) GetScheduledJobLog(accounts models.Accounts, w h
// description: Name of scheduled job
// type: string
// required: true
// - name: replicaName
// in: query
// description: Name of the job replica
// type: string
// required: false
// - name: sinceTime
// in: query
// description: Get log only from sinceTime (example 2020-03-18T07:20:41+00:00)
Expand Down Expand Up @@ -1173,6 +1178,7 @@ func (c *environmentController) GetScheduledJobLog(accounts models.Accounts, w h
appName := mux.Vars(r)["appName"]
envName := mux.Vars(r)["envName"]
scheduledJobName := mux.Vars(r)["scheduledJobName"]
replicaName := r.FormValue("replicaName")

since, asFile, logLines, err, _ := logs.GetLogParams(r)
if err != nil {
Expand All @@ -1181,7 +1187,7 @@ func (c *environmentController) GetScheduledJobLog(accounts models.Accounts, w h
}

eh := c.environmentHandlerFactory(accounts)
logs, err := eh.GetScheduledJobLogs(r.Context(), appName, envName, scheduledJobName, &since, logLines)
logs, err := eh.GetScheduledJobLogs(r.Context(), appName, envName, scheduledJobName, replicaName, &since, logLines)
if err != nil {
c.ErrorResponse(w, r, err)
return
Expand Down
Loading
Loading