Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add local storage (scratch space) allocatable support #46456

Merged
merged 2 commits into from
Jun 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/kubelet/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) {

// Node Allocatable Flags
fs.Var(&c.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi, storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details.")
fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")
Expand Down
4 changes: 2 additions & 2 deletions cmd/kubelet/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,8 +914,8 @@ func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, err
rl := make(v1.ResourceList)
for k, v := range m {
switch v1.ResourceName(k) {
// Only CPU and memory resources are supported.
case v1.ResourceCPU, v1.ResourceMemory:
// CPU, memory and local storage resources are supported.
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceStorage:
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, err
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/componentconfig/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ type KubeletConfiguration struct {
SystemReserved map[string]string `json:"systemReserved"`
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
// that describe resources reserved for kubernetes system components.
// Currently only cpu and memory are supported. [default=none]
// Currently cpu, memory and local storage for root file system are supported. [default=none]
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.
KubeReserved map[string]string `json:"kubeReserved"`

Expand Down
19 changes: 19 additions & 0 deletions pkg/kubelet/cadvisor/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package cadvisor

import (
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapi2 "github.com/google/cadvisor/info/v2"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/kubernetes/pkg/api/v1"
)
Expand All @@ -33,3 +34,21 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
}
return c
}

func StorageScratchCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
c := v1.ResourceList{
v1.ResourceStorage: *resource.NewQuantity(
int64(info.Capacity),
resource.BinarySI),
}
return c
}

func StorageOverlayCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
c := v1.ResourceList{
v1.ResourceStorageOverlay: *resource.NewQuantity(
int64(info.Capacity),
resource.BinarySI),
}
return c
}
14 changes: 13 additions & 1 deletion pkg/kubelet/cm/node_container_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/apimachinery/pkg/types"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
Expand Down Expand Up @@ -180,9 +181,20 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {

}

// GetNodeAllocatable returns amount of compute resource that have to be reserved on this node from scheduling.
// GetNodeAllocatable returns amount of compute or storage resource that have to be reserved on this node from scheduling.
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
if _, ok := cm.capacity[v1.ResourceStorage]; !ok {
if cm.cadvisorInterface != nil {
if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil {
for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
} else {
glog.Warning("Error getting rootfs info: %v", err)
}
}
}
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
Expand Down
2 changes: 2 additions & 0 deletions pkg/kubelet/eviction/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
// SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation
SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available"
)

// ThresholdOperator is the operator used to express a Threshold.
Expand Down
26 changes: 17 additions & 9 deletions pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ type managerImpl struct {
lastObservations signalObservations
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
notifiersInitialized bool
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
dedicatedImageFs *bool
}

// ensure it implements the required interface
Expand All @@ -106,6 +108,7 @@ func NewManager(
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
dedicatedImageFs: nil,
}
return manager, manager
}
Expand Down Expand Up @@ -211,21 +214,22 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}

glog.V(3).Infof("eviction manager: synchronize housekeeping")

// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
if err != nil {
if m.dedicatedImageFs == nil {
hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
if ok != nil {
return nil
}
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
m.dedicatedImageFs = &hasImageFs
m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasImageFs)

}

activePods := podFunc()
// make observations and get a function to derive pod usage stats relative to those observations.
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider, activePods, *m.dedicatedImageFs)
if err != nil {
glog.Errorf("eviction manager: unexpected err: %v", err)
return nil
Expand Down Expand Up @@ -336,7 +340,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}

// the only candidates viable for eviction are those pods that had anything running.
activePods := podFunc()
if len(activePods) == 0 {
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
return nil
}

// rank the running pods for eviction for the specified resource
rank(activePods, statsFunc)

Expand Down
58 changes: 55 additions & 3 deletions pkg/kubelet/eviction/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ const (
resourceNodeFs v1.ResourceName = "nodefs"
// nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
resourceNodeFsInodes v1.ResourceName = "nodefsInodes"
// container overlay storage, in bytes. internal to this module, used to account for local disk usage for container overlay.
resourceOverlay v1.ResourceName = "overlay"
)

var (
Expand All @@ -74,19 +76,25 @@ func init() {
signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure
signalToNodeCondition[evictionapi.SignalAllocatableNodeFsAvailable] = v1.NodeDiskPressure

// map signals to resources (and vice-versa)
signalToResource = map[evictionapi.Signal]v1.ResourceName{}
signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalAllocatableNodeFsAvailable] = resourceNodeFs
signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes

resourceToSignal = map[v1.ResourceName]evictionapi.Signal{}
for key, value := range signalToResource {
resourceToSignal[value] = key
}
// Hard-code here to make sure resourceNodeFs maps to evictionapi.SignalNodeFsAvailable
// (TODO) resourceToSignal is a map from resource name to a list of signals
resourceToSignal[resourceNodeFs] = evictionapi.SignalNodeFsAvailable
}

// validSignal returns true if the signal is supported.
Expand Down Expand Up @@ -234,6 +242,16 @@ func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
},
},
{
Signal: evictionapi.SignalAllocatableNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
},
MinReclaim: &evictionapi.ThresholdValue{
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
},
},
}
}
}
Expand Down Expand Up @@ -382,10 +400,12 @@ func localVolumeNames(pod *v1.Pod) []string {
func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
overlay := resource.Quantity{Format: resource.BinarySI}
for _, container := range podStats.Containers {
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
disk.Add(*diskUsage(container.Rootfs))
inodes.Add(*inodeUsage(container.Rootfs))
overlay.Add(*diskUsage(container.Rootfs))
}
if hasFsStatsType(statsToMeasure, fsStatsLogs) {
disk.Add(*diskUsage(container.Logs))
Expand All @@ -405,8 +425,9 @@ func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsSt
}
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
resourceDisk: disk,
resourceInodes: inodes,
resourceOverlay: overlay,
}, nil
}

Expand Down Expand Up @@ -637,7 +658,7 @@ func (a byEvictionPriority) Less(i, j int) bool {
}

// makeSignalObservations derives observations using the specified summary provider.
func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) {
func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider, pods []*v1.Pod, withImageFs bool) (signalObservations, statsFunc, error) {
summary, err := summaryProvider.Get()
if err != nil {
return nil, nil, err
Expand Down Expand Up @@ -706,6 +727,37 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
capacity: memoryAllocatableCapacity.Copy(),
}
}

if storageScratchAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceStorage]; ok {
storageScratchAllocatable := storageScratchAllocatableCapacity.Copy()
for _, pod := range pods {
podStat, ok := statsFunc(pod)
if !ok {
continue
}

usage, err := podDiskUsage(podStat, pod, []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot})
if err != nil {
glog.Warningf("eviction manager: error getting pod disk usage %v", err)
continue
}
// If there is a seperate imagefs set up for container runtimes, the scratch disk usage from nodefs should exclude the overlay usage
if withImageFs {
diskUsage := usage[resourceDisk]
diskUsageP := &diskUsage
diskUsagep := diskUsageP.Copy()
diskUsagep.Sub(usage[resourceOverlay])
storageScratchAllocatable.Sub(*diskUsagep)
} else {
storageScratchAllocatable.Sub(usage[resourceDisk])
}
}
result[evictionapi.SignalAllocatableNodeFsAvailable] = signalObservation{
available: storageScratchAllocatable,
capacity: storageScratchAllocatableCapacity.Copy(),
}
}

return result, statsFunc, nil
}

Expand Down
13 changes: 11 additions & 2 deletions pkg/kubelet/eviction/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ func TestParseThresholdConfig(t *testing.T) {
Quantity: quantityMustParse("0"),
},
},
{
Signal: evictionapi.SignalAllocatableNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("0"),
},
MinReclaim: &evictionapi.ThresholdValue{
Quantity: quantityMustParse("0"),
},
},
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Expand Down Expand Up @@ -777,8 +787,7 @@ func TestMakeSignalObservations(t *testing.T) {
if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 {
t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity)
}
actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider)

actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider, pods, false)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
clientgoclientset "k8s.io/client-go/kubernetes"

cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
Expand Down Expand Up @@ -927,6 +928,9 @@ type Kubelet struct {
// Cached MachineInfo returned by cadvisor.
machineInfo *cadvisorapi.MachineInfo

//Cached RootFsInfo returned by cadvisor
rootfsInfo *cadvisorapiv2.FsInfo

// Handles certificate rotations.
serverCertificateManager certificate.Manager

Expand Down
12 changes: 12 additions & 0 deletions pkg/kubelet/kubelet_cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,15 @@ func (kl *Kubelet) GetCachedMachineInfo() (*cadvisorapi.MachineInfo, error) {
}
return kl.machineInfo, nil
}

// GetCachedRootFsInfo assumes that the rootfs info can't change without a reboot
func (kl *Kubelet) GetCachedRootFsInfo() (cadvisorapiv2.FsInfo, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thinking right now is that it would make more sense to change cAdvisor to include the information we need in MachineInfo. IIRC, the only piece we are missing is the mountpoint for the disks, which would let us determine which filesystem is the RootFs. I am happy to look into this in the next couple days to see if it is feasible. That would greatly simplify this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! I will check that too, but we might also need to add this after code freeze.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that is fine.

if kl.rootfsInfo == nil {
info, err := kl.cadvisor.RootFsInfo()
if err != nil {
return cadvisorapiv2.FsInfo{}, err
}
kl.rootfsInfo = &info
}
return *kl.rootfsInfo, nil
}
20 changes: 20 additions & 0 deletions pkg/kubelet/kubelet_node_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,26 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.NodeInfo.BootID = info.BootID
}

rootfs, err := kl.GetCachedRootFsInfo()
if err != nil {
node.Status.Capacity[v1.ResourceStorage] = resource.MustParse("0Gi")
} else {
for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) {
node.Status.Capacity[rName] = rCap
}
}

if hasDedicatedImageFs, _ := kl.HasDedicatedImageFs(); hasDedicatedImageFs {
imagesfs, err := kl.ImagesFsInfo()
if err != nil {
node.Status.Capacity[v1.ResourceStorageOverlay] = resource.MustParse("0Gi")
} else {
for rName, rCap := range cadvisor.StorageOverlayCapacityFromFsInfo(imagesfs) {
node.Status.Capacity[rName] = rCap
}
}
}

// Set Allocatable.
if node.Status.Allocatable == nil {
node.Status.Allocatable = make(v1.ResourceList)
Expand Down
Loading