Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Node Slice Fast IPAM #458

Merged
merged 7 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
.idea
kind/

bin/
/github.com/
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ FROM alpine:latest
LABEL org.opencontainers.image.source https://github.com/k8snetworkplumbingwg/whereabouts
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/whereabouts .
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/ip-control-loop .
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/node-slice-controller .
COPY script/install-cni.sh .
CMD ["/install-cni.sh"]
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,37 @@ Similar to above, `ipRanges` can be used for configuring DualStack
}
```

## Fast IPAM by Using Preallocated Node Slices [Experimental]

**Enhance IPAM performance in large-scale Kubernetes environments by reducing IP allocation contention through node-based IP slicing.**

### Fast IPAM Configuration

apiVersion: "k8s.cni.cncf.io/v1"
kind: NetworkAttachmentDefinition
metadata:
name: whereabouts-fast-ipam
spec:
config: '{
"cniVersion": "0.3.0",
"name": "whereaboutsexample",
"type": "macvlan",
"master": "eth0",
"mode": "bridge",
"ipam": {
"type": "whereabouts",
"range": "192.168.2.0/24",
"fast_ipam": true,
"node_slice size": "/22",
"namespace": "namespace of network attachment definitions and whereabouts deployment"
}
}'

This setup enables the fast IPAM feature to optimize IP allocation for nodes, improving network performance in clusters with high pod density.
Please note, you must run a whereabouts controller for this to work. Manifest can be found in doc/crds/node-slice-controller.yaml.
You must run your whereabouts daemonset, whereabouts controller in the same namespaces as your network-attachment-definitions.


## Core Parameters

**Required**
Expand Down
97 changes: 97 additions & 0 deletions cmd/nodeslicecontroller/node_slice_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package main

Check warning on line 1 in cmd/nodeslicecontroller/node_slice_controller.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

should have a package comment
ivelichkovich marked this conversation as resolved.
Show resolved Hide resolved

import (
"errors"
"flag"
"os"
"time"

nadclient "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned"
nadinformers "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"

clientset "github.com/k8snetworkplumbingwg/whereabouts/pkg/client/clientset/versioned"
informers "github.com/k8snetworkplumbingwg/whereabouts/pkg/client/informers/externalversions"
node_controller "github.com/k8snetworkplumbingwg/whereabouts/pkg/node-controller"
"github.com/k8snetworkplumbingwg/whereabouts/pkg/node-controller/signals"
)

var (
masterURL string
kubeconfig string
)

// TODO: leader election
func main() {
klog.InitFlags(nil)
flag.Parse()

// set up signals so we handle the shutdown signal gracefully
ctx := signals.SetupSignalHandler()
logger := klog.FromContext(ctx)

cfg, err := clientcmd.BuildConfigFromFlags(masterURL, kubeconfig)
if err != nil {
logger.Error(err, "Error building kubeconfig")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

kubeClient, err := kubernetes.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

whereaboutsClient, err := clientset.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

nadClient, err := nadclient.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

whereaboutsNamespace := os.Getenv("WHEREABOUTS_NAMESPACE")
if whereaboutsNamespace == "" {
logger.Error(errors.New("env var for WHEREABOUTS_NAMESPACE not set"), "unable to discover namespace")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClient, time.Second*30)
whereaboutsInformerFactory := informers.NewSharedInformerFactory(whereaboutsClient, time.Second*30)
nadInformerFactory := nadinformers.NewSharedInformerFactory(nadClient, time.Second*30)

controller := node_controller.NewController(
ctx,
kubeClient,
whereaboutsClient,
nadClient,
kubeInformerFactory.Core().V1().Nodes(),
whereaboutsInformerFactory.Whereabouts().V1alpha1().NodeSlicePools(),
nadInformerFactory.K8sCniCncfIo().V1().NetworkAttachmentDefinitions(),
false,
whereaboutsNamespace,
)

// notice that there is no need to run Start methods in a separate goroutine. (i.e. go kubeInformerFactory.Start(ctx.done())
// Start method is non-blocking and runs all registered informers in a dedicated goroutine.
kubeInformerFactory.Start(ctx.Done())
whereaboutsInformerFactory.Start(ctx.Done())
nadInformerFactory.Start(ctx.Done())

if err = controller.Run(ctx, 1); err != nil {
logger.Error(err, "Error running controller")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
}

func init() {
flag.StringVar(&kubeconfig, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.")
flag.StringVar(&masterURL, "master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.")
}
4 changes: 4 additions & 0 deletions doc/crds/daemonset-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rules:
resources:
- ippools
- overlappingrangeipreservations
- nodeslicepools
verbs:
- get
- list
Expand All @@ -48,11 +49,14 @@ rules:
verbs:
- list
- watch
- get
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups: ["k8s.cni.cncf.io"]
resources:
- network-attachment-definitions
Expand Down
92 changes: 92 additions & 0 deletions doc/crds/node-slice-controller.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: whereabouts-controller
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: whereabouts-controller
template:
metadata:
labels:
app: whereabouts-controller
spec:
containers:
- command:
- /node-slice-controller
env:
- name: NODENAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: WHEREABOUTS_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: ghcr.io/k8snetworkplumbingwg/whereabouts:latest
name: whereabouts
resources:
limits:
cpu: 100m
memory: 200Mi
requests:
cpu: 100m
memory: 100Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /host/opt/cni/bin
name: cnibin
- mountPath: /host/etc/cni/net.d
name: cni-net-dir
- mountPath: /cron-schedule
name: cron-scheduler-configmap
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-6kd6k
readOnly: true
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: whereabouts
serviceAccountName: whereabouts
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /opt/cni/bin
type: ""
name: cnibin
- hostPath:
path: /etc/cni/net.d
type: ""
name: cni-net-dir
- configMap:
defaultMode: 484
items:
- key: cron-expression
path: config
name: whereabouts-config
name: cron-scheduler-configmap
- name: kube-api-access-6kd6k
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
79 changes: 79 additions & 0 deletions doc/crds/whereabouts.cni.cncf.io_nodeslicepools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
name: nodeslicepools.whereabouts.cni.cncf.io
spec:
group: whereabouts.cni.cncf.io
names:
kind: NodeSlicePool
listKind: NodeSlicePoolList
plural: nodeslicepools
singular: nodeslicepool
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeSlicePool is the Schema for the nodesliceippools API
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: NodeSlicePoolSpec defines the desired state of NodeSlicePool
properties:
range:
description: |-
Range is a RFC 4632/4291-style string that represents an IP address and prefix length in CIDR notation
this refers to the entire range where the node is allocated a subset
type: string
sliceSize:
description: SliceSize is the size of subnets or slices of the range
that each node will be assigned
type: string
required:
- range
- sliceSize
type: object
status:
description: NodeSlicePoolStatus defines the desired state of NodeSlicePool
properties:
allocations:
description: Allocations holds the allocations of nodes to slices
items:
properties:
nodeName:
description: NodeName is the name of the node assigned to this
slice, empty node name is an available slice for assignment
type: string
sliceRange:
description: SliceRange is the subnet of this slice
type: string
required:
- nodeName
- sliceRange
type: object
type: array
required:
- allocations
type: object
type: object
served: true
storage: true
31 changes: 31 additions & 0 deletions e2e/client/ippool.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

kubeClient "github.com/k8snetworkplumbingwg/whereabouts/pkg/storage/kubernetes"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
)

Expand All @@ -31,8 +32,38 @@ func isIPPoolAllocationsEmpty(ctx context.Context, k8sIPAM *kubeClient.Kubernete
}
}

func isIPPoolAllocationsEmptyForNodeSlices(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, clientInfo *ClientInfo) wait.ConditionWithContextFunc {
return func(context.Context) (bool, error) {
nodes, err := clientInfo.Client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
}
for _, node := range nodes.Items {
ipPool, err := k8sIPAM.GetIPPool(ctx, kubeClient.PoolIdentifier{NodeName: node.Name, IpRange: ipPoolCIDR, NetworkName: k8sIPAM.Config.NetworkName})
if err != nil {
if err.Error() == "k8s pool initialized" {
continue
} else {
return false, err
}
}

if len(ipPool.Allocations()) != 0 {
return false, nil
}
}
return true, nil
}
}

// WaitForZeroIPPoolAllocations polls up to timeout seconds for IP pool allocations to be gone from the Kubernetes cluster.
// Returns an error if any IP pool allocations remain after time limit, or if GETing IP pools causes an error.
func WaitForZeroIPPoolAllocations(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, timeout time.Duration) error {
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, isIPPoolAllocationsEmpty(ctx, k8sIPAM, ipPoolCIDR))
}

// WaitForZeroIPPoolAllocationsAcrossNodeSlices polls up to timeout seconds for IP pool allocations to be gone from the Kubernetes cluster.
// Returns an error if any IP pool allocations remain after time limit, or if GETing IP pools causes an error.
func WaitForZeroIPPoolAllocationsAcrossNodeSlices(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, timeout time.Duration, clientInfo *ClientInfo) error {
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, isIPPoolAllocationsEmptyForNodeSlices(ctx, k8sIPAM, ipPoolCIDR, clientInfo))
}
Loading
Loading