Skip to content

Commit

Permalink
Merge pull request #458 from ivelichkovich/fast
Browse files Browse the repository at this point in the history
Node Slice Fast IPAM
  • Loading branch information
dougbtv authored Jul 23, 2024
2 parents 8c38117 + 1243648 commit f1a7e7a
Show file tree
Hide file tree
Showing 39 changed files with 3,764 additions and 115 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

# Output of the go coverage tool, specifically when used with LiteIDE
*.out
.idea
kind/

bin/
/github.com/
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ FROM alpine:latest
LABEL org.opencontainers.image.source https://github.com/k8snetworkplumbingwg/whereabouts
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/whereabouts .
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/ip-control-loop .
COPY --from=0 /go/src/github.com/k8snetworkplumbingwg/whereabouts/bin/node-slice-controller .
COPY script/install-cni.sh .
CMD ["/install-cni.sh"]
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,37 @@ Similar to above, `ipRanges` can be used for configuring DualStack
}
```

## Fast IPAM by Using Preallocated Node Slices [Experimental]

**Enhance IPAM performance in large-scale Kubernetes environments by reducing IP allocation contention through node-based IP slicing.**

### Fast IPAM Configuration

apiVersion: "k8s.cni.cncf.io/v1"
kind: NetworkAttachmentDefinition
metadata:
name: whereabouts-fast-ipam
spec:
config: '{
"cniVersion": "0.3.0",
"name": "whereaboutsexample",
"type": "macvlan",
"master": "eth0",
"mode": "bridge",
"ipam": {
"type": "whereabouts",
"range": "192.168.2.0/24",
"fast_ipam": true,
"node_slice size": "/22",
"namespace": "namespace of network attachment definitions and whereabouts deployment"
}
}'

This setup enables the fast IPAM feature to optimize IP allocation for nodes, improving network performance in clusters with high pod density.
Please note, you must run a whereabouts controller for this to work. Manifest can be found in doc/crds/node-slice-controller.yaml.
You must run your whereabouts daemonset, whereabouts controller in the same namespaces as your network-attachment-definitions.


## Core Parameters

**Required**
Expand Down
97 changes: 97 additions & 0 deletions cmd/nodeslicecontroller/node_slice_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package main

Check warning on line 1 in cmd/nodeslicecontroller/node_slice_controller.go

View workflow job for this annotation

GitHub Actions / test (1.21.x, ubuntu-latest)

should have a package comment

import (
"errors"
"flag"
"os"
"time"

nadclient "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned"
nadinformers "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"

clientset "github.com/k8snetworkplumbingwg/whereabouts/pkg/client/clientset/versioned"
informers "github.com/k8snetworkplumbingwg/whereabouts/pkg/client/informers/externalversions"
node_controller "github.com/k8snetworkplumbingwg/whereabouts/pkg/node-controller"
"github.com/k8snetworkplumbingwg/whereabouts/pkg/node-controller/signals"
)

var (
masterURL string
kubeconfig string
)

// TODO: leader election
func main() {
klog.InitFlags(nil)
flag.Parse()

// set up signals so we handle the shutdown signal gracefully
ctx := signals.SetupSignalHandler()
logger := klog.FromContext(ctx)

cfg, err := clientcmd.BuildConfigFromFlags(masterURL, kubeconfig)
if err != nil {
logger.Error(err, "Error building kubeconfig")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

kubeClient, err := kubernetes.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

whereaboutsClient, err := clientset.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

nadClient, err := nadclient.NewForConfig(cfg)
if err != nil {
logger.Error(err, "Error building kubernetes clientset")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

whereaboutsNamespace := os.Getenv("WHEREABOUTS_NAMESPACE")
if whereaboutsNamespace == "" {
logger.Error(errors.New("env var for WHEREABOUTS_NAMESPACE not set"), "unable to discover namespace")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}

kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClient, time.Second*30)
whereaboutsInformerFactory := informers.NewSharedInformerFactory(whereaboutsClient, time.Second*30)
nadInformerFactory := nadinformers.NewSharedInformerFactory(nadClient, time.Second*30)

controller := node_controller.NewController(
ctx,
kubeClient,
whereaboutsClient,
nadClient,
kubeInformerFactory.Core().V1().Nodes(),
whereaboutsInformerFactory.Whereabouts().V1alpha1().NodeSlicePools(),
nadInformerFactory.K8sCniCncfIo().V1().NetworkAttachmentDefinitions(),
false,
whereaboutsNamespace,
)

// notice that there is no need to run Start methods in a separate goroutine. (i.e. go kubeInformerFactory.Start(ctx.done())
// Start method is non-blocking and runs all registered informers in a dedicated goroutine.
kubeInformerFactory.Start(ctx.Done())
whereaboutsInformerFactory.Start(ctx.Done())
nadInformerFactory.Start(ctx.Done())

if err = controller.Run(ctx, 1); err != nil {
logger.Error(err, "Error running controller")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
}

func init() {
flag.StringVar(&kubeconfig, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.")
flag.StringVar(&masterURL, "master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.")
}
4 changes: 4 additions & 0 deletions doc/crds/daemonset-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rules:
resources:
- ippools
- overlappingrangeipreservations
- nodeslicepools
verbs:
- get
- list
Expand All @@ -48,11 +49,14 @@ rules:
verbs:
- list
- watch
- get
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups: ["k8s.cni.cncf.io"]
resources:
- network-attachment-definitions
Expand Down
92 changes: 92 additions & 0 deletions doc/crds/node-slice-controller.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: whereabouts-controller
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: whereabouts-controller
template:
metadata:
labels:
app: whereabouts-controller
spec:
containers:
- command:
- /node-slice-controller
env:
- name: NODENAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: WHEREABOUTS_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: ghcr.io/k8snetworkplumbingwg/whereabouts:latest
name: whereabouts
resources:
limits:
cpu: 100m
memory: 200Mi
requests:
cpu: 100m
memory: 100Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /host/opt/cni/bin
name: cnibin
- mountPath: /host/etc/cni/net.d
name: cni-net-dir
- mountPath: /cron-schedule
name: cron-scheduler-configmap
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-6kd6k
readOnly: true
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: whereabouts
serviceAccountName: whereabouts
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /opt/cni/bin
type: ""
name: cnibin
- hostPath:
path: /etc/cni/net.d
type: ""
name: cni-net-dir
- configMap:
defaultMode: 484
items:
- key: cron-expression
path: config
name: whereabouts-config
name: cron-scheduler-configmap
- name: kube-api-access-6kd6k
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
79 changes: 79 additions & 0 deletions doc/crds/whereabouts.cni.cncf.io_nodeslicepools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
name: nodeslicepools.whereabouts.cni.cncf.io
spec:
group: whereabouts.cni.cncf.io
names:
kind: NodeSlicePool
listKind: NodeSlicePoolList
plural: nodeslicepools
singular: nodeslicepool
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: NodeSlicePool is the Schema for the nodesliceippools API
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: NodeSlicePoolSpec defines the desired state of NodeSlicePool
properties:
range:
description: |-
Range is a RFC 4632/4291-style string that represents an IP address and prefix length in CIDR notation
this refers to the entire range where the node is allocated a subset
type: string
sliceSize:
description: SliceSize is the size of subnets or slices of the range
that each node will be assigned
type: string
required:
- range
- sliceSize
type: object
status:
description: NodeSlicePoolStatus defines the desired state of NodeSlicePool
properties:
allocations:
description: Allocations holds the allocations of nodes to slices
items:
properties:
nodeName:
description: NodeName is the name of the node assigned to this
slice, empty node name is an available slice for assignment
type: string
sliceRange:
description: SliceRange is the subnet of this slice
type: string
required:
- nodeName
- sliceRange
type: object
type: array
required:
- allocations
type: object
type: object
served: true
storage: true
31 changes: 31 additions & 0 deletions e2e/client/ippool.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

kubeClient "github.com/k8snetworkplumbingwg/whereabouts/pkg/storage/kubernetes"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
)

Expand All @@ -31,8 +32,38 @@ func isIPPoolAllocationsEmpty(ctx context.Context, k8sIPAM *kubeClient.Kubernete
}
}

func isIPPoolAllocationsEmptyForNodeSlices(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, clientInfo *ClientInfo) wait.ConditionWithContextFunc {
return func(context.Context) (bool, error) {
nodes, err := clientInfo.Client.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return false, err
}
for _, node := range nodes.Items {
ipPool, err := k8sIPAM.GetIPPool(ctx, kubeClient.PoolIdentifier{NodeName: node.Name, IpRange: ipPoolCIDR, NetworkName: k8sIPAM.Config.NetworkName})
if err != nil {
if err.Error() == "k8s pool initialized" {
continue
} else {
return false, err
}
}

if len(ipPool.Allocations()) != 0 {
return false, nil
}
}
return true, nil
}
}

// WaitForZeroIPPoolAllocations polls up to timeout seconds for IP pool allocations to be gone from the Kubernetes cluster.
// Returns an error if any IP pool allocations remain after time limit, or if GETing IP pools causes an error.
func WaitForZeroIPPoolAllocations(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, timeout time.Duration) error {
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, isIPPoolAllocationsEmpty(ctx, k8sIPAM, ipPoolCIDR))
}

// WaitForZeroIPPoolAllocationsAcrossNodeSlices polls up to timeout seconds for IP pool allocations to be gone from the Kubernetes cluster.
// Returns an error if any IP pool allocations remain after time limit, or if GETing IP pools causes an error.
func WaitForZeroIPPoolAllocationsAcrossNodeSlices(ctx context.Context, k8sIPAM *kubeClient.KubernetesIPAM, ipPoolCIDR string, timeout time.Duration, clientInfo *ClientInfo) error {
return wait.PollUntilContextTimeout(ctx, time.Second, timeout, true, isIPPoolAllocationsEmptyForNodeSlices(ctx, k8sIPAM, ipPoolCIDR, clientInfo))
}
Loading

0 comments on commit f1a7e7a

Please sign in to comment.