base frame

the executable codes for caelus
Tencent · Oct 13, 2021 · 79ea1bf · 79ea1bf
commit 79ea1bf
Show file tree

Hide file tree

Showing 279 changed files with 40,231 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea/
+_output/
diff --git a/LICENSE b/LICENSE
diff --git a/Makefile b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: build
+build: format
+	./hack/build.sh
+
+.PHONY: format
+format:
+	./hack/format.sh
+
+.PHONY: test
+test:
+	./hack/test.sh
+
+.PHONY: clean
+clean:
+	./hack/clean.sh
+
+.PHONY: image
+image: build
+	./hack/image.sh
diff --git a/README.md b/README.md
@@ -0,0 +1,73 @@
+## Caelus
+  [![GitHub license](https://img.shields.io/badge/license-Apache--2.0-brightgreen)](https://github.com/Tencent/caelus/blob/master/LICENSE)
+  [![Release](https://img.shields.io/github/v/release/Tencent/caelus.svg)](https://github.com/Tencent/caelus/releases)
+  [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/Tencent/caelus/pulls)
+
+  Caelus is a set of Kubernetes solutions for reusing idle resources of nodes by running extra batch jobs, these resources come from 
+  the underutilization of online jobs, especially during low traffic periods. To make batch jobs compatible with online jobs,
+  caelus dynamically manages multiple resource isolation mechanisms and also checks abnormalities of various metrics. 
+  Batch jobs will be throttled or even killed if interference detected.
+
+## Features
+
+* Collect various metrics, including node resources, cgroup resources and online jobs latency
+
+* Batch jobs could be running on YARN or Kubernetes
+
+* Predict total resource usages of the node, including online jobs and kernel modules, such as slab
+
+* Dynamically manage multiple resource isolation mechanisms, such as CPU, memory, and disk space
+
+* Dynamically check abnormalities of various metrics, such as CPU usage or online jobs latency
+
+* Throttle or even kill batch jobs when resource pressure or latency spike detected
+
+* Prometheus metrics supported
+
+* Alarm supported
+
+## Usage
+
+Find more usage at [Tutorial.md](doc/tutorial.md). The project also have two attached tools:
+
+#### nm_operator
+
+[nm_operator](doc/nm_operator.md) is used to execute YARN commands in the way of remote API.
+
+#### metric_adapter
+
+[metric_adapter](doc/metric_adapter.md) is used to collect more application metrics with adapter extension.
+
+
+## Getting started
+
+### build
+
+``` sh
+# binary build, which generates binary under _output/bin/
+$ make build
+
+# image build
+$ make image
+
+# run unit test
+$ make test
+```
+
+### Run
+
+```sh
+# running in script
+$ caelus --config=hack/config/caelus.json --hostname-override=xxx --v=2
+
+# running in image
+$ kubectl create -f hack/yaml/caelus.json
+$ kubectl label node colation=true
+$ kubectl -n kube-system get daemonset
+```
+
+## Contributing
+For more information about contributing issues or pull requests, see our [Contributing to Caelus](doc/contributing.md).
+
+## License
+Caelus is under the Apache License 2.0. See the [License](LICENSE) file for details.
diff --git a/cmd/caelus/app/api.go b/cmd/caelus/app/api.go
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021 THL A29 Limited, a Tencent company.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package app
+
+import (
+	"flag"
+	"fmt"
+	"net"
+	"net/http"
+	"net/http/pprof"
+	"time"
+
+	"github.com/tencent/caelus/cmd/caelus/context"
+	"github.com/tencent/caelus/pkg/caelus/metrics"
+	"github.com/tencent/caelus/pkg/caelus/metrics/outer/serverrequest"
+	"github.com/tencent/caelus/pkg/caelus/metrics/outer/textfile"
+	"github.com/tencent/caelus/pkg/caelus/statestore"
+	"github.com/tencent/caelus/pkg/version/verflag"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+	"github.com/spf13/pflag"
+	"k8s.io/klog"
+)
+
+// ApiOption describe API related data
+type ApiOption struct {
+	// Profiling enable pprof debug
+	Profiling       bool
+	InsecureAddress string
+	InsecurePort    string
+	// statsMetric describe metrics which need to show in prometheus mode
+	statsMetric map[metrics.StatsMetric]interface{}
+	// prometheusRegistry used for register prometheus metrics
+	prometheusRegistry *prometheus.Registry
+}
+
+// AddFlags describe API related flags
+func (a *ApiOption) AddFlags(fs *pflag.FlagSet) {
+	fs.BoolVar(&a.Profiling, "profiling", true,
+		"Enable profiling via web interface host:port/debug/pprof/. Default is true")
+	fs.StringVar(&a.InsecureAddress, "insecure-bind-address", "127.0.0.1",
+		"The IP address on which to serve the --insecure-port. Defaults to localhost")
+	fs.StringVar(&a.InsecurePort, "insecure-port", "10030",
+		"The port on which to serve unsecured, unauthenticated access. Default 10030")
+}
+
+// Init function init metrics collecting manager
+func (a *ApiOption) Init(ctx *context.CaelusContext) error {
+	a.statsMetric = make(map[metrics.StatsMetric]interface{})
+	a.prometheusRegistry = prometheus.NewRegistry()
+	return nil
+}
+
+// Run main loop, now nothing to do
+func (a *ApiOption) Run(stopCh <-chan struct{}) error {
+	return nil
+}
+
+func (a *ApiOption) loadStatsMetric(name metrics.StatsMetric, stat interface{}) {
+	a.statsMetric[name] = stat
+}
+
+// RegisterServer register API route
+func (a *ApiOption) RegisterServer(node string) error {
+	// start server
+	mux := http.NewServeMux()
+	if a.Profiling {
+		mux.HandleFunc("/debug/pprof/", pprof.Index)
+		mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
+		mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
+	}
+
+	// register prometheus metrics
+	metrics.RegisterTotalMetrics(a.prometheusRegistry)
+	textfile.RegisterTextFileMetrics(a.prometheusRegistry)
+	stStore := a.statsMetric[metrics.StatsMetricStore].(statestore.StateStore)
+	serverrequest.RegisterRequestMetrics(a.prometheusRegistry, stStore)
+	a.prometheusRegistry.MustRegister(metrics.NewPrometheusCollector(node, a.statsMetric,
+		metrics.ResourceMetricsConfig()))
+	mux.Handle("/metrics", promhttp.HandlerFor(a.prometheusRegistry, promhttp.HandlerOpts{
+		ErrorHandling: promhttp.ContinueOnError}))
+
+	// register version api
+	mux.HandleFunc("/version", verflag.RequestVersion)
+
+	mux.HandleFunc("/klog", func(w http.ResponseWriter, r *http.Request) {
+		v := r.URL.Query().Get("v")
+		if v != "" {
+			flag.Lookup("v").Value.Set(v)
+			klog.Infof("set log level to %v", v)
+		}
+	})
+
+	handler := http.TimeoutHandler(mux, time.Duration(1*time.Minute), "time out")
+	httpServer := &http.Server{
+		Handler:        handler,
+		MaxHeaderBytes: 1 << 20,
+	}
+
+	insecureLocation := net.JoinHostPort(a.InsecureAddress, a.InsecurePort)
+	listener, err := net.Listen("tcp", insecureLocation)
+	if err != nil {
+		return fmt.Errorf("listen(%s) err: %v", insecureLocation, err)
+	}
+
+	go func() {
+		err = httpServer.Serve(listener)
+		if err != nil {
+			err = fmt.Errorf("start listen server err: %v", err)
+		}
+	}()
+
+	return err
+}