Skip to content

Commit

Permalink
base frame
Browse files Browse the repository at this point in the history
the executable codes for caelus
  • Loading branch information
ddongchen committed Oct 13, 2021
0 parents commit 79ea1bf
Show file tree
Hide file tree
Showing 279 changed files with 40,231 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea/
_output/
379 changes: 379 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.PHONY: build
build: format
./hack/build.sh

.PHONY: format
format:
./hack/format.sh

.PHONY: test
test:
./hack/test.sh

.PHONY: clean
clean:
./hack/clean.sh

.PHONY: image
image: build
./hack/image.sh
73 changes: 73 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
## Caelus
[![GitHub license](https://img.shields.io/badge/license-Apache--2.0-brightgreen)](https://github.com/Tencent/caelus/blob/master/LICENSE)
[![Release](https://img.shields.io/github/v/release/Tencent/caelus.svg)](https://github.com/Tencent/caelus/releases)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/Tencent/caelus/pulls)

Caelus is a set of Kubernetes solutions for reusing idle resources of nodes by running extra batch jobs, these resources come from
the underutilization of online jobs, especially during low traffic periods. To make batch jobs compatible with online jobs,
caelus dynamically manages multiple resource isolation mechanisms and also checks abnormalities of various metrics.
Batch jobs will be throttled or even killed if interference detected.

## Features

* Collect various metrics, including node resources, cgroup resources and online jobs latency

* Batch jobs could be running on YARN or Kubernetes

* Predict total resource usages of the node, including online jobs and kernel modules, such as slab

* Dynamically manage multiple resource isolation mechanisms, such as CPU, memory, and disk space

* Dynamically check abnormalities of various metrics, such as CPU usage or online jobs latency

* Throttle or even kill batch jobs when resource pressure or latency spike detected

* Prometheus metrics supported

* Alarm supported

## Usage

Find more usage at [Tutorial.md](doc/tutorial.md). The project also have two attached tools:

#### nm_operator

[nm_operator](doc/nm_operator.md) is used to execute YARN commands in the way of remote API.

#### metric_adapter

[metric_adapter](doc/metric_adapter.md) is used to collect more application metrics with adapter extension.


## Getting started

### build

``` sh
# binary build, which generates binary under _output/bin/
$ make build

# image build
$ make image

# run unit test
$ make test
```

### Run

```sh
# running in script
$ caelus --config=hack/config/caelus.json --hostname-override=xxx --v=2

# running in image
$ kubectl create -f hack/yaml/caelus.json
$ kubectl label node colation=true
$ kubectl -n kube-system get daemonset
```

## Contributing
For more information about contributing issues or pull requests, see our [Contributing to Caelus](doc/contributing.md).

## License
Caelus is under the Apache License 2.0. See the [License](LICENSE) file for details.
128 changes: 128 additions & 0 deletions cmd/caelus/app/api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Copyright (c) 2021 THL A29 Limited, a Tencent company.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package app

import (
"flag"
"fmt"
"net"
"net/http"
"net/http/pprof"
"time"

"github.com/tencent/caelus/cmd/caelus/context"
"github.com/tencent/caelus/pkg/caelus/metrics"
"github.com/tencent/caelus/pkg/caelus/metrics/outer/serverrequest"
"github.com/tencent/caelus/pkg/caelus/metrics/outer/textfile"
"github.com/tencent/caelus/pkg/caelus/statestore"
"github.com/tencent/caelus/pkg/version/verflag"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/spf13/pflag"
"k8s.io/klog"
)

// ApiOption describe API related data
type ApiOption struct {
// Profiling enable pprof debug
Profiling bool
InsecureAddress string
InsecurePort string
// statsMetric describe metrics which need to show in prometheus mode
statsMetric map[metrics.StatsMetric]interface{}
// prometheusRegistry used for register prometheus metrics
prometheusRegistry *prometheus.Registry
}

// AddFlags describe API related flags
func (a *ApiOption) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&a.Profiling, "profiling", true,
"Enable profiling via web interface host:port/debug/pprof/. Default is true")
fs.StringVar(&a.InsecureAddress, "insecure-bind-address", "127.0.0.1",
"The IP address on which to serve the --insecure-port. Defaults to localhost")
fs.StringVar(&a.InsecurePort, "insecure-port", "10030",
"The port on which to serve unsecured, unauthenticated access. Default 10030")
}

// Init function init metrics collecting manager
func (a *ApiOption) Init(ctx *context.CaelusContext) error {
a.statsMetric = make(map[metrics.StatsMetric]interface{})
a.prometheusRegistry = prometheus.NewRegistry()
return nil
}

// Run main loop, now nothing to do
func (a *ApiOption) Run(stopCh <-chan struct{}) error {
return nil
}

func (a *ApiOption) loadStatsMetric(name metrics.StatsMetric, stat interface{}) {
a.statsMetric[name] = stat
}

// RegisterServer register API route
func (a *ApiOption) RegisterServer(node string) error {
// start server
mux := http.NewServeMux()
if a.Profiling {
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
}

// register prometheus metrics
metrics.RegisterTotalMetrics(a.prometheusRegistry)
textfile.RegisterTextFileMetrics(a.prometheusRegistry)
stStore := a.statsMetric[metrics.StatsMetricStore].(statestore.StateStore)
serverrequest.RegisterRequestMetrics(a.prometheusRegistry, stStore)
a.prometheusRegistry.MustRegister(metrics.NewPrometheusCollector(node, a.statsMetric,
metrics.ResourceMetricsConfig()))
mux.Handle("/metrics", promhttp.HandlerFor(a.prometheusRegistry, promhttp.HandlerOpts{
ErrorHandling: promhttp.ContinueOnError}))

// register version api
mux.HandleFunc("/version", verflag.RequestVersion)

mux.HandleFunc("/klog", func(w http.ResponseWriter, r *http.Request) {
v := r.URL.Query().Get("v")
if v != "" {
flag.Lookup("v").Value.Set(v)
klog.Infof("set log level to %v", v)
}
})

handler := http.TimeoutHandler(mux, time.Duration(1*time.Minute), "time out")
httpServer := &http.Server{
Handler: handler,
MaxHeaderBytes: 1 << 20,
}

insecureLocation := net.JoinHostPort(a.InsecureAddress, a.InsecurePort)
listener, err := net.Listen("tcp", insecureLocation)
if err != nil {
return fmt.Errorf("listen(%s) err: %v", insecureLocation, err)
}

go func() {
err = httpServer.Serve(listener)
if err != nil {
err = fmt.Errorf("start listen server err: %v", err)
}
}()

return err
}
Loading

0 comments on commit 79ea1bf

Please sign in to comment.