From a89e0985f2d487825f692b145d0c6d02f4dcca32 Mon Sep 17 00:00:00 2001 From: "g.arczynski" Date: Thu, 7 Nov 2024 12:59:29 +0100 Subject: [PATCH 01/10] lxd watchdog --- Makefile | 8 +- cli.go | 26 ++ tools/lxd_watchdog/go.mod | 32 ++ tools/lxd_watchdog/go.sum | 108 +++++ tools/lxd_watchdog/lxd_watchdog.go | 636 +++++++++++++++++++++++++++++ 5 files changed, 809 insertions(+), 1 deletion(-) create mode 100644 tools/lxd_watchdog/go.mod create mode 100644 tools/lxd_watchdog/go.sum create mode 100644 tools/lxd_watchdog/lxd_watchdog.go diff --git a/Makefile b/Makefile index de8cde62..2bf455c2 100644 --- a/Makefile +++ b/Makefile @@ -72,9 +72,15 @@ coverage.coverprofile: $(COVERPROFILES) $(GO) tool cover -func=$@ .PHONY: build -build: .deps-fetched +build: .deps-fetched watchdog $(GO) install -tags netgo -ldflags "$(GOBUILD_LDFLAGS)" $(ALL_PACKAGES) +.PHONY: watchdog +watchdog: + pushd tools/lxd_watchdog && \ + $(GO) install . && \ + popd + .PHONY: crossbuild crossbuild: .deps-fetched $(CROSSBUILD_BINARIES) diff --git a/cli.go b/cli.go index fa587197..a09c8af6 100644 --- a/cli.go +++ b/cli.go @@ -11,6 +11,7 @@ import ( "os" "os/exec" "os/signal" + "strconv" "strings" "syscall" "time" @@ -143,6 +144,9 @@ func (i *CLI) Setup() (bool, error) { logger.WithField("cfg", fmt.Sprintf("%#v", i.Config)).Debug("read config") + i.checkIfCanRun() + i.writePid() + i.setupSentry() i.setupMetrics() @@ -265,6 +269,28 @@ func (i *CLI) Run() { } } +func (i *CLI) checkIfCanRun() { + file, err := os.Open("/tmp/worker.lock") + if err == nil { + + file.Close() + i.logger.WithField("err", err).Error("/tmp/worker.lock exists, not running!") + os.Exit(-11) + } + +} + +func (i *CLI) writePid() { + file, err := os.Create("/tmp/worker.pid") + if err != nil { + i.logger.WithField("err", err).Error("failed to write worker pid") + return + } + defer file.Close() + + file.WriteString(strconv.Itoa(os.Getpid())) +} + func (i *CLI) setupHeartbeat() { hbURL := i.c.String("heartbeat-url") if hbURL == "" { diff --git a/tools/lxd_watchdog/go.mod b/tools/lxd_watchdog/go.mod new file mode 100644 index 00000000..00b70775 --- /dev/null +++ b/tools/lxd_watchdog/go.mod @@ -0,0 +1,32 @@ +module github.com/travis-ci/worker/lxd_watchdog + +go 1.23.2 + +require ( + github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5 // indirect + github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect + github.com/go-jose/go-jose/v4 v4.0.4 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/securecookie v1.1.2 // indirect + github.com/gorilla/websocket v1.5.1 // indirect + github.com/kr/fs v0.1.0 // indirect + github.com/muhlemmer/gu v0.3.1 // indirect + github.com/pkg/sftp v1.13.7 // indirect + github.com/pkg/xattr v0.4.10 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + github.com/zitadel/logging v0.6.1 // indirect + github.com/zitadel/oidc/v3 v3.32.1 // indirect + github.com/zitadel/schema v1.3.0 // indirect + go.opentelemetry.io/otel v1.31.0 // indirect + go.opentelemetry.io/otel/metric v1.31.0 // indirect + go.opentelemetry.io/otel/trace v1.31.0 // indirect + golang.org/x/crypto v0.28.0 // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/oauth2 v0.23.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect +) diff --git a/tools/lxd_watchdog/go.sum b/tools/lxd_watchdog/go.sum new file mode 100644 index 00000000..303a7942 --- /dev/null +++ b/tools/lxd_watchdog/go.sum @@ -0,0 +1,108 @@ +github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5 h1:86R20y0SKAljYK5h4LDubdwKnR+1BLsIjyn4lMoQU/A= +github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5/go.mod h1:/lJQAYIjxVKguNpoVoiK8ppnNngrUpqj+xceYyuNt5g= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= +github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= +github.com/go-jose/go-jose/v4 v4.0.4 h1:VsjPI33J0SB9vQM6PLmNjoHqMQNGPiZ0rHL7Ni7Q6/E= +github.com/go-jose/go-jose/v4 v4.0.4/go.mod h1:NKb5HO1EZccyMpiZNbdUw/14tiXNyUJh188dfnMCAfc= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA= +github.com/gorilla/securecookie v1.1.2/go.mod h1:NfCASbcHqRSY+3a8tlWJwsQap2VX5pwzwo4h3eOamfo= +github.com/gorilla/websocket v1.5.1 h1:gmztn0JnHVt9JZquRuzLw3g4wouNVzKL15iLr/zn/QY= +github.com/gorilla/websocket v1.5.1/go.mod h1:x3kM2JMyaluk02fnUJpQuwD2dCS5NDG2ZHL0uE0tcaY= +github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/muhlemmer/gu v0.3.1 h1:7EAqmFrW7n3hETvuAdmFmn4hS8W+z3LgKtrnow+YzNM= +github.com/muhlemmer/gu v0.3.1/go.mod h1:YHtHR+gxM+bKEIIs7Hmi9sPT3ZDUvTN/i88wQpZkrdM= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pkg/sftp v1.13.7 h1:uv+I3nNJvlKZIQGSr8JVQLNHFU9YhhNpvC14Y6KgmSM= +github.com/pkg/sftp v1.13.7/go.mod h1:KMKI0t3T6hfA+lTR/ssZdunHo+uwq7ghoN09/FSu3DY= +github.com/pkg/xattr v0.4.10 h1:Qe0mtiNFHQZ296vRgUjRCoPHPqH7VdTOrZx3g0T+pGA= +github.com/pkg/xattr v0.4.10/go.mod h1:di8WF84zAKk8jzR1UBTEWh9AUlIZZ7M/JNt8e9B6ktU= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/zitadel/logging v0.6.1 h1:Vyzk1rl9Kq9RCevcpX6ujUaTYFX43aa4LkvV1TvUk+Y= +github.com/zitadel/logging v0.6.1/go.mod h1:Y4CyAXHpl3Mig6JOszcV5Rqqsojj+3n7y2F591Mp/ow= +github.com/zitadel/oidc/v3 v3.32.1 h1:uE7IgQq4yJfQPXaIbvkOjOaIyb10OF1QtG1COUB/efE= +github.com/zitadel/oidc/v3 v3.32.1/go.mod h1:DyE/XClysRK/ozFaZSqlYamKVnTh4l6Ln25ihSNI03w= +github.com/zitadel/schema v1.3.0 h1:kQ9W9tvIwZICCKWcMvCEweXET1OcOyGEuFbHs4o5kg0= +github.com/zitadel/schema v1.3.0/go.mod h1:NptN6mkBDFvERUCvZHlvWmmME+gmZ44xzwRXwhzsbtc= +go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY= +go.opentelemetry.io/otel v1.31.0/go.mod h1:O0C14Yl9FgkjqcCZAsE053C13OaddMYr/hz6clDkEJE= +go.opentelemetry.io/otel/metric v1.31.0 h1:FSErL0ATQAmYHUIzSezZibnyVlft1ybhy4ozRPcF2fE= +go.opentelemetry.io/otel/metric v1.31.0/go.mod h1:C3dEloVbLuYoX41KpmAhOqNriGbA+qqH6PQ5E5mUfnY= +go.opentelemetry.io/otel/trace v1.31.0 h1:ffjsj1aRouKewfr85U2aGagJ46+MvodynlQ1HYdmJys= +go.opentelemetry.io/otel/trace v1.31.0/go.mod h1:TXZkRk7SM2ZQLtR6eoAWQFIHPvzQ06FJAsO1tJg480A= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= +golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tools/lxd_watchdog/lxd_watchdog.go b/tools/lxd_watchdog/lxd_watchdog.go new file mode 100644 index 00000000..d911a131 --- /dev/null +++ b/tools/lxd_watchdog/lxd_watchdog.go @@ -0,0 +1,636 @@ +package main + +// worker watchdog - performs basic checks for worker lxd backend +// runs as a single check or a loop with '-l' or '--loop' parameters +// configuration: +// WATCHDOG_PING_URL: url to ping, default www.google.com +// WATCHDOG_INTERVAL: sleep in minutes before retry (when run with --loop param) +// DATADOG_URL: sends notification to Datadog - requires full url with key +// WATCHDOG_IMAGE: image to be used, default alpine:3.20 +// additionally following envs (equal to worker) are available: +// NETWORK_STATIC, NETWORK_DNS, NETWORK_DNS, HTTP_PROXY, HTTPS_PROXY, FTP_PROXY, NO_PROXY +// +// on connection error, watchdog kills the worker basing on it's pid from /tmp/worker.pid +// and creates a /tmp/worker.lock until connection is available again + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "net" + "net/http" + "os" + "strconv" + "strings" + "sync" + "syscall" + "time" + + lxd "github.com/canonical/lxd/client" + lxdconfig "github.com/canonical/lxd/lxc/config" + lxdapi "github.com/canonical/lxd/shared/api" +) + +type lxdWatchdog struct { + client lxd.InstanceServer + url string + networkStatic bool + networkGateway string + networkSubnet *net.IPNet + networkMTU string + networkDNS []string + networkLeases map[string]string + networkLeasesLock sync.Mutex + + httpProxy, httpsProxy, ftpProxy, noProxy string +} + +func newLxdWatchdog() (*lxdWatchdog, error) { + client, err := lxd.ConnectLXDUnix("", nil) + if err != nil { + fmt.Printf("can't connect lxd: %v\n", err) + return nil, err + } + + networkStatic := false + networkMTU := "1500" + var networkGateway string + var networkSubnet *net.IPNet + var networkLeases map[string]string + + if os.Getenv("NETWORK_STATIC") != "" { + networkStatic = os.Getenv("NETWORK_STATIC") == "true" + + network, _, err := client.GetNetwork("lxdbr0") + if err != nil { + return nil, err + } + + if network.Managed { + // Get MTU + if network.Config["bridge.mtu"] != "" { + networkMTU = network.Config["bridge.mtu"] + } + + // Get subnet + if network.Config["ipv4.address"] == "" { + return nil, fmt.Errorf("no IPv4 subnet set on the network") + } + + gateway, subnet, err := net.ParseCIDR(network.Config["ipv4.address"]) + if err != nil { + return nil, err + } + + networkGateway = gateway.String() + networkSubnet = subnet + } else { + networkState, err := client.GetNetworkState("lxdbr0") + if err != nil { + return nil, err + } + + // Get MTU + networkMTU = fmt.Sprintf("%d", networkState.Mtu) + + // Get subnet + for _, address := range networkState.Addresses { + if address.Family != "inet" || address.Scope != "global" { + continue + } + + gateway, subnet, err := net.ParseCIDR(fmt.Sprintf("%s/%s", address.Address, address.Netmask)) + if err != nil { + return nil, err + } + + networkGateway = gateway.String() + networkSubnet = subnet + } + } + networkLeases = map[string]string{} + } + + networkDNS := []string{"1.1.1.1", "1.0.0.1"} + if os.Getenv("NETWORK_DNS") != "" { + networkDNS = strings.Split(os.Getenv("NETWORK_DNS"), ",") + } + + httpProxy := os.Getenv("HTTP_PROXY") + httpsProxy := os.Getenv("HTTPS_PROXY") + ftpProxy := os.Getenv("FTP_PROXY") + noProxy := os.Getenv("NO_PROXY") + url := "www.google.com" + + if os.Getenv("WATCHDOG_PING_URL") != "" { + url = os.Getenv("WATCHDOG_PING_URL") + } + + return &lxdWatchdog{ + client: client, + + url: url, + + networkSubnet: networkSubnet, + networkGateway: networkGateway, + networkStatic: networkStatic, + networkMTU: networkMTU, + networkDNS: networkDNS, + networkLeases: networkLeases, + + httpProxy: httpProxy, + httpsProxy: httpsProxy, + ftpProxy: ftpProxy, + noProxy: noProxy, + }, nil +} + +func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { + p.networkLeasesLock.Lock() + defer p.networkLeasesLock.Unlock() + + // Get all IPs + inc := func(ip net.IP) { + for j := len(ip) - 1; j >= 0; j-- { + ip[j]++ + if ip[j] > 0 { + break + } + } + } + + stringInSlice := func(key string, list []string) bool { + for _, entry := range list { + if entry == key { + return true + } + } + + return false + } + + var ips []string + ip := net.ParseIP(p.networkGateway) + for ip := ip.Mask(p.networkSubnet.Mask); p.networkSubnet.Contains(ip); inc(ip) { + ips = append(ips, ip.String()) + } + + usedIPs := []string{} + for _, usedIP := range p.networkLeases { + usedIPs = append(usedIPs, usedIP) + } + + // Find a free address + for _, ip := range ips { + // Skip used addresses + if ip == ips[0] { + continue + } + + if ip == p.networkGateway { + continue + } + + if ip == ips[len(ips)-1] { + continue + } + + if stringInSlice(ip, usedIPs) { + continue + } + + // Allocate the address + p.networkLeases[containerName] = ip + size, _ := p.networkSubnet.Mask.Size() + return fmt.Sprintf("%s/%d", ip, size), nil + } + + return "", fmt.Errorf("no free addresses found") +} + +func (p *lxdWatchdog) releaseAddress(containerName string) { + p.networkLeasesLock.Lock() + defer p.networkLeasesLock.Unlock() + + delete(p.networkLeases, containerName) +} + +func (p *lxdWatchdog) getImage(imageName string) (lxd.ImageServer, *lxdapi.Image, error) { + // Remote images + if strings.Contains(imageName, ":") { + defaultConfig := lxdconfig.NewConfig("", true) + + remote, fingerprint, err := defaultConfig.ParseRemote(imageName) + if err != nil { + return nil, nil, err + } + + imageServer, err := defaultConfig.GetImageServer(remote) + if err != nil { + return nil, nil, err + } + + if fingerprint == "" { + fingerprint = "default" + } + + alias, _, err := imageServer.GetImageAlias(fingerprint) + if err == nil { + fingerprint = alias.Target + } + + image, _, err := imageServer.GetImage(fingerprint) + if err != nil { + return nil, nil, err + } + + return imageServer, image, nil + } + + // Local images + fingerprint := imageName + alias, _, err := p.client.GetImageAlias(imageName) + if err == nil { + fingerprint = alias.Target + } + + image, _, err := p.client.GetImage(fingerprint) + if err != nil { + return nil, nil, err + } + + return p.client, image, nil +} + +func (p *lxdWatchdog) Start() error { + + var ( + err error + ) + + containerName := "watchdogContainer" + imageName := os.Getenv("WATCHDOG_IMAGE") + if imageName == "" { + imageName = "images:alpine/3.20" + } + + imageServer, image, err := p.getImage(imageName) + if err != nil { + fmt.Printf("Error getting image: %v\n", err) + return err + } + + existingContainer, _, err := p.client.GetInstance(containerName) + if err == nil { + if existingContainer.StatusCode != lxdapi.Stopped { + // Force stop the container + req := lxdapi.InstanceStatePut{ + Action: "stop", + Timeout: -1, + Force: true, + } + + op, err := p.client.UpdateInstanceState(containerName, req, "") + if err != nil { + return fmt.Errorf("couldn't stop preexisting container before create: %v", err) + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("couldn't stop preexisting container before create: %v", err) + } + } + + op, err := p.client.DeleteInstance(containerName) + if err != nil { + return fmt.Errorf("couldn't remove preexisting container before create: %v", err) + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("couldn't remove preexisting container before create: %v", err) + } + + if p.networkStatic { + p.releaseAddress(containerName) + } + + fmt.Printf("removed preexisting container before create\n") + } + + // Create the container + config := map[string]string{ + "security.devlxd": "false", + "security.idmap.isolated": "true", + "security.idmap.size": "100000", + "security.nesting": "true", + "security.privileged": "false", + "security.syscalls.intercept.mknod": "true", + "security.syscalls.intercept.setxattr": "true", + "limits.memory": "500MB", + "limits.processes": "1000", + "linux.kernel_modules": "overlay", + "limits.cpu": "1", + } + + req := lxdapi.InstancesPost{ + Name: containerName, + } + req.Config = config + + rop, err := p.client.CreateInstanceFromImage(imageServer, *image, req) + if err != nil { + return fmt.Errorf("couldn't create a new container: %v", err) + } + + err = rop.Wait() + if err != nil { + return fmt.Errorf("couldn't create a new container: %v", err) + } + + // Configure the container devices + container, etag, err := p.client.GetInstance(containerName) + if err != nil { + return fmt.Errorf("failed to get the container: %v", err) + } + + // Disk limits + container.Devices["root"] = container.ExpandedDevices["root"] + container.Devices["root"]["size"] = "1GB" + + // Network limits + container.Devices["eth0"] = container.ExpandedDevices["eth0"] + container.Devices["eth0"]["limits.max"] = "100Mbit" + container.Devices["eth0"]["security.mac_filtering"] = "true" + container.Devices["eth0"]["security.ipv4_filtering"] = "true" + container.Devices["eth0"]["security.ipv6_filtering"] = "false" + + // Static networking + if p.networkStatic { + address, err := p.allocateAddress(containerName) + if err != nil { + return err + } + + dns, err := json.Marshal(p.networkDNS) + if err != nil { + return err + } + + container.Devices["eth0"]["ipv4.address"] = strings.Split(address, "/")[0] + + var fileName, content string + fileName = "/etc/netplan/50-cloud-init.yaml" + content = fmt.Sprintf(`network: + version: 2 + ethernets: + eth0: + addresses: + - %s + gateway4: %s + nameservers: + addresses: %s + mtu: %s + `, address, p.networkGateway, dns, p.networkMTU) + + args := lxd.InstanceFileArgs{ + Type: "file", + Mode: 0644, + UID: 0, + GID: 0, + Content: strings.NewReader(string(content)), + } + + err = p.client.CreateInstanceFile(containerName, fileName, args) + if err != nil { + return fmt.Errorf("failed to upload netplan/interfaces to container: %v", err) + } + } + + // Save the changes + op, err := p.client.UpdateInstance(containerName, container.Writable(), etag) + if err != nil { + return fmt.Errorf("failed to update the container config: %v", err) + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("failed to update the container config: %v", err) + } + + fmt.Printf("STARTING\n") + // Start the container + op, err = p.client.UpdateInstanceState(containerName, lxdapi.InstanceStatePut{Action: "start", Timeout: -1}, "") + if err != nil { + return fmt.Errorf("couldn't start new container: %v", err) + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("couldn't start new container: %v", err) + } + + fmt.Printf("STARTED - check connection\n") + + // Wait for connectivity + connectivityCheck := func() error { + exec := lxdapi.InstanceExecPost{ + Command: []string{"ping", p.url, "-c", "1"}, + } + + // Spawn the command + op, err := p.client.ExecInstance(containerName, exec, nil) + if err != nil { + return err + } + + err = op.Wait() + if err != nil { + return err + } + opAPI := op.Get() + + retVal := int32(opAPI.Metadata["return"].(float64)) + if retVal != 0 { + return fmt.Errorf("ping exited with %d", retVal) + } + return nil + } + + // Wait 30s for network + time.Sleep(1 * time.Second) + for i := 0; i < 60; i++ { + err = connectivityCheck() + if err == nil { + break + } + //fmt.Printf("wait for connection\n") + + time.Sleep(500 * time.Millisecond) + } + + if err != nil { + fmt.Printf("container didn't have connectivity after 30s: %v\n", err) + err = p.killWorker() + if err != nil { + fmt.Printf("kill worker error: %v\n", err) + } + + p.datadogAlert("[TRAVIS][LXC] Watchdog error", "container didn't have connectivity after 30s") + } + fmt.Printf("STARTED - OK\n") + + p.setWorkerLock(false) + + // Get the container + container, _, err = p.client.GetInstance(containerName) + if err != nil { + return fmt.Errorf("failed to get the container: %v", err) + } + + if container.StatusCode != lxdapi.Stopped { + // Force stop the container + req := lxdapi.InstanceStatePut{ + Action: "stop", + Timeout: -1, + Force: true, + } + + op, err := p.client.UpdateInstanceState(container.Name, req, "") + if err != nil { + return fmt.Errorf("couldn't stop preexisting container before create: %v", err) + + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("couldn't stop preexisting container before create: %v", err) + + } + } + + op, err = p.client.DeleteInstance(container.Name) + if err != nil { + return fmt.Errorf("couldn't remove preexisting container before create: %v", err) + } + + err = op.Wait() + if err != nil { + return fmt.Errorf("couldn't remove preexisting container before create: %v", err) + } + + if p.networkStatic { + p.releaseAddress(container.Name) + } + + fmt.Printf("CLEANUP DONE\n") + return nil +} +func (p *lxdWatchdog) setWorkerLock(value bool) error { + if value { + file, err := os.Create("/tmp/worker.lock") + if err != nil { + return fmt.Errorf("can't set the worker lock, can't access the worker.lock file: %v", err) + } + defer file.Close() + _, err = file.Write([]byte{'1'}) + if err != nil { + return fmt.Errorf("can't set the worker lock, can't write the worker.lock file: %v", err) + } + } else { + err := os.Remove("/tmp/worker.lock") + if err != nil && !errors.Is(err, os.ErrNotExist) { + + return fmt.Errorf("can't remove the worker lock!: %v", err) + } + if err != nil { + fmt.Printf("Skipping remove lock, doesn't exist\n") + } + } + return nil +} + +func (p *lxdWatchdog) killWorker() error { + file, err := os.Open("/tmp/worker.pid") + if err != nil { + return fmt.Errorf("can't kill the worker, can't access the worker.pid file: %v", err) + } + defer file.Close() + data := make([]byte, 64) + + var count int = 0 + count, err = file.Read(data) + if err != nil { + return fmt.Errorf("can't kill the worker, can't read the worker.pid file: %v", err) + } + pid := 0 + pid, err = strconv.Atoi(string(data[:count])) + if err != nil || pid == 0 { + return fmt.Errorf("can't kill the worker, can't read the worker.pid : %v", err) + } + p.setWorkerLock(true) + syscall.Kill(pid, syscall.SIGTERM) + fmt.Printf("Sent SIGTERM to worker process [%d]\n", pid) + return nil +} + +func (p *lxdWatchdog) datadogAlert(title string, text string) { + + priority := "high" + hostname, err := os.Hostname() + if err != nil { + hostname = "n/a" + } + content := fmt.Sprintf(`{"title": "%s", "text" : "%s", "priority": "%s", "host": "%s", "tags": ["TravisCI", "lxc_alerts"], "alert_type": "error"}`, title, text, priority, hostname) + url := os.Getenv("DATADOG_URL") + if url == "" { + return + } + r, err := http.NewRequest("POST", url, bytes.NewBufferString(content)) + if err != nil { + fmt.Printf("ERROR on creating request for Datadog: %v\n", err) + } + r.Header.Add("Content-Type", "application/json") + + client := &http.Client{} + _, err = client.Do(r) + if err != nil { + fmt.Printf("ERROR on sending request to Datadog: %v\n", err) + } +} + +func main() { + args := os.Args + loop := false + sleepTime := 60 * time.Minute + if len(args) > 1 && (args[1] == "-l" || args[1] == "--loop") { + loop = true + sleepStr := os.Getenv("WATCHDOG_INTERVAL") + if sleepStr != "" { + + t, err := strconv.Atoi(sleepStr) + if err == nil { + sleepTime = time.Duration(t) * time.Minute + } + } + } + fmt.Println("Starting LXD watchdog") + w, err := newLxdWatchdog() + for { + if err == nil { + err = w.Start() + if err != nil { + fmt.Printf("error on start: %v\n", err) + } + } else { + fmt.Printf("Starting LXD watchdog error: %v\n", err) + } + if !loop { + break + } + err = nil + + time.Sleep(sleepTime) + } +} From 39b61e0574b0bcd11015271f8afd699397a5938d Mon Sep 17 00:00:00 2001 From: "g.arczynski" Date: Tue, 12 Nov 2024 15:42:16 +0100 Subject: [PATCH 02/10] reserved ips for watchdog, checking used ips, network config update --- backend/lxd.go | 4 +- tools/lxd_watchdog/lxd_watchdog.go | 97 +++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 23 deletions(-) diff --git a/backend/lxd.go b/backend/lxd.go index 3687521b..6f1885c8 100644 --- a/backend/lxd.go +++ b/backend/lxd.go @@ -388,7 +388,9 @@ func (p *lxdProvider) allocateAddress(containerName string) (string, error) { var ips []string ip := net.ParseIP(p.networkGateway) for ip := ip.Mask(p.networkSubnet.Mask); p.networkSubnet.Contains(ip); inc(ip) { - ips = append(ips, ip.String()) + if ip[3] < 230 { // reserving some IPs for watchdog + ips = append(ips, ip.String()) + } } usedIPs := []string{} diff --git a/tools/lxd_watchdog/lxd_watchdog.go b/tools/lxd_watchdog/lxd_watchdog.go index d911a131..4571ff18 100644 --- a/tools/lxd_watchdog/lxd_watchdog.go +++ b/tools/lxd_watchdog/lxd_watchdog.go @@ -15,7 +15,6 @@ package main import ( "bytes" - "encoding/json" "errors" "fmt" "net" @@ -146,11 +145,37 @@ func newLxdWatchdog() (*lxdWatchdog, error) { }, nil } +func (p *lxdWatchdog) getInstancesIps() []string { + result := []string{} + instances, err := p.client.GetInstances(lxdapi.InstanceTypeAny) + if err != nil { + fmt.Printf("Error on getting instances: %v\n", err) + return result + } + for _, i := range instances { + state, _, err := p.client.GetInstanceState(i.Name) + if err != nil { + fmt.Printf("Error on getting instance state: %v\n", err) + return result + } + net := state.Network["eth0"] + for _, addr := range net.Addresses { + if addr.Family == "inet" { + result = append(result, addr.Address) + } + + } + + } + return result +} + func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { p.networkLeasesLock.Lock() defer p.networkLeasesLock.Unlock() - // Get all IPs + reservedIps := p.getInstancesIps() + inc := func(ip net.IP) { for j := len(ip) - 1; j >= 0; j-- { ip[j]++ @@ -173,7 +198,9 @@ func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { var ips []string ip := net.ParseIP(p.networkGateway) for ip := ip.Mask(p.networkSubnet.Mask); p.networkSubnet.Contains(ip); inc(ip) { - ips = append(ips, ip.String()) + if ip[3] >= 230 { + ips = append(ips, ip.String()) + } } usedIPs := []string{} @@ -181,6 +208,10 @@ func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { usedIPs = append(usedIPs, usedIP) } + usedIPs = append(usedIPs, reservedIps...) + + fmt.Printf("usedIPs: %v\n", usedIPs) + // Find a free address for _, ip := range ips { // Skip used addresses @@ -200,6 +231,7 @@ func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { continue } + fmt.Printf("FREE ADDRESS: %v\n", ip) // Allocate the address p.networkLeases[containerName] = ip size, _ := p.networkSubnet.Mask.Size() @@ -373,26 +405,16 @@ func (p *lxdWatchdog) Start() error { return err } - dns, err := json.Marshal(p.networkDNS) - if err != nil { - return err - } - container.Devices["eth0"]["ipv4.address"] = strings.Split(address, "/")[0] var fileName, content string - fileName = "/etc/netplan/50-cloud-init.yaml" - content = fmt.Sprintf(`network: - version: 2 - ethernets: - eth0: - addresses: - - %s - gateway4: %s - nameservers: - addresses: %s - mtu: %s - `, address, p.networkGateway, dns, p.networkMTU) + fileName = "/etc/network/interfaces" + content = fmt.Sprintf(`auto eth0 +iface eth0 inet static + address %s + gateway: %s + netmask: 255.255.255.0 +`, strings.Split(address, "/")[0], p.networkGateway) args := lxd.InstanceFileArgs{ Type: "file", @@ -404,7 +426,26 @@ func (p *lxdWatchdog) Start() error { err = p.client.CreateInstanceFile(containerName, fileName, args) if err != nil { - return fmt.Errorf("failed to upload netplan/interfaces to container: %v", err) + fmt.Printf("failed to upload network/interfaces to container: %v\n", err) + } + + fileName = "/etc/resolv.conf" + content = fmt.Sprintf("search lxd\nnameserver %s\n", p.networkGateway) + for _, d := range p.networkDNS { + content = fmt.Sprintf("%snameserver %s\n", content, d) + } + + args = lxd.InstanceFileArgs{ + Type: "file", + Mode: 0644, + UID: 0, + GID: 0, + Content: strings.NewReader(string(content)), + } + + err = p.client.CreateInstanceFile(containerName, fileName, args) + if err != nil { + fmt.Printf("failed to upload resolv.conf to container: %v\n", err) } } @@ -433,6 +474,18 @@ func (p *lxdWatchdog) Start() error { fmt.Printf("STARTED - check connection\n") + if p.networkStatic { + exec := lxdapi.InstanceExecPost{ + Command: []string{"route", "add", "default", "gw", "_gateway.lxd"}, + } + + // Spawn the command + _, err = p.client.ExecInstance(containerName, exec, nil) + if err != nil { + fmt.Printf("couldn't add default gateway: %v\n", err) + } + } + // Wait for connectivity connectivityCheck := func() error { exec := lxdapi.InstanceExecPost{ @@ -465,7 +518,7 @@ func (p *lxdWatchdog) Start() error { if err == nil { break } - //fmt.Printf("wait for connection\n") + fmt.Printf("wait for connection\n") time.Sleep(500 * time.Millisecond) } From 8a281d6c4e33242a257e7f2f25ef2ad2930a7f6b Mon Sep 17 00:00:00 2001 From: "g.arczynski" Date: Mon, 18 Nov 2024 13:29:11 +0100 Subject: [PATCH 03/10] watchdog moved to worker --- .snapcraft.yaml | 2 +- cli.go | 37 ++-- config/config.go | 3 + tools/lxd_watchdog/go.mod | 32 ---- tools/lxd_watchdog/go.sum | 108 ----------- .../lxd_watchdog.go => watchdog.go | 181 ++++++++++-------- 6 files changed, 117 insertions(+), 246 deletions(-) delete mode 100644 tools/lxd_watchdog/go.mod delete mode 100644 tools/lxd_watchdog/go.sum rename tools/lxd_watchdog/lxd_watchdog.go => watchdog.go (82%) diff --git a/.snapcraft.yaml b/.snapcraft.yaml index dbee69bc..d3f7aa79 100644 --- a/.snapcraft.yaml +++ b/.snapcraft.yaml @@ -31,7 +31,7 @@ parts: go-packages: - github.com/travis-ci/worker/cmd/travis-worker go-importpath: github.com/travis-ci/worker - go-channel: 1.21/stable + go-channel: 1.23/stable prime: - bin/travis-worker override-build: |- diff --git a/cli.go b/cli.go index a09c8af6..479c67cb 100644 --- a/cli.go +++ b/cli.go @@ -11,7 +11,6 @@ import ( "os" "os/exec" "os/signal" - "strconv" "strings" "syscall" "time" @@ -144,11 +143,19 @@ func (i *CLI) Setup() (bool, error) { logger.WithField("cfg", fmt.Sprintf("%#v", i.Config)).Debug("read config") - i.checkIfCanRun() - i.writePid() - i.setupSentry() i.setupMetrics() + if i.Config.ProviderName == "lxd" { // run watchdog once to check if containers start and get network connection - exits if not + RunLXDWatchdog(false) + + if i.c.Bool("watchdog") { + os.Exit(0) // don't proceed if running with '-watchdog' param + } + } + + if i.Config.ProviderName == "lxd" { + RunLXDWatchdog(true) // start the ldx watchdog loop + } err := i.setupOpenCensus(ctx) if err != nil { @@ -269,28 +276,6 @@ func (i *CLI) Run() { } } -func (i *CLI) checkIfCanRun() { - file, err := os.Open("/tmp/worker.lock") - if err == nil { - - file.Close() - i.logger.WithField("err", err).Error("/tmp/worker.lock exists, not running!") - os.Exit(-11) - } - -} - -func (i *CLI) writePid() { - file, err := os.Create("/tmp/worker.pid") - if err != nil { - i.logger.WithField("err", err).Error("failed to write worker pid") - return - } - defer file.Close() - - file.WriteString(strconv.Itoa(os.Getpid())) -} - func (i *CLI) setupHeartbeat() { hbURL := i.c.String("heartbeat-url") if hbURL == "" { diff --git a/config/config.go b/config/config.go index fe6ef23d..0fb1258b 100644 --- a/config/config.go +++ b/config/config.go @@ -280,6 +280,9 @@ var ( Usage: "sample rate for trace as an inverse fraction - for sample rate n, every nth event will be sampled", Value: 1, }), + NewConfigDef("watchdog", &cli.BoolFlag{ + Usage: "execute LXD watchdog and exit", + }), } // Flags is the list of all CLI flags accepted by travis-worker diff --git a/tools/lxd_watchdog/go.mod b/tools/lxd_watchdog/go.mod deleted file mode 100644 index 00b70775..00000000 --- a/tools/lxd_watchdog/go.mod +++ /dev/null @@ -1,32 +0,0 @@ -module github.com/travis-ci/worker/lxd_watchdog - -go 1.23.2 - -require ( - github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5 // indirect - github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 // indirect - github.com/go-jose/go-jose/v4 v4.0.4 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-logr/stdr v1.2.2 // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/gorilla/securecookie v1.1.2 // indirect - github.com/gorilla/websocket v1.5.1 // indirect - github.com/kr/fs v0.1.0 // indirect - github.com/muhlemmer/gu v0.3.1 // indirect - github.com/pkg/sftp v1.13.7 // indirect - github.com/pkg/xattr v0.4.10 // indirect - github.com/sirupsen/logrus v1.9.3 // indirect - github.com/zitadel/logging v0.6.1 // indirect - github.com/zitadel/oidc/v3 v3.32.1 // indirect - github.com/zitadel/schema v1.3.0 // indirect - go.opentelemetry.io/otel v1.31.0 // indirect - go.opentelemetry.io/otel/metric v1.31.0 // indirect - go.opentelemetry.io/otel/trace v1.31.0 // indirect - golang.org/x/crypto v0.28.0 // indirect - golang.org/x/net v0.30.0 // indirect - golang.org/x/oauth2 v0.23.0 // indirect - golang.org/x/sys v0.26.0 // indirect - golang.org/x/term v0.25.0 // indirect - golang.org/x/text v0.19.0 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect -) diff --git a/tools/lxd_watchdog/go.sum b/tools/lxd_watchdog/go.sum deleted file mode 100644 index 303a7942..00000000 --- a/tools/lxd_watchdog/go.sum +++ /dev/null @@ -1,108 +0,0 @@ -github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5 h1:86R20y0SKAljYK5h4LDubdwKnR+1BLsIjyn4lMoQU/A= -github.com/canonical/lxd v0.0.0-20241105131838-efae303214c5/go.mod h1:/lJQAYIjxVKguNpoVoiK8ppnNngrUpqj+xceYyuNt5g= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3 h1:fmFk0Wt3bBxxwZnu48jqMdaOR/IZ4vdtJFuaFV8MpIE= -github.com/flosch/pongo2 v0.0.0-20200913210552-0d938eb266f3/go.mod h1:bJWSKrZyQvfTnb2OudyUjurSG4/edverV7n82+K3JiM= -github.com/go-jose/go-jose/v4 v4.0.4 h1:VsjPI33J0SB9vQM6PLmNjoHqMQNGPiZ0rHL7Ni7Q6/E= -github.com/go-jose/go-jose/v4 v4.0.4/go.mod h1:NKb5HO1EZccyMpiZNbdUw/14tiXNyUJh188dfnMCAfc= -github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/securecookie v1.1.2 h1:YCIWL56dvtr73r6715mJs5ZvhtnY73hBvEF8kXD8ePA= -github.com/gorilla/securecookie v1.1.2/go.mod h1:NfCASbcHqRSY+3a8tlWJwsQap2VX5pwzwo4h3eOamfo= -github.com/gorilla/websocket v1.5.1 h1:gmztn0JnHVt9JZquRuzLw3g4wouNVzKL15iLr/zn/QY= -github.com/gorilla/websocket v1.5.1/go.mod h1:x3kM2JMyaluk02fnUJpQuwD2dCS5NDG2ZHL0uE0tcaY= -github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8= -github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/muhlemmer/gu v0.3.1 h1:7EAqmFrW7n3hETvuAdmFmn4hS8W+z3LgKtrnow+YzNM= -github.com/muhlemmer/gu v0.3.1/go.mod h1:YHtHR+gxM+bKEIIs7Hmi9sPT3ZDUvTN/i88wQpZkrdM= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/pkg/sftp v1.13.7 h1:uv+I3nNJvlKZIQGSr8JVQLNHFU9YhhNpvC14Y6KgmSM= -github.com/pkg/sftp v1.13.7/go.mod h1:KMKI0t3T6hfA+lTR/ssZdunHo+uwq7ghoN09/FSu3DY= -github.com/pkg/xattr v0.4.10 h1:Qe0mtiNFHQZ296vRgUjRCoPHPqH7VdTOrZx3g0T+pGA= -github.com/pkg/xattr v0.4.10/go.mod h1:di8WF84zAKk8jzR1UBTEWh9AUlIZZ7M/JNt8e9B6ktU= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zitadel/logging v0.6.1 h1:Vyzk1rl9Kq9RCevcpX6ujUaTYFX43aa4LkvV1TvUk+Y= -github.com/zitadel/logging v0.6.1/go.mod h1:Y4CyAXHpl3Mig6JOszcV5Rqqsojj+3n7y2F591Mp/ow= -github.com/zitadel/oidc/v3 v3.32.1 h1:uE7IgQq4yJfQPXaIbvkOjOaIyb10OF1QtG1COUB/efE= -github.com/zitadel/oidc/v3 v3.32.1/go.mod h1:DyE/XClysRK/ozFaZSqlYamKVnTh4l6Ln25ihSNI03w= -github.com/zitadel/schema v1.3.0 h1:kQ9W9tvIwZICCKWcMvCEweXET1OcOyGEuFbHs4o5kg0= -github.com/zitadel/schema v1.3.0/go.mod h1:NptN6mkBDFvERUCvZHlvWmmME+gmZ44xzwRXwhzsbtc= -go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY= -go.opentelemetry.io/otel v1.31.0/go.mod h1:O0C14Yl9FgkjqcCZAsE053C13OaddMYr/hz6clDkEJE= -go.opentelemetry.io/otel/metric v1.31.0 h1:FSErL0ATQAmYHUIzSezZibnyVlft1ybhy4ozRPcF2fE= -go.opentelemetry.io/otel/metric v1.31.0/go.mod h1:C3dEloVbLuYoX41KpmAhOqNriGbA+qqH6PQ5E5mUfnY= -go.opentelemetry.io/otel/trace v1.31.0 h1:ffjsj1aRouKewfr85U2aGagJ46+MvodynlQ1HYdmJys= -go.opentelemetry.io/otel/trace v1.31.0/go.mod h1:TXZkRk7SM2ZQLtR6eoAWQFIHPvzQ06FJAsO1tJg480A= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= -golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= -golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= -golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= -golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= -golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= -golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= -golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tools/lxd_watchdog/lxd_watchdog.go b/watchdog.go similarity index 82% rename from tools/lxd_watchdog/lxd_watchdog.go rename to watchdog.go index 4571ff18..b4941c11 100644 --- a/tools/lxd_watchdog/lxd_watchdog.go +++ b/watchdog.go @@ -1,4 +1,4 @@ -package main +package worker // worker watchdog - performs basic checks for worker lxd backend // runs as a single check or a loop with '-l' or '--loop' parameters @@ -15,7 +15,6 @@ package main import ( "bytes" - "errors" "fmt" "net" "net/http" @@ -43,12 +42,13 @@ type lxdWatchdog struct { networkLeasesLock sync.Mutex httpProxy, httpsProxy, ftpProxy, noProxy string + lastSleep int } func newLxdWatchdog() (*lxdWatchdog, error) { client, err := lxd.ConnectLXDUnix("", nil) if err != nil { - fmt.Printf("can't connect lxd: %v\n", err) + fmt.Printf("[LXDWATCHDOG] can't connect lxd: %v\n", err) return nil, err } @@ -142,6 +142,7 @@ func newLxdWatchdog() (*lxdWatchdog, error) { httpsProxy: httpsProxy, ftpProxy: ftpProxy, noProxy: noProxy, + lastSleep: 0, }, nil } @@ -149,13 +150,13 @@ func (p *lxdWatchdog) getInstancesIps() []string { result := []string{} instances, err := p.client.GetInstances(lxdapi.InstanceTypeAny) if err != nil { - fmt.Printf("Error on getting instances: %v\n", err) + fmt.Printf("[LXDWATCHDOG] Error on getting instances: %v\n", err) return result } for _, i := range instances { state, _, err := p.client.GetInstanceState(i.Name) if err != nil { - fmt.Printf("Error on getting instance state: %v\n", err) + fmt.Printf("[LXDWATCHDOG] Error on getting instance state: %v\n", err) return result } net := state.Network["eth0"] @@ -210,8 +211,6 @@ func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { usedIPs = append(usedIPs, reservedIps...) - fmt.Printf("usedIPs: %v\n", usedIPs) - // Find a free address for _, ip := range ips { // Skip used addresses @@ -231,7 +230,7 @@ func (p *lxdWatchdog) allocateAddress(containerName string) (string, error) { continue } - fmt.Printf("FREE ADDRESS: %v\n", ip) + fmt.Printf("[LXDWATCHDOG] FREE ADDRESS: %v\n", ip) // Allocate the address p.networkLeases[containerName] = ip size, _ := p.networkSubnet.Mask.Size() @@ -295,7 +294,7 @@ func (p *lxdWatchdog) getImage(imageName string) (lxd.ImageServer, *lxdapi.Image return p.client, image, nil } -func (p *lxdWatchdog) Start() error { +func (p *lxdWatchdog) Start(singleRun bool) error { var ( err error @@ -309,7 +308,7 @@ func (p *lxdWatchdog) Start() error { imageServer, image, err := p.getImage(imageName) if err != nil { - fmt.Printf("Error getting image: %v\n", err) + fmt.Printf("[LXDWATCHDOG] Error getting image: %v\n", err) return err } @@ -348,7 +347,7 @@ func (p *lxdWatchdog) Start() error { p.releaseAddress(containerName) } - fmt.Printf("removed preexisting container before create\n") + fmt.Printf("[LXDWATCHDOG] removed preexisting container before create\n") } // Create the container @@ -445,7 +444,7 @@ iface eth0 inet static err = p.client.CreateInstanceFile(containerName, fileName, args) if err != nil { - fmt.Printf("failed to upload resolv.conf to container: %v\n", err) + fmt.Printf("[LXDWATCHDOG] failed to upload resolv.conf to container: %v\n", err) } } @@ -482,7 +481,7 @@ iface eth0 inet static // Spawn the command _, err = p.client.ExecInstance(containerName, exec, nil) if err != nil { - fmt.Printf("couldn't add default gateway: %v\n", err) + fmt.Printf("[LXDWATCHDOG] couldn't add default gateway: %v\n", err) } } @@ -518,23 +517,21 @@ iface eth0 inet static if err == nil { break } - fmt.Printf("wait for connection\n") + fmt.Printf("[LXDWATCHDOG] wait for connection\n") time.Sleep(500 * time.Millisecond) } if err != nil { - fmt.Printf("container didn't have connectivity after 30s: %v\n", err) - err = p.killWorker() + fmt.Printf("[LXDWATCHDOG] container didn't have connectivity after 30s: %v\n", err) + err = p.killWorker(singleRun) if err != nil { fmt.Printf("kill worker error: %v\n", err) } p.datadogAlert("[TRAVIS][LXC] Watchdog error", "container didn't have connectivity after 30s") } - fmt.Printf("STARTED - OK\n") - - p.setWorkerLock(false) + fmt.Printf("[LXDWATCHDOG] STARTED - OK\n") // Get the container container, _, err = p.client.GetInstance(containerName) @@ -577,54 +574,25 @@ iface eth0 inet static p.releaseAddress(container.Name) } - fmt.Printf("CLEANUP DONE\n") + fmt.Printf("[LXDWATCHDOG] CLEANUP DONE\n") return nil } -func (p *lxdWatchdog) setWorkerLock(value bool) error { - if value { - file, err := os.Create("/tmp/worker.lock") - if err != nil { - return fmt.Errorf("can't set the worker lock, can't access the worker.lock file: %v", err) - } - defer file.Close() - _, err = file.Write([]byte{'1'}) - if err != nil { - return fmt.Errorf("can't set the worker lock, can't write the worker.lock file: %v", err) - } - } else { - err := os.Remove("/tmp/worker.lock") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("can't remove the worker lock!: %v", err) - } - if err != nil { - fmt.Printf("Skipping remove lock, doesn't exist\n") +func (p *lxdWatchdog) killWorker(singleRun bool) error { + if singleRun { + fmt.Printf("[LXDWATCHDOG] Can't reach network from LXD watchdog killing the worker") + if p.lastSleep == 0 { + p.lastSleep = 1 + } else { + p.lastSleep *= 2 } - } - return nil -} -func (p *lxdWatchdog) killWorker() error { - file, err := os.Open("/tmp/worker.pid") - if err != nil { - return fmt.Errorf("can't kill the worker, can't access the worker.pid file: %v", err) + p.setStartInterval(p.lastSleep) + os.Exit(-1) } - defer file.Close() - data := make([]byte, 64) - - var count int = 0 - count, err = file.Read(data) - if err != nil { - return fmt.Errorf("can't kill the worker, can't read the worker.pid file: %v", err) - } - pid := 0 - pid, err = strconv.Atoi(string(data[:count])) - if err != nil || pid == 0 { - return fmt.Errorf("can't kill the worker, can't read the worker.pid : %v", err) - } - p.setWorkerLock(true) - syscall.Kill(pid, syscall.SIGTERM) - fmt.Printf("Sent SIGTERM to worker process [%d]\n", pid) + pid := os.Getpid() + syscall.Kill(pid, syscall.SIGUSR2) + fmt.Printf("[LXDWATCHDOG] Sent SIGUSR2 to worker process [%d]\n", pid) return nil } @@ -642,42 +610,88 @@ func (p *lxdWatchdog) datadogAlert(title string, text string) { } r, err := http.NewRequest("POST", url, bytes.NewBufferString(content)) if err != nil { - fmt.Printf("ERROR on creating request for Datadog: %v\n", err) + fmt.Printf("[LXDWATCHDOG] ERROR on creating request for Datadog: %v\n", err) } r.Header.Add("Content-Type", "application/json") client := &http.Client{} _, err = client.Do(r) if err != nil { - fmt.Printf("ERROR on sending request to Datadog: %v\n", err) + fmt.Printf("[LXDWATCHDOG] ERROR on sending request to Datadog: %v\n", err) + } +} + +func (p *lxdWatchdog) setStartInterval(interval int) error { + file, err := os.Create("/tmp/ldx_watchdog.interval") + if err != nil { + return fmt.Errorf("can't set the worker lock, can't access the worker.lock file: %v", err) + } + defer file.Close() + _, err = file.WriteString(strconv.Itoa(interval)) + if err != nil { + return fmt.Errorf("can't set the worker lock, can't write the worker.lock file: %v", err) } + + return nil } -func main() { - args := os.Args - loop := false +func (p *lxdWatchdog) getStartInterval() int { + defaultInterval := 0 + file, err := os.Open("/tmp/ldx_watchdog.interval") + if err != nil { + return defaultInterval + } + defer file.Close() + data := make([]byte, 64) + + var count int = 0 + count, err = file.Read(data) + if err != nil { + return defaultInterval + } + interval := 0 + interval, err = strconv.Atoi(string(data[:count])) + if err != nil || interval == 0 { + return defaultInterval + } + return interval +} + +func (p *lxdWatchdog) handleSleep() { sleepTime := 60 * time.Minute - if len(args) > 1 && (args[1] == "-l" || args[1] == "--loop") { - loop = true - sleepStr := os.Getenv("WATCHDOG_INTERVAL") - if sleepStr != "" { - - t, err := strconv.Atoi(sleepStr) - if err == nil { - sleepTime = time.Duration(t) * time.Minute - } + + t := p.getStartInterval() + if t > 0 { + sleepTime = time.Duration(p.getStartInterval()) * time.Minute + p.lastSleep = t + fmt.Printf("[LXDWATCHDOG] last run was unsuccessful waiting for %d minutes before retry\n", t) + time.Sleep(sleepTime) + } +} + +func watchdogMain(loop bool) { + sleepTime := 60 * time.Minute + sleepStr := os.Getenv("WATCHDOG_INTERVAL") + if sleepStr != "" { + + t, err := strconv.Atoi(sleepStr) + if err == nil { + sleepTime = time.Duration(t) * time.Minute } } - fmt.Println("Starting LXD watchdog") + fmt.Println("[LXDWATCHDOG] Starting LXD watchdog") w, err := newLxdWatchdog() + if !loop { + w.handleSleep() + } for { if err == nil { - err = w.Start() + err = w.Start(!loop) if err != nil { - fmt.Printf("error on start: %v\n", err) + fmt.Printf("[LXDWATCHDOG] error on start: %v\n", err) } } else { - fmt.Printf("Starting LXD watchdog error: %v\n", err) + fmt.Printf("[LXDWATCHDOG] Starting LXD watchdog error: %v\n", err) } if !loop { break @@ -687,3 +701,12 @@ func main() { time.Sleep(sleepTime) } } + +func RunLXDWatchdog(loop bool) { + + if loop { + go watchdogMain(true) + } else { + watchdogMain(false) + } +} From 0ca31b256c82a7f15484cc940693c1b28f813a6b Mon Sep 17 00:00:00 2001 From: "g.arczynski" Date: Tue, 10 Dec 2024 10:32:40 +0100 Subject: [PATCH 04/10] pause/resume --- Makefile | 8 +------- cli.go | 19 ++++++++++++++++++- config/config.go | 5 +++++ processor.go | 7 +++++++ remote_controller.go | 39 +++++++++++++++++++++++++++++++++++---- step_start_instance.go | 3 +++ watchdog.go | 4 ++-- 7 files changed, 71 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 2bf455c2..de8cde62 100644 --- a/Makefile +++ b/Makefile @@ -72,15 +72,9 @@ coverage.coverprofile: $(COVERPROFILES) $(GO) tool cover -func=$@ .PHONY: build -build: .deps-fetched watchdog +build: .deps-fetched $(GO) install -tags netgo -ldflags "$(GOBUILD_LDFLAGS)" $(ALL_PACKAGES) -.PHONY: watchdog -watchdog: - pushd tools/lxd_watchdog && \ - $(GO) install . && \ - popd - .PHONY: crossbuild crossbuild: .deps-fetched $(CROSSBUILD_BINARIES) diff --git a/cli.go b/cli.go index 479c67cb..2f8a5be0 100644 --- a/cli.go +++ b/cli.go @@ -69,6 +69,8 @@ type CLI struct { heartbeatErrSleep time.Duration heartbeatSleep time.Duration + + lastPoolSize int } // NewCLI creates a new *CLI from a *cli.Context @@ -561,7 +563,7 @@ func (i *CLI) signalHandler() { signal.Notify(signalChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGUSR1, syscall.SIGTTIN, syscall.SIGTTOU, - syscall.SIGUSR2) + syscall.SIGUSR2, syscall.Signal(0x22), syscall.Signal(0x23)) for { select { @@ -580,8 +582,23 @@ func (i *CLI) signalHandler() { i.logger.Info("SIGTTOU received, removing processor from pool") i.ProcessorPool.Decr() case syscall.SIGUSR2: + i.lastPoolSize = i.ProcessorPool.Size() i.logger.Warn("SIGUSR2 received, toggling graceful shutdown and pause") i.ProcessorPool.GracefulShutdown(true) + case syscall.Signal(0x22): //SIGRTMIN + i.lastPoolSize = i.ProcessorPool.Size() + i.logger.Warn("SIGRTMIN received, pause processing") + i.ProcessorPool.SetSize(0) + case syscall.Signal(0x23): //SIGRTMIN + 1 + i.logProcessorInfo("received SIGRTMIN+1, resuming processor pool") + if i.lastPoolSize == 0 { + if i.ProcessorPool.Size() > 0 { + i.lastPoolSize = i.ProcessorPool.Size() + } else { + i.lastPoolSize = 1 + } + } + i.ProcessorPool.SetSize(i.lastPoolSize) case syscall.SIGUSR1: i.logProcessorInfo("received SIGUSR1") default: diff --git a/config/config.go b/config/config.go index 0fb1258b..fc377d63 100644 --- a/config/config.go +++ b/config/config.go @@ -283,6 +283,10 @@ var ( NewConfigDef("watchdog", &cli.BoolFlag{ Usage: "execute LXD watchdog and exit", }), + NewConfigDef("MaxRequeues", &cli.IntFlag{ + Usage: "Max requeue count after Worker pauses", + Value: 0, + }), } // Flags is the list of all CLI flags accepted by travis-worker @@ -446,6 +450,7 @@ type Config struct { StackdriverProjectID string `config:"stackdriver-project-id"` OpencensusTracingEnabled bool `config:"opencensus-tracing-enabled"` OpencensusSamplingRate int `config:"opencensus-sampling-rate"` + MaxRequeues int `config:"max-requeues"` ProviderConfig *ProviderConfig } diff --git a/processor.go b/processor.go index c2007fa5..e0e0495d 100644 --- a/processor.go +++ b/processor.go @@ -278,6 +278,13 @@ func (p *Processor) process(ctx gocontext.Context, buildJob Job) { } if buildJob.Requeued() { fields["requeued"] = 1 + + requeueCount, _ := state.Get("requeueCount").(int) + + if p.config.MaxRequeues > 0 && requeueCount > p.config.MaxRequeues { + logger.WithFields(fields).Info("too many requeues, shutting down") + p.GracefulShutdown() + } } logger.WithFields(fields).Info("finished job") diff --git a/remote_controller.go b/remote_controller.go index 6f7c678e..88ec84f0 100644 --- a/remote_controller.go +++ b/remote_controller.go @@ -15,10 +15,11 @@ import ( // RemoteController provides an HTTP API for controlling worker. type RemoteController struct { - pool *ProcessorPool - auth string - workerInfo func() workerInfo - cancel func() + pool *ProcessorPool + auth string + workerInfo func() workerInfo + cancel func() + lastPoolSize int } // Setup installs the HTTP routes that will handle requests to the HTTP API. @@ -37,6 +38,9 @@ func (api *RemoteController) Setup() { r.HandleFunc("/pool/increment", api.IncrementPool).Methods("POST") r.HandleFunc("/pool/decrement", api.DecrementPool).Methods("POST") + r.HandleFunc("/pause", api.Pause).Methods("POST") + r.HandleFunc("/resume", api.Resume).Methods("POST") + r.Use(api.SetContext) r.Use(api.CheckAuth) http.Handle("/", r) @@ -179,6 +183,33 @@ func (api *RemoteController) ShutdownWorker(w http.ResponseWriter, req *http.Req w.WriteHeader(http.StatusNoContent) } +// IncrementPool tells the worker to spin up another processor. +func (api *RemoteController) Pause(w http.ResponseWriter, req *http.Request) { + log := context.LoggerFromContext(req.Context()).WithField("method", "Pause") + + api.lastPoolSize = api.pool.Size() + api.pool.SetSize(0) + log.Info("pool size set to 0") + + w.WriteHeader(http.StatusNoContent) +} + +// IncrementPool tells the worker to spin up another processor. +func (api *RemoteController) Resume(w http.ResponseWriter, req *http.Request) { + log := context.LoggerFromContext(req.Context()).WithField("method", "Resume") + if api.lastPoolSize == 0 { + if api.pool.Size() > 0 { + api.lastPoolSize = api.pool.Size() + } else { + api.lastPoolSize = 1 + } + } + api.pool.SetSize(api.lastPoolSize) + log.Info("pool size set to " + fmt.Sprintf("%d", api.lastPoolSize)) + + w.WriteHeader(http.StatusNoContent) +} + // IncrementPool tells the worker to spin up another processor. func (api *RemoteController) IncrementPool(w http.ResponseWriter, req *http.Request) { log := context.LoggerFromContext(req.Context()).WithField("method", "IncrementPool") diff --git a/step_start_instance.go b/step_start_instance.go index 554b6546..5bbb6b03 100644 --- a/step_start_instance.go +++ b/step_start_instance.go @@ -95,6 +95,9 @@ func (s *stepStartInstance) Run(state multistep.StateBag) multistep.StepAction { }).Error("couldn't start instance, attempting requeue") context.CaptureError(ctx, err) + requeueCount,_ := state.Get("requeueCount").(int) + state.Put("requeueCount", requeueCount + 1) + err := buildJob.Requeue(preTimeoutCtx) if err != nil { logger.WithField("err", err).Error("couldn't requeue job") diff --git a/watchdog.go b/watchdog.go index b4941c11..f4d1f4de 100644 --- a/watchdog.go +++ b/watchdog.go @@ -591,8 +591,8 @@ func (p *lxdWatchdog) killWorker(singleRun bool) error { os.Exit(-1) } pid := os.Getpid() - syscall.Kill(pid, syscall.SIGUSR2) - fmt.Printf("[LXDWATCHDOG] Sent SIGUSR2 to worker process [%d]\n", pid) + syscall.Kill(pid, syscall.SIGINT) + fmt.Printf("[LXDWATCHDOG] Sent SIGINT to worker process [%d]\n", pid) return nil } From a15e640aebc734a45a82bcd7fbba9aa92b77a577 Mon Sep 17 00:00:00 2001 From: GbArc Date: Tue, 10 Dec 2024 11:02:51 +0100 Subject: [PATCH 05/10] lint --- step_start_instance.go | 4 ++-- watchdog.go | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/step_start_instance.go b/step_start_instance.go index 5bbb6b03..e52bf099 100644 --- a/step_start_instance.go +++ b/step_start_instance.go @@ -95,8 +95,8 @@ func (s *stepStartInstance) Run(state multistep.StateBag) multistep.StepAction { }).Error("couldn't start instance, attempting requeue") context.CaptureError(ctx, err) - requeueCount,_ := state.Get("requeueCount").(int) - state.Put("requeueCount", requeueCount + 1) + requeueCount, _ := state.Get("requeueCount").(int) + state.Put("requeueCount", requeueCount+1) err := buildJob.Requeue(preTimeoutCtx) if err != nil { diff --git a/watchdog.go b/watchdog.go index f4d1f4de..614ce20d 100644 --- a/watchdog.go +++ b/watchdog.go @@ -587,12 +587,15 @@ func (p *lxdWatchdog) killWorker(singleRun bool) error { p.lastSleep *= 2 } - p.setStartInterval(p.lastSleep) + _ = p.setStartInterval(p.lastSleep) os.Exit(-1) } pid := os.Getpid() - syscall.Kill(pid, syscall.SIGINT) - fmt.Printf("[LXDWATCHDOG] Sent SIGINT to worker process [%d]\n", pid) + if syscall.Kill(pid, syscall.SIGINT) != nil { + fmt.Printf("[LXDWATCHDOG] Couldn't send SIGINT to worker process [%d]\n", pid) + } else { + fmt.Printf("[LXDWATCHDOG] Sent SIGINT to worker process [%d]\n", pid) + } return nil } @@ -644,7 +647,7 @@ func (p *lxdWatchdog) getStartInterval() int { defer file.Close() data := make([]byte, 64) - var count int = 0 + var count int count, err = file.Read(data) if err != nil { return defaultInterval @@ -658,11 +661,9 @@ func (p *lxdWatchdog) getStartInterval() int { } func (p *lxdWatchdog) handleSleep() { - sleepTime := 60 * time.Minute - t := p.getStartInterval() if t > 0 { - sleepTime = time.Duration(p.getStartInterval()) * time.Minute + sleepTime := time.Duration(p.getStartInterval()) * time.Minute p.lastSleep = t fmt.Printf("[LXDWATCHDOG] last run was unsuccessful waiting for %d minutes before retry\n", t) time.Sleep(sleepTime) From 0c29a03f7f3c1aaca3cf986981995e62855ad2d5 Mon Sep 17 00:00:00 2001 From: GbArc Date: Thu, 12 Dec 2024 09:55:54 +0100 Subject: [PATCH 06/10] envs fix --- watchdog.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/watchdog.go b/watchdog.go index 614ce20d..19265a98 100644 --- a/watchdog.go +++ b/watchdog.go @@ -58,8 +58,8 @@ func newLxdWatchdog() (*lxdWatchdog, error) { var networkSubnet *net.IPNet var networkLeases map[string]string - if os.Getenv("NETWORK_STATIC") != "" { - networkStatic = os.Getenv("NETWORK_STATIC") == "true" + if os.Getenv("TRAVIS_WORKER_LXD_NETWORK_STATIC") != "" { + networkStatic = os.Getenv("TRAVIS_WORKER_LXD_NETWORK_STATIC") == "true" network, _, err := client.GetNetwork("lxdbr0") if err != nil { @@ -112,14 +112,14 @@ func newLxdWatchdog() (*lxdWatchdog, error) { } networkDNS := []string{"1.1.1.1", "1.0.0.1"} - if os.Getenv("NETWORK_DNS") != "" { - networkDNS = strings.Split(os.Getenv("NETWORK_DNS"), ",") + if os.Getenv("TRAVIS_WORKER_LXD_NETWORK_DNS") != "" { + networkDNS = strings.Split(os.Getenv("TRAVIS_WORKER_LXD_NETWORK_DNS"), ",") } - httpProxy := os.Getenv("HTTP_PROXY") - httpsProxy := os.Getenv("HTTPS_PROXY") - ftpProxy := os.Getenv("FTP_PROXY") - noProxy := os.Getenv("NO_PROXY") + httpProxy := os.Getenv("TRAVIS_WORKER_LXD_HTTP_PROXY") + httpsProxy := os.Getenv("TRAVIS_WORKER_LXD_HTTPS_PROXY") + ftpProxy := os.Getenv("TRAVIS_WORKER_LXD_FTP_PROXY") + noProxy := os.Getenv("TRAVIS_WORKER_LXD_NO_PROXY") url := "www.google.com" if os.Getenv("WATCHDOG_PING_URL") != "" { From 4e6e17563683d829e6ffadfb601a8651ce8aca9f Mon Sep 17 00:00:00 2001 From: GbArc Date: Thu, 12 Dec 2024 13:08:22 +0100 Subject: [PATCH 07/10] gateway --- watchdog.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/watchdog.go b/watchdog.go index 19265a98..f559091d 100644 --- a/watchdog.go +++ b/watchdog.go @@ -473,20 +473,17 @@ iface eth0 inet static fmt.Printf("STARTED - check connection\n") - if p.networkStatic { - exec := lxdapi.InstanceExecPost{ - Command: []string{"route", "add", "default", "gw", "_gateway.lxd"}, - } - - // Spawn the command - _, err = p.client.ExecInstance(containerName, exec, nil) - if err != nil { - fmt.Printf("[LXDWATCHDOG] couldn't add default gateway: %v\n", err) - } - } - // Wait for connectivity connectivityCheck := func() error { + if p.networkStatic { + exec := lxdapi.InstanceExecPost{ + Command: []string{"route", "add", "default", "gw", p.networkGateway}, + } + _, err = p.client.ExecInstance(containerName, exec, nil) + if err != nil { + fmt.Printf("[LXDWATCHDOG] couldn't add default gateway: %v\n", err) + } + } exec := lxdapi.InstanceExecPost{ Command: []string{"ping", p.url, "-c", "1"}, } From f1dec4ef43d66e9bd1a8aff64a583b65129ad589 Mon Sep 17 00:00:00 2001 From: GbArc Date: Thu, 12 Dec 2024 14:48:01 +0100 Subject: [PATCH 08/10] interval reset --- watchdog.go | 1 + 1 file changed, 1 insertion(+) diff --git a/watchdog.go b/watchdog.go index f559091d..cf85b2b0 100644 --- a/watchdog.go +++ b/watchdog.go @@ -571,6 +571,7 @@ iface eth0 inet static p.releaseAddress(container.Name) } + _ = p.setStartInterval(1) fmt.Printf("[LXDWATCHDOG] CLEANUP DONE\n") return nil } From f230d6c6a403af6154a90f129c5f7ca211bcabcf Mon Sep 17 00:00:00 2001 From: GbArc Date: Thu, 12 Dec 2024 16:01:11 +0100 Subject: [PATCH 09/10] additional timeout check --- watchdog.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/watchdog.go b/watchdog.go index cf85b2b0..1c6b964c 100644 --- a/watchdog.go +++ b/watchdog.go @@ -507,6 +507,7 @@ iface eth0 inet static return nil } + testStartTime := time.Now().Unix() // Wait 30s for network time.Sleep(1 * time.Second) for i := 0; i < 60; i++ { @@ -517,6 +518,13 @@ iface eth0 inet static fmt.Printf("[LXDWATCHDOG] wait for connection\n") time.Sleep(500 * time.Millisecond) + + testCurrentTime := time.Now().Unix() + if testCurrentTime - testStartTime > 30 { + fmt.Printf("[LXDWATCHDOG] timeout while waiting for connection\n") + err = fmt.Errorf("connection test timeout") + break + } } if err != nil { From b35e79ab188e53aca7e444e7784e8b6bc59e89df Mon Sep 17 00:00:00 2001 From: GbArc Date: Thu, 12 Dec 2024 16:32:33 +0100 Subject: [PATCH 10/10] ping limit to 5s --- watchdog.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/watchdog.go b/watchdog.go index 1c6b964c..3085eaa9 100644 --- a/watchdog.go +++ b/watchdog.go @@ -485,7 +485,7 @@ iface eth0 inet static } } exec := lxdapi.InstanceExecPost{ - Command: []string{"ping", p.url, "-c", "1"}, + Command: []string{"ping", p.url, "-c", "1", "-w", "5"}, } // Spawn the command @@ -525,6 +525,7 @@ iface eth0 inet static err = fmt.Errorf("connection test timeout") break } + fmt.Printf("[LXDWATCHDOG] test running for %ds\n", testCurrentTime - testStartTime) } if err != nil {