Commit 48059a6c authored by Miek Gieben's avatar Miek Gieben Committed by GitHub

Overloaded (#1364)

* plugin/health: add 'overloaded metrics'

Query our on health endpoint and record (and export as a metric) the
time it takes. The Get has a 5s timeout, that, when reached, will set
the metric duration to 5s. The actually call "I'm I overloaded" is left
to an external entity.

* README

* golint and govet

* and the tests
parent cced1a4c
...@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README. ...@@ -25,6 +25,14 @@ supports health checks has a section "Health" in their README.
Any plugin that implements the Healther interface will be used to report health. Any plugin that implements the Healther interface will be used to report health.
## Metrics
If monitoring is enabled (via the *prometheus* directive) then the following metric is exported:
* `coredns_health_request_duration_seconds{}` - duration to process a /health query. As this should
be a local operation it should be fast. A (large) increases in this duration indicates the
CoreDNS process is having trouble keeping up.
## Examples ## Examples
Run another health endpoint on http://localhost:8091. Run another health endpoint on http://localhost:8091.
......
...@@ -21,9 +21,11 @@ type health struct { ...@@ -21,9 +21,11 @@ type health struct {
h []Healther h []Healther
sync.RWMutex sync.RWMutex
ok bool // ok is the global boolean indicating an all healthy plugin stack ok bool // ok is the global boolean indicating an all healthy plugin stack
stop chan bool
} }
func (h *health) Startup() error { func (h *health) OnStartup() error {
if h.Addr == "" { if h.Addr == "" {
h.Addr = defAddr h.Addr = defAddr
} }
...@@ -51,14 +53,20 @@ func (h *health) Startup() error { ...@@ -51,14 +53,20 @@ func (h *health) Startup() error {
go func() { go func() {
http.Serve(h.ln, h.mux) http.Serve(h.ln, h.mux)
}() }()
go func() {
h.overloaded()
}()
}) })
return nil return nil
} }
func (h *health) Shutdown() error { func (h *health) OnShutdown() error {
if h.ln != nil { if h.ln != nil {
return h.ln.Close() return h.ln.Close()
} }
h.stop <- true
return nil return nil
} }
......
...@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) { ...@@ -13,10 +13,10 @@ func TestHealth(t *testing.T) {
h := health{Addr: ":0"} h := health{Addr: ":0"}
h.h = append(h.h, &erratic.Erratic{}) h.h = append(h.h, &erratic.Erratic{})
if err := h.Startup(); err != nil { if err := h.OnStartup(); err != nil {
t.Fatalf("Unable to startup the health server: %v", err) t.Fatalf("Unable to startup the health server: %v", err)
} }
defer h.Shutdown() defer h.OnShutdown()
// Reconstruct the http address based on the port allocated by operating system. // Reconstruct the http address based on the port allocated by operating system.
address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path) address := fmt.Sprintf("http://%s%s", h.ln.Addr().String(), path)
......
package health
import (
"net/http"
"sync"
"time"
"github.com/coredns/coredns/plugin"
"github.com/prometheus/client_golang/prometheus"
)
// overloaded queries the health end point and updates a metrics showing how long it took.
func (h *health) overloaded() {
timeout := time.Duration(5 * time.Second)
client := http.Client{
Timeout: timeout,
}
url := "http://" + h.Addr
tick := time.NewTicker(1 * time.Second)
for {
select {
case <-tick.C:
start := time.Now()
resp, err := client.Get(url)
if err != nil {
HealthDuration.Observe(timeout.Seconds())
continue
}
resp.Body.Close()
HealthDuration.Observe(time.Since(start).Seconds())
case <-h.stop:
tick.Stop()
return
}
}
}
var (
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
HealthDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_duration_seconds",
Buckets: plugin.TimeBuckets,
Help: "Histogram of the time (in seconds) each request took.",
})
)
var onceMetric sync.Once
...@@ -6,6 +6,7 @@ import ( ...@@ -6,6 +6,7 @@ import (
"github.com/coredns/coredns/core/dnsserver" "github.com/coredns/coredns/core/dnsserver"
"github.com/coredns/coredns/plugin" "github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/metrics"
"github.com/mholt/caddy" "github.com/mholt/caddy"
) )
...@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error { ...@@ -23,7 +24,7 @@ func setup(c *caddy.Controller) error {
return plugin.Error("health", err) return plugin.Error("health", err)
} }
h := &health{Addr: addr} h := &health{Addr: addr, stop: make(chan bool)}
c.OnStartup(func() error { c.OnStartup(func() error {
plugins := dnsserver.GetConfig(c).Handlers() plugins := dnsserver.GetConfig(c).Handlers()
...@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error { ...@@ -36,6 +37,7 @@ func setup(c *caddy.Controller) error {
}) })
c.OnStartup(func() error { c.OnStartup(func() error {
// Poll all middleware every second.
h.poll() h.poll()
go func() { go func() {
for { for {
...@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error { ...@@ -46,8 +48,21 @@ func setup(c *caddy.Controller) error {
return nil return nil
}) })
c.OnStartup(h.Startup) c.OnStartup(func() error {
c.OnFinalShutdown(h.Shutdown) onceMetric.Do(func() {
m := dnsserver.GetConfig(c).Handler("prometheus")
if m == nil {
return
}
if x, ok := m.(*metrics.Metrics); ok {
x.MustRegister(HealthDuration)
}
})
return nil
})
c.OnStartup(h.OnStartup)
c.OnFinalShutdown(h.OnShutdown)
// Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running. // Don't do AddPlugin, as health is not *really* a plugin just a separate webserver running.
return nil return nil
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment