7e71986493
Today each storagenode should have a port which is opened for the internet, and handles DRPC protocol calls. When we do a HTTP call on the DRPC endpoint, it hangs until a timeout. This patch changes the behavior: the main DRPC port of the storagenodes can accept HTTP requests and can be used to monitor the status of the node: * if returns with HTTP 200 only if the storagnode is healthy (not suspended / disqualified + online score > 0.9) * it CAN include information about the current status (per satellite). It's opt-in, you should configure it so. In this way it becomes extremely easy to monitor storagenodes with external uptime services. Note: this patch exposes some information which was not easily available before (especially the node status, and used satellites). I think it should be acceptable: * Until having more community satellites, all storagenodes are connected to the main Storj satellites. * With community satellites, it's good thing to have more transparency (easy way to check who is connected to which satellites) The implementation is based on this line: ``` http.Serve(NewPrefixedListener([]byte("GET / HT"), publicMux.Route("GET / HT")), p.public.http) ``` This line answers to the TCP requests with `GET / HT...` (GET HTTP request to the route), but puts back the removed prefix. Change-Id: I3700c7e24524850825ecdf75a4bcc3b4afcb3a74
89 lines
1.9 KiB
Go
89 lines
1.9 KiB
Go
// Copyright (C) 2022 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package healthcheck
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
"github.com/zeebo/errs"
|
|
|
|
"storj.io/common/storj"
|
|
"storj.io/storj/storagenode/reputation"
|
|
)
|
|
|
|
var (
|
|
// Err defines sno service error.
|
|
Err = errs.Class("healthcheck")
|
|
|
|
mon = monkit.Package()
|
|
)
|
|
|
|
// Service is handling storage node estimation payouts logic.
|
|
//
|
|
// architecture: Service
|
|
type Service struct {
|
|
reputationDB reputation.DB
|
|
serveDetails bool
|
|
}
|
|
|
|
// NewService returns new instance of Service.
|
|
func NewService(reputationDB reputation.DB, serveDetails bool) *Service {
|
|
return &Service{
|
|
reputationDB: reputationDB,
|
|
serveDetails: serveDetails,
|
|
}
|
|
}
|
|
|
|
// Health represents the current status of the Storage ndoe.
|
|
type Health struct {
|
|
Statuses []SatelliteHealthStatus
|
|
Help string
|
|
AllHealthy bool
|
|
}
|
|
|
|
// SatelliteHealthStatus is the health status reported by one satellite.
|
|
type SatelliteHealthStatus struct {
|
|
OnlineScore float64
|
|
SatelliteID storj.NodeID
|
|
DisqualifiedAt *time.Time
|
|
SuspendedAt *time.Time
|
|
}
|
|
|
|
// GetHealth retrieves current health status based on DB records.
|
|
func (s *Service) GetHealth(ctx context.Context) (h Health, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
stats, err := s.reputationDB.All(ctx)
|
|
|
|
h.AllHealthy = true
|
|
|
|
if err != nil {
|
|
return h, Err.Wrap(err)
|
|
}
|
|
for _, stat := range stats {
|
|
if stat.DisqualifiedAt != nil || stat.SuspendedAt != nil || stat.OnlineScore < 0.9 {
|
|
h.AllHealthy = false
|
|
}
|
|
|
|
if s.serveDetails {
|
|
h.Statuses = append(h.Statuses, SatelliteHealthStatus{
|
|
SatelliteID: stat.SatelliteID,
|
|
OnlineScore: stat.OnlineScore,
|
|
DisqualifiedAt: stat.DisqualifiedAt,
|
|
SuspendedAt: stat.SuspendedAt,
|
|
})
|
|
}
|
|
}
|
|
|
|
// sg is wrong if we didn't connect to any satellite
|
|
if len(stats) == 0 {
|
|
h.AllHealthy = false
|
|
}
|
|
|
|
h.Help = "To access Storagenode services, please use DRPC protocol!"
|
|
|
|
return h, nil
|
|
}
|