2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-10 19:25:46 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package audit
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2018-10-16 21:02:18 +01:00
|
|
|
"time"
|
|
|
|
|
2019-05-17 19:48:32 +01:00
|
|
|
"github.com/zeebo/errs"
|
2018-10-16 21:02:18 +01:00
|
|
|
"go.uber.org/zap"
|
2018-10-10 19:25:46 +01:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
"storj.io/storj/internal/memory"
|
|
|
|
"storj.io/storj/internal/sync2"
|
2019-01-30 20:47:21 +00:00
|
|
|
"storj.io/storj/pkg/identity"
|
2019-05-27 12:13:47 +01:00
|
|
|
"storj.io/storj/pkg/storj"
|
2018-10-10 19:25:46 +01:00
|
|
|
"storj.io/storj/pkg/transport"
|
2019-04-25 09:46:32 +01:00
|
|
|
"storj.io/storj/satellite/metainfo"
|
2019-03-28 20:09:23 +00:00
|
|
|
"storj.io/storj/satellite/orders"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2018-10-10 19:25:46 +01:00
|
|
|
)
|
|
|
|
|
2019-05-17 19:48:32 +01:00
|
|
|
// Error is the default audit errs class
|
|
|
|
var Error = errs.Class("audit error")
|
|
|
|
|
2019-01-23 19:58:44 +00:00
|
|
|
// Config contains configurable values for audit service
|
|
|
|
type Config struct {
|
2019-06-03 10:17:09 +01:00
|
|
|
MaxRetriesStatDB int `help:"max number of times to attempt updating a statdb batch" default:"3"`
|
|
|
|
Interval time.Duration `help:"how frequently segments are audited" default:"30s"`
|
|
|
|
MinBytesPerSecond memory.Size `help:"the minimum acceptable bytes that storage nodes can transfer per second to the satellite" default:"128B"`
|
2019-06-14 12:53:49 +01:00
|
|
|
MinDownloadTimeout time.Duration `help:"the minimum duration for downloading a share from storage nodes before timing out" default:"25s"`
|
2019-06-03 10:17:09 +01:00
|
|
|
MaxReverifyCount int `help:"limit above which we consider an audit is failed" default:"3"`
|
2019-01-23 19:58:44 +00:00
|
|
|
}
|
|
|
|
|
2018-10-10 19:25:46 +01:00
|
|
|
// Service helps coordinate Cursor and Verifier to run the audit process continuously
|
|
|
|
type Service struct {
|
2019-02-01 17:28:40 +00:00
|
|
|
log *zap.Logger
|
|
|
|
|
2018-10-10 19:25:46 +01:00
|
|
|
Cursor *Cursor
|
|
|
|
Verifier *Verifier
|
2018-10-16 18:40:34 +01:00
|
|
|
Reporter reporter
|
2019-02-01 17:28:40 +00:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
Loop sync2.Cycle
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2018-10-10 19:25:46 +01:00
|
|
|
// NewService instantiates a Service with access to a Cursor and Verifier
|
2019-04-25 09:46:32 +01:00
|
|
|
func NewService(log *zap.Logger, config Config, metainfo *metainfo.Service,
|
2019-08-06 17:35:59 +01:00
|
|
|
orders *orders.Service, transport transport.Client, overlay *overlay.Service,
|
2019-06-06 13:55:06 +01:00
|
|
|
containment Containment, identity *identity.FullIdentity) (*Service, error) {
|
2018-11-01 14:03:45 +00:00
|
|
|
return &Service{
|
2019-02-01 17:28:40 +00:00
|
|
|
log: log,
|
2019-03-18 10:55:06 +00:00
|
|
|
|
2019-04-25 09:46:32 +01:00
|
|
|
Cursor: NewCursor(metainfo),
|
2019-06-19 10:02:25 +01:00
|
|
|
Verifier: NewVerifier(log.Named("audit:verifier"), metainfo, transport, overlay, containment, orders, identity, config.MinBytesPerSecond, config.MinDownloadTimeout),
|
2019-06-07 13:38:41 +01:00
|
|
|
Reporter: NewReporter(log.Named("audit:reporter"), overlay, containment, config.MaxRetriesStatDB, int32(config.MaxReverifyCount)),
|
2019-02-01 17:28:40 +00:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
Loop: *sync2.NewCycle(config.Interval),
|
2018-10-16 21:02:18 +01:00
|
|
|
}, nil
|
2018-10-10 19:25:46 +01:00
|
|
|
}
|
|
|
|
|
2018-11-01 14:03:45 +00:00
|
|
|
// Run runs auditing service
|
|
|
|
func (service *Service) Run(ctx context.Context) (err error) {
|
2018-10-16 21:02:18 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-01-10 16:35:18 +00:00
|
|
|
service.log.Info("Audit cron is starting up")
|
2019-03-18 10:55:06 +00:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
2018-11-01 14:03:45 +00:00
|
|
|
err := service.process(ctx)
|
|
|
|
if err != nil {
|
2019-01-10 16:35:18 +00:00
|
|
|
service.log.Error("process", zap.Error(err))
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
2019-04-01 10:16:17 +01:00
|
|
|
return nil
|
2019-03-19 17:37:26 +00:00
|
|
|
})
|
|
|
|
}
|
2018-11-01 14:03:45 +00:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
// Close halts the audit loop
|
|
|
|
func (service *Service) Close() error {
|
|
|
|
service.Loop.Close()
|
|
|
|
return nil
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// process picks a random stripe and verifies correctness
|
2019-06-04 12:36:27 +01:00
|
|
|
func (service *Service) process(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-01 19:59:30 +01:00
|
|
|
var stripe *Stripe
|
|
|
|
for {
|
|
|
|
s, more, err := service.Cursor.NextStripe(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if s != nil {
|
|
|
|
stripe = s
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if !more {
|
|
|
|
return nil
|
|
|
|
}
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
2019-05-27 12:13:47 +01:00
|
|
|
var errlist errs.Group
|
|
|
|
|
|
|
|
report, err := service.Verifier.Reverify(ctx, stripe)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
2018-12-19 18:44:03 +00:00
|
|
|
// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update
|
2019-05-27 12:13:47 +01:00
|
|
|
_, err = service.Reporter.RecordAudits(ctx, report)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
2018-10-16 21:02:18 +01:00
|
|
|
|
2019-05-27 12:13:47 +01:00
|
|
|
// skip all reverified nodes in the next Verify step
|
|
|
|
skip := make(map[storj.NodeID]bool)
|
|
|
|
if report != nil {
|
|
|
|
for _, nodeID := range report.Successes {
|
|
|
|
skip[nodeID] = true
|
|
|
|
}
|
|
|
|
for _, nodeID := range report.Offlines {
|
|
|
|
skip[nodeID] = true
|
|
|
|
}
|
|
|
|
for _, nodeID := range report.Fails {
|
|
|
|
skip[nodeID] = true
|
|
|
|
}
|
|
|
|
for _, pending := range report.PendingAudits {
|
|
|
|
skip[pending.NodeID] = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
report, err = service.Verifier.Verify(ctx, stripe, skip)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update
|
|
|
|
_, err = service.Reporter.RecordAudits(ctx, report)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return errlist.Err()
|
2018-10-10 19:25:46 +01:00
|
|
|
}
|