Run repairer and checker early (#565)
* Run repairers, checker, auditors first time they run to detect potential setup problems. * Fix error handling in audit.Service
This commit is contained in:
parent
1d3367bb09
commit
2a8b681c4d
@ -13,7 +13,6 @@ import (
|
|||||||
"storj.io/storj/pkg/pointerdb/pdbclient"
|
"storj.io/storj/pkg/pointerdb/pdbclient"
|
||||||
"storj.io/storj/pkg/provider"
|
"storj.io/storj/pkg/provider"
|
||||||
"storj.io/storj/pkg/transport"
|
"storj.io/storj/pkg/transport"
|
||||||
"storj.io/storj/pkg/utils"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Service helps coordinate Cursor and Verifier to run the audit process continuously
|
// Service helps coordinate Cursor and Verifier to run the audit process continuously
|
||||||
@ -21,7 +20,7 @@ type Service struct {
|
|||||||
Cursor *Cursor
|
Cursor *Cursor
|
||||||
Verifier *Verifier
|
Verifier *Verifier
|
||||||
Reporter reporter
|
Reporter reporter
|
||||||
errs []error
|
ticker *time.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
// Config contains configurable values for audit service
|
// Config contains configurable values for audit service
|
||||||
@ -37,15 +36,15 @@ type Config struct {
|
|||||||
|
|
||||||
// Run runs the repairer with the configured values
|
// Run runs the repairer with the configured values
|
||||||
func (c Config) Run(ctx context.Context, server *provider.Provider) (err error) {
|
func (c Config) Run(ctx context.Context, server *provider.Provider) (err error) {
|
||||||
service, err := NewService(ctx, c.StatDBPort, c.MaxRetriesStatDB, c.Pointers, c.Transport, c.Overlay, c.ID)
|
service, err := NewService(ctx, c.StatDBPort, c.Interval, c.MaxRetriesStatDB, c.Pointers, c.Transport, c.Overlay, c.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return service.Run(ctx, c.Interval)
|
return service.Run(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewService instantiates a Service with access to a Cursor and Verifier
|
// NewService instantiates a Service with access to a Cursor and Verifier
|
||||||
func NewService(ctx context.Context, statDBPort string, maxRetries int, pointers pdbclient.Client, transport transport.Client, overlay overlay.Client,
|
func NewService(ctx context.Context, statDBPort string, interval time.Duration, maxRetries int, pointers pdbclient.Client, transport transport.Client, overlay overlay.Client,
|
||||||
id provider.FullIdentity) (service *Service, err error) {
|
id provider.FullIdentity) (service *Service, err error) {
|
||||||
cursor := NewCursor(pointers)
|
cursor := NewCursor(pointers)
|
||||||
verifier := NewVerifier(transport, overlay, id)
|
verifier := NewVerifier(transport, overlay, id)
|
||||||
@ -54,57 +53,56 @@ func NewService(ctx context.Context, statDBPort string, maxRetries int, pointers
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Service{Cursor: cursor,
|
return &Service{
|
||||||
|
Cursor: cursor,
|
||||||
Verifier: verifier,
|
Verifier: verifier,
|
||||||
Reporter: reporter,
|
Reporter: reporter,
|
||||||
errs: []error{},
|
ticker: time.NewTicker(interval),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run calls Cursor and Verifier to continuously request random pointers, then verify data correctness at
|
// Run runs auditing service
|
||||||
// a random stripe within a segment
|
func (service *Service) Run(ctx context.Context) (err error) {
|
||||||
func (service *Service) Run(ctx context.Context, interval time.Duration) (err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
zap.S().Info("Audit cron is starting up")
|
zap.S().Info("Audit cron is starting up")
|
||||||
ticker := time.NewTicker(interval)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
for {
|
||||||
defer cancel()
|
err := service.process(ctx)
|
||||||
|
if err != nil {
|
||||||
go func() {
|
zap.L().Error("process", zap.Error(err))
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ticker.C:
|
|
||||||
stripe, err := service.Cursor.NextStripe(ctx)
|
|
||||||
if err != nil {
|
|
||||||
service.errs = append(service.errs, err)
|
|
||||||
cancel()
|
|
||||||
}
|
|
||||||
|
|
||||||
authorization, err := service.Cursor.pointers.SignedMessage()
|
|
||||||
if err != nil {
|
|
||||||
service.errs = append(service.errs, err)
|
|
||||||
cancel()
|
|
||||||
}
|
|
||||||
|
|
||||||
verifiedNodes, err := service.Verifier.verify(ctx, stripe.Index, stripe.Segment, authorization)
|
|
||||||
if err != nil {
|
|
||||||
service.errs = append(service.errs, err)
|
|
||||||
cancel()
|
|
||||||
}
|
|
||||||
err = service.Reporter.RecordAudits(ctx, verifiedNodes)
|
|
||||||
// TODO: if Error.Has(err) then log the error because it means not all node stats updated
|
|
||||||
if err != nil {
|
|
||||||
service.errs = append(service.errs, err)
|
|
||||||
cancel()
|
|
||||||
}
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}()
|
|
||||||
|
|
||||||
return utils.CombineErrors(service.errs...)
|
select {
|
||||||
|
case <-service.ticker.C:
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// process picks a random stripe and verifies correctness
|
||||||
|
func (service *Service) process(ctx context.Context) error {
|
||||||
|
stripe, err := service.Cursor.NextStripe(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
authorization, err := service.Cursor.pointers.SignedMessage()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
verifiedNodes, err := service.Verifier.verify(ctx, stripe.Index, stripe.Segment, authorization)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = service.Reporter.RecordAudits(ctx, verifiedNodes)
|
||||||
|
// TODO: if Error.Has(err) then log the error because it means not all node stats updated
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -46,6 +46,24 @@ func newChecker(pointerdb *pointerdb.Server, repairQueue *queue.Queue, overlay p
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Run the checker loop
|
||||||
|
func (c *checker) Run(ctx context.Context) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
|
for {
|
||||||
|
err = c.IdentifyInjuredSegments(ctx)
|
||||||
|
if err != nil {
|
||||||
|
zap.L().Error("Checker failed", zap.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-c.ticker.C: // wait for the next interval to happen
|
||||||
|
case <-ctx.Done(): // or the checker is canceled via context
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// IdentifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
|
// IdentifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
|
||||||
func (c *checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
|
func (c *checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
@ -122,21 +140,3 @@ func lookupResponsesToNodes(responses *pb.LookupResponses) []*pb.Node {
|
|||||||
}
|
}
|
||||||
return nodes
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run the checker loop
|
|
||||||
func (c *checker) Run(ctx context.Context) (err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-c.ticker.C: // wait for the next interval to happen
|
|
||||||
case <-ctx.Done(): // or the checker is canceled via context
|
|
||||||
return ctx.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
err = c.IdentifyInjuredSegments(ctx)
|
|
||||||
if err != nil {
|
|
||||||
zap.L().Error("Checker failed", zap.Error(err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -8,7 +8,8 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
q "storj.io/storj/pkg/datarepair/queue"
|
|
||||||
|
"storj.io/storj/pkg/datarepair/queue"
|
||||||
"storj.io/storj/pkg/provider"
|
"storj.io/storj/pkg/provider"
|
||||||
"storj.io/storj/storage/redis"
|
"storj.io/storj/storage/redis"
|
||||||
)
|
)
|
||||||
@ -27,8 +28,7 @@ func (c Config) Run(ctx context.Context, server *provider.Provider) (err error)
|
|||||||
return Error.Wrap(err)
|
return Error.Wrap(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
queue := q.NewQueue(client)
|
queue := queue.NewQueue(client)
|
||||||
|
|
||||||
repairer := newRepairer(queue, c.Interval, c.MaxRepair)
|
repairer := newRepairer(queue, c.Interval, c.MaxRepair)
|
||||||
|
|
||||||
// TODO(coyle): we need to figure out how to propagate the error up to cancel the service
|
// TODO(coyle): we need to figure out how to propagate the error up to cancel the service
|
||||||
|
@ -10,7 +10,7 @@ import (
|
|||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"storj.io/storj/internal/sync2"
|
"storj.io/storj/internal/sync2"
|
||||||
q "storj.io/storj/pkg/datarepair/queue"
|
"storj.io/storj/pkg/datarepair/queue"
|
||||||
"storj.io/storj/pkg/pb"
|
"storj.io/storj/pkg/pb"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -22,12 +22,12 @@ type Repairer interface {
|
|||||||
|
|
||||||
// repairer holds important values for data repair
|
// repairer holds important values for data repair
|
||||||
type repairer struct {
|
type repairer struct {
|
||||||
queue q.RepairQueue
|
queue queue.RepairQueue
|
||||||
limiter *sync2.Limiter
|
limiter *sync2.Limiter
|
||||||
ticker *time.Ticker
|
ticker *time.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
func newRepairer(queue q.RepairQueue, interval time.Duration, concurrency int) *repairer {
|
func newRepairer(queue queue.RepairQueue, interval time.Duration, concurrency int) *repairer {
|
||||||
return &repairer{
|
return &repairer{
|
||||||
queue: queue,
|
queue: queue,
|
||||||
limiter: sync2.NewLimiter(concurrency),
|
limiter: sync2.NewLimiter(concurrency),
|
||||||
@ -35,7 +35,7 @@ func newRepairer(queue q.RepairQueue, interval time.Duration, concurrency int) *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run the repairer loop
|
// Run runs the repairer service
|
||||||
func (r *repairer) Run(ctx context.Context) (err error) {
|
func (r *repairer) Run(ctx context.Context) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
@ -43,28 +43,37 @@ func (r *repairer) Run(ctx context.Context) (err error) {
|
|||||||
defer r.limiter.Wait()
|
defer r.limiter.Wait()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
err := r.process(ctx)
|
||||||
|
if err != nil {
|
||||||
|
zap.L().Error("process", zap.Error(err))
|
||||||
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-r.ticker.C: // wait for the next interval to happen
|
case <-r.ticker.C: // wait for the next interval to happen
|
||||||
case <-ctx.Done(): // or the repairer is canceled via context
|
case <-ctx.Done(): // or the repairer is canceled via context
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
seg, err := r.queue.Dequeue()
|
|
||||||
if err != nil {
|
|
||||||
// TODO: only log when err != ErrQueueEmpty
|
|
||||||
zap.L().Error("dequeue", zap.Error(err))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
r.limiter.Go(ctx, func() {
|
|
||||||
err := r.Repair(ctx, &seg)
|
|
||||||
if err != nil {
|
|
||||||
zap.L().Error("Repair failed", zap.Error(err))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// process picks an item from repair queue and spawns a repairer
|
||||||
|
func (r *repairer) process(ctx context.Context) error {
|
||||||
|
seg, err := r.queue.Dequeue()
|
||||||
|
if err != nil {
|
||||||
|
// TODO: only log when err != ErrQueueEmpty
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
r.limiter.Go(ctx, func() {
|
||||||
|
err := r.Repair(ctx, &seg)
|
||||||
|
if err != nil {
|
||||||
|
zap.L().Error("Repair failed", zap.Error(err))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Repair starts repair of the segment
|
// Repair starts repair of the segment
|
||||||
func (r *repairer) Repair(ctx context.Context, seg *pb.InjuredSegment) (err error) {
|
func (r *repairer) Repair(ctx context.Context, seg *pb.InjuredSegment) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
Loading…
Reference in New Issue
Block a user