storj/satellite/audit/worker.go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.

package audit

import (
	"context"
	"time"

	"github.com/zeebo/errs"
	"go.uber.org/zap"

	"storj.io/common/memory"
	"storj.io/common/storj"
	"storj.io/common/sync2"
)

// Error is the default audit errs class.
var Error = errs.Class("audit")

// Config contains configurable values for audit chore and workers.
type Config struct {
	MaxRetriesStatDB   int           `help:"max number of times to attempt updating a statdb batch" default:"3"`
	MinBytesPerSecond  memory.Size   `help:"the minimum acceptable bytes that storage nodes can transfer per second to the satellite" default:"128B" testDefault:"1.00 KB"`
	MinDownloadTimeout time.Duration `help:"the minimum duration for downloading a share from storage nodes before timing out" default:"5m0s" testDefault:"5s"`
	MaxReverifyCount   int           `help:"limit above which we consider an audit is failed" default:"3"`

	ChoreInterval     time.Duration `help:"how often to run the reservoir chore" releaseDefault:"24h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
	QueueInterval     time.Duration `help:"how often to recheck an empty audit queue" releaseDefault:"1h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
	Slots             int           `help:"number of reservoir slots allotted for nodes, currently capped at 3" default:"3"`
	WorkerConcurrency int           `help:"number of workers to run audits on segments" default:"2"`
}

// Worker contains information for populating audit queue and processing audits.
type Worker struct {
	log         *zap.Logger
	queues      *Queues
	verifier    *Verifier
	reporter    Reporter
	Loop        *sync2.Cycle
	concurrency int
}

// NewWorker instantiates Worker.
func NewWorker(log *zap.Logger, queues *Queues, verifier *Verifier, reporter Reporter, config Config) (*Worker, error) {
	return &Worker{
		log: log,

		queues:      queues,
		verifier:    verifier,
		reporter:    reporter,
		Loop:        sync2.NewCycle(config.QueueInterval),
		concurrency: config.WorkerConcurrency,
	}, nil
}

// Run runs audit service 2.0.
func (worker *Worker) Run(ctx context.Context) (err error) {
	defer mon.Task()(&ctx)(&err)

	return worker.Loop.Run(ctx, func(ctx context.Context) (err error) {
		defer mon.Task()(&ctx)(&err)
		err = worker.process(ctx)
		if err != nil {
			worker.log.Error("process", zap.Error(Error.Wrap(err)))
		}
		return nil
	})
}

// Close halts the worker.
func (worker *Worker) Close() error {
	worker.Loop.Close()
	return nil
}

// process repeatedly removes an item from the queue and runs an audit.
func (worker *Worker) process(ctx context.Context) (err error) {
	defer mon.Task()(&ctx)(&err)

	// get the current queue
	queue := worker.queues.Fetch()

	limiter := sync2.NewLimiter(worker.concurrency)
	defer limiter.Wait()

	for {
		segment, err := queue.Next()
		if err != nil {
			if ErrEmptyQueue.Has(err) {
				// get a new queue and return if empty; otherwise continue working.
				queue = worker.queues.Fetch()
				if queue.Size() == 0 {
					return nil
				}
				continue
			}
			return err
		}

		started := limiter.Go(ctx, func() {
			err := worker.work(ctx, segment)
			if err != nil {
				worker.log.Error("error(s) during audit",
					zap.String("Segment StreamID", segment.StreamID.String()),
					zap.Uint64("Segment Position", segment.Position.Encode()),
					zap.Error(err))
			}
		})
		if !started {
			return ctx.Err()
		}
	}
}

func (worker *Worker) work(ctx context.Context, segment Segment) (err error) {
	defer mon.Task()(&ctx)(&err)

	var errlist errs.Group

	// First, attempt to reverify nodes for this segment that are in containment mode.
	report, err := worker.verifier.Reverify(ctx, segment)
	if err != nil {
		errlist.Add(err)
	}

	// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update
	_, err = worker.reporter.RecordAudits(ctx, report)
	if err != nil {
		errlist.Add(err)
	}

	// Skip all reverified nodes in the next Verify step.
	skip := make(map[storj.NodeID]bool)
	for _, nodeID := range report.Successes {
		skip[nodeID] = true
	}
	for _, nodeID := range report.Offlines {
		skip[nodeID] = true
	}
	for _, nodeID := range report.Fails {
		skip[nodeID] = true
	}
	for _, pending := range report.PendingAudits {
		skip[pending.NodeID] = true
	}
	for _, nodeID := range report.Unknown {
		skip[nodeID] = true
	}

	// Next, audit the the remaining nodes that are not in containment mode.
	report, err = worker.verifier.Verify(ctx, segment, skip)
	if err != nil {
		errlist.Add(err)
	}

	// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update
	_, err = worker.reporter.RecordAudits(ctx, report)
	if err != nil {
		errlist.Add(err)
	}

	return errlist.Err()
}
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`// Copyright (C) 2019 Storj Labs, Inc.`
			`// See LICENSE for copying information.`

			`package audit`

			`import (`
			`"context"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`"time"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`"github.com/zeebo/errs"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`"go.uber.org/zap"`

common: separate repository Change-Id: Ibb89c42060450e3839481a7e495bbe3ad940610a 2019-12-27 11:48:47 +00:00			`"storj.io/common/memory"`
			`"storj.io/common/storj"`
			`"storj.io/common/sync2"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`)`

satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`// Error is the default audit errs class.`
all: fix error naming errs.Class should not contain "error" in the name, since that causes a lot of stutter in the error logs. As an example a log line could end up looking like: ERROR node stats service error: satellitedbs error: node stats database error: no rows Whereas something like: ERROR nodestats service: satellitedbs: nodestatsdb: no rows Would contain all the necessary information without the stutter. Change-Id: I7b7cb7e592ebab4bcfadc1eef11122584d2b20e0 2021-04-28 09:06:17 +01:00			`var Error = errs.Class("audit")`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00
			`// Config contains configurable values for audit chore and workers.`
			`type Config struct {`
			MaxRetriesStatDB int `help:"max number of times to attempt updating a statdb batch" default:"3"`
testplanet/satellite: reduce the number of places default values need to be configured Satellites set their configuration values to default values using cfgstruct, however, it turns out our tests don't test these values at all! Instead, they have a completely separate definition system that is easy to forget about. As is to be expected, these values have drifted, and it appears in a few cases test planet is testing unreasonable values that we won't see in production, or perhaps worse, features enabled in production were missed and weren't enabled in testplanet. This change makes it so all values are configured the same, systematic way, so it's easy to see when test values are different than dev values or release values, and it's less hard to forget to enable features in testplanet. In terms of reviewing, this change should be actually fairly easy to review, considering private/testplanet/satellite.go keeps the current config system and the new one and confirms that they result in identical configurations, so you can be certain that nothing was missed and the config is all correct. You can also check the config lock to see what actual config values changed. Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e 2021-05-31 22:15:00 +01:00			MinBytesPerSecond memory.Size `help:"the minimum acceptable bytes that storage nodes can transfer per second to the satellite" default:"128B" testDefault:"1.00 KB"`
			MinDownloadTimeout time.Duration `help:"the minimum duration for downloading a share from storage nodes before timing out" default:"5m0s" testDefault:"5s"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			MaxReverifyCount int `help:"limit above which we consider an audit is failed" default:"3"`

testplanet/satellite: reduce the number of places default values need to be configured Satellites set their configuration values to default values using cfgstruct, however, it turns out our tests don't test these values at all! Instead, they have a completely separate definition system that is easy to forget about. As is to be expected, these values have drifted, and it appears in a few cases test planet is testing unreasonable values that we won't see in production, or perhaps worse, features enabled in production were missed and weren't enabled in testplanet. This change makes it so all values are configured the same, systematic way, so it's easy to see when test values are different than dev values or release values, and it's less hard to forget to enable features in testplanet. In terms of reviewing, this change should be actually fairly easy to review, considering private/testplanet/satellite.go keeps the current config system and the new one and confirms that they result in identical configurations, so you can be certain that nothing was missed and the config is all correct. You can also check the config lock to see what actual config values changed. Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e 2021-05-31 22:15:00 +01:00			ChoreInterval time.Duration `help:"how often to run the reservoir chore" releaseDefault:"24h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
			QueueInterval time.Duration `help:"how often to recheck an empty audit queue" releaseDefault:"1h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			Slots int `help:"number of reservoir slots allotted for nodes, currently capped at 3" default:"3"`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			WorkerConcurrency int `help:"number of workers to run audits on segments" default:"2"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`}`

satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`// Worker contains information for populating audit queue and processing audits.`
			`type Worker struct {`
satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`log *zap.Logger`
			`queues *Queues`
			`verifier *Verifier`
			`reporter Reporter`
			`Loop *sync2.Cycle`
			`concurrency int`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`

			`// NewWorker instantiates Worker.`
satellite/{repair,audit}: simplify reputation reporter Also, make it an interface so that the upcoming write cache can be dropped in to the same place. Change-Id: I2c286743825e647c0cef5b6578245391851fa10c 2022-04-11 17:47:14 +01:00			`func NewWorker(log zap.Logger, queues Queues, verifier Verifier, reporter Reporter, config Config) (Worker, error) {`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`return &Worker{`
			`log: log,`

satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`queues: queues,`
			`verifier: verifier,`
			`reporter: reporter,`
			`Loop: sync2.NewCycle(config.QueueInterval),`
			`concurrency: config.WorkerConcurrency,`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}, nil`
			`}`

			`// Run runs audit service 2.0.`
			`func (worker *Worker) Run(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`

			`return worker.Loop.Run(ctx, func(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`
			`err = worker.process(ctx)`
			`if err != nil {`
			`worker.log.Error("process", zap.Error(Error.Wrap(err)))`
			`}`
			`return nil`
			`})`
			`}`

			`// Close halts the worker.`
			`func (worker *Worker) Close() error {`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`worker.Loop.Close()`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`return nil`
			`}`

			`// process repeatedly removes an item from the queue and runs an audit.`
			`func (worker *Worker) process(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`

satellite/audit/queue: Separate audit queue into two separate structs. * The audit worker wants to get items from the queue and process them. * The audit chore wants to create new queues and swap them in when the old queue has been processed. This change adds a "Queues" struct which handles the concurrency issues around the worker fetching a queue and the chore swapping a new queue in. It simplifies the logic of the "Queue" struct to its bare bones, so that it behaves like a normal queue with no need to understand the details of swapping and worker/chore interactions. Change-Id: Ic3689ede97a528e7590e98338cedddfa51794e1b 2020-08-20 14:29:02 +01:00			`// get the current queue`
			`queue := worker.queues.Fetch()`

satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`limiter := sync2.NewLimiter(worker.concurrency)`
			`defer limiter.Wait()`

satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`for {`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`segment, err := queue.Next()`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`if err != nil {`
			`if ErrEmptyQueue.Has(err) {`
satellite/audit/queue: Separate audit queue into two separate structs. * The audit worker wants to get items from the queue and process them. * The audit chore wants to create new queues and swap them in when the old queue has been processed. This change adds a "Queues" struct which handles the concurrency issues around the worker fetching a queue and the chore swapping a new queue in. It simplifies the logic of the "Queue" struct to its bare bones, so that it behaves like a normal queue with no need to understand the details of swapping and worker/chore interactions. Change-Id: Ic3689ede97a528e7590e98338cedddfa51794e1b 2020-08-20 14:29:02 +01:00			`// get a new queue and return if empty; otherwise continue working.`
			`queue = worker.queues.Fetch()`
			`if queue.Size() == 0 {`
			`return nil`
			`}`
			`continue`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`return err`
			`}`

satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`started := limiter.Go(ctx, func() {`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`err := worker.work(ctx, segment)`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`if err != nil {`
satellite/audit: change wording of audit worker error log "audit failed" is already used when a node fails an audit. That makes searching for this higher level audit worker error more difficult. Additionally, the presence of errors from the audit worker doesn't necessarily mean the audit failed. Reword the error message to "error(s) during audit" Change-Id: I0aab12c73c18d4bd962c5d8ac8a17cabcec022e6 2021-08-16 20:42:56 +01:00			`worker.log.Error("error(s) during audit",`
satellite/audit: move to segmentloop Change-Id: I10e63a1e4b6b62f5cd3098f5922ad3de1ec5af51 2021-06-14 16:40:46 +01:00			`zap.String("Segment StreamID", segment.StreamID.String()),`
			`zap.Uint64("Segment Position", segment.Position.Encode()),`
			`zap.Error(err))`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`})`
satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`if !started {`
			`return ctx.Err()`
			`}`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`}`

satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`func (worker *Worker) work(ctx context.Context, segment Segment) (err error) {`
satellite/audit: monitor worker function Change-Id: I94d1161deffe4ea9782abee1afbb5735f18aab44 2019-11-22 23:40:00 +00:00			`defer mon.Task()(&ctx)(&err)`

satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`var errlist errs.Group`

			`// First, attempt to reverify nodes for this segment that are in containment mode.`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`report, err := worker.verifier.Reverify(ctx, segment)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`if err != nil {`
			`errlist.Add(err)`
			`}`

			`// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`_, err = worker.reporter.RecordAudits(ctx, report)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`if err != nil {`
			`errlist.Add(err)`
			`}`

			`// Skip all reverified nodes in the next Verify step.`
			`skip := make(map[storj.NodeID]bool)`
satellite/audit: fix audit panic (#3217) 2019-10-09 15:06:58 +01:00			`for _, nodeID := range report.Successes {`
			`skip[nodeID] = true`
			`}`
			`for _, nodeID := range report.Offlines {`
			`skip[nodeID] = true`
			`}`
			`for _, nodeID := range report.Fails {`
			`skip[nodeID] = true`
			`}`
			`for _, pending := range report.PendingAudits {`
			`skip[pending.NodeID] = true`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`}`
satellite/audit: do not contain nodes for unknown errors (#3592) * skip unknown errors (wip) * add tests to make sure nodes that time out are added to containment * add bad blobs store * call "Skipped" "Unknown" * add tests to ensure unknown errors do not trigger containment * add monkit stats to lockfile * typo * add periods to end of bad blobs comments 2019-11-19 16:30:28 +00:00			`for _, nodeID := range report.Unknown {`
			`skip[nodeID] = true`
			`}`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00
			`// Next, audit the the remaining nodes that are not in containment mode.`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`report, err = worker.verifier.Verify(ctx, segment, skip)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`if err != nil {`
			`errlist.Add(err)`
			`}`

			`// TODO(moby) we need to decide if we want to do something with nodes that the reporter failed to update`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`_, err = worker.reporter.RecordAudits(ctx, report)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`if err != nil {`
			`errlist.Add(err)`
			`}`

			`return errlist.Err()`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`