storj/satellite/audit/worker.go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.

package audit

import (
	"context"
	"time"

	"github.com/zeebo/errs"
	"go.uber.org/zap"

	"storj.io/common/memory"
	"storj.io/common/sync2"
	"storj.io/storj/satellite/metabase"
)

// Error is the default audit errs class.
var Error = errs.Class("audit")

// Config contains configurable values for audit chore and workers.
type Config struct {
	MaxRetriesStatDB   int           `help:"max number of times to attempt updating a statdb batch" default:"3"`
	MinBytesPerSecond  memory.Size   `help:"the minimum acceptable bytes that storage nodes can transfer per second to the satellite" default:"128B" testDefault:"1.00 KB"`
	MinDownloadTimeout time.Duration `help:"the minimum duration for downloading a share from storage nodes before timing out" default:"5m0s" testDefault:"5s"`
	MaxReverifyCount   int           `help:"limit above which we consider an audit is failed" default:"3"`

	ChoreInterval             time.Duration `help:"how often to run the reservoir chore" releaseDefault:"24h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
	QueueInterval             time.Duration `help:"how often to recheck an empty audit queue" releaseDefault:"1h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
	Slots                     int           `help:"number of reservoir slots allotted for nodes, currently capped at 3" default:"3"`
	VerificationPushBatchSize int           `help:"number of audit jobs to push at once to the verification queue" devDefault:"10" releaseDefault:"4096"`
	WorkerConcurrency         int           `help:"number of workers to run audits on segments" default:"2"`
	UseRangedLoop             bool          `help:"whether or not to use the ranged loop observer instead of the chore." default:"false" testDefault:"false"`

	ReverifyWorkerConcurrency   int           `help:"number of workers to run reverify audits on pieces" default:"2"`
	ReverificationRetryInterval time.Duration `help:"how long a single reverification job can take before it may be taken over by another worker" releaseDefault:"6h" devDefault:"10m"`
}

// Worker contains information for populating audit queue and processing audits.
type Worker struct {
	log           *zap.Logger
	queue         VerifyQueue
	verifier      *Verifier
	reverifyQueue ReverifyQueue
	reporter      Reporter
	Loop          *sync2.Cycle
	concurrency   int
}

// NewWorker instantiates Worker.
func NewWorker(log *zap.Logger, queue VerifyQueue, verifier *Verifier, reverifyQueue ReverifyQueue, reporter Reporter, config Config) *Worker {
	return &Worker{
		log: log,

		queue:         queue,
		verifier:      verifier,
		reverifyQueue: reverifyQueue,
		reporter:      reporter,
		Loop:          sync2.NewCycle(config.QueueInterval),
		concurrency:   config.WorkerConcurrency,
	}
}

// Run runs audit service 2.0.
func (worker *Worker) Run(ctx context.Context) (err error) {
	defer mon.Task()(&ctx)(&err)

	return worker.Loop.Run(ctx, func(ctx context.Context) (err error) {
		defer mon.Task()(&ctx)(&err)
		err = worker.process(ctx)
		if err != nil {
			worker.log.Error("process", zap.Error(Error.Wrap(err)))
		}
		return nil
	})
}

// Close halts the worker.
func (worker *Worker) Close() error {
	worker.Loop.Close()
	return nil
}

// process repeatedly removes an item from the queue and runs an audit.
func (worker *Worker) process(ctx context.Context) (err error) {
	defer mon.Task()(&ctx)(&err)

	limiter := sync2.NewLimiter(worker.concurrency)
	defer limiter.Wait()

	for {
		segment, err := worker.queue.Next(ctx)
		if err != nil {
			if ErrEmptyQueue.Has(err) {
				return nil
			}
			return err
		}

		started := limiter.Go(ctx, func() {
			err := worker.work(ctx, segment)
			if err != nil {
				worker.log.Error("error(s) during audit",
					zap.String("Segment StreamID", segment.StreamID.String()),
					zap.Uint64("Segment Position", segment.Position.Encode()),
					zap.Error(err))
			}
		})
		if !started {
			return ctx.Err()
		}
	}
}

func (worker *Worker) work(ctx context.Context, segment Segment) (err error) {
	defer mon.Task()(&ctx)(&err)

	var errlist errs.Group

	// First, remove nodes that are contained. We do not (currently)
	// audit contained nodes for other pieces until we get an answer
	// for the contained audit. (I suspect this could change without
	// upsetting anything, but for now it's best to keep it the way
	// it was. -thepaul)
	skip, err := worker.verifier.IdentifyContainedNodes(ctx, segment)
	if err != nil {
		if metabase.ErrSegmentNotFound.Has(err) {
			// no need to add this error; Verify() will encounter it again
			// and will handle the verification job as appropriate.
			err = nil
		} else {
			errlist.Add(err)
		}
	}

	// Next, audit the remaining nodes that are not in containment mode.
	report, err := worker.verifier.Verify(ctx, segment, skip)
	if err != nil {
		if metabase.ErrSegmentNotFound.Has(err) {
			// no need to add this error; Verify() will encounter it again
			// and will handle the verification job as appropriate.
			err = nil
		} else {
			errlist.Add(err)
		}
	}

	worker.reporter.RecordAudits(ctx, report)

	return errlist.Err()
}
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`// Copyright (C) 2019 Storj Labs, Inc.`
			`// See LICENSE for copying information.`

			`package audit`

			`import (`
			`"context"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`"time"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`"github.com/zeebo/errs"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`"go.uber.org/zap"`

common: separate repository Change-Id: Ibb89c42060450e3839481a7e495bbe3ad940610a 2019-12-27 11:48:47 +00:00			`"storj.io/common/memory"`
			`"storj.io/common/sync2"`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00			`"storj.io/storj/satellite/metabase"`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`)`

satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`// Error is the default audit errs class.`
all: fix error naming errs.Class should not contain "error" in the name, since that causes a lot of stutter in the error logs. As an example a log line could end up looking like: ERROR node stats service error: satellitedbs error: node stats database error: no rows Whereas something like: ERROR nodestats service: satellitedbs: nodestatsdb: no rows Would contain all the necessary information without the stutter. Change-Id: I7b7cb7e592ebab4bcfadc1eef11122584d2b20e0 2021-04-28 09:06:17 +01:00			`var Error = errs.Class("audit")`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00
			`// Config contains configurable values for audit chore and workers.`
			`type Config struct {`
			MaxRetriesStatDB int `help:"max number of times to attempt updating a statdb batch" default:"3"`
testplanet/satellite: reduce the number of places default values need to be configured Satellites set their configuration values to default values using cfgstruct, however, it turns out our tests don't test these values at all! Instead, they have a completely separate definition system that is easy to forget about. As is to be expected, these values have drifted, and it appears in a few cases test planet is testing unreasonable values that we won't see in production, or perhaps worse, features enabled in production were missed and weren't enabled in testplanet. This change makes it so all values are configured the same, systematic way, so it's easy to see when test values are different than dev values or release values, and it's less hard to forget to enable features in testplanet. In terms of reviewing, this change should be actually fairly easy to review, considering private/testplanet/satellite.go keeps the current config system and the new one and confirms that they result in identical configurations, so you can be certain that nothing was missed and the config is all correct. You can also check the config lock to see what actual config values changed. Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e 2021-05-31 22:15:00 +01:00			MinBytesPerSecond memory.Size `help:"the minimum acceptable bytes that storage nodes can transfer per second to the satellite" default:"128B" testDefault:"1.00 KB"`
			MinDownloadTimeout time.Duration `help:"the minimum duration for downloading a share from storage nodes before timing out" default:"5m0s" testDefault:"5s"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			MaxReverifyCount int `help:"limit above which we consider an audit is failed" default:"3"`

satellite/audit: help performance of pushing to audit queue The audit chore will be pushing a large number of segments to be audited, and the db might choke on that large insert when under load. This change divides the insert up into batches, which can be sized however is optimal for the backing database. It also arranges for segments to be inserted in the order of the primary key, which helps performance on some systems. Refs: https://github.com/storj/storj/issues/5228 Change-Id: I941f580f690d681b80c86faf4abca2995e37135d 2022-11-23 16:37:39 +00:00			ChoreInterval time.Duration `help:"how often to run the reservoir chore" releaseDefault:"24h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
			QueueInterval time.Duration `help:"how often to recheck an empty audit queue" releaseDefault:"1h" devDefault:"1m" testDefault:"$TESTINTERVAL"`
			Slots int `help:"number of reservoir slots allotted for nodes, currently capped at 3" default:"3"`
			VerificationPushBatchSize int `help:"number of audit jobs to push at once to the verification queue" devDefault:"10" releaseDefault:"4096"`
			WorkerConcurrency int `help:"number of workers to run audits on segments" default:"2"`
satellite/audit: implement rangedloop observer This change implements the ranged loop observer to replace the audit chore that builds the audit queue. The strategy employed by this change is to use a collector for each segment range to build separate per-node segment reservoirs that are then merge them during the join step. In previous observer migrations, there were only a handful of tests so the strategy was to duplicate them. In this package, there are dozens of tests that utilize the chore. To reduce code churn and maintenance burden until the chore is removed, this change introduces a helper that runs tests under both the chore and observer, providing a pair of functions that can be used to pause or run the queueing function. https://github.com/storj/storj/issues/5232 Change-Id: I8bb4b4e55cf98b1aac9f26307e3a9a355cb3f506 2022-12-15 00:37:37 +00:00			UseRangedLoop bool `help:"whether or not to use the ranged loop observer instead of the chore." default:"false" testDefault:"false"`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00
			ReverifyWorkerConcurrency int `help:"number of workers to run reverify audits on pieces" default:"2"`
			ReverificationRetryInterval time.Duration `help:"how long a single reverification job can take before it may be taken over by another worker" releaseDefault:"6h" devDefault:"10m"`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`}`

satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`// Worker contains information for populating audit queue and processing audits.`
			`type Worker struct {`
satellite/audit: add audit.ReverifyWorker Here we add a worker class comparable to audit.Worker, which will be responsible for pulling items off of the reverification queue and calling reverifier.ReverifyPiece on them. Note that piecewise reverification audits (which this will control) are not yet being done. That is, nothing is being added to the reverification queue at this point. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I94e28830e27caa49f2c8bd4a2336533e187ab69c 2022-11-22 22:20:44 +00:00			`log *zap.Logger`
			`queue VerifyQueue`
			`verifier *Verifier`
			`reverifyQueue ReverifyQueue`
			`reporter Reporter`
			`Loop *sync2.Cycle`
			`concurrency int`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`

			`// NewWorker instantiates Worker.`
satellite/audit: add audit.ReverifyWorker Here we add a worker class comparable to audit.Worker, which will be responsible for pulling items off of the reverification queue and calling reverifier.ReverifyPiece on them. Note that piecewise reverification audits (which this will control) are not yet being done. That is, nothing is being added to the reverification queue at this point. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I94e28830e27caa49f2c8bd4a2336533e187ab69c 2022-11-22 22:20:44 +00:00			`func NewWorker(log zap.Logger, queue VerifyQueue, verifier Verifier, reverifyQueue ReverifyQueue, reporter Reporter, config Config) *Worker {`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`return &Worker{`
			`log: log,`

satellite/audit: add audit.ReverifyWorker Here we add a worker class comparable to audit.Worker, which will be responsible for pulling items off of the reverification queue and calling reverifier.ReverifyPiece on them. Note that piecewise reverification audits (which this will control) are not yet being done. That is, nothing is being added to the reverification queue at this point. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I94e28830e27caa49f2c8bd4a2336533e187ab69c 2022-11-22 22:20:44 +00:00			`queue: queue,`
			`verifier: verifier,`
			`reverifyQueue: reverifyQueue,`
			`reporter: reporter,`
			`Loop: sync2.NewCycle(config.QueueInterval),`
			`concurrency: config.WorkerConcurrency,`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00			`}`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`

			`// Run runs audit service 2.0.`
			`func (worker *Worker) Run(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`

			`return worker.Loop.Run(ctx, func(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`
			`err = worker.process(ctx)`
			`if err != nil {`
			`worker.log.Error("process", zap.Error(Error.Wrap(err)))`
			`}`
			`return nil`
			`})`
			`}`

			`// Close halts the worker.`
			`func (worker *Worker) Close() error {`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`worker.Loop.Close()`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`return nil`
			`}`

			`// process repeatedly removes an item from the queue and runs an audit.`
			`func (worker *Worker) process(ctx context.Context) (err error) {`
			`defer mon.Task()(&ctx)(&err)`

satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`limiter := sync2.NewLimiter(worker.concurrency)`
			`defer limiter.Wait()`

satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`for {`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00			`segment, err := worker.queue.Next(ctx)`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`if err != nil {`
			`if ErrEmptyQueue.Has(err) {`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00			`return nil`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`return err`
			`}`

satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`started := limiter.Go(ctx, func() {`
satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`err := worker.work(ctx, segment)`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`if err != nil {`
satellite/audit: change wording of audit worker error log "audit failed" is already used when a node fails an audit. That makes searching for this higher level audit worker error more difficult. Additionally, the presence of errors from the audit worker doesn't necessarily mean the audit failed. Reword the error message to "error(s) during audit" Change-Id: I0aab12c73c18d4bd962c5d8ac8a17cabcec022e6 2021-08-16 20:42:56 +01:00			`worker.log.Error("error(s) during audit",`
satellite/audit: move to segmentloop Change-Id: I10e63a1e4b6b62f5cd3098f5922ad3de1ec5af51 2021-06-14 16:40:46 +01:00			`zap.String("Segment StreamID", segment.StreamID.String()),`
			`zap.Uint64("Segment Position", segment.Position.Encode()),`
			`zap.Error(err))`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`})`
satellite/audit,storagenode/gracefulexit: fixes to limiter Ensure we don't rely on limiter to wait multiple times. Change-Id: I75d48420236216d4c2fc6fa99293f51f80cd9c33 2022-08-01 13:00:23 +01:00			`if !started {`
			`return ctx.Err()`
			`}`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`
			`}`

satellite/audits: migrate to metabase Change-Id: I480c941820c5b0bd3af0539d92b548189211acb2 2020-12-14 12:54:22 +00:00			`func (worker *Worker) work(ctx context.Context, segment Segment) (err error) {`
satellite/audit: monitor worker function Change-Id: I94d1161deffe4ea9782abee1afbb5735f18aab44 2019-11-22 23:40:00 +00:00			`defer mon.Task()(&ctx)(&err)`

satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`var errlist errs.Group`

satellite/audit: Begin using piecewise reverifications This commit pulls the big switch! We have been setting up piecewise reverifications (the workers for which can be scaled independently of the core) for several commits now, and this commit actually begins making use of them. The core of this commit is fairly small, but it requires changing the semantics in all the tests that relate to reverifications, so it ends up being a large change. The changes to the tests are mostly mechanical and repetitive, though, so reviewers needn't worry much. Refs: https://github.com/storj/storj/issues/5230 Change-Id: Ibb421cc021664fd6e0096ffdf5b402a69b2d6f18 2022-11-22 23:18:01 +00:00			`// First, remove nodes that are contained. We do not (currently)`
			`// audit contained nodes for other pieces until we get an answer`
			`// for the contained audit. (I suspect this could change without`
			`// upsetting anything, but for now it's best to keep it the way`
			`// it was. -thepaul)`
			`skip, err := worker.verifier.IdentifyContainedNodes(ctx, segment)`
satellite/audit: use db for auditor queue As part of the effort of splitting out the auditor workers to their own process, we are transitioning the communication between the auditor chore and the verification workers to a queue implemented in the database, rather than the sequence of in-memory queues we used to use. This logical database is safely partitionable from the rest of satelliteDB. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I6cd31ac5265423271fbafe6127a86172c5cb53dc 2022-11-11 23:11:40 +00:00			`if err != nil {`
			`if metabase.ErrSegmentNotFound.Has(err) {`
			`// no need to add this error; Verify() will encounter it again`
			`// and will handle the verification job as appropriate.`
			`err = nil`
			`} else {`
			`errlist.Add(err)`
			`}`
			`}`

			`// Next, audit the remaining nodes that are not in containment mode.`
satellite/audit: Begin using piecewise reverifications This commit pulls the big switch! We have been setting up piecewise reverifications (the workers for which can be scaled independently of the core) for several commits now, and this commit actually begins making use of them. The core of this commit is fairly small, but it requires changing the semantics in all the tests that relate to reverifications, so it ends up being a large change. The changes to the tests are mostly mechanical and repetitive, though, so reviewers needn't worry much. Refs: https://github.com/storj/storj/issues/5230 Change-Id: Ibb421cc021664fd6e0096ffdf5b402a69b2d6f18 2022-11-22 23:18:01 +00:00			`report, err := worker.verifier.Verify(ctx, segment, skip)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`if err != nil {`
satellite/audit: split out auditor process This change creates a new independent process, the 'auditor', comparable to the repairer, gc, and api processes. This will allow auditors to be scaled independently of the core. Refs: https://github.com/storj/storj/issues/5251 Change-Id: I8a29eeb0a6e35753dfa0eab5c1246048065d1e91 2022-10-12 21:33:31 +01:00			`if metabase.ErrSegmentNotFound.Has(err) {`
			`// no need to add this error; Verify() will encounter it again`
			`// and will handle the verification job as appropriate.`
			`err = nil`
			`} else {`
			`errlist.Add(err)`
			`}`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00			`}`

satellite/audit: teach Reporter about piecewise audits The Reporter is responsible for processing results from auditing operations, logging the results, disqualifying nodes that reached the maximum reverification count, and passing the results on to the reputation system. In this commit, we extend the Reporter so that it knows how to process the results of piecewise reverification audits. We also change most reporter-related tests so that reverifications happen as piecewise reverification audits, exercising the new code. Note that piecewise reverification audits are not yet being done outside of tests. In a later commit, we will switch from doing segmentwise reverifications to piecewise reverifications, as part of the audit-scaling effort. Refs: https://github.com/storj/storj/issues/5230 Change-Id: I9438164ce1ea4d9a1790d18d0e1046a8eb04d8e9 2022-11-22 21:55:19 +00:00			`worker.reporter.RecordAudits(ctx, report)`
satellite/audit: worker now verifies and reverifies (#2965) 2019-09-11 23:37:01 +01:00
			`return errlist.Err()`
satellite/audit: create the audit queue, chore, and worker (#2888) 2019-09-05 16:40:52 +01:00			`}`