storj/cmd/tools/segment-verify/process.go
paul cannon 6e1554652a cmd/tools/segment-verify: handle dq'd nodes
Previously, if any pieces are still on disqualified nodes, this tool
would treat those pieces as fine (if the disqualified node is still
online) or temporarily unavailable (if the disqualified node is
offline). Instead, we should treat such pieces as lost.

This also fixes a slight problem with the code that handles a broken
alias. This is not likely to happen, but if we do see an alias that is
not in the alias map, we return an error instead of nil.

Change-Id: Ib4e2e729ef0535dd7bd9ce2f621680d9f959891c
2023-01-04 17:54:03 +00:00

198 lines
4.8 KiB
Go

// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package main
import (
"context"
"sync"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/common/sync2"
"storj.io/storj/satellite/metabase"
)
// Verify verifies a collection of segments.
func (service *Service) Verify(ctx context.Context, segments []*Segment) (err error) {
defer mon.Task()(&ctx)(&err)
for _, segment := range segments {
retryCount := service.config.Check
if retryCount == 0 {
retryCount = len(segment.AliasPieces)
}
segment.Status.Retry = int32(retryCount)
}
batches, err := service.CreateBatches(ctx, segments)
if err != nil {
return Error.Wrap(err)
}
err = service.VerifyBatches(ctx, batches)
if err != nil {
return Error.Wrap(err)
}
retrySegments := []*Segment{}
for _, segment := range segments {
if segment.Status.Retry > 0 {
retrySegments = append(retrySegments, segment)
}
}
if len(retrySegments) == 0 {
return nil
}
if service.config.Check <= 0 {
return nil
}
// Reverse the pieces slice to ensure we pick different nodes this time.
for _, segment := range retrySegments {
xs := segment.AliasPieces
for i, j := 0, len(xs)-1; i < j; i, j = i+1, j-1 {
xs[i], xs[j] = xs[j], xs[i]
}
// Also remove priority nodes, because we have already checked them.
service.removePriorityPieces(segment)
}
retryBatches, err := service.CreateBatches(ctx, retrySegments)
if err != nil {
return Error.Wrap(err)
}
err = service.VerifyBatches(ctx, retryBatches)
if err != nil {
return Error.Wrap(err)
}
return nil
}
// VerifyBatches verifies batches.
func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) error {
defer mon.Task()(&ctx)(nil)
var mu sync.Mutex
limiter := sync2.NewLimiter(service.config.Concurrency)
for _, batch := range batches {
batch := batch
info, err := service.GetNodeInfo(ctx, batch.Alias)
if err != nil {
if ErrNoSuchNode.Has(err) {
service.log.Error("will not verify batch; consider pieces lost",
zap.Int("alias", int(batch.Alias)),
zap.Error(err))
continue
}
return Error.Wrap(err)
}
ignoreThrottle := service.priorityNodes.Contains(batch.Alias)
limiter.Go(ctx, func() {
verifiedCount, err := service.verifier.Verify(ctx, batch.Alias, info.NodeURL, info.Version, batch.Items, ignoreThrottle)
if err != nil {
if ErrNodeOffline.Has(err) {
mu.Lock()
if verifiedCount == 0 {
service.offlineNodes.Add(batch.Alias)
} else {
service.offlineCount[batch.Alias]++
if service.config.MaxOffline > 0 && service.offlineCount[batch.Alias] >= service.config.MaxOffline {
service.offlineNodes.Add(batch.Alias)
}
}
mu.Unlock()
}
service.log.Error("verifying a batch failed", zap.Error(err))
} else {
mu.Lock()
if service.offlineCount[batch.Alias] > 0 {
service.offlineCount[batch.Alias]--
}
mu.Unlock()
}
})
}
limiter.Wait()
return nil
}
// convertAliasToNodeURL converts a node alias to node url, using a cache if needed.
func (service *Service) convertAliasToNodeURL(ctx context.Context, alias metabase.NodeAlias) (_ storj.NodeURL, err error) {
nodeURL, ok := service.aliasToNodeURL[alias]
if !ok {
nodeID, ok := service.aliasMap.Node(alias)
if !ok {
latest, err := service.metabase.LatestNodesAliasMap(ctx)
if !ok {
return storj.NodeURL{}, Error.Wrap(err)
}
service.aliasMap = latest
nodeID, ok = service.aliasMap.Node(alias)
if !ok {
return storj.NodeURL{}, ErrNoSuchNode.New("no node has alias %d", alias)
}
}
info, err := service.overlay.Get(ctx, nodeID)
if err != nil {
return storj.NodeURL{}, Error.Wrap(err)
}
if info.Disqualified != nil || info.ExitStatus.ExitFinishedAt != nil {
return storj.NodeURL{}, ErrNoSuchNode.New("node %s is no longer on the network", nodeID.String())
}
// TODO: single responsibility?
service.nodesVersionMap[alias] = info.Version.Version
nodeURL = storj.NodeURL{
ID: info.Id,
Address: info.Address.Address,
}
service.aliasToNodeURL[alias] = nodeURL
}
return nodeURL, nil
}
// NodeInfo contains node information.
type NodeInfo struct {
Version string
NodeURL storj.NodeURL
}
// GetNodeInfo retrieves node information, using a cache if needed.
func (service *Service) GetNodeInfo(ctx context.Context, alias metabase.NodeAlias) (NodeInfo, error) {
nodeURL, err := service.convertAliasToNodeURL(ctx, alias)
if err != nil {
return NodeInfo{}, Error.Wrap(err)
}
version, ok := service.nodesVersionMap[alias]
if !ok {
info, err := service.overlay.Get(ctx, nodeURL.ID)
if err != nil {
return NodeInfo{}, Error.Wrap(err)
}
service.nodesVersionMap[alias] = info.Version.Version
version = info.Version.Version
}
return NodeInfo{
NodeURL: nodeURL,
Version: version,
}, nil
}