storj/cmd/tools/segment-verify/batch.go
paul cannon 2feb49afc3 cmd/tools/segment-verify: don't cache offline status forever
Because it was originally intended to work on only a few pieces from
each segment at a time, and would frequently have reset its list of
online nodes, segment-verify has been taking nodes out of its
onlineNodes set and never putting them back. This means that over a long
run in Check=0 mode, we end up treating more and more nodes as offline,
permanently. This trend obfuscates the number of missing pieces that
each segment really has, because we don't check pieces on offline nodes.

This commit changes the onlineNodes set to an "offlineNodes" set, with
an expiration time on the offline-ness quality. So nodes are added to
the offlineNodes set when we see they are offline, and then we only
treat them as offline for the next 30 minutes (configurable). After that
point, we will try connecting to them again.

Change-Id: I14f0332de25cdc6ef655f923739bcb4df71e079e
2023-01-03 23:11:42 +00:00

170 lines
4.6 KiB
Go

// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package main
import (
"context"
"sort"
"go.uber.org/zap"
"storj.io/storj/satellite/metabase"
)
// CreateBatches creates load-balanced queues of segments to verify.
func (service *Service) CreateBatches(ctx context.Context, segments []*Segment) (_ []*Batch, err error) {
defer mon.Task()(&ctx)(&err)
if len(segments) == 0 {
return nil, Error.New("no segments")
}
// Remove offline nodes and prioritize nodes.
for _, segment := range segments {
service.selectOnlinePieces(segment)
service.sortPriorityToFirst(segment)
}
batches := map[metabase.NodeAlias]*Batch{}
enqueue := func(alias metabase.NodeAlias, segment *Segment) {
q, ok := batches[alias]
if !ok {
q = &Batch{Alias: alias}
batches[alias] = q
}
q.Items = append(q.Items, segment)
}
// Distribute things randomly into batches.
// We assume that segment.AliasPieces is randomly ordered in terms of nodes.
for _, segment := range segments {
if len(segment.AliasPieces) < int(segment.Status.Retry) {
if service.config.Check == 0 {
// some pieces were removed in selectOnlinePieces. adjust the expected count.
segment.Status.Retry = int32(len(segment.AliasPieces))
} else {
service.log.Error("segment contains too few pieces. skipping segment",
zap.Int("num-pieces", len(segment.AliasPieces)),
zap.Int32("expected", segment.Status.Retry),
zap.Stringer("stream-id", segment.StreamID),
zap.Uint64("position", segment.Position.Encode()))
continue
}
}
for _, piece := range segment.AliasPieces[:segment.Status.Retry] {
enqueue(piece.Alias, segment)
}
}
allQueues := []*Batch{}
for _, q := range batches {
allQueues = append(allQueues, q)
}
// sort queues by queue length descending.
sort.Slice(allQueues, func(i, k int) bool {
return allQueues[i].Len() >= allQueues[k].Len()
})
// try to redistribute segments in different slices
// queue with length above 65% will be redistributed
// to queues with length below 40%, but no more than 50%
highIndex := len(allQueues) * (100 - 65) / 100
midIndex := len(allQueues) * (100 - 50) / 100
lowIndex := len(allQueues) * (100 - 40) / 100
midLen := allQueues[midIndex].Len()
highLen := allQueues[highIndex].Len()
smallBatches := mapFromBatches(allQueues[lowIndex:])
// iterate over all queues
for _, large := range allQueues[:highIndex] {
// don't redistribute priority nodes
if service.priorityNodes.Contains(large.Alias) {
continue
}
newItems := large.Items[:highLen]
nextSegment:
for _, segment := range large.Items[highLen:] {
// try to find a piece that can be moved into a small batch.
for _, piece := range segment.AliasPieces[segment.Status.Retry:] {
if q, ok := smallBatches[piece.Alias]; ok {
// move to the other queue
q.Items = append(q.Items, segment)
if q.Len() >= midLen {
delete(smallBatches, piece.Alias)
}
continue nextSegment
}
}
// keep the segment in the slice,
// because we didn't find a good alternative
newItems = append(newItems, segment)
}
large.Items = newItems
if len(smallBatches) == 0 {
break
}
}
return allQueues, nil
}
// selectOnlinePieces modifies slice such that it only contains online pieces.
func (service *Service) selectOnlinePieces(segment *Segment) {
for i, x := range segment.AliasPieces {
if !service.offlineNodes.Contains(x.Alias) && !service.ignoreNodes.Contains(x.Alias) {
continue
}
// found an offline node, start removing
rs := segment.AliasPieces[:i]
for _, x := range segment.AliasPieces[i+1:] {
if !service.offlineNodes.Contains(x.Alias) && !service.ignoreNodes.Contains(x.Alias) {
rs = append(rs, x)
}
}
segment.AliasPieces = rs
return
}
}
// removePriorityPieces modifies slice such that it only contains non-priority pieces.
func (service *Service) removePriorityPieces(segment *Segment) {
target := 0
for _, x := range segment.AliasPieces {
if service.priorityNodes.Contains(x.Alias) {
continue
}
segment.AliasPieces[target] = x
target++
}
segment.AliasPieces = segment.AliasPieces[:target]
}
// sortPriorityToFirst moves priority node pieces at the front of the list.
func (service *Service) sortPriorityToFirst(segment *Segment) {
xs := segment.AliasPieces
target := 0
for i, x := range xs {
if service.priorityNodes.Contains(x.Alias) {
xs[target], xs[i] = xs[i], xs[target]
target++
}
}
}
// mapFromBatches creates a map from the specified batches.
func mapFromBatches(batches []*Batch) map[metabase.NodeAlias]*Batch {
xs := make(map[metabase.NodeAlias]*Batch, len(batches))
for _, b := range batches {
xs[b.Alias] = b
}
return xs
}