Implement checker observer (#2620)

Integrate checker into metainfo loop
This commit is contained in:
Maximillian von Briesen 2019-08-01 14:44:32 -04:00 committed by GitHub
parent 51833d0650
commit 4547084f26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 163 additions and 201 deletions

View File

@ -436,10 +436,12 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config *Config, ve
log.Debug("Setting up datarepair")
// TODO: simplify argument list somehow
peer.Repair.Checker = checker.NewChecker(
peer.Metainfo.Service,
peer.Log.Named("checker"),
peer.DB.RepairQueue(),
peer.Overlay.Service, peer.DB.Irreparable(),
0, peer.Log.Named("checker"),
peer.DB.Irreparable(),
peer.Metainfo.Service,
peer.Metainfo.Loop,
peer.Overlay.Service,
config.Checker)
peer.Repair.Repairer = repairer.NewService(

View File

@ -7,12 +7,12 @@ import (
"context"
"time"
"github.com/gogo/protobuf/proto"
"github.com/zeebo/errs"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
monkit "gopkg.in/spacemonkeygo/monkit.v2"
"storj.io/storj/internal/errs2"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/storj"
@ -20,7 +20,6 @@ import (
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair/irreparable"
"storj.io/storj/satellite/repair/queue"
"storj.io/storj/storage"
)
// Error is a standard error class for this package.
@ -48,30 +47,29 @@ type durabilityStats struct {
// Checker contains the information needed to do checks for missing pieces
type Checker struct {
metainfo *metainfo.Service
lastChecked string
repairQueue queue.RepairQueue
nodestate *ReliabilityCache
irrdb irreparable.DB
logger *zap.Logger
repairQueue queue.RepairQueue
irrdb irreparable.DB
metainfo *metainfo.Service
metaLoop *metainfo.Loop
nodestate *ReliabilityCache
Loop sync2.Cycle
IrreparableLoop sync2.Cycle
monStats durabilityStats
}
// NewChecker creates a new instance of checker
func NewChecker(metainfo *metainfo.Service, repairQueue queue.RepairQueue, overlay *overlay.Cache, irrdb irreparable.DB, limit int, logger *zap.Logger, config Config) *Checker {
// TODO: reorder arguments
func NewChecker(logger *zap.Logger, repairQueue queue.RepairQueue, irrdb irreparable.DB, metainfo *metainfo.Service, metaLoop *metainfo.Loop, overlay *overlay.Cache, config Config) *Checker {
return &Checker{
metainfo: metainfo,
lastChecked: "",
repairQueue: repairQueue,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
irrdb: irrdb,
logger: logger,
logger: logger,
repairQueue: repairQueue,
irrdb: irrdb,
metainfo: metainfo,
metaLoop: metaLoop,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
Loop: *sync2.NewCycle(config.Interval),
IrreparableLoop: *sync2.NewCycle(config.IrreparableInterval),
monStats: durabilityStats{},
}
}
@ -107,53 +105,27 @@ func (checker *Checker) Close() error {
func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
err = checker.metainfo.Iterate(ctx, "", checker.lastChecked, true, false,
func(ctx context.Context, it storage.Iterator) error {
var item storage.ListItem
defer func() {
var nextItem storage.ListItem
it.Next(ctx, &nextItem)
// start at the next item in the next call
checker.lastChecked = nextItem.Key.String()
// if we have finished iterating, send and reset durability stats
if checker.lastChecked == "" {
// send durability stats
mon.IntVal("remote_files_checked").Observe(checker.monStats.remoteFilesChecked)
mon.IntVal("remote_segments_checked").Observe(checker.monStats.remoteSegmentsChecked)
mon.IntVal("remote_segments_needing_repair").Observe(checker.monStats.remoteSegmentsNeedingRepair)
mon.IntVal("remote_segments_lost").Observe(checker.monStats.remoteSegmentsLost)
mon.IntVal("remote_files_lost").Observe(int64(len(checker.monStats.remoteSegmentInfo)))
// reset durability stats for next iteration
checker.monStats = durabilityStats{}
}
}()
for it.Next(ctx, &item) {
pointer := &pb.Pointer{}
err = proto.Unmarshal(item.Value, pointer)
if err != nil {
return Error.New("error unmarshalling pointer %s", err)
}
remote := pointer.GetRemote()
if remote == nil {
continue
}
err = checker.updateSegmentStatus(ctx, pointer, item.Key.String(), &checker.monStats)
if err != nil {
return err
}
}
return nil
},
)
observer := &checkerObserver{
repairQueue: checker.repairQueue,
irrdb: checker.irrdb,
nodestate: checker.nodestate,
monStats: durabilityStats{},
log: checker.logger,
}
err = checker.metaLoop.Join(ctx, observer)
if err != nil {
if !errs2.IsCanceled(err) {
checker.logger.Error("IdentifyInjuredSegments error", zap.Error(err))
}
return err
}
mon.IntVal("remote_files_checked").Observe(observer.monStats.remoteFilesChecked)
mon.IntVal("remote_segments_checked").Observe(observer.monStats.remoteSegmentsChecked)
mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair)
mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost)
mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo)))
return nil
}
@ -167,7 +139,8 @@ func contains(a []string, x string) bool {
return false
}
func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Pointer, path string, monStats *durabilityStats) (err error) {
func (checker *Checker) updateIrreparableSegmentStatus(ctx context.Context, pointer *pb.Pointer, path string) (err error) {
// TODO figure out how to reduce duplicate code between here and checkerObs.RemoteSegment
defer mon.Task()(&ctx)(&err)
remote := pointer.GetRemote()
if remote == nil {
@ -182,35 +155,31 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
missingPieces, err := checker.nodestate.MissingPieces(ctx, pointer.CreationDate, pieces)
if err != nil {
return Error.New("error getting missing pieces %s", err)
}
monStats.remoteSegmentsChecked++
pathElements := storj.SplitPath(path)
if len(pathElements) >= 2 && pathElements[1] == "l" {
monStats.remoteFilesChecked++
return errs.Combine(Error.New("error getting missing pieces"), err)
}
numHealthy := int32(len(pieces) - len(missingPieces))
redundancy := pointer.Remote.Redundancy
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces)))
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy))
// we repair when the number of healthy pieces is less than or equal to the repair threshold
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if len(missingPieces) == 0 {
checker.logger.Warn("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme.")
checker.logger.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:",
zap.String("path", path),
zap.Int32("min", redundancy.MinReq),
zap.Int32("repair", redundancy.RepairThreshold),
zap.Int32("success", redundancy.SuccessThreshold),
zap.Int32("total", redundancy.Total))
return nil
}
monStats.remoteSegmentsNeedingRepair++
err = checker.repairQueue.Insert(ctx, &pb.InjuredSegment{
Path: []byte(path),
LostPieces: missingPieces,
InsertedTime: time.Now().UTC(),
})
if err != nil {
return Error.New("error adding injured segment to queue %s", err)
return errs.Combine(Error.New("error adding injured segment to queue"), err)
}
// delete always returns nil when something was deleted and also when element didn't exists
@ -221,19 +190,8 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
// we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost.
// except for the case when minimum and repair thresholds are the same (a case usually seen during testing)
} else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
// check to make sure there are at least *4* path elements. the first three
// are project, segment, and bucket name, but we want to make sure we're talking
// about an actual object, and that there's an object name specified
if len(pathElements) >= 4 {
project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3]
lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath)
if contains(monStats.remoteSegmentInfo, lostSegInfo) == false {
monStats.remoteSegmentInfo = append(monStats.remoteSegmentInfo, lostSegInfo)
}
}
monStats.remoteSegmentsLost++
// make an entry in to the irreparable table
// make an entry into the irreparable table
segmentInfo := &pb.IrreparableSegment{
Path: []byte(path),
SegmentDetail: pointer,
@ -245,12 +203,123 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
// add the entry if new or update attempt count if already exists
err := checker.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
if err != nil {
return Error.New("error handling irreparable segment to queue %s", err)
return errs.Combine(Error.New("error handling irreparable segment to queue"), err)
}
}
return nil
}
// checkerObserver implements the metainfo loop Observer interface
type checkerObserver struct {
repairQueue queue.RepairQueue
irrdb irreparable.DB
nodestate *ReliabilityCache
monStats durabilityStats
log *zap.Logger
}
func (obs *checkerObserver) RemoteSegment(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) {
defer mon.Task()(&ctx)(&err)
obs.monStats.remoteSegmentsChecked++
remote := pointer.GetRemote()
pieces := remote.GetRemotePieces()
if pieces == nil {
obs.log.Debug("no pieces on remote segment")
return nil
}
missingPieces, err := obs.nodestate.MissingPieces(ctx, pointer.CreationDate, pieces)
if err != nil {
return errs.Combine(Error.New("error getting missing pieces"), err)
}
numHealthy := int32(len(pieces) - len(missingPieces))
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces)))
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy))
redundancy := pointer.Remote.Redundancy
// we repair when the number of healthy pieces is less than or equal to the repair threshold
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if len(missingPieces) == 0 {
obs.log.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:",
zap.String("path", path),
zap.Int32("min", redundancy.MinReq),
zap.Int32("repair", redundancy.RepairThreshold),
zap.Int32("success", redundancy.SuccessThreshold),
zap.Int32("total", redundancy.Total))
return nil
}
obs.monStats.remoteSegmentsNeedingRepair++
err = obs.repairQueue.Insert(ctx, &pb.InjuredSegment{
Path: []byte(path),
LostPieces: missingPieces,
InsertedTime: time.Now().UTC(),
})
if err != nil {
obs.log.Error("error adding injured segment to queue", zap.Error(err))
return nil
}
// delete always returns nil when something was deleted and also when element didn't exists
err = obs.irrdb.Delete(ctx, []byte(path))
if err != nil {
obs.log.Error("error deleting entry from irreparable db", zap.Error(err))
return nil
}
// we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost.
// except for the case when minimum and repair thresholds are the same (a case usually seen during testing)
} else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
pathElements := storj.SplitPath(path)
// check to make sure there are at least *4* path elements. the first three
// are project, segment, and bucket name, but we want to make sure we're talking
// about an actual object, and that there's an object name specified
if len(pathElements) >= 4 {
project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3]
lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath)
if contains(obs.monStats.remoteSegmentInfo, lostSegInfo) == false {
obs.monStats.remoteSegmentInfo = append(obs.monStats.remoteSegmentInfo, lostSegInfo)
}
}
obs.monStats.remoteSegmentsLost++
// make an entry into the irreparable table
segmentInfo := &pb.IrreparableSegment{
Path: []byte(path),
SegmentDetail: pointer,
LostPieces: int32(len(missingPieces)),
LastRepairAttempt: time.Now().Unix(),
RepairAttemptCount: int64(1),
}
// add the entry if new or update attempt count if already exists
err := obs.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
if err != nil {
obs.log.Error("error handling irreparable segment to queue", zap.Error(err))
return nil
}
}
return nil
}
func (obs *checkerObserver) RemoteObject(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) {
defer mon.Task()(&ctx)(&err)
obs.monStats.remoteFilesChecked++
return nil
}
func (obs *checkerObserver) InlineSegment(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) {
defer mon.Task()(&ctx)(&err)
return nil
}
// IrreparableProcess iterates over all items in the irreparabledb. If an item can
// now be repaired then it is added to a worker queue.
func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) {
@ -261,7 +330,7 @@ func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) {
for {
segments, err := checker.irrdb.GetLimited(ctx, limit, lastSeenSegmentPath)
if err != nil {
return Error.New("error reading segment from the queue %s", err)
return errs.Combine(Error.New("error reading segment from the queue"), err)
}
// zero segments returned with nil err
@ -272,7 +341,7 @@ func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) {
lastSeenSegmentPath = segments[len(segments)-1].Path
for _, segment := range segments {
err = checker.updateSegmentStatus(ctx, segment.GetSegmentDetail(), string(segment.GetPath()), &durabilityStats{})
err = checker.updateIrreparableSegmentStatus(ctx, segment.GetSegmentDetail(), string(segment.GetPath()))
if err != nil {
checker.logger.Error("irrepair segment checker failed: ", zap.Error(err))
}

View File

@ -4,21 +4,18 @@
package checker_test
import (
"bytes"
"context"
"fmt"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/zeebo/errs"
"storj.io/storj/internal/testcontext"
"storj.io/storj/internal/testplanet"
"storj.io/storj/internal/teststorj"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/storj"
"storj.io/storj/satellite/repair/checker"
"storj.io/storj/storage"
)
@ -67,6 +64,7 @@ func TestIdentifyIrreparableSegments(t *testing.T) {
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
checker := planet.Satellites[0].Repair.Checker
checker.Loop.Stop()
checker.IrreparableLoop.Stop()
const numberOfNodes = 10
pieces := make([]*pb.RemotePiece, 0, numberOfNodes)
@ -206,110 +204,3 @@ func makePointer(t *testing.T, planet *testplanet.Planet, pieceID string, create
err := pointerdb.Put(ctx, pieceID, pointer)
require.NoError(t, err)
}
func TestCheckerResume(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
repairQueue := &mockRepairQueue{}
irrepairQueue := planet.Satellites[0].DB.Irreparable()
config := checker.Config{
Interval: 30 * time.Second,
IrreparableInterval: 15 * time.Second,
ReliabilityCacheStaleness: 5 * time.Minute,
}
c := checker.NewChecker(planet.Satellites[0].Metainfo.Service, repairQueue, planet.Satellites[0].Overlay.Service, irrepairQueue, 0, nil, config)
// create pointer that needs repair
makePointer(t, planet, "a", true)
// create pointer that will cause an error
makePointer(t, planet, "b", true)
// create pointer that needs repair
makePointer(t, planet, "c", true)
// create pointer that will cause an error
makePointer(t, planet, "d", true)
err := c.IdentifyInjuredSegments(ctx)
require.Error(t, err)
// "a" should be the only segment in the repair queue
injuredSegment, err := repairQueue.Select(ctx)
require.NoError(t, err)
require.Equal(t, injuredSegment.Path, []byte("a"))
err = repairQueue.Delete(ctx, injuredSegment)
require.NoError(t, err)
injuredSegment, err = repairQueue.Select(ctx)
require.Error(t, err)
err = c.IdentifyInjuredSegments(ctx)
require.Error(t, err)
// "c" should be the only segment in the repair queue
injuredSegment, err = repairQueue.Select(ctx)
require.NoError(t, err)
require.Equal(t, injuredSegment.Path, []byte("c"))
err = repairQueue.Delete(ctx, injuredSegment)
require.NoError(t, err)
injuredSegment, err = repairQueue.Select(ctx)
require.Error(t, err)
err = c.IdentifyInjuredSegments(ctx)
require.Error(t, err)
// "a" should be the only segment in the repair queue
injuredSegment, err = repairQueue.Select(ctx)
require.NoError(t, err)
require.Equal(t, injuredSegment.Path, []byte("a"))
err = repairQueue.Delete(ctx, injuredSegment)
require.NoError(t, err)
injuredSegment, err = repairQueue.Select(ctx)
require.Error(t, err)
})
}
// mock repair queue used for TestCheckerResume
type mockRepairQueue struct {
injuredSegments []pb.InjuredSegment
}
func (mockRepairQueue *mockRepairQueue) Insert(ctx context.Context, s *pb.InjuredSegment) error {
if bytes.Equal(s.Path, []byte("b")) || bytes.Equal(s.Path, []byte("d")) {
return errs.New("mock Insert error")
}
mockRepairQueue.injuredSegments = append(mockRepairQueue.injuredSegments, *s)
return nil
}
func (mockRepairQueue *mockRepairQueue) Select(ctx context.Context) (*pb.InjuredSegment, error) {
if len(mockRepairQueue.injuredSegments) == 0 {
return &pb.InjuredSegment{}, errs.New("mock Select error")
}
s := mockRepairQueue.injuredSegments[0]
return &s, nil
}
func (mockRepairQueue *mockRepairQueue) Delete(ctx context.Context, s *pb.InjuredSegment) error {
var toDelete int
found := false
for i, seg := range mockRepairQueue.injuredSegments {
if bytes.Equal(seg.Path, s.Path) {
toDelete = i
found = true
break
}
}
if !found {
return errs.New("mock Delete error")
}
mockRepairQueue.injuredSegments = append(mockRepairQueue.injuredSegments[:toDelete], mockRepairQueue.injuredSegments[toDelete+1:]...)
return nil
}
func (mockRepairQueue *mockRepairQueue) SelectN(ctx context.Context, limit int) ([]pb.InjuredSegment, error) {
return []pb.InjuredSegment{}, errs.New("mock SelectN error")
}
func (mockRepairQueue *mockRepairQueue) Count(ctx context.Context) (int, error) {
return len(mockRepairQueue.injuredSegments), nil
}