diff --git a/satellite/peer.go b/satellite/peer.go index 1c24a7859..081c777c9 100644 --- a/satellite/peer.go +++ b/satellite/peer.go @@ -436,10 +436,12 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config *Config, ve log.Debug("Setting up datarepair") // TODO: simplify argument list somehow peer.Repair.Checker = checker.NewChecker( - peer.Metainfo.Service, + peer.Log.Named("checker"), peer.DB.RepairQueue(), - peer.Overlay.Service, peer.DB.Irreparable(), - 0, peer.Log.Named("checker"), + peer.DB.Irreparable(), + peer.Metainfo.Service, + peer.Metainfo.Loop, + peer.Overlay.Service, config.Checker) peer.Repair.Repairer = repairer.NewService( diff --git a/satellite/repair/checker/checker.go b/satellite/repair/checker/checker.go index 28b0961f8..a04a86d46 100644 --- a/satellite/repair/checker/checker.go +++ b/satellite/repair/checker/checker.go @@ -7,12 +7,12 @@ import ( "context" "time" - "github.com/gogo/protobuf/proto" "github.com/zeebo/errs" "go.uber.org/zap" "golang.org/x/sync/errgroup" monkit "gopkg.in/spacemonkeygo/monkit.v2" + "storj.io/storj/internal/errs2" "storj.io/storj/internal/sync2" "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" @@ -20,7 +20,6 @@ import ( "storj.io/storj/satellite/overlay" "storj.io/storj/satellite/repair/irreparable" "storj.io/storj/satellite/repair/queue" - "storj.io/storj/storage" ) // Error is a standard error class for this package. @@ -48,30 +47,29 @@ type durabilityStats struct { // Checker contains the information needed to do checks for missing pieces type Checker struct { - metainfo *metainfo.Service - lastChecked string - repairQueue queue.RepairQueue - nodestate *ReliabilityCache - irrdb irreparable.DB logger *zap.Logger + repairQueue queue.RepairQueue + irrdb irreparable.DB + metainfo *metainfo.Service + metaLoop *metainfo.Loop + nodestate *ReliabilityCache Loop sync2.Cycle IrreparableLoop sync2.Cycle - monStats durabilityStats } // NewChecker creates a new instance of checker -func NewChecker(metainfo *metainfo.Service, repairQueue queue.RepairQueue, overlay *overlay.Cache, irrdb irreparable.DB, limit int, logger *zap.Logger, config Config) *Checker { - // TODO: reorder arguments +func NewChecker(logger *zap.Logger, repairQueue queue.RepairQueue, irrdb irreparable.DB, metainfo *metainfo.Service, metaLoop *metainfo.Loop, overlay *overlay.Cache, config Config) *Checker { return &Checker{ - metainfo: metainfo, - lastChecked: "", - repairQueue: repairQueue, - nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness), - irrdb: irrdb, - logger: logger, + logger: logger, + + repairQueue: repairQueue, + irrdb: irrdb, + metainfo: metainfo, + metaLoop: metaLoop, + nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness), + Loop: *sync2.NewCycle(config.Interval), IrreparableLoop: *sync2.NewCycle(config.IrreparableInterval), - monStats: durabilityStats{}, } } @@ -107,53 +105,27 @@ func (checker *Checker) Close() error { func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) { defer mon.Task()(&ctx)(&err) - err = checker.metainfo.Iterate(ctx, "", checker.lastChecked, true, false, - func(ctx context.Context, it storage.Iterator) error { - var item storage.ListItem - - defer func() { - var nextItem storage.ListItem - it.Next(ctx, &nextItem) - // start at the next item in the next call - checker.lastChecked = nextItem.Key.String() - // if we have finished iterating, send and reset durability stats - if checker.lastChecked == "" { - // send durability stats - mon.IntVal("remote_files_checked").Observe(checker.monStats.remoteFilesChecked) - mon.IntVal("remote_segments_checked").Observe(checker.monStats.remoteSegmentsChecked) - mon.IntVal("remote_segments_needing_repair").Observe(checker.monStats.remoteSegmentsNeedingRepair) - mon.IntVal("remote_segments_lost").Observe(checker.monStats.remoteSegmentsLost) - mon.IntVal("remote_files_lost").Observe(int64(len(checker.monStats.remoteSegmentInfo))) - - // reset durability stats for next iteration - checker.monStats = durabilityStats{} - } - }() - - for it.Next(ctx, &item) { - pointer := &pb.Pointer{} - - err = proto.Unmarshal(item.Value, pointer) - if err != nil { - return Error.New("error unmarshalling pointer %s", err) - } - remote := pointer.GetRemote() - if remote == nil { - continue - } - - err = checker.updateSegmentStatus(ctx, pointer, item.Key.String(), &checker.monStats) - if err != nil { - return err - } - } - return nil - }, - ) + observer := &checkerObserver{ + repairQueue: checker.repairQueue, + irrdb: checker.irrdb, + nodestate: checker.nodestate, + monStats: durabilityStats{}, + log: checker.logger, + } + err = checker.metaLoop.Join(ctx, observer) if err != nil { + if !errs2.IsCanceled(err) { + checker.logger.Error("IdentifyInjuredSegments error", zap.Error(err)) + } return err } + mon.IntVal("remote_files_checked").Observe(observer.monStats.remoteFilesChecked) + mon.IntVal("remote_segments_checked").Observe(observer.monStats.remoteSegmentsChecked) + mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair) + mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost) + mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo))) + return nil } @@ -167,7 +139,8 @@ func contains(a []string, x string) bool { return false } -func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Pointer, path string, monStats *durabilityStats) (err error) { +func (checker *Checker) updateIrreparableSegmentStatus(ctx context.Context, pointer *pb.Pointer, path string) (err error) { + // TODO figure out how to reduce duplicate code between here and checkerObs.RemoteSegment defer mon.Task()(&ctx)(&err) remote := pointer.GetRemote() if remote == nil { @@ -182,35 +155,31 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi missingPieces, err := checker.nodestate.MissingPieces(ctx, pointer.CreationDate, pieces) if err != nil { - return Error.New("error getting missing pieces %s", err) - } - - monStats.remoteSegmentsChecked++ - pathElements := storj.SplitPath(path) - if len(pathElements) >= 2 && pathElements[1] == "l" { - monStats.remoteFilesChecked++ + return errs.Combine(Error.New("error getting missing pieces"), err) } numHealthy := int32(len(pieces) - len(missingPieces)) redundancy := pointer.Remote.Redundancy - mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) - mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) // we repair when the number of healthy pieces is less than or equal to the repair threshold // except for the case when the repair and success thresholds are the same (a case usually seen during testing) if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold { if len(missingPieces) == 0 { - checker.logger.Warn("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme.") + checker.logger.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:", + zap.String("path", path), + zap.Int32("min", redundancy.MinReq), + zap.Int32("repair", redundancy.RepairThreshold), + zap.Int32("success", redundancy.SuccessThreshold), + zap.Int32("total", redundancy.Total)) return nil } - monStats.remoteSegmentsNeedingRepair++ err = checker.repairQueue.Insert(ctx, &pb.InjuredSegment{ Path: []byte(path), LostPieces: missingPieces, InsertedTime: time.Now().UTC(), }) if err != nil { - return Error.New("error adding injured segment to queue %s", err) + return errs.Combine(Error.New("error adding injured segment to queue"), err) } // delete always returns nil when something was deleted and also when element didn't exists @@ -221,19 +190,8 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi // we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost. // except for the case when minimum and repair thresholds are the same (a case usually seen during testing) } else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold { - // check to make sure there are at least *4* path elements. the first three - // are project, segment, and bucket name, but we want to make sure we're talking - // about an actual object, and that there's an object name specified - if len(pathElements) >= 4 { - project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3] - lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath) - if contains(monStats.remoteSegmentInfo, lostSegInfo) == false { - monStats.remoteSegmentInfo = append(monStats.remoteSegmentInfo, lostSegInfo) - } - } - monStats.remoteSegmentsLost++ - // make an entry in to the irreparable table + // make an entry into the irreparable table segmentInfo := &pb.IrreparableSegment{ Path: []byte(path), SegmentDetail: pointer, @@ -245,12 +203,123 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi // add the entry if new or update attempt count if already exists err := checker.irrdb.IncrementRepairAttempts(ctx, segmentInfo) if err != nil { - return Error.New("error handling irreparable segment to queue %s", err) + return errs.Combine(Error.New("error handling irreparable segment to queue"), err) } } return nil } +// checkerObserver implements the metainfo loop Observer interface +type checkerObserver struct { + repairQueue queue.RepairQueue + irrdb irreparable.DB + nodestate *ReliabilityCache + monStats durabilityStats + log *zap.Logger +} + +func (obs *checkerObserver) RemoteSegment(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) { + defer mon.Task()(&ctx)(&err) + + obs.monStats.remoteSegmentsChecked++ + remote := pointer.GetRemote() + + pieces := remote.GetRemotePieces() + if pieces == nil { + obs.log.Debug("no pieces on remote segment") + return nil + } + + missingPieces, err := obs.nodestate.MissingPieces(ctx, pointer.CreationDate, pieces) + if err != nil { + return errs.Combine(Error.New("error getting missing pieces"), err) + } + + numHealthy := int32(len(pieces) - len(missingPieces)) + mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) + mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) + + redundancy := pointer.Remote.Redundancy + + // we repair when the number of healthy pieces is less than or equal to the repair threshold + // except for the case when the repair and success thresholds are the same (a case usually seen during testing) + if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold { + if len(missingPieces) == 0 { + obs.log.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:", + zap.String("path", path), + zap.Int32("min", redundancy.MinReq), + zap.Int32("repair", redundancy.RepairThreshold), + zap.Int32("success", redundancy.SuccessThreshold), + zap.Int32("total", redundancy.Total)) + return nil + } + obs.monStats.remoteSegmentsNeedingRepair++ + err = obs.repairQueue.Insert(ctx, &pb.InjuredSegment{ + Path: []byte(path), + LostPieces: missingPieces, + InsertedTime: time.Now().UTC(), + }) + if err != nil { + obs.log.Error("error adding injured segment to queue", zap.Error(err)) + return nil + } + + // delete always returns nil when something was deleted and also when element didn't exists + err = obs.irrdb.Delete(ctx, []byte(path)) + if err != nil { + obs.log.Error("error deleting entry from irreparable db", zap.Error(err)) + return nil + } + // we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost. + // except for the case when minimum and repair thresholds are the same (a case usually seen during testing) + } else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold { + pathElements := storj.SplitPath(path) + + // check to make sure there are at least *4* path elements. the first three + // are project, segment, and bucket name, but we want to make sure we're talking + // about an actual object, and that there's an object name specified + if len(pathElements) >= 4 { + project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3] + lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath) + if contains(obs.monStats.remoteSegmentInfo, lostSegInfo) == false { + obs.monStats.remoteSegmentInfo = append(obs.monStats.remoteSegmentInfo, lostSegInfo) + } + } + + obs.monStats.remoteSegmentsLost++ + // make an entry into the irreparable table + segmentInfo := &pb.IrreparableSegment{ + Path: []byte(path), + SegmentDetail: pointer, + LostPieces: int32(len(missingPieces)), + LastRepairAttempt: time.Now().Unix(), + RepairAttemptCount: int64(1), + } + + // add the entry if new or update attempt count if already exists + err := obs.irrdb.IncrementRepairAttempts(ctx, segmentInfo) + if err != nil { + obs.log.Error("error handling irreparable segment to queue", zap.Error(err)) + return nil + } + } + + return nil +} + +func (obs *checkerObserver) RemoteObject(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) { + defer mon.Task()(&ctx)(&err) + + obs.monStats.remoteFilesChecked++ + + return nil +} + +func (obs *checkerObserver) InlineSegment(ctx context.Context, path storj.Path, pointer *pb.Pointer) (err error) { + defer mon.Task()(&ctx)(&err) + return nil +} + // IrreparableProcess iterates over all items in the irreparabledb. If an item can // now be repaired then it is added to a worker queue. func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) { @@ -261,7 +330,7 @@ func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) { for { segments, err := checker.irrdb.GetLimited(ctx, limit, lastSeenSegmentPath) if err != nil { - return Error.New("error reading segment from the queue %s", err) + return errs.Combine(Error.New("error reading segment from the queue"), err) } // zero segments returned with nil err @@ -272,7 +341,7 @@ func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) { lastSeenSegmentPath = segments[len(segments)-1].Path for _, segment := range segments { - err = checker.updateSegmentStatus(ctx, segment.GetSegmentDetail(), string(segment.GetPath()), &durabilityStats{}) + err = checker.updateIrreparableSegmentStatus(ctx, segment.GetSegmentDetail(), string(segment.GetPath())) if err != nil { checker.logger.Error("irrepair segment checker failed: ", zap.Error(err)) } diff --git a/satellite/repair/checker/checker_test.go b/satellite/repair/checker/checker_test.go index 533f2c1e1..6126a0860 100644 --- a/satellite/repair/checker/checker_test.go +++ b/satellite/repair/checker/checker_test.go @@ -4,21 +4,18 @@ package checker_test import ( - "bytes" "context" "fmt" "testing" "time" "github.com/stretchr/testify/require" - "github.com/zeebo/errs" "storj.io/storj/internal/testcontext" "storj.io/storj/internal/testplanet" "storj.io/storj/internal/teststorj" "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" - "storj.io/storj/satellite/repair/checker" "storj.io/storj/storage" ) @@ -67,6 +64,7 @@ func TestIdentifyIrreparableSegments(t *testing.T) { }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { checker := planet.Satellites[0].Repair.Checker checker.Loop.Stop() + checker.IrreparableLoop.Stop() const numberOfNodes = 10 pieces := make([]*pb.RemotePiece, 0, numberOfNodes) @@ -206,110 +204,3 @@ func makePointer(t *testing.T, planet *testplanet.Planet, pieceID string, create err := pointerdb.Put(ctx, pieceID, pointer) require.NoError(t, err) } - -func TestCheckerResume(t *testing.T) { - testplanet.Run(t, testplanet.Config{ - SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0, - }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { - repairQueue := &mockRepairQueue{} - irrepairQueue := planet.Satellites[0].DB.Irreparable() - config := checker.Config{ - Interval: 30 * time.Second, - IrreparableInterval: 15 * time.Second, - ReliabilityCacheStaleness: 5 * time.Minute, - } - c := checker.NewChecker(planet.Satellites[0].Metainfo.Service, repairQueue, planet.Satellites[0].Overlay.Service, irrepairQueue, 0, nil, config) - - // create pointer that needs repair - makePointer(t, planet, "a", true) - // create pointer that will cause an error - makePointer(t, planet, "b", true) - // create pointer that needs repair - makePointer(t, planet, "c", true) - // create pointer that will cause an error - makePointer(t, planet, "d", true) - - err := c.IdentifyInjuredSegments(ctx) - require.Error(t, err) - - // "a" should be the only segment in the repair queue - injuredSegment, err := repairQueue.Select(ctx) - require.NoError(t, err) - require.Equal(t, injuredSegment.Path, []byte("a")) - err = repairQueue.Delete(ctx, injuredSegment) - require.NoError(t, err) - injuredSegment, err = repairQueue.Select(ctx) - require.Error(t, err) - - err = c.IdentifyInjuredSegments(ctx) - require.Error(t, err) - - // "c" should be the only segment in the repair queue - injuredSegment, err = repairQueue.Select(ctx) - require.NoError(t, err) - require.Equal(t, injuredSegment.Path, []byte("c")) - err = repairQueue.Delete(ctx, injuredSegment) - require.NoError(t, err) - injuredSegment, err = repairQueue.Select(ctx) - require.Error(t, err) - - err = c.IdentifyInjuredSegments(ctx) - require.Error(t, err) - - // "a" should be the only segment in the repair queue - injuredSegment, err = repairQueue.Select(ctx) - require.NoError(t, err) - require.Equal(t, injuredSegment.Path, []byte("a")) - err = repairQueue.Delete(ctx, injuredSegment) - require.NoError(t, err) - injuredSegment, err = repairQueue.Select(ctx) - require.Error(t, err) - }) -} - -// mock repair queue used for TestCheckerResume -type mockRepairQueue struct { - injuredSegments []pb.InjuredSegment -} - -func (mockRepairQueue *mockRepairQueue) Insert(ctx context.Context, s *pb.InjuredSegment) error { - if bytes.Equal(s.Path, []byte("b")) || bytes.Equal(s.Path, []byte("d")) { - return errs.New("mock Insert error") - } - mockRepairQueue.injuredSegments = append(mockRepairQueue.injuredSegments, *s) - return nil -} - -func (mockRepairQueue *mockRepairQueue) Select(ctx context.Context) (*pb.InjuredSegment, error) { - if len(mockRepairQueue.injuredSegments) == 0 { - return &pb.InjuredSegment{}, errs.New("mock Select error") - } - s := mockRepairQueue.injuredSegments[0] - return &s, nil -} - -func (mockRepairQueue *mockRepairQueue) Delete(ctx context.Context, s *pb.InjuredSegment) error { - var toDelete int - found := false - for i, seg := range mockRepairQueue.injuredSegments { - if bytes.Equal(seg.Path, s.Path) { - toDelete = i - found = true - break - } - } - if !found { - return errs.New("mock Delete error") - } - - mockRepairQueue.injuredSegments = append(mockRepairQueue.injuredSegments[:toDelete], mockRepairQueue.injuredSegments[toDelete+1:]...) - return nil -} - -func (mockRepairQueue *mockRepairQueue) SelectN(ctx context.Context, limit int) ([]pb.InjuredSegment, error) { - return []pb.InjuredSegment{}, errs.New("mock SelectN error") -} - -func (mockRepairQueue *mockRepairQueue) Count(ctx context.Context) (int, error) { - return len(mockRepairQueue.injuredSegments), nil -}