2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-02 20:46:29 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package checker
|
2018-10-03 19:35:56 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2018-10-30 19:16:40 +00:00
|
|
|
"time"
|
2018-10-04 22:40:34 +01:00
|
|
|
|
2018-11-20 18:29:07 +00:00
|
|
|
"github.com/gogo/protobuf/proto"
|
2019-01-23 19:58:44 +00:00
|
|
|
"github.com/zeebo/errs"
|
2018-10-04 22:40:34 +01:00
|
|
|
"go.uber.org/zap"
|
2019-01-23 19:58:44 +00:00
|
|
|
monkit "gopkg.in/spacemonkeygo/monkit.v2"
|
2018-10-04 22:40:34 +01:00
|
|
|
|
2019-02-11 21:06:39 +00:00
|
|
|
"storj.io/storj/internal/sync2"
|
2018-12-10 19:08:45 +00:00
|
|
|
"storj.io/storj/pkg/datarepair/irreparable"
|
2018-10-09 17:09:33 +01:00
|
|
|
"storj.io/storj/pkg/datarepair/queue"
|
2019-03-23 08:06:11 +00:00
|
|
|
"storj.io/storj/pkg/overlay"
|
2018-10-09 17:09:33 +01:00
|
|
|
"storj.io/storj/pkg/pb"
|
|
|
|
"storj.io/storj/pkg/pointerdb"
|
2018-11-29 18:39:27 +00:00
|
|
|
"storj.io/storj/pkg/storj"
|
2018-10-09 17:09:33 +01:00
|
|
|
"storj.io/storj/storage"
|
2018-10-03 19:35:56 +01:00
|
|
|
)
|
|
|
|
|
2019-01-23 19:58:44 +00:00
|
|
|
// Error is a standard error class for this package.
|
|
|
|
var (
|
|
|
|
Error = errs.Class("checker error")
|
|
|
|
mon = monkit.Package()
|
|
|
|
)
|
|
|
|
|
|
|
|
// Config contains configurable values for checker
|
|
|
|
type Config struct {
|
|
|
|
Interval time.Duration `help:"how frequently checker should audit segments" default:"30s"`
|
|
|
|
}
|
|
|
|
|
2018-10-09 17:09:33 +01:00
|
|
|
// Checker contains the information needed to do checks for missing pieces
|
2019-02-11 21:06:39 +00:00
|
|
|
type Checker struct {
|
2019-01-19 18:58:53 +00:00
|
|
|
pointerdb *pointerdb.Service
|
2018-12-21 15:11:19 +00:00
|
|
|
repairQueue queue.RepairQueue
|
2019-03-23 08:06:11 +00:00
|
|
|
overlay *overlay.Cache
|
2018-12-10 19:08:45 +00:00
|
|
|
irrdb irreparable.DB
|
2018-10-09 17:09:33 +01:00
|
|
|
logger *zap.Logger
|
2019-02-14 12:33:41 +00:00
|
|
|
Loop sync2.Cycle
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2019-01-18 13:54:08 +00:00
|
|
|
// NewChecker creates a new instance of checker
|
2019-03-25 22:25:09 +00:00
|
|
|
func NewChecker(pointerdb *pointerdb.Service, repairQueue queue.RepairQueue, overlay *overlay.Cache, irrdb irreparable.DB, limit int, logger *zap.Logger, interval time.Duration) *Checker {
|
2019-01-18 13:54:08 +00:00
|
|
|
// TODO: reorder arguments
|
2019-02-11 21:06:39 +00:00
|
|
|
checker := &Checker{
|
2018-10-09 17:09:33 +01:00
|
|
|
pointerdb: pointerdb,
|
|
|
|
repairQueue: repairQueue,
|
|
|
|
overlay: overlay,
|
2018-12-04 16:26:30 +00:00
|
|
|
irrdb: irrdb,
|
2018-10-09 17:09:33 +01:00
|
|
|
logger: logger,
|
2019-02-14 12:33:41 +00:00
|
|
|
Loop: *sync2.NewCycle(interval),
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2019-02-11 21:06:39 +00:00
|
|
|
return checker
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2018-11-01 14:03:45 +00:00
|
|
|
// Run the checker loop
|
2019-02-11 21:06:39 +00:00
|
|
|
func (checker *Checker) Run(ctx context.Context) (err error) {
|
2018-11-01 14:03:45 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-02-14 12:33:41 +00:00
|
|
|
return checker.Loop.Run(ctx, func(ctx context.Context) error {
|
2019-02-11 21:06:39 +00:00
|
|
|
err := checker.IdentifyInjuredSegments(ctx)
|
2018-11-01 14:03:45 +00:00
|
|
|
if err != nil {
|
2019-02-11 21:06:39 +00:00
|
|
|
checker.logger.Error("error with injured segments identification: ", zap.Error(err))
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
2019-04-01 10:16:17 +01:00
|
|
|
return nil
|
2019-02-11 21:06:39 +00:00
|
|
|
})
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 12:33:41 +00:00
|
|
|
// Close halts the Checker loop
|
|
|
|
func (checker *Checker) Close() error {
|
|
|
|
checker.Loop.Close()
|
|
|
|
return nil
|
|
|
|
}
|
2019-01-18 13:54:08 +00:00
|
|
|
|
|
|
|
// IdentifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
|
2019-02-11 21:06:39 +00:00
|
|
|
func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
|
2018-10-09 17:09:33 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-02-26 15:17:51 +00:00
|
|
|
var remoteSegmentsChecked int64
|
|
|
|
var remoteSegmentsNeedingRepair int64
|
|
|
|
var remoteSegmentsLost int64
|
|
|
|
var remoteSegmentInfo []string
|
|
|
|
|
2019-02-11 21:06:39 +00:00
|
|
|
err = checker.pointerdb.Iterate("", "", true, false,
|
2018-10-09 17:09:33 +01:00
|
|
|
func(it storage.Iterator) error {
|
|
|
|
var item storage.ListItem
|
2019-02-14 12:33:41 +00:00
|
|
|
for it.Next(&item) {
|
2018-10-09 17:09:33 +01:00
|
|
|
pointer := &pb.Pointer{}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-10-09 17:09:33 +01:00
|
|
|
err = proto.Unmarshal(item.Value, pointer)
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error unmarshalling pointer %s", err)
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-20 15:54:22 +00:00
|
|
|
remote := pointer.GetRemote()
|
|
|
|
if remote == nil {
|
|
|
|
continue
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-20 15:54:22 +00:00
|
|
|
pieces := remote.GetRemotePieces()
|
|
|
|
if pieces == nil {
|
2019-02-11 21:06:39 +00:00
|
|
|
checker.logger.Debug("no pieces on remote segment")
|
2018-11-20 15:54:22 +00:00
|
|
|
continue
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-29 18:39:27 +00:00
|
|
|
var nodeIDs storj.NodeIDList
|
2018-10-09 17:09:33 +01:00
|
|
|
for _, p := range pieces {
|
2018-11-29 18:39:27 +00:00
|
|
|
nodeIDs = append(nodeIDs, p.NodeId)
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
|
|
|
// Find all offline nodes
|
2019-03-23 08:06:11 +00:00
|
|
|
offlineNodes, err := checker.overlay.OfflineNodes(ctx, nodeIDs)
|
2018-10-09 17:09:33 +01:00
|
|
|
if err != nil {
|
2018-11-08 16:18:28 +00:00
|
|
|
return Error.New("error getting offline nodes %s", err)
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
2019-02-11 21:06:39 +00:00
|
|
|
invalidNodes, err := checker.invalidNodes(ctx, nodeIDs)
|
2018-12-06 18:51:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return Error.New("error getting invalid nodes %s", err)
|
|
|
|
}
|
|
|
|
|
2019-04-08 20:46:23 +01:00
|
|
|
missingIndices := combineOfflineWithInvalid(offlineNodes, invalidNodes)
|
|
|
|
var missingPieces []int32
|
|
|
|
for _, i := range missingIndices {
|
|
|
|
missingPieces = append(missingPieces, pieces[i].GetPieceNum())
|
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
2019-02-26 15:17:51 +00:00
|
|
|
remoteSegmentsChecked++
|
2018-10-09 17:09:33 +01:00
|
|
|
numHealthy := len(nodeIDs) - len(missingPieces)
|
2019-04-08 20:46:23 +01:00
|
|
|
if (int32(numHealthy) >= pointer.Remote.Redundancy.MinReq) && (int32(numHealthy) <= pointer.Remote.Redundancy.RepairThreshold) {
|
2019-02-26 15:17:51 +00:00
|
|
|
remoteSegmentsNeedingRepair++
|
2019-04-16 19:14:09 +01:00
|
|
|
err = checker.repairQueue.Insert(ctx, &pb.InjuredSegment{
|
2018-10-09 17:09:33 +01:00
|
|
|
Path: string(item.Key),
|
|
|
|
LostPieces: missingPieces,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error adding injured segment to queue %s", err)
|
|
|
|
}
|
2018-12-04 16:26:30 +00:00
|
|
|
} else if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
|
2019-02-26 15:17:51 +00:00
|
|
|
pathElements := storj.SplitPath(storj.Path(item.Key))
|
|
|
|
// check to make sure there are at least *4* path elements. the first three
|
|
|
|
// are project, segment, and bucket name, but we want to make sure we're talking
|
|
|
|
// about an actual object, and that there's an object name specified
|
|
|
|
if len(pathElements) >= 4 {
|
|
|
|
project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3]
|
|
|
|
lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath)
|
|
|
|
if contains(remoteSegmentInfo, lostSegInfo) == false {
|
|
|
|
remoteSegmentInfo = append(remoteSegmentInfo, lostSegInfo)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-23 08:06:11 +00:00
|
|
|
// TODO: irreparable segment should be using storj.NodeID or something, since at the point of repair
|
|
|
|
// it may have been already repaired once.
|
|
|
|
|
2019-02-26 15:17:51 +00:00
|
|
|
remoteSegmentsLost++
|
2018-12-04 16:26:30 +00:00
|
|
|
// make an entry in to the irreparable table
|
2019-03-15 20:21:52 +00:00
|
|
|
segmentInfo := &pb.IrreparableSegment{
|
|
|
|
Path: item.Key,
|
|
|
|
SegmentDetail: pointer,
|
|
|
|
LostPieces: int32(len(missingPieces)),
|
|
|
|
LastRepairAttempt: time.Now().Unix(),
|
|
|
|
RepairAttemptCount: int64(1),
|
2018-12-04 16:26:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//add the entry if new or update attempt count if already exists
|
2019-02-11 21:06:39 +00:00
|
|
|
err := checker.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
|
2018-12-04 16:26:30 +00:00
|
|
|
if err != nil {
|
|
|
|
return Error.New("error handling irreparable segment to queue %s", err)
|
|
|
|
}
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
},
|
|
|
|
)
|
2019-02-26 15:17:51 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
mon.IntVal("remote_segments_checked").Observe(remoteSegmentsChecked)
|
|
|
|
mon.IntVal("remote_segments_needing_repair").Observe(remoteSegmentsNeedingRepair)
|
|
|
|
mon.IntVal("remote_segments_lost").Observe(remoteSegmentsLost)
|
|
|
|
mon.IntVal("remote_files_lost").Observe(int64(len(remoteSegmentInfo)))
|
|
|
|
|
|
|
|
return nil
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
// Find invalidNodes by checking the audit results that are place in overlay
|
2019-03-23 08:06:11 +00:00
|
|
|
func (checker *Checker) invalidNodes(ctx context.Context, nodeIDs storj.NodeIDList) (invalidNodes []int, err error) {
|
2018-12-06 18:51:23 +00:00
|
|
|
// filter if nodeIDs have invalid pieces from auditing results
|
2019-03-25 22:25:09 +00:00
|
|
|
maxStats := &overlay.NodeStats{
|
|
|
|
AuditSuccessRatio: 0, // TODO: update when we have stats added to overlay
|
|
|
|
UptimeRatio: 0, // TODO: update when we have stats added to overlay
|
2018-12-06 18:51:23 +00:00
|
|
|
}
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
invalidIDs, err := checker.overlay.FindInvalidNodes(ctx, nodeIDs, maxStats)
|
2018-12-06 18:51:23 +00:00
|
|
|
if err != nil {
|
2019-03-25 22:25:09 +00:00
|
|
|
return nil, Error.New("error getting valid nodes from overlay %s", err)
|
2018-12-06 18:51:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
invalidNodesMap := make(map[storj.NodeID]bool)
|
2018-12-19 18:44:03 +00:00
|
|
|
for _, invalidID := range invalidIDs {
|
2018-12-06 18:51:23 +00:00
|
|
|
invalidNodesMap[invalidID] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, nID := range nodeIDs {
|
|
|
|
if invalidNodesMap[nID] {
|
2019-03-23 08:06:11 +00:00
|
|
|
invalidNodes = append(invalidNodes, i)
|
2018-12-06 18:51:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return invalidNodes, nil
|
|
|
|
}
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
// combine the offline nodes with nodes marked invalid by overlay
|
2019-03-23 08:06:11 +00:00
|
|
|
func combineOfflineWithInvalid(offlineNodes []int, invalidNodes []int) (missingPieces []int32) {
|
|
|
|
for _, offline := range offlineNodes {
|
|
|
|
missingPieces = append(missingPieces, int32(offline))
|
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
2019-03-23 08:06:11 +00:00
|
|
|
offlineMap := make(map[int]bool)
|
2018-12-06 18:51:23 +00:00
|
|
|
for _, i := range offlineNodes {
|
|
|
|
offlineMap[i] = true
|
|
|
|
}
|
|
|
|
for _, i := range invalidNodes {
|
|
|
|
if !offlineMap[i] {
|
2019-03-23 08:06:11 +00:00
|
|
|
missingPieces = append(missingPieces, int32(i))
|
2018-12-06 18:51:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return missingPieces
|
|
|
|
}
|
2019-02-26 15:17:51 +00:00
|
|
|
|
|
|
|
// checks for a string in slice
|
|
|
|
func contains(a []string, x string) bool {
|
|
|
|
for _, n := range a {
|
|
|
|
if x == n {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|