2018-10-02 20:46:29 +01:00
|
|
|
// Copyright (C) 2018 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package checker
|
2018-10-03 19:35:56 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2018-10-30 19:16:40 +00:00
|
|
|
"time"
|
2018-10-04 22:40:34 +01:00
|
|
|
|
2018-11-20 18:29:07 +00:00
|
|
|
"github.com/gogo/protobuf/proto"
|
2018-10-04 22:40:34 +01:00
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2018-12-10 19:08:45 +00:00
|
|
|
"storj.io/storj/pkg/datarepair/irreparable"
|
2018-10-09 17:09:33 +01:00
|
|
|
"storj.io/storj/pkg/datarepair/queue"
|
|
|
|
"storj.io/storj/pkg/pb"
|
|
|
|
"storj.io/storj/pkg/pointerdb"
|
2018-12-06 18:51:23 +00:00
|
|
|
"storj.io/storj/pkg/statdb"
|
2018-11-29 18:39:27 +00:00
|
|
|
"storj.io/storj/pkg/storj"
|
2018-10-09 17:09:33 +01:00
|
|
|
"storj.io/storj/storage"
|
2018-10-03 19:35:56 +01:00
|
|
|
)
|
|
|
|
|
2018-11-08 16:18:28 +00:00
|
|
|
// Checker is the interface for data repair checker
|
2018-10-12 18:49:49 +01:00
|
|
|
type Checker interface {
|
2018-10-30 19:16:40 +00:00
|
|
|
Run(ctx context.Context) error
|
2018-10-03 19:35:56 +01:00
|
|
|
}
|
2018-10-09 17:09:33 +01:00
|
|
|
|
|
|
|
// Checker contains the information needed to do checks for missing pieces
|
2018-10-12 18:49:49 +01:00
|
|
|
type checker struct {
|
2018-12-14 20:17:30 +00:00
|
|
|
statdb statdb.DB
|
2018-10-09 17:09:33 +01:00
|
|
|
pointerdb *pointerdb.Server
|
|
|
|
repairQueue *queue.Queue
|
|
|
|
overlay pb.OverlayServer
|
2018-12-10 19:08:45 +00:00
|
|
|
irrdb irreparable.DB
|
2018-10-09 17:09:33 +01:00
|
|
|
limit int
|
|
|
|
logger *zap.Logger
|
2018-10-30 19:16:40 +00:00
|
|
|
ticker *time.Ticker
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2018-12-04 16:26:30 +00:00
|
|
|
// newChecker creates a new instance of checker
|
2018-12-14 20:17:30 +00:00
|
|
|
func newChecker(pointerdb *pointerdb.Server, sdb statdb.DB, repairQueue *queue.Queue, overlay pb.OverlayServer, irrdb irreparable.DB, limit int, logger *zap.Logger, interval time.Duration) *checker {
|
2018-10-12 18:49:49 +01:00
|
|
|
return &checker{
|
2018-12-06 18:51:23 +00:00
|
|
|
statdb: sdb,
|
2018-10-09 17:09:33 +01:00
|
|
|
pointerdb: pointerdb,
|
|
|
|
repairQueue: repairQueue,
|
|
|
|
overlay: overlay,
|
2018-12-04 16:26:30 +00:00
|
|
|
irrdb: irrdb,
|
2018-10-09 17:09:33 +01:00
|
|
|
limit: limit,
|
|
|
|
logger: logger,
|
2018-10-30 19:16:40 +00:00
|
|
|
ticker: time.NewTicker(interval),
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-01 14:03:45 +00:00
|
|
|
// Run the checker loop
|
|
|
|
func (c *checker) Run(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
for {
|
2018-11-08 16:18:28 +00:00
|
|
|
err = c.identifyInjuredSegments(ctx)
|
2018-11-01 14:03:45 +00:00
|
|
|
if err != nil {
|
2018-12-12 21:24:08 +00:00
|
|
|
c.logger.Error("Checker failed", zap.Error(err))
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-c.ticker.C: // wait for the next interval to happen
|
|
|
|
case <-ctx.Done(): // or the checker is canceled via context
|
|
|
|
return ctx.Err()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-08 16:18:28 +00:00
|
|
|
// identifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
|
|
|
|
func (c *checker) identifyInjuredSegments(ctx context.Context) (err error) {
|
2018-10-09 17:09:33 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
err = c.pointerdb.Iterate(ctx, &pb.IterateRequest{Recurse: true},
|
|
|
|
func(it storage.Iterator) error {
|
|
|
|
var item storage.ListItem
|
2018-10-30 19:16:40 +00:00
|
|
|
lim := c.limit
|
|
|
|
if lim <= 0 || lim > storage.LookupLimit {
|
|
|
|
lim = storage.LookupLimit
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2018-10-30 19:16:40 +00:00
|
|
|
for ; lim > 0 && it.Next(&item); lim-- {
|
2018-10-09 17:09:33 +01:00
|
|
|
pointer := &pb.Pointer{}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-10-09 17:09:33 +01:00
|
|
|
err = proto.Unmarshal(item.Value, pointer)
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error unmarshalling pointer %s", err)
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-20 15:54:22 +00:00
|
|
|
remote := pointer.GetRemote()
|
|
|
|
if remote == nil {
|
|
|
|
continue
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-20 15:54:22 +00:00
|
|
|
pieces := remote.GetRemotePieces()
|
|
|
|
if pieces == nil {
|
|
|
|
c.logger.Debug("no pieces on remote segment")
|
|
|
|
continue
|
|
|
|
}
|
2018-12-12 15:39:16 +00:00
|
|
|
|
2018-11-29 18:39:27 +00:00
|
|
|
var nodeIDs storj.NodeIDList
|
2018-10-09 17:09:33 +01:00
|
|
|
for _, p := range pieces {
|
2018-11-29 18:39:27 +00:00
|
|
|
nodeIDs = append(nodeIDs, p.NodeId)
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
|
|
|
// Find all offline nodes
|
|
|
|
offlineNodes, err := c.offlineNodes(ctx, nodeIDs)
|
2018-10-09 17:09:33 +01:00
|
|
|
if err != nil {
|
2018-11-08 16:18:28 +00:00
|
|
|
return Error.New("error getting offline nodes %s", err)
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
|
|
|
invalidNodes, err := c.invalidNodes(ctx, nodeIDs)
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error getting invalid nodes %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
missingPieces := combineOfflineWithInvalid(offlineNodes, invalidNodes)
|
|
|
|
|
2018-10-09 17:09:33 +01:00
|
|
|
numHealthy := len(nodeIDs) - len(missingPieces)
|
2018-12-04 16:26:30 +00:00
|
|
|
if (int32(numHealthy) >= pointer.Remote.Redundancy.MinReq) && (int32(numHealthy) < pointer.Remote.Redundancy.RepairThreshold) {
|
2018-10-09 17:09:33 +01:00
|
|
|
err = c.repairQueue.Enqueue(&pb.InjuredSegment{
|
|
|
|
Path: string(item.Key),
|
|
|
|
LostPieces: missingPieces,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error adding injured segment to queue %s", err)
|
|
|
|
}
|
2018-12-04 16:26:30 +00:00
|
|
|
} else if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
|
|
|
|
// make an entry in to the irreparable table
|
2018-12-10 19:08:45 +00:00
|
|
|
segmentInfo := &irreparable.RemoteSegmentInfo{
|
2018-12-04 16:26:30 +00:00
|
|
|
EncryptedSegmentPath: item.Key,
|
|
|
|
EncryptedSegmentDetail: item.Value,
|
|
|
|
LostPiecesCount: int64(len(missingPieces)),
|
|
|
|
RepairUnixSec: time.Now().Unix(),
|
|
|
|
RepairAttemptCount: int64(1),
|
|
|
|
}
|
|
|
|
|
|
|
|
//add the entry if new or update attempt count if already exists
|
|
|
|
err := c.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("error handling irreparable segment to queue %s", err)
|
|
|
|
}
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
},
|
|
|
|
)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-11-08 16:18:28 +00:00
|
|
|
// returns the indices of offline nodes
|
2018-11-29 18:39:27 +00:00
|
|
|
func (c *checker) offlineNodes(ctx context.Context, nodeIDs storj.NodeIDList) (offline []int32, err error) {
|
|
|
|
responses, err := c.overlay.BulkLookup(ctx, pb.NodeIDsToLookupRequests(nodeIDs))
|
2018-10-09 17:09:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return []int32{}, err
|
|
|
|
}
|
2018-11-29 18:39:27 +00:00
|
|
|
nodes := pb.LookupResponsesToNodes(responses)
|
2018-10-09 17:09:33 +01:00
|
|
|
for i, n := range nodes {
|
|
|
|
if n == nil {
|
|
|
|
offline = append(offline, int32(i))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return offline, nil
|
|
|
|
}
|
2018-12-06 18:51:23 +00:00
|
|
|
|
|
|
|
// Find invalidNodes by checking the audit results that are place in statdb
|
|
|
|
func (c *checker) invalidNodes(ctx context.Context, nodeIDs storj.NodeIDList) (invalidNodes []int32, err error) {
|
|
|
|
// filter if nodeIDs have invalid pieces from auditing results
|
2018-12-14 20:17:30 +00:00
|
|
|
findInvalidNodesReq := &statdb.FindInvalidNodesRequest{
|
2018-12-06 18:51:23 +00:00
|
|
|
NodeIds: nodeIDs,
|
|
|
|
MaxStats: &pb.NodeStats{
|
|
|
|
AuditSuccessRatio: 0, // TODO: update when we have stats added to statdb
|
|
|
|
UptimeRatio: 0, // TODO: update when we have stats added to statdb
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
resp, err := c.statdb.FindInvalidNodes(ctx, findInvalidNodesReq)
|
|
|
|
if err != nil {
|
|
|
|
return nil, Error.New("error getting valid nodes from statdb %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
invalidNodesMap := make(map[storj.NodeID]bool)
|
|
|
|
for _, invalidID := range resp.InvalidIds {
|
|
|
|
invalidNodesMap[invalidID] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, nID := range nodeIDs {
|
|
|
|
if invalidNodesMap[nID] {
|
|
|
|
invalidNodes = append(invalidNodes, int32(i))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return invalidNodes, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// combine the offline nodes with nodes marked invalid by statdb
|
|
|
|
func combineOfflineWithInvalid(offlineNodes []int32, invalidNodes []int32) (missingPieces []int32) {
|
|
|
|
missingPieces = append(missingPieces, offlineNodes...)
|
|
|
|
|
|
|
|
offlineMap := make(map[int32]bool)
|
|
|
|
for _, i := range offlineNodes {
|
|
|
|
offlineMap[i] = true
|
|
|
|
}
|
|
|
|
for _, i := range invalidNodes {
|
|
|
|
if !offlineMap[i] {
|
|
|
|
missingPieces = append(missingPieces, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return missingPieces
|
|
|
|
}
|