storj/pkg/datarepair/checker/checker.go

253 lines
7.4 KiB
Go
Raw Normal View History

2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"time"
"github.com/gogo/protobuf/proto"
2019-01-23 19:58:44 +00:00
"github.com/zeebo/errs"
"go.uber.org/zap"
2019-01-23 19:58:44 +00:00
monkit "gopkg.in/spacemonkeygo/monkit.v2"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/datarepair/irreparable"
"storj.io/storj/pkg/datarepair/queue"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/pointerdb"
"storj.io/storj/pkg/statdb"
"storj.io/storj/pkg/storj"
"storj.io/storj/storage"
)
2019-01-23 19:58:44 +00:00
// Error is a standard error class for this package.
var (
Error = errs.Class("checker error")
mon = monkit.Package()
)
// Config contains configurable values for checker
type Config struct {
Interval time.Duration `help:"how frequently checker should audit segments" default:"30s"`
}
// Checker contains the information needed to do checks for missing pieces
type Checker struct {
statdb statdb.DB
pointerdb *pointerdb.Service
repairQueue queue.RepairQueue
overlay pb.OverlayServer
irrdb irreparable.DB
logger *zap.Logger
Loop sync2.Cycle
}
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// NewChecker creates a new instance of checker
func NewChecker(pointerdb *pointerdb.Service, sdb statdb.DB, repairQueue queue.RepairQueue, overlay pb.OverlayServer, irrdb irreparable.DB, limit int, logger *zap.Logger, interval time.Duration) *Checker {
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// TODO: reorder arguments
checker := &Checker{
statdb: sdb,
pointerdb: pointerdb,
repairQueue: repairQueue,
overlay: overlay,
irrdb: irrdb,
logger: logger,
Loop: *sync2.NewCycle(interval),
}
return checker
}
// Run the checker loop
func (checker *Checker) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return checker.Loop.Run(ctx, func(ctx context.Context) error {
err := checker.IdentifyInjuredSegments(ctx)
if err != nil {
checker.logger.Error("error with injured segments identification: ", zap.Error(err))
}
return err
})
}
// Close halts the Checker loop
func (checker *Checker) Close() error {
checker.Loop.Close()
return nil
}
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// IdentifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
var remoteSegmentsChecked int64
var remoteSegmentsNeedingRepair int64
var remoteSegmentsLost int64
var remoteSegmentInfo []string
err = checker.pointerdb.Iterate("", "", true, false,
func(it storage.Iterator) error {
var item storage.ListItem
for it.Next(&item) {
pointer := &pb.Pointer{}
err = proto.Unmarshal(item.Value, pointer)
if err != nil {
return Error.New("error unmarshalling pointer %s", err)
}
remote := pointer.GetRemote()
if remote == nil {
continue
}
pieces := remote.GetRemotePieces()
if pieces == nil {
checker.logger.Debug("no pieces on remote segment")
continue
}
var nodeIDs storj.NodeIDList
for _, p := range pieces {
nodeIDs = append(nodeIDs, p.NodeId)
}
// Find all offline nodes
offlineNodes, err := checker.OfflineNodes(ctx, nodeIDs)
if err != nil {
return Error.New("error getting offline nodes %s", err)
}
invalidNodes, err := checker.invalidNodes(ctx, nodeIDs)
if err != nil {
return Error.New("error getting invalid nodes %s", err)
}
missingPieces := combineOfflineWithInvalid(offlineNodes, invalidNodes)
remoteSegmentsChecked++
numHealthy := len(nodeIDs) - len(missingPieces)
if (int32(numHealthy) >= pointer.Remote.Redundancy.MinReq) && (int32(numHealthy) < pointer.Remote.Redundancy.RepairThreshold) {
remoteSegmentsNeedingRepair++
err = checker.repairQueue.Enqueue(ctx, &pb.InjuredSegment{
Path: string(item.Key),
LostPieces: missingPieces,
})
if err != nil {
return Error.New("error adding injured segment to queue %s", err)
}
} else if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
pathElements := storj.SplitPath(storj.Path(item.Key))
// check to make sure there are at least *4* path elements. the first three
// are project, segment, and bucket name, but we want to make sure we're talking
// about an actual object, and that there's an object name specified
if len(pathElements) >= 4 {
project, bucketName, segmentpath := pathElements[0], pathElements[2], pathElements[3]
lostSegInfo := storj.JoinPaths(project, bucketName, segmentpath)
if contains(remoteSegmentInfo, lostSegInfo) == false {
remoteSegmentInfo = append(remoteSegmentInfo, lostSegInfo)
}
}
remoteSegmentsLost++
// make an entry in to the irreparable table
segmentInfo := &pb.IrreparableSegment{
Path: item.Key,
SegmentDetail: pointer,
LostPieces: int32(len(missingPieces)),
LastRepairAttempt: time.Now().Unix(),
RepairAttemptCount: int64(1),
}
//add the entry if new or update attempt count if already exists
err := checker.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
if err != nil {
return Error.New("error handling irreparable segment to queue %s", err)
}
}
}
return nil
},
)
if err != nil {
return err
}
mon.IntVal("remote_segments_checked").Observe(remoteSegmentsChecked)
mon.IntVal("remote_segments_needing_repair").Observe(remoteSegmentsNeedingRepair)
mon.IntVal("remote_segments_lost").Observe(remoteSegmentsLost)
mon.IntVal("remote_files_lost").Observe(int64(len(remoteSegmentInfo)))
return nil
}
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// OfflineNodes returns the indices of offline nodes
func (checker *Checker) OfflineNodes(ctx context.Context, nodeIDs storj.NodeIDList) (offline []int32, err error) {
responses, err := checker.overlay.BulkLookup(ctx, pb.NodeIDsToLookupRequests(nodeIDs))
if err != nil {
return []int32{}, err
}
nodes := pb.LookupResponsesToNodes(responses)
for i, n := range nodes {
if n == nil {
offline = append(offline, int32(i))
}
}
return offline, nil
}
// Find invalidNodes by checking the audit results that are place in statdb
func (checker *Checker) invalidNodes(ctx context.Context, nodeIDs storj.NodeIDList) (invalidNodes []int32, err error) {
// filter if nodeIDs have invalid pieces from auditing results
maxStats := &statdb.NodeStats{
AuditSuccessRatio: 0, // TODO: update when we have stats added to statdb
UptimeRatio: 0, // TODO: update when we have stats added to statdb
}
invalidIDs, err := checker.statdb.FindInvalidNodes(ctx, nodeIDs, maxStats)
if err != nil {
return nil, Error.New("error getting valid nodes from statdb %s", err)
}
invalidNodesMap := make(map[storj.NodeID]bool)
for _, invalidID := range invalidIDs {
invalidNodesMap[invalidID] = true
}
for i, nID := range nodeIDs {
if invalidNodesMap[nID] {
invalidNodes = append(invalidNodes, int32(i))
}
}
return invalidNodes, nil
}
// combine the offline nodes with nodes marked invalid by statdb
func combineOfflineWithInvalid(offlineNodes []int32, invalidNodes []int32) (missingPieces []int32) {
missingPieces = append(missingPieces, offlineNodes...)
offlineMap := make(map[int32]bool)
for _, i := range offlineNodes {
offlineMap[i] = true
}
for _, i := range invalidNodes {
if !offlineMap[i] {
missingPieces = append(missingPieces, i)
}
}
return missingPieces
}
// checks for a string in slice
func contains(a []string, x string) bool {
for _, n := range a {
if x == n {
return true
}
}
return false
}