storj/pkg/datarepair/checker/checker.go
Egon Elbre 78dc02b758 Satellite Peer (#1034)
* add satellite peer

* Add overlay

* reorganize kademlia

* add RunRefresh

* add refresh to storagenode.Peer

* add discovery

* add agreements and metainfo

* rename

* add datarepair checker

* add repair

* add todo notes for audit

* add testing interface

* add into testplanet

* fixes

* fix compilation errors

* fix compilation errors

* make testplanet run

* remove audit refrences

* ensure that audit tests run

* dev

* checker tests compilable

* fix discovery

* fix compilation

* fix

* fix

* dev

* fix

* disable auth

* fixes

* revert go.mod/sum

* fix linter errors

* fix

* fix copyright

* Add address param for SN dashboard (#1076)

* Rename storj-sdk to storj-sim (#1078)

* Storagenode logs and config improvements  (#1075)

* Add more info to SN logs

* remove config-dir from user config

* add output where config was stored

* add message for successful connection

* fix linter

* remove storage.path from user config

* resolve config path

* move success  message to info

* log improvements

* Remove captplanet (#1070)

* pkg/server: include production cert (#1082)

Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792

* Generate Payments Report (#1079)

* memory.Size: autoformat sizes based on value entropy (#1081)

* Jj/bytes (#1085)

* run tally and rollup

* sets dev default tally and rollup intervals

* nonessential storj-sim edits (#1086)

* Closing context doesn't stop storage node (#1084)

* Print when cancelled

* Close properly

* Don't log nil

* Don't print error when closing dashboard

* Fix panic in inspector if ping fails (#1088)

* Consolidate identity management to identity cli commands (#1083)

* Consolidate identity management:

Move identity cretaion/signing out of storagenode setup command.

* fixes

* linters

* Consolidate identity management:

Move identity cretaion/signing out of storagenode setup command.

* fixes

* sava backups before saving signed certs

* add "-prebuilt-test-cmds" test flag

* linters

* prepare cli tests for travis

* linter fixes

* more fixes

* linter gods

* sp/sdk/sim

* remove ca.difficulty

* remove unused difficulty

* return setup to its rightful place

* wip travis

* Revert "wip travis"

This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca.

* typo in travis.yaml

* remove tests

* remove more

* make it only create one identity at a time for consistency

* add config-dir for consitency

* add identity creation to storj-sim

* add flags

* simplify

* fix nolint and compile

* prevent overwrite and pass difficulty, concurrency, and parent creds

* goimports
2019-01-18 08:54:08 -05:00

221 lines
6.4 KiB
Go

// Copyright (C) 2018 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"time"
"github.com/gogo/protobuf/proto"
"go.uber.org/zap"
"storj.io/storj/pkg/datarepair/irreparable"
"storj.io/storj/pkg/datarepair/queue"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/pointerdb"
"storj.io/storj/pkg/statdb"
"storj.io/storj/pkg/storj"
"storj.io/storj/storage"
)
// Checker is the interface for data repair checker
type Checker interface {
Run(ctx context.Context) error
IdentifyInjuredSegments(ctx context.Context) (err error)
OfflineNodes(ctx context.Context, nodeIDs storj.NodeIDList) (offline []int32, err error)
Close() error
}
// Checker contains the information needed to do checks for missing pieces
type checker struct {
statdb statdb.DB
pointerdb *pointerdb.Server
repairQueue queue.RepairQueue
overlay pb.OverlayServer
irrdb irreparable.DB
limit int
logger *zap.Logger
ticker *time.Ticker
}
// NewChecker creates a new instance of checker
func NewChecker(pointerdb *pointerdb.Server, sdb statdb.DB, repairQueue queue.RepairQueue, overlay pb.OverlayServer, irrdb irreparable.DB, limit int, logger *zap.Logger, interval time.Duration) Checker {
// TODO: reorder arguments
return newChecker(pointerdb, sdb, repairQueue, overlay, irrdb, limit, logger, interval)
}
// newChecker creates a new instance of checker
func newChecker(pointerdb *pointerdb.Server, sdb statdb.DB, repairQueue queue.RepairQueue, overlay pb.OverlayServer, irrdb irreparable.DB, limit int, logger *zap.Logger, interval time.Duration) *checker {
return &checker{
statdb: sdb,
pointerdb: pointerdb,
repairQueue: repairQueue,
overlay: overlay,
irrdb: irrdb,
limit: limit,
logger: logger,
ticker: time.NewTicker(interval),
}
}
// Run the checker loop
func (c *checker) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
for {
err = c.IdentifyInjuredSegments(ctx)
if err != nil {
c.logger.Error("Checker failed", zap.Error(err))
}
select {
case <-c.ticker.C: // wait for the next interval to happen
case <-ctx.Done(): // or the checker is canceled via context
return ctx.Err()
}
}
}
// Close closes resources
func (c *checker) Close() error { return nil }
// IdentifyInjuredSegments checks for missing pieces off of the pointerdb and overlay cache
func (c *checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
err = c.pointerdb.Iterate(ctx, &pb.IterateRequest{Recurse: true},
func(it storage.Iterator) error {
var item storage.ListItem
lim := c.limit
if lim <= 0 || lim > storage.LookupLimit {
lim = storage.LookupLimit
}
for ; lim > 0 && it.Next(&item); lim-- {
pointer := &pb.Pointer{}
err = proto.Unmarshal(item.Value, pointer)
if err != nil {
return Error.New("error unmarshalling pointer %s", err)
}
remote := pointer.GetRemote()
if remote == nil {
continue
}
pieces := remote.GetRemotePieces()
if pieces == nil {
c.logger.Debug("no pieces on remote segment")
continue
}
var nodeIDs storj.NodeIDList
for _, p := range pieces {
nodeIDs = append(nodeIDs, p.NodeId)
}
// Find all offline nodes
offlineNodes, err := c.OfflineNodes(ctx, nodeIDs)
if err != nil {
return Error.New("error getting offline nodes %s", err)
}
invalidNodes, err := c.invalidNodes(ctx, nodeIDs)
if err != nil {
return Error.New("error getting invalid nodes %s", err)
}
missingPieces := combineOfflineWithInvalid(offlineNodes, invalidNodes)
numHealthy := len(nodeIDs) - len(missingPieces)
if (int32(numHealthy) >= pointer.Remote.Redundancy.MinReq) && (int32(numHealthy) < pointer.Remote.Redundancy.RepairThreshold) {
err = c.repairQueue.Enqueue(ctx, &pb.InjuredSegment{
Path: string(item.Key),
LostPieces: missingPieces,
})
if err != nil {
return Error.New("error adding injured segment to queue %s", err)
}
} else if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
// make an entry in to the irreparable table
segmentInfo := &irreparable.RemoteSegmentInfo{
EncryptedSegmentPath: item.Key,
EncryptedSegmentDetail: item.Value,
LostPiecesCount: int64(len(missingPieces)),
RepairUnixSec: time.Now().Unix(),
RepairAttemptCount: int64(1),
}
//add the entry if new or update attempt count if already exists
err := c.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
if err != nil {
return Error.New("error handling irreparable segment to queue %s", err)
}
}
}
return nil
},
)
return err
}
// OfflineNodes returns the indices of offline nodes
func (c *checker) OfflineNodes(ctx context.Context, nodeIDs storj.NodeIDList) (offline []int32, err error) {
responses, err := c.overlay.BulkLookup(ctx, pb.NodeIDsToLookupRequests(nodeIDs))
if err != nil {
return []int32{}, err
}
nodes := pb.LookupResponsesToNodes(responses)
for i, n := range nodes {
if n == nil {
offline = append(offline, int32(i))
}
}
return offline, nil
}
// Find invalidNodes by checking the audit results that are place in statdb
func (c *checker) invalidNodes(ctx context.Context, nodeIDs storj.NodeIDList) (invalidNodes []int32, err error) {
// filter if nodeIDs have invalid pieces from auditing results
maxStats := &statdb.NodeStats{
AuditSuccessRatio: 0, // TODO: update when we have stats added to statdb
UptimeRatio: 0, // TODO: update when we have stats added to statdb
}
invalidIDs, err := c.statdb.FindInvalidNodes(ctx, nodeIDs, maxStats)
if err != nil {
return nil, Error.New("error getting valid nodes from statdb %s", err)
}
invalidNodesMap := make(map[storj.NodeID]bool)
for _, invalidID := range invalidIDs {
invalidNodesMap[invalidID] = true
}
for i, nID := range nodeIDs {
if invalidNodesMap[nID] {
invalidNodes = append(invalidNodes, int32(i))
}
}
return invalidNodes, nil
}
// combine the offline nodes with nodes marked invalid by statdb
func combineOfflineWithInvalid(offlineNodes []int32, invalidNodes []int32) (missingPieces []int32) {
missingPieces = append(missingPieces, offlineNodes...)
offlineMap := make(map[int32]bool)
for _, i := range offlineNodes {
offlineMap[i] = true
}
for _, i := range invalidNodes {
if !offlineMap[i] {
missingPieces = append(missingPieces, i)
}
}
return missingPieces
}