storj/satellite/overlay/service.go

866 lines
35 KiB
Go
Raw Normal View History

2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
2018-04-18 17:55:28 +01:00
// See LICENSE for copying information.
package overlay
2018-04-18 16:34:15 +01:00
import (
"context"
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
"fmt"
"net"
"time"
2018-04-18 16:34:15 +01:00
"github.com/zeebo/errs"
"go.uber.org/zap"
"golang.org/x/exp/maps"
2018-11-16 16:31:14 +00:00
"storj.io/common/pb"
"storj.io/common/storj"
"storj.io/common/storj/location"
"storj.io/common/sync2"
"storj.io/private/version"
"storj.io/storj/satellite/geoip"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/nodeevents"
"storj.io/storj/satellite/nodeselection"
2018-04-18 16:34:15 +01:00
)
// ErrEmptyNode is returned when the nodeID is empty.
2018-12-17 18:47:26 +00:00
var ErrEmptyNode = errs.New("empty node ID")
// ErrNodeNotFound is returned if a node does not exist in database.
var ErrNodeNotFound = errs.Class("node not found")
// ErrNodeOffline is returned if a nodes is offline.
var ErrNodeOffline = errs.Class("node is offline")
// ErrNodeDisqualified is returned if a nodes is disqualified.
var ErrNodeDisqualified = errs.Class("node is disqualified")
// ErrNodeFinishedGE is returned if a node has finished graceful exit.
var ErrNodeFinishedGE = errs.Class("node finished graceful exit")
// ErrNotEnoughNodes is when selecting nodes failed with the given parameters.
var ErrNotEnoughNodes = errs.Class("not enough nodes")
// ErrLowDifficulty is when the node id's difficulty is too low.
var ErrLowDifficulty = errs.Class("node id difficulty too low")
// DB implements the database for overlay.Service.
2019-09-10 14:24:16 +01:00
//
// architecture: Database
type DB interface {
// GetOnlineNodesForAuditRepair returns a map of nodes for the supplied nodeIDs.
// The return value contains necessary information to create orders as well as nodes'
// current reputation status.
GetOnlineNodesForAuditRepair(ctx context.Context, nodeIDs []storj.NodeID, onlineWindow time.Duration) (map[storj.NodeID]*NodeReputation, error)
// SelectStorageNodes looks up nodes based on criteria
SelectStorageNodes(ctx context.Context, totalNeededNodes, newNodeCount int, criteria *NodeCriteria) ([]*nodeselection.SelectedNode, error)
// SelectAllStorageNodesUpload returns all nodes that qualify to store data, organized as reputable nodes and new nodes
SelectAllStorageNodesUpload(ctx context.Context, selectionCfg NodeSelectionConfig) (reputable, new []*nodeselection.SelectedNode, err error)
// SelectAllStorageNodesDownload returns a nodes that are ready for downloading
SelectAllStorageNodesDownload(ctx context.Context, onlineWindow time.Duration, asOf AsOfSystemTimeConfig) ([]*nodeselection.SelectedNode, error)
// Get looks up the node by nodeID
Get(ctx context.Context, nodeID storj.NodeID) (*NodeDossier, error)
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error)
// Reliable returns all nodes that are reliable (separated by whether they are currently online or offline).
Reliable(ctx context.Context, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error)
// UpdateReputation updates the DB columns for all reputation fields in ReputationStatus.
UpdateReputation(ctx context.Context, id storj.NodeID, request ReputationUpdate) error
Add Version Information into KAD Network and SatelliteDB & Change Selection Process (#1648) * Initial Webserver Draft for Version Controlling * Rename type to avoid confusion * Move Function Calls into Version Package * Fix Linting and Language Typos * Fix Linting and Spelling Mistakes * Include Copyright * Include Copyright * Adjust Version-Control Server to return list of Versions * Linting * Improve Request Handling and Readability * Add Configuration File Option Add Systemd Service file * Add Logging to File * Smaller Changes * Add Semantic Versioning and refuses outdated Software from Startup (#1612) * implements internal Semantic Version library * adds version logging + reporting to process * Advance SemVer struct for easier handling * Add Accepted Version Store * Fix Function * Restructure * Type Conversion * Handle Version String properly * Add Note about array index * Set temporary Default Version * Add Copyright * Adding Version to Dashboard * Adding Version Info Log * Renaming and adding CheckerProcess * Iteration Sync * Iteration V2 * linting * made LogAndReportVersion a go routine * Refactor to Go Routine * Add Context to Go Routine and allow Operation if Lookup to Control Server fails * Handle Unmarshal properly * Linting * Relocate Version Checks * Relocating Version Check and specified default Version for now * Linting Error Prevention * Refuse Startup on outdated Version * Add Startup Check Function * Straighten Logging * Dont force Shutdown if --dev flag is set * Create full Service/Peer Structure for ControlServer * Linting * Straighting Naming * Finish VersionControl Service Layout * Improve Error Handling * Change Listening Address * Move Checker Function * Remove VersionControl Peer * Linting * Linting * Create VersionClient Service * Renaming * Add Version Client to Peer Definitions * Linting and Renaming * Linting * Remove Transport Checks for now * Move to Client Side Flag * Remove check * Linting * Transport Client Version Intro * Adding Version Client to Transport Client * Add missing parameter * Adding Version Check, to set Allowed = true * Set Default to true, testing * Restructuring Code * Uplink Changes * Add more proper Defaults * Renaming of Version struct * Dont pass Service use Pointer * Set Defaults for Versioning Checks * Put HTTP Server in go routine * Add Versioncontrol to Storj-Sim * Testplanet Fixes * Linting * Add Error Handling and new Server Struct * Move Lock slightly * Reduce Race Potentials * Remove unnecessary files * Linting * Add Proper Transport Handling * small fixes * add fence for allowed check * Add Startup Version Check and Service Naming * make errormessage private * Add Comments about VersionedClient * Linting * Remove Checks that refuse outgoing connections * Remove release cmd * Add Release Script * Linting * Update to use correct Values * Change Timestamp handling * Adding Protobuf changes back in * Adding SatelliteDB Changes and adding Storj Node Version to PB * Add Migration Table * Add Default Stats for Creation * Move to BigInt * Proper SQL Migration * Ensure minimum Version is passed to the node selection * Linting... * Remove VersionedClient and adjust smaller changes from prior merge * Linting * Fix PB Message Handling and Query for Node Selection * some future-proofing type changes Change-Id: I3cb5018dcccdbc9739fe004d859065992720caaf * fix a compiler error Change-Id: If66bb92d8b98e31cd618ecec9c6448ab9b037fa5 * Comment on Constant for Overlay * Remove NOT NULL and add epoch call as function * add versions to bootstrap and satellites Change-Id: I436944589ea5f21600cdd997742a84fe0b16e47b * Change Update Migration * Fix DB Migration * Increase Timeout temporarily, to see whats going on * Remove unnecessary const and vars Cleanup Function calls from deprecated NodeVersion struct * Updated Protopuf, removed depcreated Code from Inspector * Implement NodeVersion into InfoResponse * Regenerated locked.go * Linting * Fix Tests * Remove unnecessary constant * Update Function and Flag Description * Remove Empty Stat Creation * return properly with error * Remove unnecessary struct * simplify migration step * Update Inspector to return Version Info * Update local Endpoint Version Handling * Reset Travis Timeout * Add Default for CommitHash * single quotes
2019-04-10 07:04:24 +01:00
// UpdateNodeInfo updates node dossier with info requested from the node itself like node type, email, wallet, capacity, and version.
UpdateNodeInfo(ctx context.Context, node storj.NodeID, nodeInfo *InfoResponse) (stats *NodeDossier, err error)
// UpdateCheckIn updates a single storagenode's check-in stats.
UpdateCheckIn(ctx context.Context, node NodeCheckInInfo, timestamp time.Time, config NodeSelectionConfig) (err error)
// SetNodeContained updates the contained field for the node record.
SetNodeContained(ctx context.Context, node storj.NodeID, contained bool) (err error)
// SetAllContainedNodes updates the contained field for all nodes, as necessary.
SetAllContainedNodes(ctx context.Context, containedNodes []storj.NodeID) (err error)
// AllPieceCounts returns a map of node IDs to piece counts from the db.
AllPieceCounts(ctx context.Context) (pieceCounts map[storj.NodeID]int64, err error)
// UpdatePieceCounts sets the piece count field for the given node IDs.
UpdatePieceCounts(ctx context.Context, pieceCounts map[storj.NodeID]int64) (err error)
// UpdateExitStatus is used to update a node's graceful exit status.
UpdateExitStatus(ctx context.Context, request *ExitStatusRequest) (_ *NodeDossier, err error)
// GetExitingNodes returns nodes who have initiated a graceful exit, but have not completed it.
GetExitingNodes(ctx context.Context) (exitingNodes []*ExitStatus, err error)
// GetGracefulExitCompletedByTimeFrame returns nodes who have completed graceful exit within a time window (time window is around graceful exit completion).
GetGracefulExitCompletedByTimeFrame(ctx context.Context, begin, end time.Time) (exitedNodes storj.NodeIDList, err error)
// GetGracefulExitIncompleteByTimeFrame returns nodes who have initiated, but not completed graceful exit within a time window (time window is around graceful exit initiation).
GetGracefulExitIncompleteByTimeFrame(ctx context.Context, begin, end time.Time) (exitingNodes storj.NodeIDList, err error)
// GetExitStatus returns a node's graceful exit status.
GetExitStatus(ctx context.Context, nodeID storj.NodeID) (exitStatus *ExitStatus, err error)
// GetNodesNetwork returns the last_net subnet for each storage node, order is not guaranteed.
GetNodesNetwork(ctx context.Context, nodeIDs []storj.NodeID) (nodeNets []string, err error)
// GetNodesNetworkInOrder returns the last_net subnet for each storage node in order of the requested nodeIDs.
GetNodesNetworkInOrder(ctx context.Context, nodeIDs []storj.NodeID) (nodeNets []string, err error)
// DisqualifyNode disqualifies a storage node.
DisqualifyNode(ctx context.Context, nodeID storj.NodeID, disqualifiedAt time.Time, reason DisqualificationReason) (email string, err error)
// GetOfflineNodesForEmail gets offline nodes in need of an email.
GetOfflineNodesForEmail(ctx context.Context, offlineWindow time.Duration, cutoff time.Duration, cooldown time.Duration, limit int) (nodes map[storj.NodeID]string, err error)
// UpdateLastOfflineEmail updates last_offline_email for a list of nodes.
UpdateLastOfflineEmail(ctx context.Context, nodeIDs storj.NodeIDList, timestamp time.Time) (err error)
// DQNodesLastSeenBefore disqualifies a limited number of nodes where last_contact_success < cutoff except those already disqualified
// or gracefully exited or where last_contact_success = '0001-01-01 00:00:00+00'.
DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (nodeEmails map[storj.NodeID]string, count int, err error)
// TestSuspendNodeUnknownAudit suspends a storage node for unknown audits.
TestSuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error)
// TestUnsuspendNodeUnknownAudit unsuspends a storage node for unknown audits.
TestUnsuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID) (err error)
// TestVetNode directly sets a node's vetted_at timestamp to make testing easier
TestVetNode(ctx context.Context, nodeID storj.NodeID) (vettedTime *time.Time, err error)
// TestUnvetNode directly sets a node's vetted_at timestamp to null to make testing easier
TestUnvetNode(ctx context.Context, nodeID storj.NodeID) (err error)
// TestSuspendNodeOffline directly sets a node's offline_suspended timestamp to make testing easier
TestSuspendNodeOffline(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error)
// TestNodeCountryCode sets node country code.
TestNodeCountryCode(ctx context.Context, nodeID storj.NodeID, countryCode string) (err error)
// TestUpdateCheckInDirectUpdate tries to update a node info directly. Returns true if it succeeded, false if there were no node with the provided (used for testing).
TestUpdateCheckInDirectUpdate(ctx context.Context, node NodeCheckInInfo, timestamp time.Time, semVer version.SemVer, walletFeatures string) (updated bool, err error)
// OneTimeFixLastNets updates the last_net values for all node records to be equal to their
// last_ip_port values.
OneTimeFixLastNets(ctx context.Context) error
// IterateAllContactedNodes will call cb on all known nodes (used in restore trash contexts).
IterateAllContactedNodes(context.Context, func(context.Context, *nodeselection.SelectedNode) error) error
// IterateAllNodeDossiers will call cb on all known nodes (used for invoice generation).
IterateAllNodeDossiers(context.Context, func(context.Context, *NodeDossier) error) error
// UpdateNodeTags insert (or refresh) node tags.
UpdateNodeTags(ctx context.Context, tags nodeselection.NodeTags) error
// GetNodeTags returns all nodes for a specific node.
GetNodeTags(ctx context.Context, id storj.NodeID) (nodeselection.NodeTags, error)
}
// DisqualificationReason is disqualification reason enum type.
type DisqualificationReason int
const (
// DisqualificationReasonUnknown denotes undetermined disqualification reason.
DisqualificationReasonUnknown DisqualificationReason = 0
// DisqualificationReasonAuditFailure denotes disqualification due to audit score falling below threshold.
DisqualificationReasonAuditFailure DisqualificationReason = 1
// DisqualificationReasonSuspension denotes disqualification due to unknown audit failure after grace period for unknown audits
// has elapsed.
DisqualificationReasonSuspension DisqualificationReason = 2
// DisqualificationReasonNodeOffline denotes disqualification due to node's online score falling below threshold after tracking
// period has elapsed.
DisqualificationReasonNodeOffline DisqualificationReason = 3
)
// NodeCheckInInfo contains all the info that will be updated when a node checkins.
type NodeCheckInInfo struct {
NodeID storj.NodeID
Address *pb.NodeAddress
LastNet string
LastIPPort string
IsUp bool
Operator *pb.NodeOperator
Capacity *pb.NodeCapacity
Version *pb.NodeVersion
CountryCode location.CountryCode
SoftwareUpdateEmailSent bool
VersionBelowMin bool
}
// InfoResponse contains node dossier info requested from the storage node.
type InfoResponse struct {
Type pb.NodeType
Operator *pb.NodeOperator
Capacity *pb.NodeCapacity
Version *pb.NodeVersion
}
2019-03-23 08:06:11 +00:00
// FindStorageNodesRequest defines easy request parameters.
type FindStorageNodesRequest struct {
RequestedCount int
ExcludedIDs []storj.NodeID
MinimumVersion string // semver or empty
AsOfSystemInterval time.Duration // only used for CRDB queries
Placement storj.PlacementConstraint
2019-03-23 08:06:11 +00:00
}
// NodeCriteria are the requirements for selecting nodes.
2019-03-23 08:06:11 +00:00
type NodeCriteria struct {
FreeDisk int64
ExcludedIDs []storj.NodeID
ExcludedNetworks []string // the /24 subnet IPv4 or /64 subnet IPv6 for nodes
MinimumVersion string // semver or empty
OnlineWindow time.Duration
AsOfSystemInterval time.Duration // only used for CRDB queries
2019-03-23 08:06:11 +00:00
}
// ReputationStatus indicates current reputation status for a node.
type ReputationStatus struct {
Email string
Disqualified *time.Time
DisqualificationReason *DisqualificationReason
UnknownAuditSuspended *time.Time
OfflineSuspended *time.Time
VettedAt *time.Time
}
// ReputationUpdate contains reputation update data for a node.
type ReputationUpdate struct {
Disqualified *time.Time
DisqualificationReason DisqualificationReason
UnknownAuditSuspended *time.Time
OfflineSuspended *time.Time
VettedAt *time.Time
}
// ExitStatus is used for reading graceful exit status.
type ExitStatus struct {
NodeID storj.NodeID
ExitInitiatedAt *time.Time
ExitLoopCompletedAt *time.Time
ExitFinishedAt *time.Time
ExitSuccess bool
}
// ExitStatusRequest is used to update a node's graceful exit status.
type ExitStatusRequest struct {
NodeID storj.NodeID
ExitInitiatedAt time.Time
ExitLoopCompletedAt time.Time
ExitFinishedAt time.Time
ExitSuccess bool
}
// NodeDossier is the complete info that the satellite tracks for a storage node.
type NodeDossier struct {
pb.Node
Type pb.NodeType
Operator pb.NodeOperator
Capacity pb.NodeCapacity
Reputation NodeStats
Version pb.NodeVersion
Contained bool
Disqualified *time.Time
DisqualificationReason *DisqualificationReason
UnknownAuditSuspended *time.Time
OfflineSuspended *time.Time
OfflineUnderReview *time.Time
PieceCount int64
ExitStatus ExitStatus
CreatedAt time.Time
LastNet string
LastIPPort string
LastOfflineEmail *time.Time
LastSoftwareUpdateEmail *time.Time
CountryCode location.CountryCode
}
// NodeStats contains statistics about a node.
type NodeStats struct {
Latency90 int64
LastContactSuccess time.Time
LastContactFailure time.Time
OfflineUnderReview *time.Time
Status ReputationStatus
}
// NodeLastContact contains the ID, address, and timestamp.
type NodeLastContact struct {
URL storj.NodeURL
LastIPPort string
LastContactSuccess time.Time
LastContactFailure time.Time
}
// NodeReputation is used as a result for creating orders limits for audits.
type NodeReputation struct {
ID storj.NodeID
Address *pb.NodeAddress
LastNet string
LastIPPort string
Reputation ReputationStatus
}
// Service is used to store and handle node information.
2019-09-10 14:24:16 +01:00
//
// architecture: Service
type Service struct {
log *zap.Logger
db DB
nodeEvents nodeevents.DB
satelliteName string
satelliteAddress string
config Config
GeoIP geoip.IPToCountry
UploadSelectionCache *UploadSelectionCache
DownloadSelectionCache *DownloadSelectionCache
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
LastNetFunc LastNetFunc
placementRules PlacementRules
2018-04-18 16:34:15 +01:00
}
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
// LastNetFunc is the type of a function that will be used to derive a network from an ip and port.
type LastNetFunc func(config NodeSelectionConfig, ip net.IP, port string) (string, error)
// NewService returns a new Service.
func NewService(log *zap.Logger, db DB, nodeEvents nodeevents.DB, placementRules PlacementRules, satelliteAddr, satelliteName string, config Config) (*Service, error) {
err := config.Node.AsOfSystemTime.isValid()
if err != nil {
return nil, errs.Wrap(err)
}
var geoIP geoip.IPToCountry = geoip.NewMockIPToCountry(config.GeoIP.MockCountries)
if config.GeoIP.DB != "" {
geoIP, err = geoip.OpenMaxmindDB(config.GeoIP.DB)
if err != nil {
return nil, Error.Wrap(err)
}
}
defaultSelection := nodeselection.NodeFilters{}
if len(config.Node.UploadExcludedCountryCodes) > 0 {
set := location.NewFullSet()
for _, country := range config.Node.UploadExcludedCountryCodes {
countryCode := location.ToCountryCode(country)
if countryCode == location.None {
return nil, Error.New("invalid country %q", country)
}
set.Remove(countryCode)
}
defaultSelection = defaultSelection.WithCountryFilter(set)
}
uploadSelectionCache, err := NewUploadSelectionCache(log, db,
config.NodeSelectionCache.Staleness, config.Node,
defaultSelection, placementRules,
)
if err != nil {
return nil, errs.Wrap(err)
}
downloadSelectionCache, err := NewDownloadSelectionCache(log, db,
placementRules,
DownloadSelectionCacheConfig{
Staleness: config.NodeSelectionCache.Staleness,
OnlineWindow: config.Node.OnlineWindow,
AsOfSystemTime: config.Node.AsOfSystemTime,
})
if err != nil {
return nil, errs.Wrap(err)
}
return &Service{
log: log,
db: db,
nodeEvents: nodeEvents,
satelliteAddress: satelliteAddr,
satelliteName: satelliteName,
config: config,
GeoIP: geoIP,
UploadSelectionCache: uploadSelectionCache,
DownloadSelectionCache: downloadSelectionCache,
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
LastNetFunc: MaskOffLastNet,
placementRules: placementRules,
}, nil
}
// Run runs the background processes needed for caches.
func (service *Service) Run(ctx context.Context) error {
return errs.Combine(sync2.Concurrently(
func() error { return service.UploadSelectionCache.Run(ctx) },
func() error { return service.DownloadSelectionCache.Run(ctx) },
)...)
}
// Close closes resources.
func (service *Service) Close() error {
return service.GeoIP.Close()
}
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// Get looks up the provided nodeID from the overlay.
func (service *Service) Get(ctx context.Context, nodeID storj.NodeID) (_ *NodeDossier, err error) {
2019-03-23 08:06:11 +00:00
defer mon.Task()(&ctx)(&err)
2018-12-17 18:47:26 +00:00
if nodeID.IsZero() {
return nil, ErrEmptyNode
}
return service.db.Get(ctx, nodeID)
2018-04-18 16:34:15 +01:00
}
// CachedGetOnlineNodesForGet returns a map of nodes from the download selection cache from the suppliedIDs.
func (service *Service) CachedGetOnlineNodesForGet(ctx context.Context, nodeIDs []storj.NodeID) (_ map[storj.NodeID]*nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
return service.DownloadSelectionCache.GetNodes(ctx, nodeIDs)
}
// GetOnlineNodesForAuditRepair returns a map of nodes for the supplied nodeIDs.
func (service *Service) GetOnlineNodesForAuditRepair(ctx context.Context, nodeIDs []storj.NodeID) (_ map[storj.NodeID]*NodeReputation, err error) {
defer mon.Task()(&ctx)(&err)
return service.db.GetOnlineNodesForAuditRepair(ctx, nodeIDs, service.config.Node.OnlineWindow)
}
// GetNodeIPsFromPlacement returns a map of node ip:port for the supplied nodeIDs. Results are filtered out by placement.
func (service *Service) GetNodeIPsFromPlacement(ctx context.Context, nodeIDs []storj.NodeID, placement storj.PlacementConstraint) (_ map[storj.NodeID]string, err error) {
defer mon.Task()(&ctx)(&err)
return service.DownloadSelectionCache.GetNodeIPsFromPlacement(ctx, nodeIDs, placement)
}
// IsOnline checks if a node is 'online' based on the collected statistics.
func (service *Service) IsOnline(node *NodeDossier) bool {
return time.Since(node.Reputation.LastContactSuccess) < service.config.Node.OnlineWindow
}
// FindStorageNodesForGracefulExit searches the overlay network for nodes that meet the provided requirements for graceful-exit requests.
func (service *Service) FindStorageNodesForGracefulExit(ctx context.Context, req FindStorageNodesRequest) (_ []*nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
return service.UploadSelectionCache.GetNodes(ctx, req)
}
// FindStorageNodesForUpload searches the for nodes in the cache that meet the provided requirements for upload.
func (service *Service) FindStorageNodesForUpload(ctx context.Context, req FindStorageNodesRequest) (_ []*nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
if service.config.Node.AsOfSystemTime.Enabled && service.config.Node.AsOfSystemTime.DefaultInterval < 0 {
req.AsOfSystemInterval = service.config.Node.AsOfSystemTime.DefaultInterval
}
selectedNodes, err := service.UploadSelectionCache.GetNodes(ctx, req)
if err != nil {
return selectedNodes, err
}
if len(selectedNodes) < req.RequestedCount {
excludedIDs := make([]string, 0)
for _, e := range req.ExcludedIDs {
excludedIDs = append(excludedIDs, e.String())
}
service.log.Warn("Not enough nodes are available from Node Cache",
zap.String("minVersion", req.MinimumVersion),
zap.Strings("excludedIDs", excludedIDs),
zap.Duration("asOfSystemInterval", req.AsOfSystemInterval),
zap.Int("requested", req.RequestedCount),
zap.Int("available", len(selectedNodes)),
zap.Uint16("placement", uint16(req.Placement)))
}
return selectedNodes, err
}
// InsertOfflineNodeEvents inserts offline events into node events.
func (service *Service) InsertOfflineNodeEvents(ctx context.Context, cooldown time.Duration, cutoff time.Duration, limit int) (count int, err error) {
defer mon.Task()(&ctx)(&err)
if !service.config.SendNodeEmails {
return 0, nil
}
nodes, err := service.db.GetOfflineNodesForEmail(ctx, service.config.Node.OnlineWindow, cutoff, cooldown, limit)
if err != nil {
return 0, err
}
count = len(nodes)
var successful storj.NodeIDList
for id, email := range nodes {
_, err = service.nodeEvents.Insert(ctx, email, id, nodeevents.Offline)
if err != nil {
service.log.Error("could not insert node offline into node events", zap.Error(err))
} else {
successful = append(successful, id)
}
}
if len(successful) > 0 {
err = service.db.UpdateLastOfflineEmail(ctx, successful, time.Now())
if err != nil {
return count, err
}
}
return count, err
}
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
func (service *Service) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList) (onlineNodes []nodeselection.SelectedNode, offlineNodes []nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
// TODO add as of system time
return service.db.KnownReliable(ctx, nodeIDs, service.config.Node.OnlineWindow, 0)
}
// Reliable returns all nodes that are reliable (separated by whether they are currently online or offline).
func (service *Service) Reliable(ctx context.Context) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
// TODO add as of system tim.
return service.db.Reliable(ctx, service.config.Node.OnlineWindow, 0)
}
// UpdateReputation updates the DB columns for any of the reputation fields.
func (service *Service) UpdateReputation(ctx context.Context, id storj.NodeID, email string, request ReputationUpdate, reputationChanges []nodeevents.Type) (err error) {
defer mon.Task()(&ctx)(&err)
err = service.db.UpdateReputation(ctx, id, request)
if err != nil {
return err
}
if service.config.SendNodeEmails {
service.insertReputationNodeEvents(ctx, email, id, reputationChanges)
}
return nil
}
Add Version Information into KAD Network and SatelliteDB & Change Selection Process (#1648) * Initial Webserver Draft for Version Controlling * Rename type to avoid confusion * Move Function Calls into Version Package * Fix Linting and Language Typos * Fix Linting and Spelling Mistakes * Include Copyright * Include Copyright * Adjust Version-Control Server to return list of Versions * Linting * Improve Request Handling and Readability * Add Configuration File Option Add Systemd Service file * Add Logging to File * Smaller Changes * Add Semantic Versioning and refuses outdated Software from Startup (#1612) * implements internal Semantic Version library * adds version logging + reporting to process * Advance SemVer struct for easier handling * Add Accepted Version Store * Fix Function * Restructure * Type Conversion * Handle Version String properly * Add Note about array index * Set temporary Default Version * Add Copyright * Adding Version to Dashboard * Adding Version Info Log * Renaming and adding CheckerProcess * Iteration Sync * Iteration V2 * linting * made LogAndReportVersion a go routine * Refactor to Go Routine * Add Context to Go Routine and allow Operation if Lookup to Control Server fails * Handle Unmarshal properly * Linting * Relocate Version Checks * Relocating Version Check and specified default Version for now * Linting Error Prevention * Refuse Startup on outdated Version * Add Startup Check Function * Straighten Logging * Dont force Shutdown if --dev flag is set * Create full Service/Peer Structure for ControlServer * Linting * Straighting Naming * Finish VersionControl Service Layout * Improve Error Handling * Change Listening Address * Move Checker Function * Remove VersionControl Peer * Linting * Linting * Create VersionClient Service * Renaming * Add Version Client to Peer Definitions * Linting and Renaming * Linting * Remove Transport Checks for now * Move to Client Side Flag * Remove check * Linting * Transport Client Version Intro * Adding Version Client to Transport Client * Add missing parameter * Adding Version Check, to set Allowed = true * Set Default to true, testing * Restructuring Code * Uplink Changes * Add more proper Defaults * Renaming of Version struct * Dont pass Service use Pointer * Set Defaults for Versioning Checks * Put HTTP Server in go routine * Add Versioncontrol to Storj-Sim * Testplanet Fixes * Linting * Add Error Handling and new Server Struct * Move Lock slightly * Reduce Race Potentials * Remove unnecessary files * Linting * Add Proper Transport Handling * small fixes * add fence for allowed check * Add Startup Version Check and Service Naming * make errormessage private * Add Comments about VersionedClient * Linting * Remove Checks that refuse outgoing connections * Remove release cmd * Add Release Script * Linting * Update to use correct Values * Change Timestamp handling * Adding Protobuf changes back in * Adding SatelliteDB Changes and adding Storj Node Version to PB * Add Migration Table * Add Default Stats for Creation * Move to BigInt * Proper SQL Migration * Ensure minimum Version is passed to the node selection * Linting... * Remove VersionedClient and adjust smaller changes from prior merge * Linting * Fix PB Message Handling and Query for Node Selection * some future-proofing type changes Change-Id: I3cb5018dcccdbc9739fe004d859065992720caaf * fix a compiler error Change-Id: If66bb92d8b98e31cd618ecec9c6448ab9b037fa5 * Comment on Constant for Overlay * Remove NOT NULL and add epoch call as function * add versions to bootstrap and satellites Change-Id: I436944589ea5f21600cdd997742a84fe0b16e47b * Change Update Migration * Fix DB Migration * Increase Timeout temporarily, to see whats going on * Remove unnecessary const and vars Cleanup Function calls from deprecated NodeVersion struct * Updated Protopuf, removed depcreated Code from Inspector * Implement NodeVersion into InfoResponse * Regenerated locked.go * Linting * Fix Tests * Remove unnecessary constant * Update Function and Flag Description * Remove Empty Stat Creation * return properly with error * Remove unnecessary struct * simplify migration step * Update Inspector to return Version Info * Update local Endpoint Version Handling * Reset Travis Timeout * Add Default for CommitHash * single quotes
2019-04-10 07:04:24 +01:00
// UpdateNodeInfo updates node dossier with info requested from the node itself like node type, email, wallet, capacity, and version.
func (service *Service) UpdateNodeInfo(ctx context.Context, node storj.NodeID, nodeInfo *InfoResponse) (stats *NodeDossier, err error) {
defer mon.Task()(&ctx)(&err)
return service.db.UpdateNodeInfo(ctx, node, nodeInfo)
}
// SetNodeContained updates the contained field for the node record. If
// `contained` is true, the contained field in the record is set to the current
// database time, if it is not already set. If `contained` is false, the
// contained field in the record is set to NULL. All other fields are left
// alone.
func (service *Service) SetNodeContained(ctx context.Context, node storj.NodeID, contained bool) (err error) {
defer mon.Task()(&ctx)(&err)
return service.db.SetNodeContained(ctx, node, contained)
}
// UpdateCheckIn updates a single storagenode's check-in info if needed.
/*
The check-in info is updated in the database if:
(1) there is no previous entry and the node is allowed (id difficulty, etc);
(2) it has been too long since the last known entry; or
(3) the node hostname, IP address, port, wallet, sw version, or disk capacity
has changed.
Note that there can be a race between acquiring the previous entry and
performing the update, so if two updates happen at about the same time it is
not defined which one will end up in the database.
*/
func (service *Service) UpdateCheckIn(ctx context.Context, node NodeCheckInInfo, timestamp time.Time) (err error) {
defer mon.Task()(&ctx)(&err)
failureMeter := mon.Meter("geofencing_lookup_failed")
oldInfo, err := service.Get(ctx, node.NodeID)
if err != nil && !ErrNodeNotFound.Has(err) {
return Error.New("failed to get node info from DB")
}
if oldInfo == nil {
if !node.IsUp {
// this is a previously unknown node, and we couldn't pingback to verify that it even
// exists. Don't bother putting it in the db.
return nil
}
difficulty, err := node.NodeID.Difficulty()
if err != nil {
// this should never happen
return err
}
if int(difficulty) < service.config.MinimumNewNodeIDDifficulty {
return ErrLowDifficulty.New("node id difficulty is %d when %d is the minimum",
difficulty, service.config.MinimumNewNodeIDDifficulty)
}
node.CountryCode, err = service.GeoIP.LookupISOCountryCode(node.LastIPPort)
if err != nil {
failureMeter.Mark(1)
service.log.Debug("failed to resolve country code for node",
zap.String("node address", node.Address.Address),
zap.Stringer("Node ID", node.NodeID),
zap.Error(err))
}
return service.db.UpdateCheckIn(ctx, node, timestamp, service.config.Node)
}
lastUp, lastDown := oldInfo.Reputation.LastContactSuccess, oldInfo.Reputation.LastContactFailure
lastContact := lastUp
if lastContact.Before(lastDown) {
lastContact = lastDown
}
dbStale := lastContact.Add(service.config.NodeCheckInWaitPeriod).Before(timestamp) ||
(node.IsUp && lastUp.Before(lastDown)) || (!node.IsUp && lastDown.Before(lastUp))
addrChanged := !pb.AddressEqual(node.Address, oldInfo.Address)
walletChanged := (node.Operator == nil && oldInfo.Operator.Wallet != "") ||
(node.Operator != nil && oldInfo.Operator.Wallet != node.Operator.Wallet)
verChanged := (node.Version == nil && oldInfo.Version.Version != "") ||
(node.Version != nil && oldInfo.Version.Version != node.Version.Version)
spaceChanged := (node.Capacity == nil && oldInfo.Capacity.FreeDisk != 0) ||
(node.Capacity != nil && node.Capacity.FreeDisk != oldInfo.Capacity.FreeDisk)
node.CountryCode, err = service.GeoIP.LookupISOCountryCode(node.LastIPPort)
if err != nil {
failureMeter.Mark(1)
service.log.Debug("failed to resolve country code for node",
zap.String("node address", node.Address.Address),
zap.Stringer("Node ID", node.NodeID),
zap.Error(err))
}
if service.config.SendNodeEmails && service.config.Node.MinimumVersion != "" {
min, err := version.NewSemVer(service.config.Node.MinimumVersion)
if err != nil {
return err
}
v, err := version.NewSemVer(node.Version.GetVersion())
if err != nil {
return err
}
if v.Compare(min) == -1 {
node.VersionBelowMin = true
if oldInfo.LastSoftwareUpdateEmail == nil ||
oldInfo.LastSoftwareUpdateEmail.Add(service.config.NodeSoftwareUpdateEmailCooldown).Before(timestamp) {
_, err = service.nodeEvents.Insert(ctx, node.Operator.Email, node.NodeID, nodeevents.BelowMinVersion)
if err != nil {
service.log.Error("could not insert node software below minimum version into node events", zap.Error(err))
} else {
node.SoftwareUpdateEmailSent = true
}
}
}
}
if dbStale || addrChanged || walletChanged || verChanged || spaceChanged ||
oldInfo.LastNet != node.LastNet || oldInfo.LastIPPort != node.LastIPPort ||
oldInfo.CountryCode != node.CountryCode || node.SoftwareUpdateEmailSent {
err = service.db.UpdateCheckIn(ctx, node, timestamp, service.config.Node)
if err != nil {
return Error.Wrap(err)
}
if service.config.SendNodeEmails && node.IsUp && oldInfo.Reputation.LastContactSuccess.Add(service.config.Node.OnlineWindow).Before(timestamp) {
_, err = service.nodeEvents.Insert(ctx, node.Operator.Email, node.NodeID, nodeevents.Online)
return Error.Wrap(err)
}
return nil
}
service.log.Debug("ignoring unnecessary check-in",
zap.String("node address", node.Address.Address),
zap.Stringer("Node ID", node.NodeID))
mon.Event("unnecessary_node_check_in")
return nil
}
// GetMissingPieces returns the list of offline nodes and the corresponding pieces.
func (service *Service) GetMissingPieces(ctx context.Context, pieces metabase.Pieces) (missingPieces []uint16, err error) {
defer mon.Task()(&ctx)(&err)
// TODO this method will be removed completely in subsequent change
var nodeIDs storj.NodeIDList
missingPiecesMap := map[storj.NodeID]uint16{}
for _, p := range pieces {
nodeIDs = append(nodeIDs, p.StorageNode)
missingPiecesMap[p.StorageNode] = p.Number
}
onlineNodes, _, err := service.KnownReliable(ctx, nodeIDs)
if err != nil {
return nil, Error.New("error getting nodes %s", err)
}
for _, node := range onlineNodes {
delete(missingPiecesMap, node.ID)
}
return maps.Values(missingPiecesMap), nil
}
// DQNodesLastSeenBefore disqualifies nodes who have not been contacted since the cutoff time.
func (service *Service) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error) {
defer mon.Task()(&ctx)(&err)
nodes, count, err := service.db.DQNodesLastSeenBefore(ctx, cutoff, limit)
if err != nil {
return 0, err
}
if service.config.SendNodeEmails {
for nodeID, email := range nodes {
_, err = service.nodeEvents.Insert(ctx, email, nodeID, nodeevents.Disqualified)
if err != nil {
service.log.Error("could not insert node disqualified into node events", zap.Error(err))
}
}
}
return count, err
}
// DisqualifyNode disqualifies a storage node.
func (service *Service) DisqualifyNode(ctx context.Context, nodeID storj.NodeID, reason DisqualificationReason) (err error) {
defer mon.Task()(&ctx)(&err)
email, err := service.db.DisqualifyNode(ctx, nodeID, time.Now().UTC(), reason)
if err != nil {
return err
}
if service.config.SendNodeEmails {
_, err = service.nodeEvents.Insert(ctx, email, nodeID, nodeevents.Disqualified)
if err != nil {
service.log.Error("could not insert node disqualified into node events")
}
}
return nil
}
// SelectAllStorageNodesDownload returns a nodes that are ready for downloading.
func (service *Service) SelectAllStorageNodesDownload(ctx context.Context, onlineWindow time.Duration, asOf AsOfSystemTimeConfig) (_ []*nodeselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
return service.db.SelectAllStorageNodesDownload(ctx, onlineWindow, asOf)
}
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
// ResolveIPAndNetwork resolves the target address and determines its IP and appropriate subnet IPv4 or subnet IPv6.
func (service *Service) ResolveIPAndNetwork(ctx context.Context, target string) (ip net.IP, port, network string, err error) {
// LastNetFunc is MaskOffLastNet, unless changed for a test.
return ResolveIPAndNetwork(ctx, target, service.config.Node, service.LastNetFunc)
}
// UpdateNodeTags persists all new and old node tags.
func (service *Service) UpdateNodeTags(ctx context.Context, tags []nodeselection.NodeTag) error {
return service.db.UpdateNodeTags(ctx, tags)
}
// GetNodeTags returns the node tags of a node.
func (service *Service) GetNodeTags(ctx context.Context, id storj.NodeID) (nodeselection.NodeTags, error) {
return service.db.GetNodeTags(ctx, id)
}
// GetLocationFromPlacement returns the value for `nodeselection.Location` that
// placement is currently annotated with.
func (service *Service) GetLocationFromPlacement(placement storj.PlacementConstraint) string {
return nodeselection.GetAnnotation(service.placementRules(placement), nodeselection.Location)
}
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
// ResolveIPAndNetwork resolves the target address and determines its IP and appropriate last_net, as indicated.
func ResolveIPAndNetwork(ctx context.Context, target string, config NodeSelectionConfig, lastNetFunc LastNetFunc) (ip net.IP, port, network string, err error) {
2019-06-24 16:33:18 +01:00
defer mon.Task()(&ctx)(&err)
host, port, err := net.SplitHostPort(target)
if err != nil {
return nil, "", "", err
}
ipAddr, err := net.ResolveIPAddr("ip", host)
if err != nil {
return nil, "", "", err
}
2019-06-24 16:33:18 +01:00
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
network, err = lastNetFunc(config, ipAddr.IP, port)
if err != nil {
return nil, "", "", err
2019-06-24 16:33:18 +01:00
}
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
return ipAddr.IP, port, network, nil
}
// MaskOffLastNet truncates the target address to the configured CIDR ipv6Cidr or ipv6Cidr prefix,
// if DistinctIP is enabled in the config. Otherwise, it returns the joined IP and port.
func MaskOffLastNet(config NodeSelectionConfig, addr net.IP, port string) (string, error) {
if config.DistinctIP {
// Filter all IPv4 Addresses into /24 subnets, and filter all IPv6 Addresses into /64 subnets
return truncateIPToNet(addr, config.NetworkPrefixIPv4, config.NetworkPrefixIPv6)
2019-06-24 16:33:18 +01:00
}
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
// The "network" here will be the full IP and port; that is, every node will be considered to
// be on a separate network, even if they all come from one IP (such as localhost).
return net.JoinHostPort(addr.String(), port), nil
}
2019-06-24 16:33:18 +01:00
satellite/overlay: configurable meaning of last_net Up to now, we have been implementing the DistinctIP preference with code in two places: 1. On check-in, the last_net is determined by taking the /24 or /64 (in ResolveIPAndNetwork()) and we store it with the node record. 2. On node selection, a preference parameter defines whether to return results that are distinct on last_net. It can be observed that we have never yet had the need to switch from DistinctIP to !DistinctIP, or from !DistinctIP to DistinctIP, on the same satellite, and we will probably never need to do so in an automated way. It can also be observed that this arrangement makes tests more complicated, because we often have to arrange for test nodes to have IP addresses in different /24 networks (a particular pain on macOS). Those two considerations, plus some pending work on the repair framework that will make repair take last_net into consideration, motivate this change. With this change, in the #2 place, we will _always_ return results that are distinct on last_net. We implement the DistinctIP preference, then, by making the #1 place (ResolveIPAndNetwork()) more flexible. When DistinctIP is enabled, last_net will be calculated as it was before. But when DistinctIP is _off_, last_net can be the same as address (IP and port). That will effectively implement !DistinctIP because every record will have a distinct last_net already. As a side effect, this flexibility will allow us to change the rules about last_net construction arbitrarily. We can do tests where last_net is set to the source IP, or to a /30 prefix, or a /16 prefix, etc., and be able to exercise the production logic without requiring a virtual network bridge. This change should be safe to make without any migration code, because all known production satellite deployments use DistinctIP, and the associated last_net values will not change for them. They will only change for satellites with !DistinctIP, which are mostly test deployments that can be recreated trivially. For those satellites which are both permanent and !DistinctIP, node selection will suddenly start acting as though DistinctIP is enabled, until the operator runs a single SQL update "UPDATE nodes SET last_net = last_ip_port". That can be done either before or after deploying software with this change. I also assert that this will not hurt performance for production deployments. It's true that adding the distinct requirement to node selection makes things a little slower, but the distinct requirement is already present for all production deployments, and they will see no change. Refs: https://github.com/storj/storj/issues/5391 Change-Id: I0e7e92498c3da768df5b4d5fb213dcd2d4862924
2023-02-28 22:57:39 +00:00
// truncateIPToNet truncates the target address to the given CIDR ipv4Cidr or ipv6Cidr prefix,
// according to which type of IP it is.
func truncateIPToNet(ipAddr net.IP, ipv4Cidr, ipv6Cidr int) (network string, err error) {
// If addr can be converted to 4byte notation, it is an IPv4 address, else its an IPv6 address
if ipv4 := ipAddr.To4(); ipv4 != nil {
mask := net.CIDRMask(ipv4Cidr, 32)
return ipv4.Mask(mask).String(), nil
}
if ipv6 := ipAddr.To16(); ipv6 != nil {
mask := net.CIDRMask(ipv6Cidr, 128)
return ipv6.Mask(mask).String(), nil
}
return "", fmt.Errorf("unable to get network for address %s", ipAddr.String())
}
// TestVetNode directly sets a node's vetted_at timestamp to make testing easier.
func (service *Service) TestVetNode(ctx context.Context, nodeID storj.NodeID) (vettedTime *time.Time, err error) {
vettedTime, err = service.db.TestVetNode(ctx, nodeID)
service.log.Warn("node vetted", zap.Stringer("node ID", nodeID), zap.Stringer("vetted time", vettedTime))
if err != nil {
service.log.Warn("error vetting node", zap.Stringer("node ID", nodeID))
return nil, err
}
err = service.UploadSelectionCache.Refresh(ctx)
service.log.Warn("nodecache refresh err", zap.Error(err))
return vettedTime, err
}
// TestUnvetNode directly sets a node's vetted_at timestamp to null to make testing easier.
func (service *Service) TestUnvetNode(ctx context.Context, nodeID storj.NodeID) (err error) {
err = service.db.TestUnvetNode(ctx, nodeID)
if err != nil {
service.log.Warn("error unvetting node", zap.Stringer("node ID", nodeID), zap.Error(err))
return err
}
err = service.UploadSelectionCache.Refresh(ctx)
service.log.Warn("nodecache refresh err", zap.Error(err))
return err
}
// TestNodeCountryCode directly sets a node's vetted_at timestamp to null to make testing easier.
func (service *Service) TestNodeCountryCode(ctx context.Context, nodeID storj.NodeID, countryCode string) (err error) {
err = service.db.TestNodeCountryCode(ctx, nodeID, countryCode)
if err != nil {
service.log.Warn("error updating node", zap.Stringer("node ID", nodeID), zap.Error(err))
return err
}
return nil
}
func (service *Service) insertReputationNodeEvents(ctx context.Context, email string, id storj.NodeID, repEvents []nodeevents.Type) {
defer mon.Task()(&ctx)(nil)
for _, event := range repEvents {
switch event {
case nodeevents.Disqualified:
_, err := service.nodeEvents.Insert(ctx, email, id, nodeevents.Disqualified)
if err != nil {
service.log.Error("could not insert node disqualified into node events", zap.Error(err))
}
case nodeevents.UnknownAuditSuspended:
_, err := service.nodeEvents.Insert(ctx, email, id, nodeevents.UnknownAuditSuspended)
if err != nil {
service.log.Error("could not insert node unknown audit suspended into node events", zap.Error(err))
}
case nodeevents.UnknownAuditUnsuspended:
_, err := service.nodeEvents.Insert(ctx, email, id, nodeevents.UnknownAuditUnsuspended)
if err != nil {
service.log.Error("could not insert node unknown audit unsuspended into node events", zap.Error(err))
}
case nodeevents.OfflineSuspended:
_, err := service.nodeEvents.Insert(ctx, email, id, nodeevents.OfflineSuspended)
if err != nil {
service.log.Error("could not insert node offline suspended into node events", zap.Error(err))
}
case nodeevents.OfflineUnsuspended:
_, err := service.nodeEvents.Insert(ctx, email, id, nodeevents.OfflineUnsuspended)
if err != nil {
service.log.Error("could not insert node offline unsuspended into node events", zap.Error(err))
}
default:
}
}
}