diff --git a/internal/testblobs/slow.go b/internal/testblobs/slow.go index ea6b3ff63..464303801 100644 --- a/internal/testblobs/slow.go +++ b/internal/testblobs/slow.go @@ -26,7 +26,7 @@ type SlowDB struct { func NewSlowDB(log *zap.Logger, db storagenode.DB) *SlowDB { return &SlowDB{ DB: db, - blobs: NewSlowBlobs(log, db.Pieces()), + blobs: newSlowBlobs(log, db.Pieces()), log: log, } } @@ -49,9 +49,9 @@ type SlowBlobs struct { log *zap.Logger } -// NewSlowBlobs creates a new slow blob store wrapping the provided blobs. +// newSlowBlobs creates a new slow blob store wrapping the provided blobs. // Use SetLatency to dynamically configure the latency of all operations. -func NewSlowBlobs(log *zap.Logger, blobs storage.Blobs) *SlowBlobs { +func newSlowBlobs(log *zap.Logger, blobs storage.Blobs) *SlowBlobs { return &SlowBlobs{ log: log, blobs: blobs, @@ -71,18 +71,64 @@ func (slow *SlowBlobs) Open(ctx context.Context, ref storage.BlobRef) (storage.B return slow.blobs.Open(ctx, ref) } +// OpenWithStorageFormat opens a reader for the already-located blob, avoiding the potential need +// to check multiple storage formats to find the blob. +func (slow *SlowBlobs) OpenWithStorageFormat(ctx context.Context, ref storage.BlobRef, formatVer storage.FormatVersion) (storage.BlobReader, error) { + slow.sleep() + return slow.blobs.OpenWithStorageFormat(ctx, ref, formatVer) +} + // Delete deletes the blob with the namespace and key. func (slow *SlowBlobs) Delete(ctx context.Context, ref storage.BlobRef) error { slow.sleep() return slow.blobs.Delete(ctx, ref) } +// Stat looks up disk metadata on the blob file +func (slow *SlowBlobs) Stat(ctx context.Context, ref storage.BlobRef) (storage.BlobInfo, error) { + slow.sleep() + return slow.blobs.Stat(ctx, ref) +} + +// StatWithStorageFormat looks up disk metadata for the blob file with the given storage format +// version. This avoids the potential need to check multiple storage formats for the blob +// when the format is already known. +func (slow *SlowBlobs) StatWithStorageFormat(ctx context.Context, ref storage.BlobRef, formatVer storage.FormatVersion) (storage.BlobInfo, error) { + slow.sleep() + return slow.blobs.StatWithStorageFormat(ctx, ref, formatVer) +} + +// WalkNamespace executes walkFunc for each locally stored blob in the given namespace. +// If walkFunc returns a non-nil error, WalkNamespace will stop iterating and return the +// error immediately. +func (slow *SlowBlobs) WalkNamespace(ctx context.Context, namespace []byte, walkFunc func(storage.BlobInfo) error) error { + slow.sleep() + return slow.blobs.WalkNamespace(ctx, namespace, walkFunc) +} + +// ListNamespaces returns all namespaces that might be storing data. +func (slow *SlowBlobs) ListNamespaces(ctx context.Context) ([][]byte, error) { + return slow.blobs.ListNamespaces(ctx) +} + // FreeSpace return how much free space left for writing. func (slow *SlowBlobs) FreeSpace() (int64, error) { slow.sleep() return slow.blobs.FreeSpace() } +// SpaceUsed adds up how much is used in all namespaces +func (slow *SlowBlobs) SpaceUsed(ctx context.Context) (int64, error) { + slow.sleep() + return slow.blobs.SpaceUsed(ctx) +} + +// SpaceUsedInNamespace adds up how much is used in the given namespace +func (slow *SlowBlobs) SpaceUsedInNamespace(ctx context.Context, namespace []byte) (int64, error) { + slow.sleep() + return slow.blobs.SpaceUsedInNamespace(ctx, namespace) +} + // SetLatency configures the blob store to sleep for delay duration for all // operations. A zero or negative delay means no sleep. func (slow *SlowBlobs) SetLatency(delay time.Duration) { diff --git a/pkg/pb/piecestore2.pb.go b/pkg/pb/piecestore2.pb.go index 0730960e1..d7d559550 100644 --- a/pkg/pb/piecestore2.pb.go +++ b/pkg/pb/piecestore2.pb.go @@ -26,6 +26,31 @@ var _ = time.Kitchen // proto package needs to be updated. const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package +type PieceHeader_FormatVersion int32 + +const ( + PieceHeader_FORMAT_V0 PieceHeader_FormatVersion = 0 + PieceHeader_FORMAT_V1 PieceHeader_FormatVersion = 1 +) + +var PieceHeader_FormatVersion_name = map[int32]string{ + 0: "FORMAT_V0", + 1: "FORMAT_V1", +} + +var PieceHeader_FormatVersion_value = map[string]int32{ + "FORMAT_V0": 0, + "FORMAT_V1": 1, +} + +func (x PieceHeader_FormatVersion) String() string { + return proto.EnumName(PieceHeader_FormatVersion_name, int32(x)) +} + +func (PieceHeader_FormatVersion) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_23ff32dd550c2439, []int{8, 0} +} + // Expected order of messages from uplink: // OrderLimit -> // repeated @@ -523,7 +548,88 @@ func (m *RetainResponse) XXX_DiscardUnknown() { var xxx_messageInfo_RetainResponse proto.InternalMessageInfo +// PieceHeader is used in piece storage to keep track of piece attributes. +type PieceHeader struct { + // the storage format version being used for this piece. The piece filename should agree with this. + // The inclusion of this field is intended to aid repairability when filenames are damaged. + FormatVersion PieceHeader_FormatVersion `protobuf:"varint,1,opt,name=format_version,json=formatVersion,proto3,enum=piecestore.PieceHeader_FormatVersion" json:"format_version,omitempty"` + // content hash of the piece + Hash []byte `protobuf:"bytes,2,opt,name=hash,proto3" json:"hash,omitempty"` + // timestamp when upload occurred, as given by the "timestamp" field in the original orders.PieceHash + CreationTime time.Time `protobuf:"bytes,3,opt,name=creation_time,json=creationTime,proto3,stdtime" json:"creation_time"` + // signature from uplink over the original orders.PieceHash (the corresponding PieceHashSigning + // is reconstructable using the piece id from the piecestore, the piece size from the + // filesystem (minus the piece header size), and these (hash, upload_time, signature) fields). + Signature []byte `protobuf:"bytes,4,opt,name=signature,proto3" json:"signature,omitempty"` + // the OrderLimit authorizing storage of this piece, as signed by the satellite and sent by + // the uplink + OrderLimit OrderLimit `protobuf:"bytes,5,opt,name=order_limit,json=orderLimit,proto3" json:"order_limit"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *PieceHeader) Reset() { *m = PieceHeader{} } +func (m *PieceHeader) String() string { return proto.CompactTextString(m) } +func (*PieceHeader) ProtoMessage() {} +func (*PieceHeader) Descriptor() ([]byte, []int) { + return fileDescriptor_23ff32dd550c2439, []int{8} +} +func (m *PieceHeader) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_PieceHeader.Unmarshal(m, b) +} +func (m *PieceHeader) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_PieceHeader.Marshal(b, m, deterministic) +} +func (m *PieceHeader) XXX_Merge(src proto.Message) { + xxx_messageInfo_PieceHeader.Merge(m, src) +} +func (m *PieceHeader) XXX_Size() int { + return xxx_messageInfo_PieceHeader.Size(m) +} +func (m *PieceHeader) XXX_DiscardUnknown() { + xxx_messageInfo_PieceHeader.DiscardUnknown(m) +} + +var xxx_messageInfo_PieceHeader proto.InternalMessageInfo + +func (m *PieceHeader) GetFormatVersion() PieceHeader_FormatVersion { + if m != nil { + return m.FormatVersion + } + return PieceHeader_FORMAT_V0 +} + +func (m *PieceHeader) GetHash() []byte { + if m != nil { + return m.Hash + } + return nil +} + +func (m *PieceHeader) GetCreationTime() time.Time { + if m != nil { + return m.CreationTime + } + return time.Time{} +} + +func (m *PieceHeader) GetSignature() []byte { + if m != nil { + return m.Signature + } + return nil +} + +func (m *PieceHeader) GetOrderLimit() OrderLimit { + if m != nil { + return m.OrderLimit + } + return OrderLimit{} +} + func init() { + proto.RegisterEnum("piecestore.PieceHeader_FormatVersion", PieceHeader_FormatVersion_name, PieceHeader_FormatVersion_value) proto.RegisterType((*PieceUploadRequest)(nil), "piecestore.PieceUploadRequest") proto.RegisterType((*PieceUploadRequest_Chunk)(nil), "piecestore.PieceUploadRequest.Chunk") proto.RegisterType((*PieceUploadResponse)(nil), "piecestore.PieceUploadResponse") @@ -535,45 +641,53 @@ func init() { proto.RegisterType((*PieceDeleteResponse)(nil), "piecestore.PieceDeleteResponse") proto.RegisterType((*RetainRequest)(nil), "piecestore.RetainRequest") proto.RegisterType((*RetainResponse)(nil), "piecestore.RetainResponse") + proto.RegisterType((*PieceHeader)(nil), "piecestore.PieceHeader") } func init() { proto.RegisterFile("piecestore2.proto", fileDescriptor_23ff32dd550c2439) } var fileDescriptor_23ff32dd550c2439 = []byte{ - // 513 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xb4, 0x53, 0xcd, 0x6e, 0x13, 0x3d, - 0x14, 0xcd, 0xe4, 0x67, 0xd4, 0xef, 0x7e, 0x09, 0xa2, 0x86, 0xa2, 0x60, 0x09, 0x52, 0x86, 0xbf, - 0xac, 0xa6, 0x28, 0xdd, 0xa1, 0x52, 0xa4, 0x92, 0x05, 0x08, 0x10, 0x95, 0xa1, 0x1b, 0x36, 0x95, - 0x93, 0xb9, 0x49, 0x2d, 0x26, 0xe3, 0x61, 0xec, 0x08, 0xa9, 0xaf, 0xc0, 0x86, 0x47, 0x62, 0xc9, - 0x53, 0xc0, 0x82, 0xc7, 0x60, 0x83, 0xc6, 0x3f, 0x2d, 0x93, 0x86, 0x44, 0x20, 0xb1, 0x9a, 0xb1, - 0xef, 0x39, 0xf7, 0x1e, 0x9f, 0x63, 0xc3, 0x66, 0x2e, 0x70, 0x8c, 0x4a, 0xcb, 0x02, 0x07, 0x71, - 0x5e, 0x48, 0x2d, 0x09, 0x9c, 0x6f, 0x51, 0x98, 0xca, 0xa9, 0xb4, 0xfb, 0xb4, 0x37, 0x95, 0x72, - 0x9a, 0xe2, 0x8e, 0x59, 0x8d, 0xe6, 0x93, 0x1d, 0x2d, 0x66, 0xa8, 0x34, 0x9f, 0xe5, 0x0e, 0xd0, - 0x96, 0x45, 0x82, 0x85, 0xb2, 0xab, 0xe8, 0x47, 0x00, 0xe4, 0xb0, 0xec, 0x74, 0x94, 0xa7, 0x92, - 0x27, 0x0c, 0xdf, 0xcf, 0x51, 0x69, 0xd2, 0x87, 0x56, 0x2a, 0x66, 0x42, 0x77, 0x83, 0xed, 0xa0, - 0xff, 0xff, 0x80, 0xc4, 0x8e, 0xf4, 0xaa, 0xfc, 0xbc, 0x28, 0x2b, 0xcc, 0x02, 0xc8, 0x6d, 0x68, - 0x99, 0x5a, 0xb7, 0x6e, 0x90, 0x9d, 0x0a, 0x92, 0xd9, 0x1a, 0x79, 0x08, 0xad, 0xf1, 0xc9, 0x3c, - 0x7b, 0xd7, 0x6d, 0x18, 0xd0, 0x9d, 0xf8, 0x5c, 0x7c, 0x7c, 0x71, 0x7a, 0xfc, 0xa4, 0xc4, 0x32, - 0x4b, 0x21, 0x77, 0xa1, 0x99, 0xc8, 0x0c, 0xbb, 0x4d, 0x43, 0xdd, 0xf4, 0xfd, 0x0d, 0xed, 0x29, - 0x57, 0x27, 0xcc, 0x94, 0xe9, 0x2e, 0xb4, 0x0c, 0x8d, 0x5c, 0x83, 0x50, 0x4e, 0x26, 0x0a, 0xad, - 0xf6, 0x06, 0x73, 0x2b, 0x42, 0xa0, 0x99, 0x70, 0xcd, 0x8d, 0xce, 0x36, 0x33, 0xff, 0xd1, 0x1e, - 0x5c, 0xa9, 0x8c, 0x57, 0xb9, 0xcc, 0x14, 0x9e, 0x8d, 0x0c, 0x56, 0x8e, 0x8c, 0xbe, 0x07, 0x70, - 0xd5, 0xec, 0x0d, 0xe5, 0x87, 0xec, 0x1f, 0xba, 0xb7, 0x57, 0x75, 0xef, 0xde, 0x05, 0xf7, 0x16, - 0xe6, 0x57, 0xfc, 0xa3, 0xfb, 0xeb, 0x8c, 0xb9, 0x01, 0x60, 0x90, 0xc7, 0x4a, 0x9c, 0xa2, 0x11, - 0xd2, 0x60, 0xff, 0x99, 0x9d, 0xd7, 0xe2, 0x14, 0xa3, 0x8f, 0x01, 0x6c, 0x2d, 0x4c, 0x71, 0x36, - 0x3d, 0xf2, 0xba, 0xec, 0x31, 0xef, 0xaf, 0xd0, 0x65, 0x19, 0x55, 0x61, 0x7f, 0x95, 0xd8, 0xbe, - 0xbb, 0xae, 0x43, 0x4c, 0x51, 0xe3, 0x1f, 0x1b, 0x1e, 0x6d, 0xb9, 0xc4, 0x3d, 0xdf, 0x0a, 0x8b, - 0x0a, 0xe8, 0x30, 0xd4, 0x5c, 0x64, 0xbe, 0xe3, 0x33, 0xe8, 0x8c, 0x0b, 0xe4, 0x5a, 0xc8, 0xec, - 0x38, 0xe1, 0xda, 0xdf, 0x05, 0x1a, 0xdb, 0xe7, 0x15, 0xfb, 0xe7, 0x15, 0xbf, 0xf1, 0xcf, 0xeb, - 0x60, 0xe3, 0xcb, 0xd7, 0x5e, 0xed, 0xd3, 0xb7, 0x5e, 0xc0, 0xda, 0x9e, 0x3a, 0xe4, 0x1a, 0xcb, - 0xe3, 0x4d, 0x44, 0xaa, 0x5d, 0xc8, 0x6d, 0xe6, 0x56, 0xd1, 0x65, 0xb8, 0xe4, 0x67, 0x5a, 0x15, - 0x83, 0xcf, 0x75, 0x80, 0xc3, 0x33, 0x0f, 0xc9, 0x4b, 0x08, 0xed, 0xc5, 0x24, 0x37, 0x57, 0x3f, - 0x18, 0xda, 0xfb, 0x6d, 0xdd, 0x9d, 0xaf, 0xd6, 0x0f, 0xc8, 0x11, 0x6c, 0xf8, 0x40, 0xc8, 0xf6, - 0xba, 0x3b, 0x44, 0x6f, 0xad, 0x4d, 0xb3, 0x6c, 0xfa, 0x20, 0x20, 0xcf, 0x21, 0xb4, 0x66, 0x2e, - 0x51, 0x59, 0x49, 0x69, 0x89, 0xca, 0x85, 0x14, 0x6a, 0xe4, 0x31, 0x84, 0xd6, 0x13, 0x72, 0xfd, - 0x57, 0x70, 0x25, 0x1b, 0x4a, 0x97, 0x95, 0x6c, 0x8b, 0x83, 0xe6, 0xdb, 0x7a, 0x3e, 0x1a, 0x85, - 0x26, 0x9e, 0xdd, 0x9f, 0x01, 0x00, 0x00, 0xff, 0xff, 0x1c, 0xd5, 0x7e, 0x72, 0x38, 0x05, 0x00, - 0x00, + // 634 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xb4, 0x54, 0x4f, 0x6f, 0x12, 0x4f, + 0x18, 0x66, 0x29, 0x90, 0xf2, 0xc2, 0x12, 0x3a, 0xbf, 0x5f, 0x0d, 0x6e, 0x54, 0xea, 0x6a, 0x95, + 0x8b, 0xdb, 0x4a, 0x4f, 0x9a, 0x5a, 0x23, 0x92, 0x46, 0x63, 0x9b, 0x36, 0x63, 0xdb, 0x83, 0x17, + 0x32, 0xc0, 0x2c, 0x6c, 0x84, 0x9d, 0x75, 0x67, 0xd0, 0xa4, 0x5f, 0xc1, 0x8b, 0x1f, 0xc9, 0x78, + 0xf2, 0x53, 0xe8, 0xc1, 0x8f, 0xe1, 0xc5, 0xcc, 0xcc, 0x2e, 0xb0, 0x40, 0x21, 0x35, 0xf1, 0x04, + 0xef, 0xff, 0x67, 0x9e, 0xf7, 0x79, 0x17, 0x36, 0x02, 0x8f, 0x76, 0x28, 0x17, 0x2c, 0xa4, 0x75, + 0x27, 0x08, 0x99, 0x60, 0x08, 0x26, 0x2e, 0x0b, 0x7a, 0xac, 0xc7, 0xb4, 0xdf, 0xaa, 0xf6, 0x18, + 0xeb, 0x0d, 0xe8, 0x8e, 0xb2, 0xda, 0x23, 0x77, 0x47, 0x78, 0x43, 0xca, 0x05, 0x19, 0x06, 0x51, + 0x42, 0x91, 0x85, 0x5d, 0x1a, 0x72, 0x6d, 0xd9, 0xbf, 0x0d, 0x40, 0xa7, 0xb2, 0xd3, 0x79, 0x30, + 0x60, 0xa4, 0x8b, 0xe9, 0x87, 0x11, 0xe5, 0x02, 0xd5, 0x20, 0x3b, 0xf0, 0x86, 0x9e, 0xa8, 0x18, + 0x5b, 0x46, 0xad, 0x50, 0x47, 0x4e, 0x54, 0x74, 0x22, 0x7f, 0x8e, 0x64, 0x04, 0xeb, 0x04, 0x74, + 0x0f, 0xb2, 0x2a, 0x56, 0x49, 0xab, 0x4c, 0x33, 0x91, 0x89, 0x75, 0x0c, 0x3d, 0x85, 0x6c, 0xa7, + 0x3f, 0xf2, 0xdf, 0x57, 0xd6, 0x54, 0xd2, 0x7d, 0x67, 0x02, 0xde, 0x99, 0x9f, 0xee, 0xbc, 0x94, + 0xb9, 0x58, 0x97, 0xa0, 0x6d, 0xc8, 0x74, 0x99, 0x4f, 0x2b, 0x19, 0x55, 0xba, 0x11, 0xf7, 0x57, + 0x65, 0xaf, 0x08, 0xef, 0x63, 0x15, 0xb6, 0xf6, 0x20, 0xab, 0xca, 0xd0, 0x0d, 0xc8, 0x31, 0xd7, + 0xe5, 0x54, 0x63, 0x5f, 0xc3, 0x91, 0x85, 0x10, 0x64, 0xba, 0x44, 0x10, 0x85, 0xb3, 0x88, 0xd5, + 0x7f, 0x7b, 0x1f, 0xfe, 0x4b, 0x8c, 0xe7, 0x01, 0xf3, 0x39, 0x1d, 0x8f, 0x34, 0x96, 0x8e, 0xb4, + 0x7f, 0x19, 0xf0, 0xbf, 0xf2, 0x35, 0xd9, 0x27, 0xff, 0x1f, 0xb2, 0xb7, 0x9f, 0x64, 0xef, 0xc1, + 0x1c, 0x7b, 0x33, 0xf3, 0x13, 0xfc, 0x59, 0x07, 0xab, 0x88, 0xb9, 0x0d, 0xa0, 0x32, 0x5b, 0xdc, + 0xbb, 0xa4, 0x0a, 0xc8, 0x1a, 0xce, 0x2b, 0xcf, 0x5b, 0xef, 0x92, 0xda, 0x9f, 0x0d, 0xd8, 0x9c, + 0x99, 0x12, 0xd1, 0xf4, 0x2c, 0xc6, 0xa5, 0x9f, 0xf9, 0x70, 0x09, 0x2e, 0x5d, 0x91, 0x04, 0xf6, + 0x57, 0x1b, 0x3b, 0x88, 0xe4, 0xda, 0xa4, 0x03, 0x2a, 0xe8, 0xb5, 0x09, 0xb7, 0x37, 0xa3, 0x8d, + 0xc7, 0xf5, 0x1a, 0x98, 0x1d, 0x82, 0x89, 0xa9, 0x20, 0x9e, 0x1f, 0x77, 0x7c, 0x0d, 0x66, 0x27, + 0xa4, 0x44, 0x78, 0xcc, 0x6f, 0x75, 0x89, 0x88, 0xb5, 0x60, 0x39, 0xfa, 0xbc, 0x9c, 0xf8, 0xbc, + 0x9c, 0xb3, 0xf8, 0xbc, 0x1a, 0xeb, 0xdf, 0x7f, 0x54, 0x53, 0x5f, 0x7e, 0x56, 0x0d, 0x5c, 0x8c, + 0x4b, 0x9b, 0x44, 0x50, 0xf9, 0x3c, 0xd7, 0x1b, 0x88, 0x68, 0xc9, 0x45, 0x1c, 0x59, 0x76, 0x19, + 0x4a, 0xf1, 0xcc, 0x08, 0xc5, 0xb7, 0x34, 0x14, 0xb4, 0xc8, 0x28, 0x91, 0x8b, 0x3f, 0x82, 0x92, + 0xcb, 0xc2, 0x21, 0x11, 0xad, 0x8f, 0x34, 0xe4, 0x1e, 0xf3, 0x15, 0x8a, 0x52, 0x7d, 0x7b, 0x8e, + 0x69, 0x5d, 0xe0, 0x1c, 0xaa, 0xec, 0x0b, 0x9d, 0x8c, 0x4d, 0x77, 0xda, 0x94, 0x74, 0xf6, 0x09, + 0xef, 0xc7, 0x74, 0xca, 0xff, 0x89, 0x67, 0xca, 0x0f, 0x45, 0x24, 0xb1, 0x6b, 0x3e, 0x53, 0x06, + 0xd1, 0x2d, 0xc8, 0x73, 0xaf, 0xe7, 0x13, 0x31, 0x0a, 0xf5, 0xb1, 0x16, 0xf1, 0xc4, 0x81, 0x9e, + 0x40, 0x41, 0xed, 0xa4, 0xa5, 0xf7, 0x94, 0xbd, 0x6a, 0x4f, 0x8d, 0x8c, 0x6c, 0x8f, 0x81, 0x8d, + 0x3d, 0xf6, 0x23, 0x30, 0x13, 0xef, 0x42, 0x26, 0xe4, 0x0f, 0x4f, 0xf0, 0xf1, 0x8b, 0xb3, 0xd6, + 0xc5, 0x6e, 0x39, 0x35, 0x6d, 0x3e, 0x2e, 0x1b, 0xf5, 0xaf, 0x69, 0x80, 0xd3, 0x31, 0x3d, 0xe8, + 0x18, 0x72, 0xfa, 0xba, 0xd1, 0x9d, 0xe5, 0x5f, 0x1d, 0xab, 0x7a, 0x65, 0x3c, 0x5a, 0x4f, 0xaa, + 0x66, 0xa0, 0x73, 0x58, 0x8f, 0x55, 0x8d, 0xb6, 0x56, 0x1d, 0xa2, 0x75, 0x77, 0xe5, 0x49, 0xc8, + 0xa6, 0xbb, 0x06, 0x7a, 0x03, 0x39, 0xad, 0xc8, 0x05, 0x28, 0x13, 0x52, 0x5f, 0x80, 0x72, 0x46, + 0xca, 0x29, 0xf4, 0x1c, 0x72, 0x5a, 0x58, 0xe8, 0xe6, 0x74, 0x72, 0x42, 0xe0, 0x96, 0xb5, 0x28, + 0xa4, 0x5b, 0x34, 0x32, 0xef, 0xd2, 0x41, 0xbb, 0x9d, 0x53, 0xcb, 0xdf, 0xfb, 0x13, 0x00, 0x00, + 0xff, 0xff, 0x64, 0x65, 0x86, 0x27, 0x7d, 0x06, 0x00, 0x00, } // Reference imports to suppress errors if they are not otherwise used. diff --git a/pkg/pb/piecestore2.proto b/pkg/pb/piecestore2.proto index 22bf6d22b..01a05b454 100644 --- a/pkg/pb/piecestore2.proto +++ b/pkg/pb/piecestore2.proto @@ -90,3 +90,25 @@ message RetainRequest { message RetainResponse { } + +// PieceHeader is used in piece storage to keep track of piece attributes. +message PieceHeader { + enum FormatVersion { + FORMAT_V0 = 0; + FORMAT_V1 = 1; + } + // the storage format version being used for this piece. The piece filename should agree with this. + // The inclusion of this field is intended to aid repairability when filenames are damaged. + FormatVersion format_version = 1; + // content hash of the piece + bytes hash = 2; + // timestamp when upload occurred, as given by the "timestamp" field in the original orders.PieceHash + google.protobuf.Timestamp creation_time = 3 [(gogoproto.stdtime) = true, (gogoproto.nullable) = false]; + // signature from uplink over the original orders.PieceHash (the corresponding PieceHashSigning + // is reconstructable using the piece id from the piecestore, the piece size from the + // filesystem (minus the piece header size), and these (hash, upload_time, signature) fields). + bytes signature = 4; + // the OrderLimit authorizing storage of this piece, as signed by the satellite and sent by + // the uplink + orders.OrderLimit order_limit = 5 [(gogoproto.nullable) = false]; +} \ No newline at end of file diff --git a/proto.lock b/proto.lock index 33f6d9d1e..7c95843d2 100644 --- a/proto.lock +++ b/proto.lock @@ -4941,6 +4941,20 @@ { "protopath": "pkg:/:pb:/:piecestore2.proto", "def": { + "enums": [ + { + "name": "PieceHeader.FormatVersion", + "enum_fields": [ + { + "name": "FORMAT_V0" + }, + { + "name": "FORMAT_V1", + "integer": 1 + } + ] + } + ], "messages": [ { "name": "PieceUploadRequest", @@ -5098,6 +5112,52 @@ }, { "name": "RetainResponse" + }, + { + "name": "PieceHeader", + "fields": [ + { + "id": 1, + "name": "format_version", + "type": "FormatVersion" + }, + { + "id": 2, + "name": "hash", + "type": "bytes" + }, + { + "id": 3, + "name": "creation_time", + "type": "google.protobuf.Timestamp", + "options": [ + { + "name": "(gogoproto.stdtime)", + "value": "true" + }, + { + "name": "(gogoproto.nullable)", + "value": "false" + } + ] + }, + { + "id": 4, + "name": "signature", + "type": "bytes" + }, + { + "id": 5, + "name": "order_limit", + "type": "orders.OrderLimit", + "options": [ + { + "name": "(gogoproto.nullable)", + "value": "false" + } + ] + } + ] } ], "services": [ diff --git a/satellite/gc/gc_test.go b/satellite/gc/gc_test.go index 7f3e6631f..35f1b7029 100644 --- a/satellite/gc/gc_test.go +++ b/satellite/gc/gc_test.go @@ -19,6 +19,7 @@ import ( "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" "storj.io/storj/satellite" + "storj.io/storj/storage" ) // TestGarbageCollection does the following: @@ -76,9 +77,12 @@ func TestGarbageCollection(t *testing.T) { require.NoError(t, err) // Check that piece of the deleted object is on the storagenode - pieceInfo, err := targetNode.DB.PieceInfo().Get(ctx, satellite.ID(), deletedPieceID) + pieceAccess, err := targetNode.DB.Pieces().Stat(ctx, storage.BlobRef{ + Namespace: satellite.ID().Bytes(), + Key: deletedPieceID.Bytes(), + }) require.NoError(t, err) - require.NotNil(t, pieceInfo) + require.NotNil(t, pieceAccess) // The pieceInfo.GetPieceIDs query converts piece creation and the filter creation timestamps // to datetime in sql. This chops off all precision beyond seconds. @@ -91,14 +95,20 @@ func TestGarbageCollection(t *testing.T) { gcService.Loop.TriggerWait() // Check that piece of the deleted object is not on the storagenode - pieceInfo, err = targetNode.DB.PieceInfo().Get(ctx, satellite.ID(), deletedPieceID) + pieceAccess, err = targetNode.DB.Pieces().Stat(ctx, storage.BlobRef{ + Namespace: satellite.ID().Bytes(), + Key: deletedPieceID.Bytes(), + }) require.Error(t, err) - require.Nil(t, pieceInfo) + require.Nil(t, pieceAccess) // Check that piece of the kept object is on the storagenode - pieceInfo, err = targetNode.DB.PieceInfo().Get(ctx, satellite.ID(), keptPieceID) + pieceAccess, err = targetNode.DB.Pieces().Stat(ctx, storage.BlobRef{ + Namespace: satellite.ID().Bytes(), + Key: keptPieceID.Bytes(), + }) require.NoError(t, err) - require.NotNil(t, pieceInfo) + require.NotNil(t, pieceAccess) }) } diff --git a/storage/blob.go b/storage/blob.go index d925733c8..7a4f8bc47 100644 --- a/storage/blob.go +++ b/storage/blob.go @@ -6,6 +6,7 @@ package storage import ( "context" "io" + "os" "github.com/zeebo/errs" ) @@ -13,6 +14,16 @@ import ( // ErrInvalidBlobRef is returned when an blob reference is invalid var ErrInvalidBlobRef = errs.Class("invalid blob ref") +// FormatVersion represents differing storage format version values. Different Blobs implementors +// might interpret different FormatVersion values differently, but they share a type so that there +// can be a common StorageFormatVersion() call on the interface. +// +// Changes in FormatVersion might affect how a Blobs or BlobReader or BlobWriter instance works, or +// they might only be relevant to some higher layer. A FormatVersion must be specified when writing +// a new blob, and the blob storage interface must store that value with the blob somehow, so that +// the same FormatVersion is returned later when reading that stored blob. +type FormatVersion int + // BlobRef is a reference to a blob type BlobRef struct { Namespace []byte @@ -32,17 +43,23 @@ type BlobReader interface { io.Closer // Size returns the size of the blob Size() (int64, error) + // StorageFormatVersion returns the storage format version associated with the blob. + StorageFormatVersion() FormatVersion } -// BlobWriter is an interface that groups Read, ReadAt, Seek and Close. +// BlobWriter defines the interface that must be satisfied for a general blob storage provider. +// BlobWriter instances are returned by the Create() method on Blobs instances. type BlobWriter interface { io.Writer + io.Seeker // Cancel discards the blob. Cancel(context.Context) error // Commit ensures that the blob is readable by others. Commit(context.Context) error // Size returns the size of the blob Size() (int64, error) + // StorageFormatVersion returns the storage format version associated with the blob. + StorageFormatVersion() FormatVersion } // Blobs is a blob storage interface @@ -52,8 +69,41 @@ type Blobs interface { Create(ctx context.Context, ref BlobRef, size int64) (BlobWriter, error) // Open opens a reader with the specified namespace and key Open(ctx context.Context, ref BlobRef) (BlobReader, error) + // OpenWithStorageFormat opens a reader for the already-located blob, avoiding the potential + // need to check multiple storage formats to find the blob. + OpenWithStorageFormat(ctx context.Context, ref BlobRef, formatVer FormatVersion) (BlobReader, error) // Delete deletes the blob with the namespace and key Delete(ctx context.Context, ref BlobRef) error + // Stat looks up disk metadata on the blob file + Stat(ctx context.Context, ref BlobRef) (BlobInfo, error) + // StatWithStorageFormat looks up disk metadata for the blob file with the given storage format + // version. This avoids the potential need to check multiple storage formats for the blob + // when the format is already known. + StatWithStorageFormat(ctx context.Context, ref BlobRef, formatVer FormatVersion) (BlobInfo, error) // FreeSpace return how much free space left for writing FreeSpace() (int64, error) + // SpaceUsed adds up how much is used in all namespaces + SpaceUsed(ctx context.Context) (int64, error) + // SpaceUsedInNamespace adds up how much is used in the given namespace + SpaceUsedInNamespace(ctx context.Context, namespace []byte) (int64, error) + // ListNamespaces finds all namespaces in which keys might currently be stored. + ListNamespaces(ctx context.Context) ([][]byte, error) + // WalkNamespace executes walkFunc for each locally stored blob, stored with + // storage format V1 or greater, in the given namespace. If walkFunc returns a non-nil + // error, WalkNamespace will stop iterating and return the error immediately. The ctx + // parameter is intended to allow canceling iteration early. + WalkNamespace(ctx context.Context, namespace []byte, walkFunc func(BlobInfo) error) error +} + +// BlobInfo allows lazy inspection of a blob and its underlying file during iteration with +// WalkNamespace-type methods +type BlobInfo interface { + // BlobRef returns the relevant BlobRef for the blob + BlobRef() BlobRef + // StorageFormatVersion indicates the storage format version used to store the piece + StorageFormatVersion() FormatVersion + // FullPath gives the full path to the on-disk blob file + FullPath(ctx context.Context) (string, error) + // Stat does a stat on the on-disk blob file + Stat(ctx context.Context) (os.FileInfo, error) } diff --git a/storage/filestore/blob.go b/storage/filestore/blob.go index 80fd7c371..428c76b01 100644 --- a/storage/filestore/blob.go +++ b/storage/filestore/blob.go @@ -13,13 +13,39 @@ import ( "storj.io/storj/storage" ) +const ( + // FormatV0 is the identifier for storage format v0, which also corresponds to an absence of + // format version information. + FormatV0 storage.FormatVersion = 0 + // FormatV1 is the identifier for storage format v1 + FormatV1 storage.FormatVersion = 1 + + // Note: New FormatVersion values should be consecutive, as certain parts of this blob store + // iterate over them numerically and check for blobs stored with each version. +) + +const ( + // MaxFormatVersionSupported is the highest supported storage format version for reading, and + // the only supported storage format version for writing. If stored blobs claim a higher + // storage format version than this, or a caller requests _writing_ a storage format version + // which is not this, this software will not know how to perform the read or write and an error + // will be returned. + MaxFormatVersionSupported = FormatV1 + + // MinFormatVersionSupported is the lowest supported storage format version for reading. If + // stored blobs claim a lower storage format version than this, this software will not know how + // to perform the read and an error will be returned. + MinFormatVersionSupported = FormatV0 +) + // blobReader implements reading blobs type blobReader struct { *os.File + formatVersion storage.FormatVersion } -func newBlobReader(file *os.File) *blobReader { - return &blobReader{file} +func newBlobReader(file *os.File, formatVersion storage.FormatVersion) *blobReader { + return &blobReader{file, formatVersion} } // Size returns how large is the blob. @@ -31,17 +57,29 @@ func (blob *blobReader) Size() (int64, error) { return stat.Size(), err } +// StorageFormatVersion gets the storage format version being used by the blob. +func (blob *blobReader) StorageFormatVersion() storage.FormatVersion { + return blob.formatVersion +} + // blobWriter implements writing blobs type blobWriter struct { - ref storage.BlobRef - store *Store - closed bool + ref storage.BlobRef + store *Store + closed bool + formatVersion storage.FormatVersion *os.File } -func newBlobWriter(ref storage.BlobRef, store *Store, file *os.File) *blobWriter { - return &blobWriter{ref, store, false, file} +func newBlobWriter(ref storage.BlobRef, store *Store, formatVersion storage.FormatVersion, file *os.File) *blobWriter { + return &blobWriter{ + ref: ref, + store: store, + closed: false, + formatVersion: formatVersion, + File: file, + } } // Cancel discards the blob. @@ -63,7 +101,7 @@ func (blob *blobWriter) Commit(ctx context.Context) (err error) { return Error.New("already closed") } blob.closed = true - err = blob.store.dir.Commit(ctx, blob.File, blob.ref) + err = blob.store.dir.Commit(ctx, blob.File, blob.ref, blob.formatVersion) return Error.Wrap(err) } @@ -75,3 +113,8 @@ func (blob *blobWriter) Size() (int64, error) { } return pos, err } + +// StorageFormatVersion indicates what storage format version the blob is using. +func (blob *blobWriter) StorageFormatVersion() storage.FormatVersion { + return blob.formatVersion +} diff --git a/storage/filestore/dir.go b/storage/filestore/dir.go index 210db68cf..b90e10256 100644 --- a/storage/filestore/dir.go +++ b/storage/filestore/dir.go @@ -11,6 +11,7 @@ import ( "math" "os" "path/filepath" + "strings" "sync" "github.com/zeebo/errs" @@ -21,6 +22,10 @@ import ( const ( blobPermission = 0600 dirPermission = 0700 + + v0PieceFileSuffix = "" + v1PieceFileSuffix = ".sj1" + unknownPieceFileSuffix = "/..error_unknown_format../" ) var pathEncoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding) @@ -81,8 +86,11 @@ func (dir *Dir) DeleteTemporary(ctx context.Context, file *os.File) (err error) return errs.Combine(closeErr, os.Remove(file.Name())) } -// blobToPath converts blob reference to a filepath in permanent storage -func (dir *Dir) blobToPath(ref storage.BlobRef) (string, error) { +// blobToBasePath converts a blob reference to a filepath in permanent storage. This may not be the +// entire path; blobPathForFormatVersion() must also be used. This is a separate call because this +// part of the filepath is constant, and blobPathForFormatVersion may need to be called multiple +// times with different storage.FormatVersion values. +func (dir *Dir) blobToBasePath(ref storage.BlobRef) (string, error) { if !ref.IsValid() { return "", storage.ErrInvalidBlobRef.New("") } @@ -90,14 +98,27 @@ func (dir *Dir) blobToPath(ref storage.BlobRef) (string, error) { namespace := pathEncoding.EncodeToString(ref.Namespace) key := pathEncoding.EncodeToString(ref.Key) if len(key) < 3 { - // ensure we always have at least + // ensure we always have enough characters to split [:2] and [2:] key = "11" + key } return filepath.Join(dir.blobsdir(), namespace, key[:2], key[2:]), nil } -// blobToTrashPath converts blob reference to a filepath in transient storage -// the files in trash are deleted in an interval (in case the initial deletion didn't work for some reason) +// blobPathForFormatVersion adjusts a bare blob path (as might have been generated by a call to +// blobToBasePath()) to what it should be for the given storage format version. +func blobPathForFormatVersion(path string, formatVersion storage.FormatVersion) string { + switch formatVersion { + case FormatV0: + return path + v0PieceFileSuffix + case FormatV1: + return path + v1PieceFileSuffix + } + return path + unknownPieceFileSuffix +} + +// blobToTrashPath converts a blob reference to a filepath in transient storage. +// The files in trash are deleted on an interval (in case the initial deletion didn't work for +// some reason). func (dir *Dir) blobToTrashPath(ref storage.BlobRef) string { var name []byte name = append(name, ref.Namespace...) @@ -105,8 +126,8 @@ func (dir *Dir) blobToTrashPath(ref storage.BlobRef) string { return filepath.Join(dir.garbagedir(), pathEncoding.EncodeToString(name)) } -// Commit commits temporary file to the permanent storage -func (dir *Dir) Commit(ctx context.Context, file *os.File, ref storage.BlobRef) (err error) { +// Commit commits the temporary file to permanent storage. +func (dir *Dir) Commit(ctx context.Context, file *os.File, ref storage.BlobRef, formatVersion storage.FormatVersion) (err error) { defer mon.Task()(&ctx)(&err) position, seekErr := file.Seek(0, io.SeekCurrent) truncErr := file.Truncate(position) @@ -119,11 +140,12 @@ func (dir *Dir) Commit(ctx context.Context, file *os.File, ref storage.BlobRef) return errs.Combine(seekErr, truncErr, syncErr, chmodErr, closeErr, removeErr) } - path, err := dir.blobToPath(ref) + path, err := dir.blobToBasePath(ref) if err != nil { removeErr := os.Remove(file.Name()) return errs.Combine(err, removeErr) } + path = blobPathForFormatVersion(path, formatVersion) mkdirErr := os.MkdirAll(filepath.Dir(path), dirPermission) if os.IsExist(mkdirErr) { @@ -144,69 +166,158 @@ func (dir *Dir) Commit(ctx context.Context, file *os.File, ref storage.BlobRef) return nil } -// Open opens the file with the specified ref -func (dir *Dir) Open(ctx context.Context, ref storage.BlobRef) (_ *os.File, err error) { +// Open opens the file with the specified ref. It may need to check in more than one location in +// order to find the blob, if it was stored with an older version of the storage node software. +// In cases where the storage format version of a blob is already known, OpenWithStorageFormat() +// will generally be a better choice. +func (dir *Dir) Open(ctx context.Context, ref storage.BlobRef) (_ *os.File, _ storage.FormatVersion, err error) { defer mon.Task()(&ctx)(&err) - path, err := dir.blobToPath(ref) + path, err := dir.blobToBasePath(ref) + if err != nil { + return nil, FormatV0, err + } + for formatVer := MaxFormatVersionSupported; formatVer >= MinFormatVersionSupported; formatVer-- { + vPath := blobPathForFormatVersion(path, formatVer) + file, err := openFileReadOnly(vPath, blobPermission) + if err == nil { + return file, formatVer, nil + } + if !os.IsNotExist(err) { + return nil, FormatV0, Error.New("unable to open %q: %v", vPath, err) + } + } + return nil, FormatV0, os.ErrNotExist +} + +// OpenWithStorageFormat opens an already-located blob file with a known storage format version, +// which avoids the potential need to search through multiple storage formats to find the blob. +func (dir *Dir) OpenWithStorageFormat(ctx context.Context, blobRef storage.BlobRef, formatVer storage.FormatVersion) (_ *os.File, err error) { + defer mon.Task()(&ctx)(&err) + path, err := dir.blobToBasePath(blobRef) if err != nil { return nil, err } - file, err := openFileReadOnly(path, blobPermission) - if err != nil { - if os.IsNotExist(err) { - return nil, err - } - return nil, Error.New("unable to open %q: %v", path, err) + vPath := blobPathForFormatVersion(path, formatVer) + file, err := openFileReadOnly(vPath, blobPermission) + if err == nil { + return file, nil } - return file, nil + if os.IsNotExist(err) { + return nil, err + } + return nil, Error.New("unable to open %q: %v", vPath, err) } -// Delete deletes file with the specified ref +// Stat looks up disk metadata on the blob file. It may need to check in more than one location +// in order to find the blob, if it was stored with an older version of the storage node software. +// In cases where the storage format version of a blob is already known, StatWithStorageFormat() +// will generally be a better choice. +func (dir *Dir) Stat(ctx context.Context, ref storage.BlobRef) (_ storage.BlobInfo, err error) { + defer mon.Task()(&ctx)(&err) + path, err := dir.blobToBasePath(ref) + if err != nil { + return nil, err + } + for formatVer := MaxFormatVersionSupported; formatVer >= MinFormatVersionSupported; formatVer-- { + vPath := blobPathForFormatVersion(path, formatVer) + stat, err := os.Stat(vPath) + if err == nil { + return newBlobInfo(ref, vPath, stat, formatVer), nil + } + if !os.IsNotExist(err) { + return nil, Error.New("unable to stat %q: %v", vPath, err) + } + } + return nil, os.ErrNotExist +} + +// StatWithStorageFormat looks up disk metadata on the blob file with the given storage format +// version. This avoids the need for checking for the file in multiple different storage format +// types. +func (dir *Dir) StatWithStorageFormat(ctx context.Context, ref storage.BlobRef, formatVer storage.FormatVersion) (_ storage.BlobInfo, err error) { + defer mon.Task()(&ctx)(&err) + path, err := dir.blobToBasePath(ref) + if err != nil { + return nil, err + } + vPath := blobPathForFormatVersion(path, formatVer) + stat, err := os.Stat(vPath) + if err == nil { + return newBlobInfo(ref, vPath, stat, formatVer), nil + } + if os.IsNotExist(err) { + return nil, err + } + return nil, Error.New("unable to stat %q: %v", vPath, err) +} + +// Delete deletes blobs with the specified ref (in all supported storage formats). func (dir *Dir) Delete(ctx context.Context, ref storage.BlobRef) (err error) { defer mon.Task()(&ctx)(&err) - path, err := dir.blobToPath(ref) + pathBase, err := dir.blobToBasePath(ref) if err != nil { return err } - trashPath := dir.blobToTrashPath(ref) - // move to trash folder, this is allowed for some OS-es - moveErr := rename(path, trashPath) + var ( + moveErr error + combinedErrors errs.Group + ) - // ignore concurrent delete - if os.IsNotExist(moveErr) { - return nil - } - if moveErr != nil { - trashPath = path + // Try deleting all possible paths, starting with the oldest format version. It is more + // likely, in the general case, that we will find the piece with the newest format version + // instead, but if we iterate backward here then we run the risk of a race condition: the + // piece might have existed with _SomeOldVer before the Delete call, and could then have + // been updated atomically with _MaxVer concurrently while we were iterating. If we iterate + // _forwards_, this race should not occur because it is assumed that pieces are never + // rewritten with an _older_ storage format version. + for i := MinFormatVersionSupported; i <= MaxFormatVersionSupported; i++ { + verPath := blobPathForFormatVersion(pathBase, i) + + // move to trash folder, this is allowed for some OS-es + moveErr = rename(verPath, trashPath) + if os.IsNotExist(moveErr) { + // no piece at that path; either it has a different storage format version or there + // was a concurrent delete. (this function is expected by callers to return a nil + // error in the case of concurrent deletes.) + continue + } + if moveErr != nil { + // piece could not be moved into the trash dir; we'll try removing it directly + trashPath = verPath + } + + // try removing the file + err = os.Remove(trashPath) + + // ignore concurrent deletes + if os.IsNotExist(err) { + // something is happening at the same time as this; possibly a concurrent delete, + // or possibly a rewrite of the blob. keep checking for more versions. + continue + } + + // the remove may have failed because of an open file handle. put it in a queue to be + // retried later. + if err != nil { + dir.mu.Lock() + dir.deleteQueue = append(dir.deleteQueue, trashPath) + dir.mu.Unlock() + } + + // ignore is-busy errors, they are still in the queue + // but no need to notify + if isBusy(err) { + err = nil + } + combinedErrors.Add(err) } - // try removing the file - err = os.Remove(trashPath) - - // ignore concurrent deletes - if os.IsNotExist(err) { - return nil - } - - // this may fail, because someone might be still reading it - if err != nil { - dir.mu.Lock() - dir.deleteQueue = append(dir.deleteQueue, trashPath) - dir.mu.Unlock() - } - - // ignore is busy errors, they are still in the queue - // but no need to notify - if isBusy(err) { - err = nil - } - - return err + return combinedErrors.Err() } -// GarbageCollect collects files that are pending deletion +// GarbageCollect collects files that are pending deletion. func (dir *Dir) GarbageCollect(ctx context.Context) (err error) { defer mon.Task()(&ctx)(&err) offset := int(math.MaxInt32) @@ -238,6 +349,144 @@ func (dir *Dir) GarbageCollect(ctx context.Context) (err error) { return nil } +const nameBatchSize = 1024 + +// ListNamespaces finds all known namespace IDs in use in local storage. They are not +// guaranteed to contain any blobs. +func (dir *Dir) ListNamespaces(ctx context.Context) (ids [][]byte, err error) { + defer mon.Task()(&ctx)(&err) + topBlobDir := dir.blobsdir() + openDir, err := os.Open(topBlobDir) + if err != nil { + return nil, err + } + defer func() { err = errs.Combine(err, openDir.Close()) }() + for { + dirNames, err := openDir.Readdirnames(nameBatchSize) + if err != nil && err != io.EOF { + return nil, err + } + if len(dirNames) == 0 { + return ids, nil + } + for _, name := range dirNames { + namespace, err := pathEncoding.DecodeString(name) + if err != nil { + // just an invalid directory entry, and not a namespace. probably + // don't need to pass on this error + continue + } + ids = append(ids, namespace) + } + } +} + +// WalkNamespace executes walkFunc for each locally stored blob, stored with storage format V1 or +// greater, in the given namespace. If walkFunc returns a non-nil error, WalkNamespace will stop +// iterating and return the error immediately. The ctx parameter is intended specifically to allow +// canceling iteration early. +func (dir *Dir) WalkNamespace(ctx context.Context, namespace []byte, walkFunc func(storage.BlobInfo) error) (err error) { + namespaceDir := pathEncoding.EncodeToString(namespace) + nsDir := filepath.Join(dir.blobsdir(), namespaceDir) + openDir, err := os.Open(nsDir) + if err != nil { + if os.IsNotExist(err) { + // job accomplished: there are no blobs in this namespace! + return nil + } + return err + } + defer func() { err = errs.Combine(err, openDir.Close()) }() + for { + // check for context done both before and after our readdir() call + if err := ctx.Err(); err != nil { + return err + } + subdirNames, err := openDir.Readdirnames(nameBatchSize) + if err != nil && err != io.EOF { + return err + } + if os.IsNotExist(err) || len(subdirNames) == 0 { + return nil + } + if err := ctx.Err(); err != nil { + return err + } + for _, keyPrefix := range subdirNames { + if len(keyPrefix) != 2 { + // just an invalid subdir; could be garbage of many kinds. probably + // don't need to pass on this error + continue + } + err := dir.walkNamespaceWithPrefix(ctx, namespace, nsDir, keyPrefix, walkFunc) + if err != nil { + return err + } + } + } +} + +func decodeBlobInfo(namespace []byte, keyPrefix, keyDir string, keyInfo os.FileInfo) (info storage.BlobInfo, ok bool) { + blobFileName := keyInfo.Name() + encodedKey := keyPrefix + blobFileName + formatVer := FormatV0 + if strings.HasSuffix(blobFileName, v1PieceFileSuffix) { + formatVer = FormatV1 + encodedKey = encodedKey[0 : len(encodedKey)-len(v1PieceFileSuffix)] + } + key, err := pathEncoding.DecodeString(encodedKey) + if err != nil { + return nil, false + } + ref := storage.BlobRef{ + Namespace: namespace, + Key: key, + } + return newBlobInfo(ref, filepath.Join(keyDir, blobFileName), keyInfo, formatVer), true +} + +func (dir *Dir) walkNamespaceWithPrefix(ctx context.Context, namespace []byte, nsDir, keyPrefix string, walkFunc func(storage.BlobInfo) error) (err error) { + keyDir := filepath.Join(nsDir, keyPrefix) + openDir, err := os.Open(keyDir) + if err != nil { + return err + } + defer func() { err = errs.Combine(err, openDir.Close()) }() + for { + // check for context done both before and after our readdir() call + if err := ctx.Err(); err != nil { + return err + } + keyInfos, err := openDir.Readdir(nameBatchSize) + if err != nil && err != io.EOF { + return err + } + if os.IsNotExist(err) || len(keyInfos) == 0 { + return nil + } + if err := ctx.Err(); err != nil { + return err + } + for _, keyInfo := range keyInfos { + if keyInfo.Mode().IsDir() { + continue + } + info, ok := decodeBlobInfo(namespace, keyPrefix, keyDir, keyInfo) + if !ok { + continue + } + err = walkFunc(info) + if err != nil { + return err + } + // also check for context done between every walkFunc callback. + if err := ctx.Err(); err != nil { + return err + } + } + } +} + // removeAllContent deletes everything in the folder func removeAllContent(ctx context.Context, path string) (err error) { defer mon.Task()(&ctx)(&err) @@ -275,3 +524,35 @@ func (dir *Dir) Info() (DiskInfo, error) { } return diskInfoFromPath(path) } + +type blobInfo struct { + ref storage.BlobRef + path string + fileInfo os.FileInfo + formatVersion storage.FormatVersion +} + +func newBlobInfo(ref storage.BlobRef, path string, fileInfo os.FileInfo, formatVer storage.FormatVersion) storage.BlobInfo { + return &blobInfo{ + ref: ref, + path: path, + fileInfo: fileInfo, + formatVersion: formatVer, + } +} + +func (info *blobInfo) BlobRef() storage.BlobRef { + return info.ref +} + +func (info *blobInfo) StorageFormatVersion() storage.FormatVersion { + return info.formatVersion +} + +func (info *blobInfo) Stat(ctx context.Context) (os.FileInfo, error) { + return info.fileInfo, nil +} + +func (info *blobInfo) FullPath(ctx context.Context) (string, error) { + return info.path, nil +} diff --git a/storage/filestore/store.go b/storage/filestore/store.go index 25c612c49..bab8c5831 100644 --- a/storage/filestore/store.go +++ b/storage/filestore/store.go @@ -8,7 +8,8 @@ import ( "os" "github.com/zeebo/errs" - monkit "gopkg.in/spacemonkeygo/monkit.v2" + "go.uber.org/zap" + "gopkg.in/spacemonkeygo/monkit.v2" "storj.io/storj/storage" ) @@ -25,20 +26,21 @@ var ( // Store implements a blob store type Store struct { dir *Dir + log *zap.Logger } // New creates a new disk blob store in the specified directory -func New(dir *Dir) *Store { - return &Store{dir} +func New(log *zap.Logger, dir *Dir) *Store { + return &Store{dir: dir, log: log} } // NewAt creates a new disk blob store in the specified directory -func NewAt(path string) (*Store, error) { +func NewAt(log *zap.Logger, path string) (*Store, error) { dir, err := NewDir(path) if err != nil { return nil, Error.Wrap(err) } - return &Store{dir}, nil + return &Store{dir: dir, log: log}, nil } // Close closes the store. @@ -47,14 +49,42 @@ func (store *Store) Close() error { return nil } // Open loads blob with the specified hash func (store *Store) Open(ctx context.Context, ref storage.BlobRef) (_ storage.BlobReader, err error) { defer mon.Task()(&ctx)(&err) - file, err := store.dir.Open(ctx, ref) + file, formatVer, err := store.dir.Open(ctx, ref) if err != nil { if os.IsNotExist(err) { return nil, err } return nil, Error.Wrap(err) } - return newBlobReader(file), nil + return newBlobReader(file, formatVer), nil +} + +// OpenWithStorageFormat loads the already-located blob, avoiding the potential need to check multiple +// storage formats to find the blob. +func (store *Store) OpenWithStorageFormat(ctx context.Context, blobRef storage.BlobRef, formatVer storage.FormatVersion) (_ storage.BlobReader, err error) { + defer mon.Task()(&ctx)(&err) + file, err := store.dir.OpenWithStorageFormat(ctx, blobRef, formatVer) + if err != nil { + if os.IsNotExist(err) { + return nil, err + } + return nil, Error.Wrap(err) + } + return newBlobReader(file, formatVer), nil +} + +// Stat looks up disk metadata on the blob file +func (store *Store) Stat(ctx context.Context, ref storage.BlobRef) (_ storage.BlobInfo, err error) { + defer mon.Task()(&ctx)(&err) + info, err := store.dir.Stat(ctx, ref) + return info, Error.Wrap(err) +} + +// StatWithStorageFormat looks up disk metadata on the blob file with the given storage format version +func (store *Store) StatWithStorageFormat(ctx context.Context, ref storage.BlobRef, formatVer storage.FormatVersion) (_ storage.BlobInfo, err error) { + defer mon.Task()(&ctx)(&err) + info, err := store.dir.StatWithStorageFormat(ctx, ref, formatVer) + return info, Error.Wrap(err) } // Delete deletes blobs with the specified ref @@ -79,7 +109,45 @@ func (store *Store) Create(ctx context.Context, ref storage.BlobRef, size int64) if err != nil { return nil, Error.Wrap(err) } - return newBlobWriter(ref, store, file), nil + return newBlobWriter(ref, store, MaxFormatVersionSupported, file), nil +} + +// SpaceUsed adds up the space used in all namespaces for blob storage +func (store *Store) SpaceUsed(ctx context.Context) (space int64, err error) { + defer mon.Task()(&ctx)(&err) + + var totalSpaceUsed int64 + namespaces, err := store.ListNamespaces(ctx) + if err != nil { + return 0, Error.New("failed to enumerate namespaces: %v", err) + } + for _, namespace := range namespaces { + used, err := store.SpaceUsedInNamespace(ctx, namespace) + if err != nil { + return 0, Error.New("failed to sum space used: %v", err) + } + totalSpaceUsed += used + } + return totalSpaceUsed, nil +} + +// SpaceUsedInNamespace adds up how much is used in the given namespace for blob storage +func (store *Store) SpaceUsedInNamespace(ctx context.Context, namespace []byte) (int64, error) { + var totalUsed int64 + err := store.WalkNamespace(ctx, namespace, func(info storage.BlobInfo) error { + statInfo, statErr := info.Stat(ctx) + if statErr != nil { + store.log.Error("failed to stat blob", zap.Binary("namespace", namespace), zap.Binary("key", info.BlobRef().Key), zap.Error(statErr)) + // keep iterating; we want a best effort total here. + return nil + } + totalUsed += statInfo.Size() + return nil + }) + if err != nil { + return 0, err + } + return totalUsed, nil } // FreeSpace returns how much space left in underlying directory @@ -90,3 +158,33 @@ func (store *Store) FreeSpace() (int64, error) { } return info.AvailableSpace, nil } + +// ListNamespaces finds all known namespace IDs in use in local storage. They are not +// guaranteed to contain any blobs. +func (store *Store) ListNamespaces(ctx context.Context) (ids [][]byte, err error) { + return store.dir.ListNamespaces(ctx) +} + +// WalkNamespace executes walkFunc for each locally stored blob in the given namespace. If walkFunc +// returns a non-nil error, WalkNamespace will stop iterating and return the error immediately. The +// ctx parameter is intended specifically to allow canceling iteration early. +func (store *Store) WalkNamespace(ctx context.Context, namespace []byte, walkFunc func(storage.BlobInfo) error) (err error) { + return store.dir.WalkNamespace(ctx, namespace, walkFunc) +} + +// StoreForTest is a wrapper for Store that also allows writing new V0 blobs (in order to test +// situations involving those) +type StoreForTest struct { + *Store +} + +// CreateV0 creates a new V0 blob that can be written. This is only appropriate in test situations. +func (testStore *StoreForTest) CreateV0(ctx context.Context, ref storage.BlobRef) (_ storage.BlobWriter, err error) { + defer mon.Task()(&ctx)(&err) + + file, err := testStore.dir.CreateTemporaryFile(ctx, -1) + if err != nil { + return nil, Error.Wrap(err) + } + return newBlobWriter(ref, testStore.Store, FormatV0, file), nil +} diff --git a/storage/filestore/store_test.go b/storage/filestore/store_test.go index 7ed5579f9..0e40a399d 100644 --- a/storage/filestore/store_test.go +++ b/storage/filestore/store_test.go @@ -4,21 +4,32 @@ package filestore_test import ( - "errors" + "bytes" + "context" "io" "io/ioutil" "os" "path/filepath" + "sort" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/zeebo/errs" + "go.uber.org/zap/zaptest" + "storj.io/storj/internal/memory" "storj.io/storj/internal/testcontext" "storj.io/storj/internal/testrand" "storj.io/storj/storage" "storj.io/storj/storage/filestore" ) +const ( + namespaceSize = 32 + keySize = 32 +) + func TestStoreLoad(t *testing.T) { const blobSize = 8 << 10 const repeatCount = 16 @@ -26,8 +37,9 @@ func TestStoreLoad(t *testing.T) { ctx := testcontext.New(t) defer ctx.Cleanup() - store, err := filestore.NewAt(ctx.Dir("store")) + store, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) require.NoError(t, err) + ctx.Check(store.Close) data := testrand.Bytes(blobSize) temp := make([]byte, len(data)) @@ -155,8 +167,9 @@ func TestDeleteWhileReading(t *testing.T) { ctx := testcontext.New(t) defer ctx.Cleanup() - store, err := filestore.NewAt(ctx.Dir("store")) + store, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) require.NoError(t, err) + ctx.Check(store.Close) data := testrand.Bytes(blobSize) @@ -213,9 +226,301 @@ func TestDeleteWhileReading(t *testing.T) { if info.IsDir() { return nil } - return errors.New("found file " + path) + return errs.New("found file %q", path) }) if err != nil { t.Fatal(err) } } + +func writeABlob(ctx context.Context, t testing.TB, store *filestore.Store, blobRef storage.BlobRef, data []byte, formatVersion storage.FormatVersion) { + var ( + blobWriter storage.BlobWriter + err error + ) + switch formatVersion { + case filestore.FormatV0: + tStore := &filestore.StoreForTest{store} + blobWriter, err = tStore.CreateV0(ctx, blobRef) + case filestore.FormatV1: + blobWriter, err = store.Create(ctx, blobRef, int64(len(data))) + default: + t.Fatalf("please teach me how to make a V%d blob", formatVersion) + } + require.NoError(t, err) + require.Equal(t, formatVersion, blobWriter.StorageFormatVersion()) + _, err = blobWriter.Write(data) + require.NoError(t, err) + size, err := blobWriter.Size() + require.NoError(t, err) + assert.Equal(t, int64(len(data)), size) + err = blobWriter.Commit(ctx) + require.NoError(t, err) +} + +func verifyBlobHandle(t testing.TB, reader storage.BlobReader, expectDataLen int, expectFormat storage.FormatVersion) { + assert.Equal(t, expectFormat, reader.StorageFormatVersion()) + size, err := reader.Size() + require.NoError(t, err) + assert.Equal(t, int64(expectDataLen), size) +} + +func verifyBlobInfo(ctx context.Context, t testing.TB, blobInfo storage.BlobInfo, expectDataLen int, expectFormat storage.FormatVersion) { + assert.Equal(t, expectFormat, blobInfo.StorageFormatVersion()) + stat, err := blobInfo.Stat(ctx) + require.NoError(t, err) + assert.Equal(t, int64(expectDataLen), stat.Size()) +} + +func tryOpeningABlob(ctx context.Context, t testing.TB, store *filestore.Store, blobRef storage.BlobRef, expectDataLen int, expectFormat storage.FormatVersion) { + reader, err := store.Open(ctx, blobRef) + require.NoError(t, err) + verifyBlobHandle(t, reader, expectDataLen, expectFormat) + require.NoError(t, reader.Close()) + + blobInfo, err := store.Stat(ctx, blobRef) + require.NoError(t, err) + verifyBlobInfo(ctx, t, blobInfo, expectDataLen, expectFormat) + + blobInfo, err = store.StatWithStorageFormat(ctx, blobRef, expectFormat) + require.NoError(t, err) + verifyBlobInfo(ctx, t, blobInfo, expectDataLen, expectFormat) + + reader, err = store.OpenWithStorageFormat(ctx, blobInfo.BlobRef(), blobInfo.StorageFormatVersion()) + require.NoError(t, err) + verifyBlobHandle(t, reader, expectDataLen, expectFormat) + require.NoError(t, reader.Close()) +} + +func TestMultipleStorageFormatVersions(t *testing.T) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + store, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) + require.NoError(t, err) + ctx.Check(store.Close) + + const blobSize = 1024 + + var ( + data = testrand.Bytes(blobSize) + namespace = testrand.Bytes(namespaceSize) + v0BlobKey = testrand.Bytes(keySize) + v1BlobKey = testrand.Bytes(keySize) + + v0Ref = storage.BlobRef{Namespace: namespace, Key: v0BlobKey} + v1Ref = storage.BlobRef{Namespace: namespace, Key: v1BlobKey} + ) + + // write a V0 blob + writeABlob(ctx, t, store, v0Ref, data, filestore.FormatV0) + + // write a V1 blob + writeABlob(ctx, t, store, v1Ref, data, filestore.FormatV1) + + // look up the different blobs with Open and Stat and OpenWithStorageFormat + tryOpeningABlob(ctx, t, store, v0Ref, len(data), filestore.FormatV0) + tryOpeningABlob(ctx, t, store, v1Ref, len(data), filestore.FormatV1) + + // write a V1 blob with the same ID as the V0 blob (to simulate it being rewritten as + // V1 during a migration), with different data so we can distinguish them + differentData := make([]byte, len(data)+2) + copy(differentData, data) + copy(differentData[len(data):], "\xff\x00") + writeABlob(ctx, t, store, v0Ref, differentData, filestore.FormatV1) + + // if we try to access the blob at that key, we should see only the V1 blob + tryOpeningABlob(ctx, t, store, v0Ref, len(differentData), filestore.FormatV1) + + // unless we ask specifically for a V0 blob + blobInfo, err := store.StatWithStorageFormat(ctx, v0Ref, filestore.FormatV0) + require.NoError(t, err) + verifyBlobInfo(ctx, t, blobInfo, len(data), filestore.FormatV0) + reader, err := store.OpenWithStorageFormat(ctx, blobInfo.BlobRef(), blobInfo.StorageFormatVersion()) + require.NoError(t, err) + verifyBlobHandle(t, reader, len(data), filestore.FormatV0) + require.NoError(t, reader.Close()) + + // delete the v0BlobKey; both the V0 and the V1 blobs should go away + err = store.Delete(ctx, v0Ref) + require.NoError(t, err) + + reader, err = store.Open(ctx, v0Ref) + require.Error(t, err) + assert.Nil(t, reader) +} + +// Check that the SpaceUsed and SpaceUsedInNamespace methods on filestore.Store +// work as expected. +func TestStoreSpaceUsed(t *testing.T) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + store, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) + require.NoError(t, err) + ctx.Check(store.Close) + + var ( + namespace = testrand.Bytes(namespaceSize) + otherNamespace = testrand.Bytes(namespaceSize) + sizesToStore = []memory.Size{4093, 0, 512, 1, memory.MB} + ) + + spaceUsed, err := store.SpaceUsed(ctx) + require.NoError(t, err) + assert.Equal(t, int64(0), spaceUsed) + spaceUsed, err = store.SpaceUsedInNamespace(ctx, namespace) + require.NoError(t, err) + assert.Equal(t, int64(0), spaceUsed) + spaceUsed, err = store.SpaceUsedInNamespace(ctx, otherNamespace) + require.NoError(t, err) + assert.Equal(t, int64(0), spaceUsed) + + var totalSoFar memory.Size + for _, size := range sizesToStore { + contents := testrand.Bytes(size) + blobRef := storage.BlobRef{Namespace: namespace, Key: testrand.Bytes(keySize)} + + blobWriter, err := store.Create(ctx, blobRef, int64(len(contents))) + require.NoError(t, err) + _, err = blobWriter.Write(contents) + require.NoError(t, err) + err = blobWriter.Commit(ctx) + require.NoError(t, err) + totalSoFar += size + + spaceUsed, err := store.SpaceUsed(ctx) + require.NoError(t, err) + assert.Equal(t, int64(totalSoFar), spaceUsed) + spaceUsed, err = store.SpaceUsedInNamespace(ctx, namespace) + require.NoError(t, err) + assert.Equal(t, int64(totalSoFar), spaceUsed) + spaceUsed, err = store.SpaceUsedInNamespace(ctx, otherNamespace) + require.NoError(t, err) + assert.Equal(t, int64(0), spaceUsed) + } +} + +// Check that ListNamespaces and WalkNamespace work as expected. +func TestStoreTraversals(t *testing.T) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + store, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) + require.NoError(t, err) + ctx.Check(store.Close) + + // invent some namespaces and store stuff in them + type namespaceWithBlobs struct { + namespace []byte + blobs []storage.BlobRef + } + const numNamespaces = 4 + recordsToInsert := make([]namespaceWithBlobs, numNamespaces) + + var namespaceBase = testrand.Bytes(namespaceSize) + for i := range recordsToInsert { + // give each namespace a similar ID but modified in the last byte to distinguish + recordsToInsert[i].namespace = make([]byte, len(namespaceBase)) + copy(recordsToInsert[i].namespace, namespaceBase) + recordsToInsert[i].namespace[len(namespaceBase)-1] = byte(i) + + // put varying numbers of blobs in the namespaces + recordsToInsert[i].blobs = make([]storage.BlobRef, i+1) + for j := range recordsToInsert[i].blobs { + recordsToInsert[i].blobs[j] = storage.BlobRef{ + Namespace: recordsToInsert[i].namespace, + Key: testrand.Bytes(keySize), + } + blobWriter, err := store.Create(ctx, recordsToInsert[i].blobs[j], 0) + require.NoError(t, err) + // also vary the sizes of the blobs so we can check Stat results + _, err = blobWriter.Write(testrand.Bytes(memory.Size(j))) + require.NoError(t, err) + err = blobWriter.Commit(ctx) + require.NoError(t, err) + } + } + + // test ListNamespaces + gotNamespaces, err := store.ListNamespaces(ctx) + require.NoError(t, err) + sort.Slice(gotNamespaces, func(i, j int) bool { + return bytes.Compare(gotNamespaces[i], gotNamespaces[j]) < 0 + }) + sort.Slice(recordsToInsert, func(i, j int) bool { + return bytes.Compare(recordsToInsert[i].namespace, recordsToInsert[j].namespace) < 0 + }) + for i, expected := range recordsToInsert { + require.Equalf(t, expected.namespace, gotNamespaces[i], "mismatch at index %d: recordsToInsert is %+v and gotNamespaces is %v", i, recordsToInsert, gotNamespaces) + } + + // test WalkNamespace + for _, expected := range recordsToInsert { + // this isn't strictly necessary, since the function closure below is not persisted + // past the end of a loop iteration, but this keeps the linter from complaining. + expected := expected + + // keep track of which blobs we visit with WalkNamespace + found := make([]bool, len(expected.blobs)) + + err = store.WalkNamespace(ctx, expected.namespace, func(info storage.BlobInfo) error { + gotBlobRef := info.BlobRef() + assert.Equal(t, expected.namespace, gotBlobRef.Namespace) + // find which blob this is in expected.blobs + blobIdentified := -1 + for i, expectedBlobRef := range expected.blobs { + if bytes.Equal(gotBlobRef.Key, expectedBlobRef.Key) { + found[i] = true + blobIdentified = i + } + } + // make sure this is a blob we actually put in + require.NotEqualf(t, -1, blobIdentified, + "WalkNamespace gave BlobRef %v, but I don't remember storing that", + gotBlobRef) + + // check BlobInfo sanity + stat, err := info.Stat(ctx) + require.NoError(t, err) + nameFromStat := stat.Name() + fullPath, err := info.FullPath(ctx) + require.NoError(t, err) + basePath := filepath.Base(fullPath) + assert.Equal(t, nameFromStat, basePath) + assert.Equal(t, int64(blobIdentified), stat.Size()) + assert.False(t, stat.IsDir()) + return nil + }) + require.NoError(t, err) + + // make sure all blobs were visited + for i := range found { + assert.True(t, found[i], + "WalkNamespace never yielded blob at index %d: %v", + i, expected.blobs[i]) + } + } + + // test WalkNamespace on a nonexistent namespace also + namespaceBase[len(namespaceBase)-1] = byte(numNamespaces) + err = store.WalkNamespace(ctx, namespaceBase, func(info storage.BlobInfo) error { + t.Fatal("this should not have been called") + return nil + }) + require.NoError(t, err) + + // check that WalkNamespace stops iterating after an error return + iterations := 0 + expectedErr := errs.New("an expected error") + err = store.WalkNamespace(ctx, recordsToInsert[numNamespaces-1].namespace, func(info storage.BlobInfo) error { + iterations++ + if iterations == 2 { + return expectedErr + } + return nil + }) + assert.Error(t, err) + assert.Equal(t, err, expectedErr) + assert.Equal(t, 2, iterations) +} diff --git a/storagenode/collector/service.go b/storagenode/collector/service.go index 48ecaae7e..725776250 100644 --- a/storagenode/collector/service.go +++ b/storagenode/collector/service.go @@ -11,7 +11,6 @@ import ( "go.uber.org/zap" monkit "gopkg.in/spacemonkeygo/monkit.v2" - "storj.io/storj/internal/memory" "storj.io/storj/internal/sync2" "storj.io/storj/storagenode/pieces" "storj.io/storj/storagenode/piecestore" @@ -28,18 +27,16 @@ type Config struct { type Service struct { log *zap.Logger pieces *pieces.Store - pieceinfos pieces.DB usedSerials piecestore.UsedSerials Loop sync2.Cycle } // NewService creates a new collector service. -func NewService(log *zap.Logger, pieces *pieces.Store, pieceinfos pieces.DB, usedSerials piecestore.UsedSerials, config Config) *Service { +func NewService(log *zap.Logger, pieces *pieces.Store, usedSerials piecestore.UsedSerials, config Config) *Service { return &Service{ log: log, pieces: pieces, - pieceinfos: pieceinfos, usedSerials: usedSerials, Loop: *sync2.NewCycle(config.Interval), } @@ -76,15 +73,14 @@ func (service *Service) Collect(ctx context.Context, now time.Time) (err error) const batchSize = 1000 var count int64 - var bytes int64 defer func() { if count > 0 { - service.log.Info("collect", zap.Int64("count", count), zap.Stringer("size", memory.Size(bytes))) + service.log.Info("collect", zap.Int64("count", count)) } }() for k := 0; k < maxBatches; k++ { - infos, err := service.pieceinfos.GetExpired(ctx, now, batchSize) + infos, err := service.pieces.GetExpired(ctx, now, batchSize) if err != nil { return err } @@ -95,7 +91,7 @@ func (service *Service) Collect(ctx context.Context, now time.Time) (err error) for _, expired := range infos { err := service.pieces.Delete(ctx, expired.SatelliteID, expired.PieceID) if err != nil { - errfailed := service.pieceinfos.DeleteFailed(ctx, expired.SatelliteID, expired.PieceID, now) + errfailed := service.pieces.DeleteFailed(ctx, expired, now) if errfailed != nil { service.log.Error("unable to update piece info", zap.Stringer("satellite id", expired.SatelliteID), zap.Stringer("piece id", expired.PieceID), zap.Error(errfailed)) } @@ -103,14 +99,7 @@ func (service *Service) Collect(ctx context.Context, now time.Time) (err error) continue } - err = service.pieceinfos.Delete(ctx, expired.SatelliteID, expired.PieceID) - if err != nil { - service.log.Error("unable to delete piece info", zap.Stringer("satellite id", expired.SatelliteID), zap.Stringer("piece id", expired.PieceID), zap.Error(err)) - continue - } - count++ - bytes += expired.PieceSize } } diff --git a/storagenode/collector/service_test.go b/storagenode/collector/service_test.go index 82c8b5485..b0a67e15b 100644 --- a/storagenode/collector/service_test.go +++ b/storagenode/collector/service_test.go @@ -51,11 +51,11 @@ func TestCollector(t *testing.T) { // imagine we are 30 minutes in the future for _, storageNode := range planet.StorageNodes { - pieceinfos := storageNode.DB.PieceInfo() + pieceStore := storageNode.DB.Pieces() usedSerials := storageNode.DB.UsedSerials() // verify that we actually have some data on storage nodes - used, err := pieceinfos.SpaceUsed(ctx) + used, err := pieceStore.SpaceUsed(ctx) require.NoError(t, err) if used == 0 { // this storage node didn't get picked for storing data @@ -101,7 +101,7 @@ func TestCollector(t *testing.T) { // imagine we are 10 days in the future for _, storageNode := range planet.StorageNodes { - pieceinfos := storageNode.DB.PieceInfo() + pieceStore := storageNode.DB.Pieces() usedSerials := storageNode.DB.UsedSerials() // collect all the data @@ -109,7 +109,7 @@ func TestCollector(t *testing.T) { require.NoError(t, err) // verify that we deleted everything - used, err := pieceinfos.SpaceUsed(ctx) + used, err := pieceStore.SpaceUsed(ctx) require.NoError(t, err) require.Equal(t, int64(0), used) diff --git a/storagenode/console/service.go b/storagenode/console/service.go index 9c8a2bac3..53f810701 100644 --- a/storagenode/console/service.go +++ b/storagenode/console/service.go @@ -46,7 +46,7 @@ type Service struct { consoleDB DB bandwidthDB bandwidth.DB - pieceInfoDB pieces.DB + pieceStore *pieces.Store kademlia *kademlia.Kademlia version *version.Service nodestats *nodestats.Service @@ -59,7 +59,7 @@ type Service struct { } // NewService returns new instance of Service -func NewService(log *zap.Logger, consoleDB DB, bandwidth bandwidth.DB, pieceInfo pieces.DB, kademlia *kademlia.Kademlia, version *version.Service, +func NewService(log *zap.Logger, consoleDB DB, bandwidth bandwidth.DB, pieceStore *pieces.Store, kademlia *kademlia.Kademlia, version *version.Service, nodestats *nodestats.Service, allocatedBandwidth, allocatedDiskSpace memory.Size, walletAddress string, versionInfo version.Info) (*Service, error) { if log == nil { return nil, errs.New("log can't be nil") @@ -73,8 +73,8 @@ func NewService(log *zap.Logger, consoleDB DB, bandwidth bandwidth.DB, pieceInfo return nil, errs.New("bandwidth can't be nil") } - if pieceInfo == nil { - return nil, errs.New("pieceInfo can't be nil") + if pieceStore == nil { + return nil, errs.New("pieceStore can't be nil") } if version == nil { @@ -89,7 +89,7 @@ func NewService(log *zap.Logger, consoleDB DB, bandwidth bandwidth.DB, pieceInfo log: log, consoleDB: consoleDB, bandwidthDB: bandwidth, - pieceInfoDB: pieceInfo, + pieceStore: pieceStore, kademlia: kademlia, version: version, nodestats: nodestats, @@ -146,7 +146,7 @@ func (s *Service) GetBandwidthBySatellite(ctx context.Context, satelliteID storj func (s *Service) GetUsedStorageTotal(ctx context.Context) (_ *DiskSpaceInfo, err error) { defer mon.Task()(&ctx)(&err) - spaceUsed, err := s.pieceInfoDB.SpaceUsed(ctx) + spaceUsed, err := s.pieceStore.SpaceUsedForPieces(ctx) if err != nil { return nil, err } @@ -158,7 +158,7 @@ func (s *Service) GetUsedStorageTotal(ctx context.Context) (_ *DiskSpaceInfo, er func (s *Service) GetUsedStorageBySatellite(ctx context.Context, satelliteID storj.NodeID) (_ *DiskSpaceInfo, err error) { defer mon.Task()(&ctx)(&err) - spaceUsed, err := s.pieceInfoDB.SpaceUsedBySatellite(ctx, satelliteID) + spaceUsed, err := s.pieceStore.SpaceUsedBySatellite(ctx, satelliteID) if err != nil { return nil, err } diff --git a/storagenode/inspector/inspector.go b/storagenode/inspector/inspector.go index a0e1af1d4..fdf538de6 100644 --- a/storagenode/inspector/inspector.go +++ b/storagenode/inspector/inspector.go @@ -31,10 +31,10 @@ var ( // Endpoint does inspectory things type Endpoint struct { - log *zap.Logger - pieceInfo pieces.DB - kademlia *kademlia.Kademlia - usageDB bandwidth.DB + log *zap.Logger + pieceStore *pieces.Store + kademlia *kademlia.Kademlia + usageDB bandwidth.DB startTime time.Time pieceStoreConfig piecestore.OldConfig @@ -44,7 +44,7 @@ type Endpoint struct { // NewEndpoint creates piecestore inspector instance func NewEndpoint( log *zap.Logger, - pieceInfo pieces.DB, + pieceStore *pieces.Store, kademlia *kademlia.Kademlia, usageDB bandwidth.DB, pieceStoreConfig piecestore.OldConfig, @@ -52,7 +52,7 @@ func NewEndpoint( return &Endpoint{ log: log, - pieceInfo: pieceInfo, + pieceStore: pieceStore, kademlia: kademlia, usageDB: usageDB, pieceStoreConfig: pieceStoreConfig, @@ -65,7 +65,7 @@ func (inspector *Endpoint) retrieveStats(ctx context.Context) (_ *pb.StatSummary defer mon.Task()(&ctx)(&err) // Space Usage - totalUsedSpace, err := inspector.pieceInfo.SpaceUsed(ctx) + totalUsedSpace, err := inspector.pieceStore.SpaceUsedForPieces(ctx) if err != nil { return nil, err } diff --git a/storagenode/monitor/monitor.go b/storagenode/monitor/monitor.go index b4a71e961..60f92419e 100644 --- a/storagenode/monitor/monitor.go +++ b/storagenode/monitor/monitor.go @@ -38,7 +38,6 @@ type Service struct { log *zap.Logger routingTable *kademlia.RoutingTable store *pieces.Store - pieceInfo pieces.DB usageDB bandwidth.DB allocatedDiskSpace int64 allocatedBandwidth int64 @@ -49,12 +48,11 @@ type Service struct { // TODO: should it be responsible for monitoring actual bandwidth as well? // NewService creates a new storage node monitoring service. -func NewService(log *zap.Logger, routingTable *kademlia.RoutingTable, store *pieces.Store, pieceInfo pieces.DB, usageDB bandwidth.DB, allocatedDiskSpace, allocatedBandwidth int64, interval time.Duration, config Config) *Service { +func NewService(log *zap.Logger, routingTable *kademlia.RoutingTable, store *pieces.Store, usageDB bandwidth.DB, allocatedDiskSpace, allocatedBandwidth int64, interval time.Duration, config Config) *Service { return &Service{ log: log, routingTable: routingTable, store: store, - pieceInfo: pieceInfo, usageDB: usageDB, allocatedDiskSpace: allocatedDiskSpace, allocatedBandwidth: allocatedBandwidth, @@ -162,7 +160,7 @@ func (service *Service) updateNodeInformation(ctx context.Context) (err error) { func (service *Service) usedSpace(ctx context.Context) (_ int64, err error) { defer mon.Task()(&ctx)(&err) - usedSpace, err := service.pieceInfo.SpaceUsed(ctx) + usedSpace, err := service.store.SpaceUsedForPieces(ctx) if err != nil { return 0, err } @@ -181,7 +179,7 @@ func (service *Service) usedBandwidth(ctx context.Context) (_ int64, err error) // AvailableSpace returns available disk space for upload func (service *Service) AvailableSpace(ctx context.Context) (_ int64, err error) { defer mon.Task()(&ctx)(&err) - usedSpace, err := service.pieceInfo.SpaceUsed(ctx) + usedSpace, err := service.store.SpaceUsedForPieces(ctx) if err != nil { return 0, Error.Wrap(err) } diff --git a/storagenode/peer.go b/storagenode/peer.go index 63faeca16..cf42a5de2 100644 --- a/storagenode/peer.go +++ b/storagenode/peer.go @@ -53,7 +53,8 @@ type DB interface { Pieces() storage.Blobs Orders() orders.DB - PieceInfo() pieces.DB + V0PieceInfo() pieces.V0PieceInfoDB + PieceExpirationDB() pieces.PieceExpirationDB Bandwidth() bandwidth.DB UsedSerials() piecestore.UsedSerials Vouchers() vouchers.DB @@ -225,13 +226,12 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config Config, ver return nil, errs.Combine(err, peer.Close()) } - peer.Storage2.Store = pieces.NewStore(peer.Log.Named("pieces"), peer.DB.Pieces()) + peer.Storage2.Store = pieces.NewStore(peer.Log.Named("pieces"), peer.DB.Pieces(), peer.DB.V0PieceInfo(), peer.DB.PieceExpirationDB()) peer.Storage2.Monitor = monitor.NewService( log.Named("piecestore:monitor"), peer.Kademlia.RoutingTable, peer.Storage2.Store, - peer.DB.PieceInfo(), peer.DB.Bandwidth(), config.Storage.AllocatedDiskSpace.Int64(), config.Storage.AllocatedBandwidth.Int64(), @@ -246,7 +246,6 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config Config, ver peer.Storage2.Trust, peer.Storage2.Monitor, peer.Storage2.Store, - peer.DB.PieceInfo(), peer.DB.Orders(), peer.DB.Bandwidth(), peer.DB.UsedSerials(), @@ -285,7 +284,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config Config, ver peer.Log.Named("console:service"), peer.DB.Console(), peer.DB.Bandwidth(), - peer.DB.PieceInfo(), + peer.Storage2.Store, peer.Kademlia.Service, peer.Version, peer.NodeStats, @@ -314,7 +313,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config Config, ver { // setup storage inspector peer.Storage2.Inspector = inspector.NewEndpoint( peer.Log.Named("pieces:inspector"), - peer.DB.PieceInfo(), + peer.Storage2.Store, peer.Kademlia.Service, peer.DB.Bandwidth(), config.Storage, @@ -323,7 +322,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, config Config, ver pb.RegisterPieceStoreInspectorServer(peer.Server.PrivateGRPC(), peer.Storage2.Inspector) } - peer.Collector = collector.NewService(peer.Log.Named("collector"), peer.Storage2.Store, peer.DB.PieceInfo(), peer.DB.UsedSerials(), config.Collector) + peer.Collector = collector.NewService(peer.Log.Named("collector"), peer.Storage2.Store, peer.DB.UsedSerials(), config.Collector) peer.Bandwidth = bandwidth.NewService(peer.Log.Named("bandwidth"), peer.DB.Bandwidth(), config.Bandwidth) diff --git a/storagenode/pieces/db_test.go b/storagenode/pieces/db_test.go index 43cabdbd4..6576b5057 100644 --- a/storagenode/pieces/db_test.go +++ b/storagenode/pieces/db_test.go @@ -23,12 +23,12 @@ import ( "storj.io/storj/storagenode/storagenodedb/storagenodedbtest" ) -func TestPieceInfo(t *testing.T) { +func TestV0PieceInfo(t *testing.T) { storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { ctx := testcontext.New(t) defer ctx.Cleanup() - pieceinfos := db.PieceInfo() + pieceinfos := db.V0PieceInfo().(pieces.V0PieceInfoDBForTest) satellite0 := testidentity.MustPregeneratedSignedIdentity(0, storj.LatestIDVersion()) satellite1 := testidentity.MustPregeneratedSignedIdentity(1, storj.LatestIDVersion()) diff --git a/storagenode/pieces/pieceexpiration_test.go b/storagenode/pieces/pieceexpiration_test.go new file mode 100644 index 000000000..6aca04be9 --- /dev/null +++ b/storagenode/pieces/pieceexpiration_test.go @@ -0,0 +1,91 @@ +// Copyright (C) 2019 Storj Labs, Inc. +// See LICENSE for copying information. + +package pieces_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "storj.io/storj/internal/testcontext" + "storj.io/storj/internal/testrand" + "storj.io/storj/storagenode" + "storj.io/storj/storagenode/pieces" + "storj.io/storj/storagenode/storagenodedb/storagenodedbtest" +) + +func TestPieceExpirationDB(t *testing.T) { + // test GetExpired, SetExpiration, DeleteExpiration, DeleteFailed + storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + expireDB := db.PieceExpirationDB() + + satelliteID := testrand.NodeID() + pieceID := testrand.PieceID() + expectedExpireInfo := pieces.ExpiredInfo{ + SatelliteID: satelliteID, + PieceID: pieceID, + InPieceInfo: false, + } + + // GetExpired with no matches + expiredPieceIDs, err := expireDB.GetExpired(ctx, time.Now(), 1000) + require.NoError(t, err) + require.Len(t, expiredPieceIDs, 0) + + // DeleteExpiration with no matches + found, err := expireDB.DeleteExpiration(ctx, satelliteID, pieceID) + require.NoError(t, err) + require.False(t, found) + + // DeleteFailed with no matches + err = expireDB.DeleteFailed(ctx, satelliteID, pieceID, time.Now()) + require.NoError(t, err) + + expireAt := time.Now() + + // SetExpiration normal usage + err = expireDB.SetExpiration(ctx, satelliteID, pieceID, expireAt) + require.NoError(t, err) + + // SetExpiration duplicate + err = expireDB.SetExpiration(ctx, satelliteID, pieceID, expireAt.Add(time.Hour)) + require.Error(t, err) + + // GetExpired normal usage + expiredPieceIDs, err = expireDB.GetExpired(ctx, expireAt.Add(time.Microsecond), 1000) + require.NoError(t, err) + require.Len(t, expiredPieceIDs, 1) + assert.Equal(t, expiredPieceIDs[0], expectedExpireInfo) + + deleteFailedAt := expireAt.Add(2 * time.Microsecond) + + // DeleteFailed normal usage + err = expireDB.DeleteFailed(ctx, satelliteID, pieceID, deleteFailedAt) + require.NoError(t, err) + + // GetExpired filters out rows with deletion_failed_at = t + expiredPieceIDs, err = expireDB.GetExpired(ctx, deleteFailedAt, 1000) + require.NoError(t, err) + require.Len(t, expiredPieceIDs, 0) + expiredPieceIDs, err = expireDB.GetExpired(ctx, deleteFailedAt.Add(time.Microsecond), 1000) + require.NoError(t, err) + require.Len(t, expiredPieceIDs, 1) + assert.Equal(t, expiredPieceIDs[0], expectedExpireInfo) + + // DeleteExpiration normal usage + found, err = expireDB.DeleteExpiration(ctx, satelliteID, pieceID) + require.NoError(t, err) + require.True(t, found) + + // Should not be there anymore + expiredPieceIDs, err = expireDB.GetExpired(ctx, expireAt.Add(365*24*time.Hour), 1000) + require.NoError(t, err) + require.Len(t, expiredPieceIDs, 0) + }) +} diff --git a/storagenode/pieces/readwrite.go b/storagenode/pieces/readwrite.go index 5f9f91aa9..b8996acd1 100644 --- a/storagenode/pieces/readwrite.go +++ b/storagenode/pieces/readwrite.go @@ -5,18 +5,61 @@ package pieces import ( "context" + "encoding/binary" "hash" "io" + "github.com/gogo/protobuf/proto" + "github.com/zeebo/errs" + + "storj.io/storj/pkg/pb" "storj.io/storj/pkg/pkcrypto" "storj.io/storj/storage" + "storj.io/storj/storage/filestore" +) + +const ( + // V1PieceHeaderReservedArea is the amount of space to be reserved at the beginning of + // pieces stored with filestore.FormatV1 or greater. Serialized piece headers should be + // written into that space, and the remaining space afterward should be zeroes. + // V1PieceHeaderReservedArea includes the size of the framing field + // (v1PieceHeaderFrameSize). It has a constant size because: + // + // * We do not anticipate needing more than this. + // * We will be able to sum up all space used by a satellite (or all satellites) without + // opening and reading from each piece file (stat() is faster than open()). + // * This simplifies piece file writing (if we needed to know the exact header size + // before writing, then we'd need to spool the entire contents of the piece somewhere + // before we could calculate the hash and size). This way, we can simply reserve the + // header space, write the piece content as it comes in, and then seek back to the + // beginning and fill in the header. + // + // We put it at the beginning of piece files because: + // + // * If we put it at the end instead, we would have to seek to the end of a file (to find + // out the real size while avoiding race conditions with stat()) and then seek backward + // again to get the header, and then seek back to the beginning to get the content. + // Seeking on spinning platter hard drives is very slow compared to reading sequential + // bytes. + // * Putting the header in the middle of piece files might be entertaining, but it would + // also be silly. + // * If piece files are incorrectly truncated or not completely written, it will be + // much easier to identify those cases when the header is intact and findable. + // + // If more space than this is needed, we will need to use a new storage format version. + V1PieceHeaderReservedArea = 512 + + // v1PieceHeaderFramingSize is the size of the field used at the beginning of piece + // files to indicate the size of the marshaled piece header within the reserved header + // area (because protobufs are not self-delimiting, which is lame). + v1PieceHeaderFramingSize = 2 ) // Writer implements a piece writer that writes content to blob store and calculates a hash. type Writer struct { - hash hash.Hash - blob storage.BlobWriter - size int64 + hash hash.Hash + blob storage.BlobWriter + pieceSize int64 // piece size only; i.e., not including piece header closed bool } @@ -24,6 +67,20 @@ type Writer struct { // NewWriter creates a new writer for storage.BlobWriter. func NewWriter(blob storage.BlobWriter) (*Writer, error) { w := &Writer{} + if blob.StorageFormatVersion() >= filestore.FormatV1 { + // We skip past the reserved header area for now- we want the header to be at the + // beginning of the file, to make it quick to seek there and also to make it easier + // to identify situations where a blob file has been truncated incorrectly. And we + // don't know what exactly is going to be in the header yet--we won't know what the + // hash or size or timestamp or expiration or signature fields need to be until we + // have received the whole piece. + // + // Once the writer calls Commit() on this writer, we will seek back to the beginning + // of the file and write the header. + if _, err := blob.Seek(V1PieceHeaderReservedArea, io.SeekStart); err != nil { + return nil, Error.Wrap(err) + } + } w.blob = blob w.hash = pkcrypto.NewHash() return w, nil @@ -32,7 +89,7 @@ func NewWriter(blob storage.BlobWriter) (*Writer, error) { // Write writes data to the blob and calculates the hash. func (w *Writer) Write(data []byte) (int, error) { n, err := w.blob.Write(data) - w.size += int64(n) + w.pieceSize += int64(n) _, _ = w.hash.Write(data[:n]) // guaranteed not to return an error if err == io.EOF { return n, err @@ -40,20 +97,78 @@ func (w *Writer) Write(data []byte) (int, error) { return n, Error.Wrap(err) } -// Size returns the amount of data written so far. -func (w *Writer) Size() int64 { return w.size } +// Size returns the amount of data written to the piece so far, not including the size of +// the piece header. +func (w *Writer) Size() int64 { return w.pieceSize } // Hash returns the hash of data written so far. func (w *Writer) Hash() []byte { return w.hash.Sum(nil) } // Commit commits piece to permanent storage. -func (w *Writer) Commit(ctx context.Context) (err error) { +func (w *Writer) Commit(ctx context.Context, pieceHeader *pb.PieceHeader) (err error) { defer mon.Task()(&ctx)(&err) if w.closed { return Error.New("already closed") } + + // point of no return: after this we definitely either commit or cancel w.closed = true - return Error.Wrap(w.blob.Commit(ctx)) + defer func() { + if err != nil { + err = Error.Wrap(errs.Combine(err, w.blob.Cancel(ctx))) + } else { + err = Error.Wrap(w.blob.Commit(ctx)) + } + }() + + formatVer := w.blob.StorageFormatVersion() + if formatVer == filestore.FormatV0 { + return nil + } + pieceHeader.FormatVersion = pb.PieceHeader_FormatVersion(formatVer) + headerBytes, err := proto.Marshal(pieceHeader) + if err != nil { + return err + } + mon.IntVal("storagenode_pieces_pieceheader_size").Observe(int64(len(headerBytes))) + if len(headerBytes) > (V1PieceHeaderReservedArea - v1PieceHeaderFramingSize) { + // This should never happen under normal circumstances, and it might deserve a panic(), + // but I'm not *entirely* sure this case can't be triggered by a malicious uplink. Are + // google.protobuf.Timestamp fields variable-width? + mon.Meter("storagenode_pieces_pieceheader_overflow").Mark(len(headerBytes)) + return Error.New("marshaled piece header too big!") + } + size, err := w.blob.Size() + if err != nil { + return err + } + if _, err := w.blob.Seek(0, io.SeekStart); err != nil { + return err + } + + // We need to store some "framing" bytes first, because protobufs are not self-delimiting. + // In cases where the serialized pieceHeader is not exactly V1PieceHeaderReservedArea bytes + // (probably _all_ cases), without this marker, we wouldn't have any way to take the + // V1PieceHeaderReservedArea bytes from a piece blob and trim off the right number of zeroes + // at the end so that the protobuf unmarshals correctly. + var framingBytes [v1PieceHeaderFramingSize]byte + binary.BigEndian.PutUint16(framingBytes[:], uint16(len(headerBytes))) + if _, err = w.blob.Write(framingBytes[:]); err != nil { + return Error.New("failed writing piece framing field at file start: %v", err) + } + + // Now write the serialized header bytes. + if _, err = w.blob.Write(headerBytes); err != nil { + return Error.New("failed writing piece header at file start: %v", err) + } + + // seek back to the end, as blob.Commit will truncate from the current file position. + // (don't try to seek(0, io.SeekEnd), because dir.CreateTemporaryFile preallocs space + // and the actual end of the file might be far past the intended end of the piece.) + if _, err := w.blob.Seek(size, io.SeekStart); err != nil { + return err + } + return nil } // Cancel deletes any temporarily written data. @@ -68,9 +183,11 @@ func (w *Writer) Cancel(ctx context.Context) (err error) { // Reader implements a piece reader that reads content from blob store. type Reader struct { - blob storage.BlobReader - pos int64 - size int64 + formatVersion storage.FormatVersion + + blob storage.BlobReader + pos int64 // relative to file start; i.e., it includes piece header + pieceSize int64 // piece size only; i.e., not including piece header } // NewReader creates a new reader for storage.BlobReader. @@ -79,16 +196,80 @@ func NewReader(blob storage.BlobReader) (*Reader, error) { if err != nil { return nil, Error.Wrap(err) } + formatVersion := blob.StorageFormatVersion() + if formatVersion >= filestore.FormatV1 { + if size < V1PieceHeaderReservedArea { + return nil, Error.New("invalid piece file for storage format version %d: too small for header (%d < %d)", formatVersion, size, V1PieceHeaderReservedArea) + } + size -= V1PieceHeaderReservedArea + } - reader := &Reader{} - reader.blob = blob - reader.size = size - + reader := &Reader{ + formatVersion: formatVersion, + blob: blob, + pieceSize: size, + } return reader, nil } +// StorageFormatVersion returns the storage format version of the piece being read. +func (r *Reader) StorageFormatVersion() storage.FormatVersion { + return r.formatVersion +} + +// GetPieceHeader reads, unmarshals, and returns the piece header. It may only be called once, +// before any Read() calls. (Retrieving the header at any time could be supported, but for the sake +// of performance we need to understand why and how often that would happen.) +func (r *Reader) GetPieceHeader() (*pb.PieceHeader, error) { + if r.formatVersion < filestore.FormatV1 { + return nil, Error.New("Can't get piece header from storage format V0 reader") + } + if r.pos != 0 { + return nil, Error.New("GetPieceHeader called when not at the beginning of the blob stream") + } + // We need to read the size of the serialized header protobuf before we read the header + // itself. The headers aren't a constant size, although V1PieceHeaderReservedArea is + // constant. Without this marker, we wouldn't have any way to know how much of the + // reserved header area is supposed to make up the serialized header protobuf. + var headerBytes [V1PieceHeaderReservedArea]byte + framingBytes := headerBytes[:v1PieceHeaderFramingSize] + n, err := io.ReadFull(r.blob, framingBytes) + if err != nil { + return nil, Error.Wrap(err) + } + if n != v1PieceHeaderFramingSize { + return nil, Error.New("Could not read whole PieceHeader framing field") + } + r.pos += int64(n) + headerSize := binary.BigEndian.Uint16(framingBytes) + if headerSize > (V1PieceHeaderReservedArea - v1PieceHeaderFramingSize) { + return nil, Error.New("PieceHeader framing field claims impossible size of %d bytes", headerSize) + } + + // Now we can read the actual serialized header. + pieceHeaderBytes := headerBytes[v1PieceHeaderFramingSize : v1PieceHeaderFramingSize+headerSize] + n, err = io.ReadFull(r.blob, pieceHeaderBytes) + if err != nil { + return nil, Error.Wrap(err) + } + r.pos += int64(n) + + // Deserialize and return. + header := &pb.PieceHeader{} + if err := proto.Unmarshal(pieceHeaderBytes, header); err != nil { + return nil, Error.New("piece header: %v", err) + } + return header, nil +} + // Read reads data from the underlying blob, buffering as necessary. func (r *Reader) Read(data []byte) (int, error) { + if r.formatVersion >= filestore.FormatV1 && r.pos < V1PieceHeaderReservedArea { + // should only be necessary once per reader. or zero times, if GetPieceHeader is used + if _, err := r.blob.Seek(V1PieceHeaderReservedArea, io.SeekStart); err != nil { + return 0, Error.Wrap(err) + } + } n, err := r.blob.Read(data) r.pos += int64(n) if err == io.EOF { @@ -97,22 +278,37 @@ func (r *Reader) Read(data []byte) (int, error) { return n, Error.Wrap(err) } -// Seek seeks to the specified location. +// Seek seeks to the specified location within the piece content (ignoring the header). func (r *Reader) Seek(offset int64, whence int) (int64, error) { + if whence == io.SeekStart && r.formatVersion >= filestore.FormatV1 { + offset += V1PieceHeaderReservedArea + } if whence == io.SeekStart && r.pos == offset { return r.pos, nil } pos, err := r.blob.Seek(offset, whence) r.pos = pos + if r.formatVersion >= filestore.FormatV1 { + if pos < V1PieceHeaderReservedArea { + // any position within the file header should show as 0 here + pos = 0 + } else { + pos -= V1PieceHeaderReservedArea + } + } if err == io.EOF { return pos, err } return pos, Error.Wrap(err) } -// ReadAt reads data at the specified offset +// ReadAt reads data at the specified offset, which is relative to the piece content, +// not the underlying blob. The piece header is not reachable by this method. func (r *Reader) ReadAt(data []byte, offset int64) (int, error) { + if r.formatVersion >= filestore.FormatV1 { + offset += V1PieceHeaderReservedArea + } n, err := r.blob.ReadAt(data, offset) if err == io.EOF { return n, err @@ -120,8 +316,8 @@ func (r *Reader) ReadAt(data []byte, offset int64) (int, error) { return n, Error.Wrap(err) } -// Size returns the amount of data written so far. -func (r *Reader) Size() int64 { return r.size } +// Size returns the amount of data in the piece. +func (r *Reader) Size() int64 { return r.pieceSize } // Close closes the reader. func (r *Reader) Close() error { diff --git a/storagenode/pieces/readwrite_test.go b/storagenode/pieces/readwrite_test.go index 46772fe88..15f2e086d 100644 --- a/storagenode/pieces/readwrite_test.go +++ b/storagenode/pieces/readwrite_test.go @@ -6,14 +6,19 @@ package pieces_test import ( "io" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" + "go.uber.org/zap/zaptest" "storj.io/storj/internal/memory" "storj.io/storj/internal/testcontext" "storj.io/storj/internal/testrand" + "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" + "storj.io/storj/storage" "storj.io/storj/storage/filestore" "storj.io/storj/storagenode/pieces" ) @@ -24,10 +29,10 @@ func BenchmarkReadWrite(b *testing.B) { dir, err := filestore.NewDir(ctx.Dir("pieces")) require.NoError(b, err) - blobs := filestore.New(dir) + blobs := filestore.New(zap.NewNop(), dir) defer ctx.Check(blobs.Close) - store := pieces.NewStore(zap.NewNop(), blobs) + store := pieces.NewStore(zap.NewNop(), blobs, nil, nil) // setup test parameters const blockSize = int(256 * memory.KiB) @@ -51,7 +56,7 @@ func BenchmarkReadWrite(b *testing.B) { data = data[n:] } - require.NoError(b, writer.Commit(ctx)) + require.NoError(b, writer.Commit(ctx, &pb.PieceHeader{})) } }) @@ -61,7 +66,7 @@ func BenchmarkReadWrite(b *testing.B) { require.NoError(b, err) _, err = writer.Write(source) require.NoError(b, err) - require.NoError(b, writer.Commit(ctx)) + require.NoError(b, writer.Commit(ctx, &pb.PieceHeader{})) } b.Run("Read", func(b *testing.B) { @@ -83,3 +88,131 @@ func BenchmarkReadWrite(b *testing.B) { } }) } + +func readAndWritePiece(t *testing.T, content []byte) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + dir, err := filestore.NewDir(ctx.Dir("pieces")) + require.NoError(t, err) + blobs := filestore.New(zaptest.NewLogger(t), dir) + defer ctx.Check(blobs.Close) + + store := pieces.NewStore(zaptest.NewLogger(t), blobs, nil, nil) + + // test parameters + satelliteID := testrand.NodeID() + pieceID := testrand.PieceID() + fakeHash := testrand.Bytes(32) + creationTime := time.Unix(1564362827, 18364029) + fakeSig := testrand.Bytes(32) + expirationTime := time.Unix(1595898827, 18364029) + + // write a V1 format piece + w, err := store.Writer(ctx, satelliteID, pieceID) + require.NoError(t, err) + if len(content) > 0 { + _, err = w.Write(content) + require.NoError(t, err) + } + + // make sure w.Size() works + assert.Equal(t, int64(len(content)), w.Size()) + + // commit the writer with the piece header, and close it + err = w.Commit(ctx, &pb.PieceHeader{ + Hash: fakeHash, + CreationTime: creationTime, + Signature: fakeSig, + OrderLimit: pb.OrderLimit{ + PieceExpiration: expirationTime.UTC(), + }, + }) + require.NoError(t, err) + + // open a reader + r, err := store.Reader(ctx, satelliteID, pieceID) + require.NoError(t, err) + defer ctx.Check(r.Close) + assert.Equal(t, filestore.MaxFormatVersionSupported, r.StorageFormatVersion()) + + // make sure r.Size() works + assert.Equal(t, int64(len(content)), r.Size()) + + // make sure seek-nowhere works as expected before piece header is read + pos, err := r.Seek(0, io.SeekCurrent) + require.NoError(t, err) + require.Equal(t, int64(0), pos) + + // read piece header + header, err := r.GetPieceHeader() + require.NoError(t, err) + assert.Equal(t, fakeHash, header.Hash) + assert.Truef(t, header.CreationTime.Equal(creationTime), + "header.CreationTime = %s, but expected creationTime = %s", header.CreationTime, creationTime) + assert.Equal(t, fakeSig, header.Signature) + require.NotZero(t, header.OrderLimit.PieceExpiration) + assert.Truef(t, header.OrderLimit.PieceExpiration.Equal(expirationTime), + "*header.ExpirationTime = %s, but expected expirationTime = %s", header.OrderLimit.PieceExpiration, expirationTime) + assert.Equal(t, pb.OrderLimit{PieceExpiration: expirationTime.UTC()}, header.OrderLimit) + assert.Equal(t, filestore.FormatV1, storage.FormatVersion(header.FormatVersion)) + + // make sure seek-nowhere works as expected after piece header is read too + // (from the point of view of the piece store, the file position has not moved) + pos, err = r.Seek(0, io.SeekCurrent) + require.NoError(t, err) + assert.Equal(t, int64(0), pos) + + // read piece contents + bufSize := memory.MB.Int() + if len(content) < bufSize { + bufSize = len(content) + } + buf := make([]byte, bufSize) + bytesRead, err := r.Read(buf) + require.NoError(t, err) + require.Equal(t, bufSize, bytesRead) + require.Equal(t, content[:len(buf)], buf) + + // GetPieceHeader should error here now + header, err = r.GetPieceHeader() + require.Error(t, err) + assert.Truef(t, pieces.Error.Has(err), "err is not a pieces.Error: %v", err) + assert.Nil(t, header) + + // check file position again + pos, err = r.Seek(0, io.SeekCurrent) + require.NoError(t, err) + require.Equal(t, int64(bufSize), pos) + + const miniReadSize = 256 + if len(content) > int(pos+miniReadSize) { + // Continuing to read should be ok + bytesRead, err = r.Read(buf[:miniReadSize]) + require.NoError(t, err) + require.Equal(t, miniReadSize, bytesRead) + require.Equal(t, content[int(memory.MB):int(memory.MB)+miniReadSize], buf[:miniReadSize]) + + // Perform a Seek that actually moves the file pointer + const startReadFrom = 11 + pos, err = r.Seek(startReadFrom, io.SeekStart) + require.NoError(t, err) + assert.Equal(t, int64(startReadFrom), pos) + + // And make sure that Seek had an effect + bytesRead, err = r.Read(buf[:miniReadSize]) + require.NoError(t, err) + require.Equal(t, miniReadSize, bytesRead) + require.Equal(t, content[startReadFrom:startReadFrom+miniReadSize], buf[:miniReadSize]) + } +} + +func TestReadWriteWithPieceHeader(t *testing.T) { + content := testrand.Bytes(30 * memory.MB) + readAndWritePiece(t, content) +} + +func TestEmptyPiece(t *testing.T) { + var content [0]byte + readAndWritePiece(t, content[:]) +} diff --git a/storagenode/pieces/store.go b/storagenode/pieces/store.go index f3d409ae5..450a8d842 100644 --- a/storagenode/pieces/store.go +++ b/storagenode/pieces/store.go @@ -10,12 +10,13 @@ import ( "github.com/zeebo/errs" "go.uber.org/zap" - monkit "gopkg.in/spacemonkeygo/monkit.v2" + "gopkg.in/spacemonkeygo/monkit.v2" "storj.io/storj/internal/memory" "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" "storj.io/storj/storage" + "storj.io/storj/storage/filestore" ) const ( @@ -46,42 +47,108 @@ type Info struct { type ExpiredInfo struct { SatelliteID storj.NodeID PieceID storj.PieceID - PieceSize int64 + + // This can be removed when we no longer need to support the pieceinfo db. Its only purpose + // is to keep track of whether expired entries came from piece_expirations or pieceinfo. + InPieceInfo bool } -// DB stores meta information about a piece, the actual piece is stored in storage.Blobs -type DB interface { - // Add inserts Info to the database. - Add(context.Context, *Info) error +// PieceExpirationDB stores information about pieces with expiration dates. +type PieceExpirationDB interface { + // GetExpired gets piece IDs that expire or have expired before the given time + GetExpired(ctx context.Context, expiresBefore time.Time, limit int64) ([]ExpiredInfo, error) + // SetExpiration sets an expiration time for the given piece ID on the given satellite + SetExpiration(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID, expiresAt time.Time) error + // DeleteExpiration removes an expiration record for the given piece ID on the given satellite + DeleteExpiration(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID) (found bool, err error) + // DeleteFailed marks an expiration record as having experienced a failure in deleting the + // piece from the disk + DeleteFailed(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID, failedAt time.Time) error +} + +// V0PieceInfoDB stores meta information about pieces stored with storage format V0 (where +// metadata goes in the "pieceinfo" table in the storagenodedb). The actual pieces are stored +// behind something providing the storage.Blobs interface. +type V0PieceInfoDB interface { // Get returns Info about a piece. Get(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (*Info, error) - // GetPieceIDs gets pieceIDs using the satelliteID - GetPieceIDs(ctx context.Context, satelliteID storj.NodeID, createdBefore time.Time, limit int, cursor storj.PieceID) (pieceIDs []storj.PieceID, err error) // Delete deletes Info about a piece. Delete(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) error // DeleteFailed marks piece deletion from disk failed DeleteFailed(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID, failedAt time.Time) error - // SpaceUsed returns the in memory value for disk space used by all pieces - SpaceUsed(ctx context.Context) (int64, error) - // CalculatedSpaceUsed calculates disk space used by all pieces - CalculatedSpaceUsed(ctx context.Context) (int64, error) - // SpaceUsedBySatellite calculates disk space used by all pieces by satellite - SpaceUsedBySatellite(ctx context.Context, satelliteID storj.NodeID) (int64, error) - // GetExpired gets orders that are expired and were created before some time + // GetExpired gets piece IDs stored with storage format V0 that expire or have expired + // before the given time GetExpired(ctx context.Context, expiredAt time.Time, limit int64) ([]ExpiredInfo, error) + // WalkSatelliteV0Pieces executes walkFunc for each locally stored piece, stored + // with storage format V0 in the namespace of the given satellite. If walkFunc returns a + // non-nil error, WalkSatelliteV0Pieces will stop iterating and return the error + // immediately. The ctx parameter is intended specifically to allow canceling iteration + // early. + WalkSatelliteV0Pieces(ctx context.Context, blobStore storage.Blobs, satellite storj.NodeID, walkFunc func(StoredPieceAccess) error) error +} + +// V0PieceInfoDBForTest is like V0PieceInfoDB, but adds on the Add() method so +// that test environments with V0 piece data can be set up. +type V0PieceInfoDBForTest interface { + V0PieceInfoDB + + // Add inserts Info to the database. This is only a valid thing to do, now, + // during tests, to replicate the environment of a storage node not yet fully + // migrated to V1 storage. + Add(context.Context, *Info) error +} + +// StoredPieceAccess allows inspection and manipulation of a piece during iteration with +// WalkSatellitePieces-type methods. +type StoredPieceAccess interface { + storage.BlobInfo + + // PieceID gives the pieceID of the piece + PieceID() storj.PieceID + // Satellite gives the nodeID of the satellite which owns the piece + Satellite() (storj.NodeID, error) + // ContentSize gives the size of the piece content (not including the piece header, if + // applicable) + ContentSize(ctx context.Context) (int64, error) + // CreationTime returns the piece creation time as given in the original PieceHash (which is + // likely not the same as the file mtime). For non-FormatV0 pieces, this requires opening + // the file and unmarshaling the piece header. If exact precision is not required, ModTime() + // may be a better solution. + CreationTime(ctx context.Context) (time.Time, error) + // ModTime returns a less-precise piece creation time than CreationTime, but is generally + // much faster. For non-FormatV0 pieces, this gets the piece creation time from to the + // filesystem instead of the piece header. + ModTime(ctx context.Context) (time.Time, error) } // Store implements storing pieces onto a blob storage implementation. type Store struct { - log *zap.Logger - blobs storage.Blobs + log *zap.Logger + blobs storage.Blobs + v0PieceInfo V0PieceInfoDB + expirationInfo PieceExpirationDB + + // The value of reservedSpace is always added to the return value from the + // SpaceUsedForPieces() method. + // The reservedSpace field is part of an unfortunate hack that enables testing of low-space + // or no-space conditions. It is not (or should not be) used under regular operating + // conditions. + reservedSpace int64 +} + +// StoreForTest is a wrapper around Store to be used only in test scenarios. It enables writing +// pieces with older storage formats and allows use of the ReserveSpace() method. +type StoreForTest struct { + *Store } // NewStore creates a new piece store -func NewStore(log *zap.Logger, blobs storage.Blobs) *Store { +func NewStore(log *zap.Logger, blobs storage.Blobs, v0PieceInfo V0PieceInfoDB, expirationInfo PieceExpirationDB) *Store { return &Store{ - log: log, - blobs: blobs, + log: log, + blobs: blobs, + v0PieceInfo: v0PieceInfo, + expirationInfo: expirationInfo, } } @@ -100,6 +167,37 @@ func (store *Store) Writer(ctx context.Context, satellite storj.NodeID, pieceID return writer, Error.Wrap(err) } +// WriterForFormatVersion allows opening a piece writer with a specified storage format version. +// This is meant to be used externally only in test situations (thus the StoreForTest receiver +// type). +func (store StoreForTest) WriterForFormatVersion(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID, formatVersion storage.FormatVersion) (_ *Writer, err error) { + defer mon.Task()(&ctx)(&err) + + blobRef := storage.BlobRef{ + Namespace: satellite.Bytes(), + Key: pieceID.Bytes(), + } + var blob storage.BlobWriter + switch formatVersion { + case filestore.FormatV0: + fStore, ok := store.blobs.(*filestore.Store) + if !ok { + return nil, Error.New("can't make a WriterForFormatVersion with this blob store (%T)", store.blobs) + } + tStore := filestore.StoreForTest{Store: fStore} + blob, err = tStore.CreateV0(ctx, blobRef) + case filestore.FormatV1: + blob, err = store.blobs.Create(ctx, blobRef, preallocSize.Int64()) + default: + return nil, Error.New("please teach me how to make V%d pieces", formatVersion) + } + if err != nil { + return nil, Error.Wrap(err) + } + writer, err := NewWriter(blob) + return writer, Error.Wrap(err) +} + // Reader returns a new piece reader. func (store *Store) Reader(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID) (_ *Reader, err error) { defer mon.Task()(&ctx)(&err) @@ -118,6 +216,23 @@ func (store *Store) Reader(ctx context.Context, satellite storj.NodeID, pieceID return reader, Error.Wrap(err) } +// ReaderWithStorageFormat returns a new piece reader for a located piece, which avoids the +// potential need to check multiple storage formats to find the right blob. +func (store *Store) ReaderWithStorageFormat(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID, formatVersion storage.FormatVersion) (_ *Reader, err error) { + defer mon.Task()(&ctx)(&err) + ref := storage.BlobRef{Namespace: satellite.Bytes(), Key: pieceID.Bytes()} + blob, err := store.blobs.OpenWithStorageFormat(ctx, ref, formatVersion) + if err != nil { + if os.IsNotExist(err) { + return nil, err + } + return nil, Error.Wrap(err) + } + + reader, err := NewReader(blob) + return reader, Error.Wrap(err) +} + // Delete deletes the specified piece. func (store *Store) Delete(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID) (err error) { defer mon.Task()(&ctx)(&err) @@ -125,9 +240,164 @@ func (store *Store) Delete(ctx context.Context, satellite storj.NodeID, pieceID Namespace: satellite.Bytes(), Key: pieceID.Bytes(), }) + if err != nil { + return Error.Wrap(err) + } + // delete records in both the piece_expirations and pieceinfo DBs, wherever we find it. + // both of these calls should return no error if the requested record is not found. + if store.expirationInfo != nil { + _, err = store.expirationInfo.DeleteExpiration(ctx, satellite, pieceID) + } + if store.v0PieceInfo != nil { + err = errs.Combine(err, store.v0PieceInfo.Delete(ctx, satellite, pieceID)) + } return Error.Wrap(err) } +// GetV0PieceInfoDB returns this piece-store's reference to the V0 piece info DB (or nil, +// if this piece-store does not have one). This is ONLY intended for use with testing +// functionality. +func (store *Store) GetV0PieceInfoDB() V0PieceInfoDB { + return store.v0PieceInfo +} + +// WalkSatellitePieces executes walkFunc for each locally stored piece in the namespace of the +// given satellite. If walkFunc returns a non-nil error, WalkSatellitePieces will stop iterating +// and return the error immediately. The ctx parameter is intended specifically to allow canceling +// iteration early. +// +// Note that this method includes all locally stored pieces, both V0 and higher. +func (store *Store) WalkSatellitePieces(ctx context.Context, satellite storj.NodeID, walkFunc func(StoredPieceAccess) error) (err error) { + defer mon.Task()(&ctx)(&err) + // first iterate over all in V1 storage, then all in V0 + err = store.blobs.WalkNamespace(ctx, satellite.Bytes(), func(blobInfo storage.BlobInfo) error { + if blobInfo.StorageFormatVersion() < filestore.FormatV1 { + // we'll address this piece while iterating over the V0 pieces below. + return nil + } + pieceAccess, err := newStoredPieceAccess(store, blobInfo) + if err != nil { + // this is not a real piece blob. the blob store can't distinguish between actual piece + // blobs and stray files whose names happen to decode as valid base32. skip this + // "blob". + return nil + } + return walkFunc(pieceAccess) + }) + if err == nil && store.v0PieceInfo != nil { + err = store.v0PieceInfo.WalkSatelliteV0Pieces(ctx, store.blobs, satellite, walkFunc) + } + return err +} + +// GetExpired gets piece IDs that are expired and were created before the given time +func (store *Store) GetExpired(ctx context.Context, expiredAt time.Time, limit int64) (_ []ExpiredInfo, err error) { + defer mon.Task()(&ctx)(&err) + + expired, err := store.expirationInfo.GetExpired(ctx, expiredAt, limit) + if err != nil { + return nil, err + } + if int64(len(expired)) < limit && store.v0PieceInfo != nil { + v0Expired, err := store.v0PieceInfo.GetExpired(ctx, expiredAt, limit-int64(len(expired))) + if err != nil { + return nil, err + } + expired = append(expired, v0Expired...) + } + return expired, nil +} + +// SetExpiration records an expiration time for the specified piece ID owned by the specified satellite +func (store *Store) SetExpiration(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID, expiresAt time.Time) (err error) { + return store.expirationInfo.SetExpiration(ctx, satellite, pieceID, expiresAt) +} + +// DeleteFailed marks piece as a failed deletion. +func (store *Store) DeleteFailed(ctx context.Context, expired ExpiredInfo, when time.Time) (err error) { + defer mon.Task()(&ctx)(&err) + + if expired.InPieceInfo { + return store.v0PieceInfo.DeleteFailed(ctx, expired.SatelliteID, expired.PieceID, when) + } + return store.expirationInfo.DeleteFailed(ctx, expired.SatelliteID, expired.PieceID, when) +} + +// SpaceUsedForPieces returns *an approximation of* the disk space used by all local pieces (both +// V0 and later). This is an approximation because changes may be being applied to the filestore as +// this information is collected, and because it is possible that various errors in directory +// traversal could cause this count to be undersized. +// +// Important note: this metric does not include space used by piece headers, whereas +// storj/filestore/store.(*Store).SpaceUsed() *does* include all space used by the blobs. +// +// The value of reservedSpace for this Store is added to the result, but this should only affect +// tests (reservedSpace should always be 0 in real usage). +func (store *Store) SpaceUsedForPieces(ctx context.Context) (int64, error) { + satellites, err := store.getAllStoringSatellites(ctx) + if err != nil { + return 0, err + } + var total int64 + for _, satellite := range satellites { + spaceUsed, err := store.SpaceUsedBySatellite(ctx, satellite) + if err != nil { + return 0, err + } + total += spaceUsed + } + return total + store.reservedSpace, nil +} + +func (store *Store) getAllStoringSatellites(ctx context.Context) ([]storj.NodeID, error) { + namespaces, err := store.blobs.ListNamespaces(ctx) + if err != nil { + return nil, err + } + satellites := make([]storj.NodeID, len(namespaces)) + for i, namespace := range namespaces { + satellites[i], err = storj.NodeIDFromBytes(namespace) + if err != nil { + return nil, err + } + } + return satellites, nil +} + +// SpaceUsedBySatellite calculates *an approximation of* how much disk space is used for local +// piece storage in the given satellite's namespace. This is an approximation because changes may +// be being applied to the filestore as this information is collected, and because it is possible +// that various errors in directory traversal could cause this count to be undersized. +// +// Important note: this metric does not include space used by piece headers, whereas +// storj/filestore/store.(*Store).SpaceUsedInNamespace() *does* include all space used by the +// blobs. +func (store *Store) SpaceUsedBySatellite(ctx context.Context, satelliteID storj.NodeID) (int64, error) { + var totalUsed int64 + err := store.WalkSatellitePieces(ctx, satelliteID, func(access StoredPieceAccess) error { + contentSize, statErr := access.ContentSize(ctx) + if statErr != nil { + store.log.Error("failed to stat", zap.Error(statErr), zap.String("pieceID", access.PieceID().String()), zap.String("satellite", satelliteID.String())) + // keep iterating; we want a best effort total here. + return nil + } + totalUsed += contentSize + return nil + }) + if err != nil { + return 0, err + } + return totalUsed, nil +} + +// ReserveSpace marks some amount of free space as used, even if it's not, so that future calls +// to SpaceUsedForPieces() are raised by this amount. Calls to ReserveSpace invalidate earlier +// calls, so ReserveSpace(0) undoes all prior space reservation. This should only be used in +// test scenarios. +func (store StoreForTest) ReserveSpace(amount int64) { + store.reservedSpace = amount +} + // StorageStatus contains information about the disk store is using. type StorageStatus struct { DiskUsed int64 @@ -146,3 +416,78 @@ func (store *Store) StorageStatus(ctx context.Context) (_ StorageStatus, err err DiskFree: diskFree, }, nil } + +type storedPieceAccess struct { + storage.BlobInfo + store *Store + pieceID storj.PieceID +} + +func newStoredPieceAccess(store *Store, blobInfo storage.BlobInfo) (storedPieceAccess, error) { + pieceID, err := storj.PieceIDFromBytes(blobInfo.BlobRef().Key) + if err != nil { + return storedPieceAccess{}, err + } + return storedPieceAccess{ + BlobInfo: blobInfo, + store: store, + pieceID: pieceID, + }, nil +} + +// PieceID returns the piece ID of the piece +func (access storedPieceAccess) PieceID() storj.PieceID { + return access.pieceID +} + +// Satellite returns the satellite ID that owns the piece +func (access storedPieceAccess) Satellite() (storj.NodeID, error) { + return storj.NodeIDFromBytes(access.BlobRef().Namespace) +} + +// ContentSize gives the size of the piece content (not including the piece header, if applicable) +func (access storedPieceAccess) ContentSize(ctx context.Context) (size int64, err error) { + defer mon.Task()(&ctx)(&err) + stat, err := access.Stat(ctx) + if err != nil { + return 0, err + } + size = stat.Size() + if access.StorageFormatVersion() >= filestore.FormatV1 { + size -= V1PieceHeaderReservedArea + } + return size, nil +} + +// CreationTime returns the piece creation time as given in the original PieceHash (which is likely +// not the same as the file mtime). This requires opening the file and unmarshaling the piece +// header. If exact precision is not required, ModTime() may be a better solution. +func (access storedPieceAccess) CreationTime(ctx context.Context) (cTime time.Time, err error) { + defer mon.Task()(&ctx)(&err) + satellite, err := access.Satellite() + if err != nil { + return time.Time{}, err + } + reader, err := access.store.ReaderWithStorageFormat(ctx, satellite, access.PieceID(), access.StorageFormatVersion()) + if err != nil { + return time.Time{}, err + } + header, err := reader.GetPieceHeader() + if err != nil { + return time.Time{}, err + } + return header.CreationTime, nil +} + +// ModTime returns a less-precise piece creation time than CreationTime, but is generally +// much faster. This gets the piece creation time from to the filesystem instead of the +// piece header. +func (access storedPieceAccess) ModTime(ctx context.Context) (mTime time.Time, err error) { + defer mon.Task()(&ctx)(&err) + stat, err := access.Stat(ctx) + if err != nil { + return time.Time{}, err + } + + return stat.ModTime(), nil +} diff --git a/storagenode/pieces/store_test.go b/storagenode/pieces/store_test.go index f5fe5feac..bb4a91da3 100644 --- a/storagenode/pieces/store_test.go +++ b/storagenode/pieces/store_test.go @@ -5,20 +5,29 @@ package pieces_test import ( "bytes" + "context" "io" + "io/ioutil" + "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap/zaptest" + "storj.io/storj/internal/memory" "storj.io/storj/internal/testcontext" "storj.io/storj/internal/testidentity" "storj.io/storj/internal/testrand" + "storj.io/storj/pkg/pb" "storj.io/storj/pkg/pkcrypto" "storj.io/storj/pkg/storj" + "storj.io/storj/storage" "storj.io/storj/storage/filestore" + "storj.io/storj/storagenode" "storj.io/storj/storagenode/pieces" + "storj.io/storj/storagenode/storagenodedb/storagenodedbtest" ) func TestPieces(t *testing.T) { @@ -28,10 +37,10 @@ func TestPieces(t *testing.T) { dir, err := filestore.NewDir(ctx.Dir("pieces")) require.NoError(t, err) - blobs := filestore.New(dir) + blobs := filestore.New(zaptest.NewLogger(t), dir) defer ctx.Check(blobs.Close) - store := pieces.NewStore(zaptest.NewLogger(t), blobs) + store := pieces.NewStore(zaptest.NewLogger(t), blobs, nil, nil) satelliteID := testidentity.MustPregeneratedSignedIdentity(0, storj.LatestIDVersion()).ID pieceID := storj.NewPieceID() @@ -53,7 +62,7 @@ func TestPieces(t *testing.T) { assert.Equal(t, hash.Sum(nil), writer.Hash()) // commit - require.NoError(t, writer.Commit(ctx)) + require.NoError(t, writer.Commit(ctx, &pb.PieceHeader{})) // after commit we should be able to call cancel without an error require.NoError(t, writer.Cancel(ctx)) } @@ -120,10 +129,309 @@ func TestPieces(t *testing.T) { // cancel writing require.NoError(t, writer.Cancel(ctx)) // commit should not fail - require.Error(t, writer.Commit(ctx)) + require.Error(t, writer.Commit(ctx, &pb.PieceHeader{})) // read should fail _, err = store.Reader(ctx, satelliteID, cancelledPieceID) assert.Error(t, err) } } + +func writeAPiece(ctx context.Context, t testing.TB, store *pieces.Store, satelliteID storj.NodeID, pieceID storj.PieceID, data []byte, atTime time.Time, expireTime *time.Time, formatVersion storage.FormatVersion) { + tStore := &pieces.StoreForTest{store} + writer, err := tStore.WriterForFormatVersion(ctx, satelliteID, pieceID, formatVersion) + require.NoError(t, err) + + _, err = writer.Write(data) + require.NoError(t, err) + size := writer.Size() + assert.Equal(t, int64(len(data)), size) + limit := pb.OrderLimit{} + if expireTime != nil { + limit.PieceExpiration = *expireTime + } + err = writer.Commit(ctx, &pb.PieceHeader{ + Hash: writer.Hash(), + CreationTime: atTime, + OrderLimit: limit, + }) + require.NoError(t, err) +} + +func verifyPieceHandle(t testing.TB, reader *pieces.Reader, expectDataLen int, expectCreateTime time.Time, expectFormat storage.FormatVersion) { + assert.Equal(t, expectFormat, reader.StorageFormatVersion()) + assert.Equal(t, int64(expectDataLen), reader.Size()) + if expectFormat != filestore.FormatV0 { + pieceHeader, err := reader.GetPieceHeader() + require.NoError(t, err) + assert.Equal(t, expectFormat, storage.FormatVersion(pieceHeader.FormatVersion)) + assert.Equal(t, expectCreateTime.UTC(), pieceHeader.CreationTime.UTC()) + } +} + +func tryOpeningAPiece(ctx context.Context, t testing.TB, store *pieces.Store, satelliteID storj.NodeID, pieceID storj.PieceID, expectDataLen int, expectTime time.Time, expectFormat storage.FormatVersion) { + reader, err := store.Reader(ctx, satelliteID, pieceID) + require.NoError(t, err) + verifyPieceHandle(t, reader, expectDataLen, expectTime, expectFormat) + require.NoError(t, reader.Close()) + + reader, err = store.ReaderWithStorageFormat(ctx, satelliteID, pieceID, expectFormat) + require.NoError(t, err) + verifyPieceHandle(t, reader, expectDataLen, expectTime, expectFormat) + require.NoError(t, reader.Close()) +} + +// Test that the piece store can still read V0 pieces that might be left over from a previous +// version, as well as V1 pieces. +func TestMultipleStorageFormatVersions(t *testing.T) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + blobs, err := filestore.NewAt(zaptest.NewLogger(t), ctx.Dir("store")) + require.NoError(t, err) + defer ctx.Check(blobs.Close) + + store := pieces.NewStore(zaptest.NewLogger(t), blobs, nil, nil) + + const pieceSize = 1024 + + var ( + data = testrand.Bytes(pieceSize) + satellite = testrand.NodeID() + v0PieceID = testrand.PieceID() + v1PieceID = testrand.PieceID() + now = time.Now().UTC() + ) + + // write a V0 piece + writeAPiece(ctx, t, store, satellite, v0PieceID, data, now, nil, filestore.FormatV0) + + // write a V1 piece + writeAPiece(ctx, t, store, satellite, v1PieceID, data, now, nil, filestore.FormatV1) + + // look up the different pieces with Reader and ReaderWithStorageFormat + tryOpeningAPiece(ctx, t, store, satellite, v0PieceID, len(data), now, filestore.FormatV0) + tryOpeningAPiece(ctx, t, store, satellite, v1PieceID, len(data), now, filestore.FormatV1) + + // write a V1 piece with the same ID as the V0 piece (to simulate it being rewritten as + // V1 during a migration) + differentData := append(data, 111, 104, 97, 105) + writeAPiece(ctx, t, store, satellite, v0PieceID, differentData, now, nil, filestore.FormatV1) + + // if we try to access the piece at that key, we should see only the V1 piece + tryOpeningAPiece(ctx, t, store, satellite, v0PieceID, len(differentData), now, filestore.FormatV1) + + // unless we ask specifically for a V0 piece + reader, err := store.ReaderWithStorageFormat(ctx, satellite, v0PieceID, filestore.FormatV0) + require.NoError(t, err) + verifyPieceHandle(t, reader, len(data), now, filestore.FormatV0) + require.NoError(t, reader.Close()) + + // delete the v0PieceID; both the V0 and the V1 pieces should go away + err = store.Delete(ctx, satellite, v0PieceID) + require.NoError(t, err) + + reader, err = store.Reader(ctx, satellite, v0PieceID) + require.Error(t, err) + require.True(t, os.IsNotExist(err)) + assert.Nil(t, reader) +} + +func TestGetExpired(t *testing.T) { + storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + v0PieceInfo, ok := db.V0PieceInfo().(pieces.V0PieceInfoDBForTest) + require.True(t, ok, "V0PieceInfoDB can not satisfy V0PieceInfoDBForTest") + expirationInfo := db.PieceExpirationDB() + + store := pieces.NewStore(zaptest.NewLogger(t), db.Pieces(), v0PieceInfo, expirationInfo) + + now := time.Now().UTC() + testDates := []struct { + years, months, days int + }{ + {-20, -1, -2}, + {1, 6, 14}, + {0, -1, 0}, + {0, 0, 1}, + } + testPieces := make([]pieces.Info, 4) + for p := range testPieces { + testPieces[p] = pieces.Info{ + SatelliteID: testrand.NodeID(), + PieceID: testrand.PieceID(), + OrderLimit: &pb.OrderLimit{}, + UplinkPieceHash: &pb.PieceHash{}, + PieceExpiration: now.AddDate(testDates[p].years, testDates[p].months, testDates[p].days), + } + } + + // put testPieces 0 and 1 in the v0 pieceinfo db + err := v0PieceInfo.Add(ctx, &testPieces[0]) + require.NoError(t, err) + err = v0PieceInfo.Add(ctx, &testPieces[1]) + require.NoError(t, err) + + // put testPieces 2 and 3 in the piece_expirations db + err = expirationInfo.SetExpiration(ctx, testPieces[2].SatelliteID, testPieces[2].PieceID, testPieces[2].PieceExpiration) + require.NoError(t, err) + err = expirationInfo.SetExpiration(ctx, testPieces[3].SatelliteID, testPieces[3].PieceID, testPieces[3].PieceExpiration) + require.NoError(t, err) + + // GetExpired with limit 0 gives empty result + expired, err := store.GetExpired(ctx, now, 0) + require.NoError(t, err) + assert.Empty(t, expired) + + // GetExpired with limit 1 gives only 1 result, although there are 2 possible + expired, err = store.GetExpired(ctx, now, 1) + require.NoError(t, err) + require.Len(t, expired, 1) + assert.Equal(t, testPieces[2].PieceID, expired[0].PieceID) + assert.Equal(t, testPieces[2].SatelliteID, expired[0].SatelliteID) + assert.False(t, expired[0].InPieceInfo) + + // GetExpired with 2 or more gives all expired results correctly; one from + // piece_expirations, and one from pieceinfo + expired, err = store.GetExpired(ctx, now, 1000) + require.NoError(t, err) + require.Len(t, expired, 2) + assert.Equal(t, testPieces[2].PieceID, expired[0].PieceID) + assert.Equal(t, testPieces[2].SatelliteID, expired[0].SatelliteID) + assert.False(t, expired[0].InPieceInfo) + assert.Equal(t, testPieces[0].PieceID, expired[1].PieceID) + assert.Equal(t, testPieces[0].SatelliteID, expired[1].SatelliteID) + assert.True(t, expired[1].InPieceInfo) + }) +} + +func TestOverwriteV0WithV1(t *testing.T) { + storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { + ctx := testcontext.New(t) + defer ctx.Cleanup() + + v0PieceInfo, ok := db.V0PieceInfo().(pieces.V0PieceInfoDBForTest) + require.True(t, ok, "V0PieceInfoDB can not satisfy V0PieceInfoDBForTest") + expirationInfo := db.PieceExpirationDB() + + store := pieces.NewStore(zaptest.NewLogger(t), db.Pieces(), v0PieceInfo, expirationInfo) + + satelliteID := testrand.NodeID() + pieceID := testrand.PieceID() + v0Data := testrand.Bytes(4 * memory.MiB) + v1Data := testrand.Bytes(3 * memory.MiB) + + // write the piece as V0. We can't provide the expireTime via writeAPiece, because + // BlobWriter.Commit only knows how to store expiration times in piece_expirations. + v0CreateTime := time.Now().UTC() + v0ExpireTime := v0CreateTime.AddDate(5, 0, 0) + writeAPiece(ctx, t, store, satelliteID, pieceID, v0Data, v0CreateTime, nil, filestore.FormatV0) + // now put the piece in the pieceinfo db directly, because store won't do that for us. + // this is where the expireTime takes effect. + err := v0PieceInfo.Add(ctx, &pieces.Info{ + SatelliteID: satelliteID, + PieceID: pieceID, + PieceSize: int64(len(v0Data)), + PieceCreation: v0CreateTime, + PieceExpiration: v0ExpireTime, + OrderLimit: &pb.OrderLimit{}, + UplinkPieceHash: &pb.PieceHash{}, + }) + require.NoError(t, err) + + // ensure we can see it via store.Reader + { + reader, err := store.Reader(ctx, satelliteID, pieceID) + require.NoError(t, err) + assert.Equal(t, int64(len(v0Data)), reader.Size()) + assert.Equal(t, filestore.FormatV0, reader.StorageFormatVersion()) + gotData, err := ioutil.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, v0Data, gotData) + require.NoError(t, reader.Close()) + } + + // ensure we can see it via WalkSatellitePieces + calledTimes := 0 + err = store.WalkSatellitePieces(ctx, satelliteID, func(access pieces.StoredPieceAccess) error { + calledTimes++ + require.Equal(t, 1, calledTimes) + gotCreateTime, err := access.CreationTime(ctx) + require.NoError(t, err) + assert.Equal(t, v0CreateTime, gotCreateTime) + gotSize, err := access.ContentSize(ctx) + require.NoError(t, err) + assert.Equal(t, int64(len(v0Data)), gotSize) + return nil + }) + require.NoError(t, err) + + // now "overwrite" the piece (write a new blob with the same id, but with V1 storage) + v1CreateTime := time.Now().UTC() + v1ExpireTime := v1CreateTime.AddDate(5, 0, 0) + writeAPiece(ctx, t, store, satelliteID, pieceID, v1Data, v1CreateTime, &v1ExpireTime, filestore.FormatV1) + + // ensure we can see it (the new piece) via store.Reader + { + reader, err := store.Reader(ctx, satelliteID, pieceID) + require.NoError(t, err) + assert.Equal(t, int64(len(v1Data)), reader.Size()) + assert.Equal(t, filestore.FormatV1, reader.StorageFormatVersion()) + gotData, err := ioutil.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, v1Data, gotData) + require.NoError(t, reader.Close()) + } + + // now _both_ pieces should show up under WalkSatellitePieces. this may + // be counter-intuitive, but the V0 piece still exists for now (so we can avoid + // hitting the pieceinfo db with every new piece write). I believe this is OK, because + // (a) I don't think that writing different pieces with the same piece ID is a normal + // use case, unless we make a V0->V1 migrator tool, which should know about these + // semantics; (b) the V0 piece should not ever become visible again to the user; it + // should not be possible under normal conditions to delete one without deleting the + // other. + calledTimes = 0 + err = store.WalkSatellitePieces(ctx, satelliteID, func(access pieces.StoredPieceAccess) error { + calledTimes++ + switch calledTimes { + case 1: + // expect the V1 piece + assert.Equal(t, pieceID, access.PieceID()) + assert.Equal(t, filestore.FormatV1, access.StorageFormatVersion()) + gotCreateTime, err := access.CreationTime(ctx) + require.NoError(t, err) + assert.Equal(t, v1CreateTime, gotCreateTime) + gotSize, err := access.ContentSize(ctx) + require.NoError(t, err) + assert.Equal(t, int64(len(v1Data)), gotSize) + case 2: + // expect the V0 piece + assert.Equal(t, pieceID, access.PieceID()) + assert.Equal(t, filestore.FormatV0, access.StorageFormatVersion()) + gotCreateTime, err := access.CreationTime(ctx) + require.NoError(t, err) + assert.Equal(t, v0CreateTime, gotCreateTime) + gotSize, err := access.ContentSize(ctx) + require.NoError(t, err) + assert.Equal(t, int64(len(v0Data)), gotSize) + default: + t.Fatalf("calledTimes should be 1 or 2, but it is %d", calledTimes) + } + return nil + }) + require.NoError(t, err) + + // delete the pieceID; this should get both V0 and V1 + err = store.Delete(ctx, satelliteID, pieceID) + require.NoError(t, err) + + err = store.WalkSatellitePieces(ctx, satelliteID, func(access pieces.StoredPieceAccess) error { + t.Fatalf("this should not have been called. pieceID=%x, format=%d", access.PieceID(), access.StorageFormatVersion()) + return nil + }) + require.NoError(t, err) + }) +} diff --git a/storagenode/piecestore/endpoint.go b/storagenode/piecestore/endpoint.go index 69ae8d883..c9589ba37 100644 --- a/storagenode/piecestore/endpoint.go +++ b/storagenode/piecestore/endpoint.go @@ -120,7 +120,6 @@ type Endpoint struct { monitor *monitor.Service store *pieces.Store - pieceinfo pieces.DB orders orders.DB usage bandwidth.DB usedSerials UsedSerials @@ -129,7 +128,7 @@ type Endpoint struct { } // NewEndpoint creates a new piecestore endpoint. -func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, store *pieces.Store, pieceinfo pieces.DB, orders orders.DB, usage bandwidth.DB, usedSerials UsedSerials, config Config) (*Endpoint, error) { +func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, store *pieces.Store, orders orders.DB, usage bandwidth.DB, usedSerials UsedSerials, config Config) (*Endpoint, error) { return &Endpoint{ log: log, config: config, @@ -139,7 +138,6 @@ func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, moni monitor: monitor, store: store, - pieceinfo: pieceinfo, orders: orders, usage: usage, usedSerials: usedSerials, @@ -167,11 +165,7 @@ func (endpoint *Endpoint) Delete(ctx context.Context, delete *pb.PieceDeleteRequ return nil, Error.Wrap(err) } - // TODO: parallelize this and maybe return early - pieceInfoErr := endpoint.pieceinfo.Delete(ctx, delete.Limit.SatelliteId, delete.Limit.PieceId) - pieceErr := endpoint.store.Delete(ctx, delete.Limit.SatelliteId, delete.Limit.PieceId) - - if err := errs.Combine(pieceInfoErr, pieceErr); err != nil { + if err := endpoint.store.Delete(ctx, delete.Limit.SatelliteId, delete.Limit.PieceId); err != nil { // explicitly ignoring error because the errors // TODO: add more debug info endpoint.log.Error("delete failed", zap.Stringer("Piece ID", delete.Limit.PieceId), zap.Error(err)) @@ -327,44 +321,36 @@ func (endpoint *Endpoint) Upload(stream pb.Piecestore_UploadServer) (err error) } if message.Done != nil { - expectedHash := pieceWriter.Hash() - if err := endpoint.VerifyPieceHash(ctx, limit, message.Done, expectedHash); err != nil { + calculatedHash := pieceWriter.Hash() + if err := endpoint.VerifyPieceHash(ctx, limit, message.Done, calculatedHash); err != nil { return err // TODO: report grpc status internal server error } if message.Done.PieceSize != pieceWriter.Size() { return ErrProtocol.New("Size of finished piece does not match size declared by uplink! %d != %d", - message.Done.GetPieceSize(), pieceWriter.Size()) + message.Done.PieceSize, pieceWriter.Size()) } - if err := pieceWriter.Commit(ctx); err != nil { - return ErrInternal.Wrap(err) // TODO: report grpc status internal server error - } - - // TODO: do this in a goroutine { - // TODO: maybe this should be as a pieceWriter.Commit(ctx, info) - info := &pieces.Info{ - SatelliteID: limit.SatelliteId, - - PieceID: limit.PieceId, - PieceSize: pieceWriter.Size(), - PieceCreation: limit.OrderCreation, - PieceExpiration: limit.PieceExpiration, - - OrderLimit: limit, - UplinkPieceHash: message.Done, + info := &pb.PieceHeader{ + Hash: calculatedHash, + CreationTime: message.Done.Timestamp, + Signature: message.Done.GetSignature(), + OrderLimit: *limit, } - - if err := endpoint.pieceinfo.Add(ctx, info); err != nil { - ignoreCancelContext := context.Background() - deleteErr := endpoint.store.Delete(ignoreCancelContext, limit.SatelliteId, limit.PieceId) - return ErrInternal.Wrap(errs.Combine(err, deleteErr)) + if err := pieceWriter.Commit(ctx, info); err != nil { + return ErrInternal.Wrap(err) // TODO: report grpc status internal server error + } + if !limit.PieceExpiration.IsZero() { + err := endpoint.store.SetExpiration(ctx, limit.SatelliteId, limit.PieceId, limit.PieceExpiration) + if err != nil { + return ErrInternal.Wrap(err) // TODO: report grpc status internal server error + } } } storageNodeHash, err := signing.SignPieceHash(ctx, endpoint.signer, &pb.PieceHash{ PieceId: limit.PieceId, - Hash: expectedHash, + Hash: calculatedHash, PieceSize: pieceWriter.Size(), Timestamp: time.Now(), }) @@ -589,6 +575,53 @@ func (endpoint *Endpoint) saveOrder(ctx context.Context, limit *pb.OrderLimit, o } } +// ------------------------------------------------------------------------------------------------ +// On the correctness of using access.ModTime() in place of the more precise access.CreationTime() +// in Retain(): +// ------------------------------------------------------------------------------------------------ +// +// Background: for pieces not stored with storage.FormatV0, the access.CreationTime() value can +// only be retrieved by opening the piece file, and reading and unmarshaling the piece header. +// This is far slower than access.ModTime(), which gets the file modification time from the file +// system and only needs to do a stat(2) on the piece file. If we can make Retain() work with +// ModTime, we should. +// +// Possibility of mismatch: We do not force or require piece file modification times to be equal to +// or close to the CreationTime specified by the uplink, but we do expect that piece files will be +// written to the filesystem _after_ the CreationTime. We make the assumption already that storage +// nodes and satellites and uplinks have system clocks that are very roughly in sync (that is, they +// are out of sync with each other by less than an hour of real time, or whatever is configured as +// RetainTimeBuffer). So if an uplink is not lying about CreationTime and it uploads a piece that +// makes it to a storagenode's disk as quickly as possible, even in the worst-synchronized-clocks +// case we can assume that `ModTime > (CreationTime - RetainTimeBuffer)`. We also allow for storage +// node operators doing file system manipulations after a piece has been written. If piece files +// are copied between volumes and their attributes are not preserved, it will be possible for their +// modification times to be changed to something later in time. This still preserves the inequality +// relationship mentioned above, `ModTime > (CreationTime - RetainTimeBuffer)`. We only stipulate +// that storage node operators must not artificially change blob file modification times to be in +// the past. +// +// If there is a mismatch: in most cases, a mismatch between ModTime and CreationTime has no +// effect. In certain remaining cases, the only effect is that a piece file which _should_ be +// garbage collected survives until the next round of garbage collection. The only really +// problematic case is when there is a relatively new piece file which was created _after_ this +// node's Retain bloom filter started being built on the satellite, and is recorded in this +// storage node's blob store before the Retain operation has completed. Then, it might be possible +// for that new piece to be garbage collected incorrectly, because it does not show up in the +// bloom filter and the node incorrectly thinks that it was created before the bloom filter. +// But if the uplink is not lying about CreationTime and its clock drift versus the storage node +// is less than `RetainTimeBuffer`, and the ModTime on a blob file is correctly set from the +// storage node system time, then it is still true that `ModTime > (CreationTime - +// RetainTimeBuffer)`. +// +// The rule that storage node operators need to be aware of is only this: do not artificially set +// mtimes on blob files to be in the past. Let the filesystem manage mtimes. If blob files need to +// be moved or copied between locations, and this updates the mtime, that is ok. A secondary effect +// of this rule is that if the storage node's system clock needs to be changed forward by a +// nontrivial amount, mtimes on existing blobs should also be adjusted (by the same interval, +// ideally, but just running "touch" on all blobs is sufficient to avoid incorrect deletion of +// data). + // Retain keeps only piece ids specified in the request func (endpoint *Endpoint) Retain(ctx context.Context, retainReq *pb.RetainRequest) (res *pb.RetainResponse, err error) { defer mon.Task()(&ctx)(&err) @@ -613,48 +646,56 @@ func (endpoint *Endpoint) Retain(ctx context.Context, retainReq *pb.RetainReques return nil, status.Error(codes.InvalidArgument, Error.Wrap(err).Error()) } - const limit = 1000 - cursor := storj.PieceID{} numDeleted := 0 - hasMorePieces := true - for hasMorePieces { - // subtract some time to leave room for clock difference between the satellite and storage node - createdBefore := retainReq.GetCreationDate().Add(-endpoint.config.RetainTimeBuffer) + // subtract some time to leave room for clock difference between the satellite and storage node + createdBefore := retainReq.GetCreationDate().Add(-endpoint.config.RetainTimeBuffer) - pieceIDs, err := endpoint.pieceinfo.GetPieceIDs(ctx, peer.ID, createdBefore, limit, cursor) - if err != nil { - return nil, status.Error(codes.Internal, Error.Wrap(err).Error()) - } - for _, pieceID := range pieceIDs { - cursor = pieceID + endpoint.log.Info("Prepared to run a Retain request.", + zap.Time("createdBefore", createdBefore), + zap.Int64("filterSize", filter.Size()), + zap.String("satellite", peer.ID.String())) - if !filter.Contains(pieceID) { - endpoint.log.Sugar().Debugf("About to delete piece id (%s) from satellite (%s). RetainStatus: %s", pieceID.String(), peer.ID.String(), endpoint.config.RetainStatus.String()) - - // if retain status is enabled, delete pieceid - if endpoint.config.RetainStatus == RetainEnabled { - if err = endpoint.store.Delete(ctx, peer.ID, pieceID); err != nil { - endpoint.log.Error("failed to delete a piece", zap.Error(err)) - // continue because if we fail to delete from file system, - // we need to keep the pieceinfo so we can delete next time - continue - } - if err = endpoint.pieceinfo.Delete(ctx, peer.ID, pieceID); err != nil { - endpoint.log.Error("failed to delete piece info", zap.Error(err)) - } - } - - numDeleted++ - } - } - - hasMorePieces = (len(pieceIDs) == limit) - // We call Gosched() here because the GC process is expected to be long and we want to keep it at low priority, + err = endpoint.store.WalkSatellitePieces(ctx, peer.ID, func(access pieces.StoredPieceAccess) error { + // We call Gosched() when done because the GC process is expected to be long and we want to keep it at low priority, // so other goroutines can continue serving requests. - runtime.Gosched() - } + defer runtime.Gosched() + // See the comment above the Retain() function for a discussion on the correctness + // of using ModTime in place of the more precise CreationTime. + mTime, err := access.ModTime(ctx) + if err != nil { + endpoint.log.Error("failed to determine mtime of blob", zap.Error(err)) + // but continue iterating. + return nil + } + if !mTime.Before(createdBefore) { + return nil + } + pieceID := access.PieceID() + if !filter.Contains(pieceID) { + endpoint.log.Debug("About to delete piece id", + zap.String("satellite", peer.ID.String()), + zap.String("pieceID", pieceID.String()), + zap.String("retainStatus", endpoint.config.RetainStatus.String())) + // if retain status is enabled, delete pieceid + if endpoint.config.RetainStatus == RetainEnabled { + if err = endpoint.store.Delete(ctx, peer.ID, pieceID); err != nil { + endpoint.log.Error("failed to delete piece", + zap.String("satellite", peer.ID.String()), + zap.String("pieceID", pieceID.String()), + zap.Error(err)) + return nil + } + } + numDeleted++ + } + return nil + }) + if err != nil { + return nil, status.Error(codes.Internal, Error.Wrap(err).Error()) + } + mon.IntVal("garbage_collection_pieces_deleted").Observe(int64(numDeleted)) endpoint.log.Sugar().Debugf("Deleted %d pieces during retain. RetainStatus: %s", numDeleted, endpoint.config.RetainStatus.String()) return &pb.RetainResponse{}, nil diff --git a/storagenode/piecestore/endpoint_test.go b/storagenode/piecestore/endpoint_test.go index 42e86f81f..857156cb8 100644 --- a/storagenode/piecestore/endpoint_test.go +++ b/storagenode/piecestore/endpoint_test.go @@ -4,6 +4,7 @@ package piecestore_test import ( + "context" "crypto/tls" "crypto/x509" "io" @@ -235,7 +236,7 @@ func TestDownload(t *testing.T) { { // should err with piece ID not specified pieceID: storj.PieceID{2}, action: pb.PieceAction_GET, - errs: []string{"no such file or directory", "The system cannot find the path specified"}, + errs: []string{"file does not exist", "The system cannot find the path specified"}, }, { // should successfully download data pieceID: orderLimit.PieceId, @@ -506,8 +507,7 @@ func TestRetain(t *testing.T) { storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { ctx := testcontext.New(t) defer ctx.Cleanup() - pieceInfos := db.PieceInfo() - store := pieces.NewStore(zaptest.NewLogger(t), db.Pieces()) + store := pieces.NewStore(zaptest.NewLogger(t), db.Pieces(), db.V0PieceInfo(), db.PieceExpirationDB()) const numPieces = 1000 const numPiecesToKeep = 990 @@ -532,15 +532,15 @@ func TestRetain(t *testing.T) { require.NoError(t, err) uplink := testidentity.MustPregeneratedSignedIdentity(3, storj.LatestIDVersion()) - endpointEnabled, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, pieceInfos, nil, nil, nil, ps.Config{ + endpointEnabled, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, nil, nil, nil, ps.Config{ RetainStatus: ps.RetainEnabled, }) require.NoError(t, err) - endpointDisabled, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, pieceInfos, nil, nil, nil, ps.Config{ + endpointDisabled, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, nil, nil, nil, ps.Config{ RetainStatus: ps.RetainDisabled, }) require.NoError(t, err) - endpointDebug, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, pieceInfos, nil, nil, nil, ps.Config{ + endpointDebug, err := ps.NewEndpoint(zaptest.NewLogger(t), nil, trusted, nil, store, nil, nil, nil, ps.Config{ RetainStatus: ps.RetainDebug, }) require.NoError(t, err) @@ -597,10 +597,11 @@ func TestRetain(t *testing.T) { OrderLimit: &pb.OrderLimit{}, } - err = pieceInfos.Add(ctx, &pieceinfo0) + v0db := store.GetV0PieceInfoDB().(pieces.V0PieceInfoDBForTest) + err = v0db.Add(ctx, &pieceinfo0) require.NoError(t, err) - err = pieceInfos.Add(ctx, &pieceinfo1) + err = v0db.Add(ctx, &pieceinfo1) require.NoError(t, err) } @@ -624,11 +625,11 @@ func TestRetain(t *testing.T) { _, err = endpointDebug.Retain(ctxSatellite0, &retainReq) require.NoError(t, err) - satellite1Pieces, err := pieceInfos.GetPieceIDs(ctx, satellite1.ID, recentTime.Add(time.Duration(5)*time.Second), numPieces, storj.PieceID{}) + satellite1Pieces, err := getAllPieceIDs(ctx, store, satellite1.ID, recentTime.Add(time.Duration(5)*time.Second)) require.NoError(t, err) require.Equal(t, numPieces, len(satellite1Pieces)) - satellite0Pieces, err := pieceInfos.GetPieceIDs(ctx, satellite0.ID, recentTime.Add(time.Duration(5)*time.Second), numPieces, storj.PieceID{}) + satellite0Pieces, err := getAllPieceIDs(ctx, store, satellite0.ID, recentTime.Add(time.Duration(5)*time.Second)) require.NoError(t, err) require.Equal(t, numPieces, len(satellite0Pieces)) @@ -637,13 +638,13 @@ func TestRetain(t *testing.T) { require.NoError(t, err) // check we have deleted nothing for satellite1 - satellite1Pieces, err = pieceInfos.GetPieceIDs(ctx, satellite1.ID, recentTime.Add(time.Duration(5)*time.Second), numPieces, storj.PieceID{}) + satellite1Pieces, err = getAllPieceIDs(ctx, store, satellite1.ID, recentTime.Add(time.Duration(5)*time.Second)) require.NoError(t, err) require.Equal(t, numPieces, len(satellite1Pieces)) // check we did not delete recent pieces or retained pieces for satellite0 // also check that we deleted the correct pieces for satellite0 - satellite0Pieces, err = pieceInfos.GetPieceIDs(ctx, satellite0.ID, recentTime.Add(time.Duration(5)*time.Second), numPieces, storj.PieceID{}) + satellite0Pieces, err = getAllPieceIDs(ctx, store, satellite0.ID, recentTime.Add(time.Duration(5)*time.Second)) require.NoError(t, err) require.Equal(t, numPieces-numOldPieces, len(satellite0Pieces)) @@ -661,6 +662,21 @@ func TestRetain(t *testing.T) { }) } +func getAllPieceIDs(ctx context.Context, store *pieces.Store, satellite storj.NodeID, createdBefore time.Time) (pieceIDs []storj.PieceID, err error) { + err = store.WalkSatellitePieces(ctx, satellite, func(pieceAccess pieces.StoredPieceAccess) error { + mTime, err := pieceAccess.CreationTime(ctx) + if err != nil { + return err + } + if !mTime.Before(createdBefore) { + return nil + } + pieceIDs = append(pieceIDs, pieceAccess.PieceID()) + return nil + }) + return pieceIDs, err +} + // generateTestIDs generates n piece ids func generateTestIDs(n int) []storj.PieceID { ids := make([]storj.PieceID, n) diff --git a/storagenode/piecestore/verification_test.go b/storagenode/piecestore/verification_test.go index 725396004..f573861c6 100644 --- a/storagenode/piecestore/verification_test.go +++ b/storagenode/piecestore/verification_test.go @@ -5,7 +5,6 @@ package piecestore_test import ( "context" - "fmt" "testing" "time" @@ -26,7 +25,8 @@ import ( const oneWeek = 7 * 24 * time.Hour func TestOrderLimitPutValidation(t *testing.T) { - for i, tt := range []struct { + for _, tt := range []struct { + testName string useUnknownSatellite bool pieceID storj.PieceID action pb.PieceAction @@ -38,7 +38,8 @@ func TestOrderLimitPutValidation(t *testing.T) { availableSpace int64 err string }{ - { // unapproved satellite id + { + testName: "unapproved satellite id", useUnknownSatellite: true, pieceID: storj.PieceID{1}, action: pb.PieceAction_PUT, @@ -48,7 +49,8 @@ func TestOrderLimitPutValidation(t *testing.T) { limit: memory.KiB.Int64(), err: " is untrusted", }, - { // approved satellite id + { + testName: "approved satellite id", pieceID: storj.PieceID{2}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{2}, @@ -56,7 +58,8 @@ func TestOrderLimitPutValidation(t *testing.T) { orderExpiration: oneWeek, limit: 10 * memory.KiB.Int64(), }, - { // wrong action type + { + testName: "wrong action type", pieceID: storj.PieceID{3}, action: pb.PieceAction_GET, serialNumber: storj.SerialNumber{3}, @@ -65,7 +68,8 @@ func TestOrderLimitPutValidation(t *testing.T) { limit: memory.KiB.Int64(), err: "expected put or put repair action got GET", }, - { // piece expired + { + testName: "piece expired", pieceID: storj.PieceID{4}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{4}, @@ -74,7 +78,8 @@ func TestOrderLimitPutValidation(t *testing.T) { limit: memory.KiB.Int64(), err: "piece expired:", }, - { // limit is negative + { + testName: "limit is negative", pieceID: storj.PieceID{5}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{5}, @@ -83,7 +88,8 @@ func TestOrderLimitPutValidation(t *testing.T) { limit: -1, err: "order limit is negative", }, - { // order limit expired + { + testName: "order limit expired", pieceID: storj.PieceID{6}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{6}, @@ -92,7 +98,8 @@ func TestOrderLimitPutValidation(t *testing.T) { limit: memory.KiB.Int64(), err: "order expired:", }, - { // allocated bandwidth limit + { + testName: "allocated bandwidth limit", pieceID: storj.PieceID{7}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{7}, @@ -102,7 +109,8 @@ func TestOrderLimitPutValidation(t *testing.T) { availableBandwidth: 5 * memory.KiB.Int64(), err: "out of bandwidth", }, - { // allocated space limit + { + testName: "allocated space limit", pieceID: storj.PieceID{8}, action: pb.PieceAction_PUT, serialNumber: storj.SerialNumber{8}, @@ -113,69 +121,71 @@ func TestOrderLimitPutValidation(t *testing.T) { err: "out of space", }, } { - ctx := testcontext.New(t) - defer ctx.Cleanup() + tt := tt + t.Run(tt.testName, func(t *testing.T) { + ctx := testcontext.New(t) + defer ctx.Cleanup() - planet, err := testplanet.New(t, 1, 1, 1) - require.NoError(t, err) - defer ctx.Check(planet.Shutdown) - - planet.Start(ctx) - - // set desirable bandwidth - setBandwidth(ctx, t, planet, tt.availableBandwidth) - // set desirable space - setSpace(ctx, t, planet, tt.availableSpace) - - client, err := planet.Uplinks[0].DialPiecestore(ctx, planet.StorageNodes[0]) - require.NoError(t, err) - defer ctx.Check(client.Close) - - signer := signing.SignerFromFullIdentity(planet.Satellites[0].Identity) - satellite := planet.Satellites[0].Identity - if tt.useUnknownSatellite { - unapprovedSatellite, err := planet.NewIdentity() + planet, err := testplanet.New(t, 1, 1, 1) require.NoError(t, err) - signer = signing.SignerFromFullIdentity(unapprovedSatellite) - satellite = unapprovedSatellite - } + defer ctx.Check(planet.Shutdown) - orderLimit, piecePrivateKey := GenerateOrderLimit( - t, - satellite.ID, - planet.StorageNodes[0].ID(), - tt.pieceID, - tt.action, - tt.serialNumber, - tt.pieceExpiration, - tt.orderExpiration, - tt.limit, - ) + planet.Start(ctx) - orderLimit, err = signing.SignOrderLimit(ctx, signer, orderLimit) - require.NoError(t, err) + // set desirable bandwidth + setBandwidth(ctx, t, planet, tt.availableBandwidth) + // set desirable space + setSpace(ctx, t, planet, tt.availableSpace) - uploader, err := client.Upload(ctx, orderLimit, piecePrivateKey) - require.NoError(t, err) + client, err := planet.Uplinks[0].DialPiecestore(ctx, planet.StorageNodes[0]) + require.NoError(t, err) + defer ctx.Check(client.Close) - var writeErr error - buffer := make([]byte, memory.KiB) - for i := 0; i < 10; i++ { - testrand.Read(buffer) - _, writeErr = uploader.Write(buffer) - if writeErr != nil { - break + signer := signing.SignerFromFullIdentity(planet.Satellites[0].Identity) + satellite := planet.Satellites[0].Identity + if tt.useUnknownSatellite { + unapprovedSatellite, err := planet.NewIdentity() + require.NoError(t, err) + signer = signing.SignerFromFullIdentity(unapprovedSatellite) + satellite = unapprovedSatellite } - } - _, commitErr := uploader.Commit(ctx) - err = errs.Combine(writeErr, commitErr) - testIndex := fmt.Sprintf("#%d", i) - if tt.err != "" { - require.Error(t, err, testIndex) - require.Contains(t, err.Error(), tt.err, testIndex) - } else { - require.NoError(t, err, testIndex) - } + + orderLimit, piecePrivateKey := GenerateOrderLimit( + t, + satellite.ID, + planet.StorageNodes[0].ID(), + tt.pieceID, + tt.action, + tt.serialNumber, + tt.pieceExpiration, + tt.orderExpiration, + tt.limit, + ) + + orderLimit, err = signing.SignOrderLimit(ctx, signer, orderLimit) + require.NoError(t, err) + + uploader, err := client.Upload(ctx, orderLimit, piecePrivateKey) + require.NoError(t, err) + + var writeErr error + buffer := make([]byte, memory.KiB) + for i := 0; i < 10; i++ { + testrand.Read(buffer) + _, writeErr = uploader.Write(buffer) + if writeErr != nil { + break + } + } + _, commitErr := uploader.Commit(ctx) + err = errs.Combine(writeErr, commitErr) + if tt.err != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tt.err) + } else { + require.NoError(t, err) + } + }) } } @@ -318,17 +328,6 @@ func setSpace(ctx context.Context, t *testing.T, planet *testplanet.Planet, spac for _, storageNode := range planet.StorageNodes { availableSpace, err := storageNode.Storage2.Monitor.AvailableSpace(ctx) require.NoError(t, err) - diff := (space - availableSpace) * -1 - now := time.Now() - err = storageNode.DB.PieceInfo().Add(ctx, &pieces.Info{ - SatelliteID: planet.Satellites[0].ID(), - PieceID: storj.PieceID{99}, - PieceSize: diff, - PieceCreation: now, - PieceExpiration: time.Time{}, - OrderLimit: &pb.OrderLimit{}, - UplinkPieceHash: &pb.PieceHash{}, - }) - require.NoError(t, err) + pieces.StoreForTest{Store: storageNode.Storage2.Store}.ReserveSpace(availableSpace - space) } } diff --git a/storagenode/storagenodedb/database.go b/storagenode/storagenodedb/database.go index eb008ac44..52324ddf3 100644 --- a/storagenode/storagenodedb/database.go +++ b/storagenode/storagenodedb/database.go @@ -54,7 +54,7 @@ func New(log *zap.Logger, config Config) (*DB, error) { if err != nil { return nil, err } - pieces := filestore.New(piecesDir) + pieces := filestore.New(log, piecesDir) infodb, err := newInfo(config.Info2) if err != nil { @@ -85,7 +85,7 @@ func NewTest(log *zap.Logger, storageDir string) (*DB, error) { if err != nil { return nil, err } - pieces := filestore.New(piecesDir) + pieces := filestore.New(log, piecesDir) infodb, err := NewInfoTest() if err != nil { diff --git a/storagenode/storagenodedb/infodb.go b/storagenode/storagenodedb/infodb.go index a2ff9875c..a16681dea 100644 --- a/storagenode/storagenodedb/infodb.go +++ b/storagenode/storagenodedb/infodb.go @@ -15,7 +15,7 @@ import ( "github.com/zeebo/errs" "go.uber.org/zap" - monkit "gopkg.in/spacemonkeygo/monkit.v2" + "gopkg.in/spacemonkeygo/monkit.v2" "storj.io/storj/internal/dbutil" "storj.io/storj/internal/dbutil/utccheck" @@ -52,10 +52,11 @@ type SQLDB interface { // InfoDB implements information database for piecestore. type InfoDB struct { - db SQLDB - bandwidthdb bandwidthdb - pieceinfo pieceinfo - location string + db SQLDB + bandwidthdb bandwidthdb + v0PieceInfo v0PieceInfo + pieceExpirationDB pieceExpirationDB + location string } // newInfo creates or opens InfoDB at the specified path. @@ -72,8 +73,9 @@ func newInfo(path string) (*InfoDB, error) { dbutil.Configure(db, mon) infoDb := &InfoDB{db: db} - infoDb.pieceinfo = pieceinfo{InfoDB: infoDb} + infoDb.v0PieceInfo = v0PieceInfo{InfoDB: infoDb} infoDb.bandwidthdb = bandwidthdb{InfoDB: infoDb} + infoDb.pieceExpirationDB = pieceExpirationDB{InfoDB: infoDb} infoDb.location = path return infoDb, nil @@ -99,8 +101,9 @@ func NewInfoTest() (*InfoDB, error) { })) infoDb := &InfoDB{db: utccheck.New(db)} - infoDb.pieceinfo = pieceinfo{InfoDB: infoDb} + infoDb.v0PieceInfo = v0PieceInfo{InfoDB: infoDb} infoDb.bandwidthdb = bandwidthdb{InfoDB: infoDb} + infoDb.pieceExpirationDB = pieceExpirationDB{InfoDB: infoDb} return infoDb, nil } @@ -404,6 +407,22 @@ func (db *InfoDB) Migration() *migrate.Migration { return nil }), }, + { + Description: "Start piece_expirations table, deprecate pieceinfo table", + Version: 15, + Action: migrate.SQL{ + // new table to hold expiration data (and only expirations. no other pieceinfo) + `CREATE TABLE piece_expirations ( + satellite_id BLOB NOT NULL, + piece_id BLOB NOT NULL, + piece_expiration TIMESTAMP NOT NULL, -- date when it can be deleted + deletion_failed_at TIMESTAMP, + PRIMARY KEY (satellite_id, piece_id) + )`, + `CREATE INDEX idx_piece_expirations_piece_expiration ON piece_expirations(piece_expiration)`, + `CREATE INDEX idx_piece_expirations_deletion_failed_at ON piece_expirations(deletion_failed_at)`, + }, + }, }, } } diff --git a/storagenode/storagenodedb/pieceexpiration.go b/storagenode/storagenodedb/pieceexpiration.go new file mode 100644 index 000000000..4722d469f --- /dev/null +++ b/storagenode/storagenodedb/pieceexpiration.go @@ -0,0 +1,99 @@ +// Copyright (C) 2019 Storj Labs, Inc. +// See LICENSE for copying information. + +package storagenodedb + +import ( + "context" + "time" + + "github.com/zeebo/errs" + + "storj.io/storj/pkg/storj" + "storj.io/storj/storagenode/pieces" +) + +type pieceExpirationDB struct { + *InfoDB +} + +// PieceExpirationDB returns database for storing piece expiration data +func (db *DB) PieceExpirationDB() pieces.PieceExpirationDB { return db.info.PieceExpirationDB() } + +// PieceExpirationDB returns database for storing piece expiration data +func (db *InfoDB) PieceExpirationDB() pieces.PieceExpirationDB { return &db.pieceExpirationDB } + +// GetExpired gets piece IDs that expire or have expired before the given time +func (db *pieceExpirationDB) GetExpired(ctx context.Context, expiresBefore time.Time, limit int64) (expiredPieceIDs []pieces.ExpiredInfo, err error) { + defer mon.Task()(&ctx)(&err) + + rows, err := db.db.QueryContext(ctx, ` + SELECT satellite_id, piece_id + FROM piece_expirations + WHERE piece_expiration < ? + AND ((deletion_failed_at IS NULL) OR deletion_failed_at <> ?) + LIMIT ? + `, expiresBefore.UTC(), expiresBefore.UTC(), limit) + if err != nil { + return nil, ErrInfo.Wrap(err) + } + defer func() { err = errs.Combine(err, rows.Close()) }() + + for rows.Next() { + var satelliteID storj.NodeID + var pieceID storj.PieceID + err = rows.Scan(&satelliteID, &pieceID) + if err != nil { + return nil, ErrInfo.Wrap(err) + } + expiredPieceIDs = append(expiredPieceIDs, pieces.ExpiredInfo{ + SatelliteID: satelliteID, + PieceID: pieceID, + InPieceInfo: false, + }) + } + return expiredPieceIDs, nil +} + +// SetExpiration sets an expiration time for the given piece ID on the given satellite +func (db *pieceExpirationDB) SetExpiration(ctx context.Context, satellite storj.NodeID, pieceID storj.PieceID, expiresAt time.Time) (err error) { + defer mon.Task()(&ctx)(&err) + + _, err = db.db.ExecContext(ctx, ` + INSERT INTO piece_expirations(satellite_id, piece_id, piece_expiration) + VALUES (?,?,?) + `, satellite, pieceID, expiresAt.UTC()) + return ErrInfo.Wrap(err) +} + +// DeleteExpiration removes an expiration record for the given piece ID on the given satellite +func (db *pieceExpirationDB) DeleteExpiration(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (found bool, err error) { + defer mon.Task()(&ctx)(&err) + + result, err := db.db.ExecContext(ctx, ` + DELETE FROM piece_expirations + WHERE satellite_id = ? AND piece_id = ? + `, satelliteID, pieceID) + if err != nil { + return false, err + } + numRows, err := result.RowsAffected() + if err != nil { + return false, err + } + return numRows > 0, nil +} + +// DeleteFailed marks an expiration record as having experienced a failure in deleting the piece +// from the disk +func (db *pieceExpirationDB) DeleteFailed(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID, when time.Time) (err error) { + defer mon.Task()(&ctx)(&err) + + _, err = db.db.ExecContext(ctx, ` + UPDATE piece_expirations + SET deletion_failed_at = ? + WHERE satellite_id = ? + AND piece_id = ? + `, when.UTC(), satelliteID, pieceID) + return ErrInfo.Wrap(err) +} diff --git a/storagenode/storagenodedb/pieceinfo.go b/storagenode/storagenodedb/pieceinfo.go index 36c36ab6d..608df2994 100644 --- a/storagenode/storagenodedb/pieceinfo.go +++ b/storagenode/storagenodedb/pieceinfo.go @@ -5,9 +5,7 @@ package storagenodedb import ( "context" - "database/sql" - "sync" - "sync/atomic" + "os" "time" "github.com/gogo/protobuf/proto" @@ -15,25 +13,23 @@ import ( "storj.io/storj/pkg/pb" "storj.io/storj/pkg/storj" + "storj.io/storj/storage" + "storj.io/storj/storage/filestore" "storj.io/storj/storagenode/pieces" ) -type pieceinfo struct { - // Moved to top of struct to resolve alignment issue with atomic operations on ARM - usedSpace int64 - loadSpaceOnce sync.Once - +type v0PieceInfo struct { *InfoDB } -// PieceInfo returns database for storing piece information -func (db *DB) PieceInfo() pieces.DB { return db.info.PieceInfo() } +// V0PieceInfo returns database for storing piece information +func (db *DB) V0PieceInfo() pieces.V0PieceInfoDB { return db.info.V0PieceInfo() } -// PieceInfo returns database for storing piece information -func (db *InfoDB) PieceInfo() pieces.DB { return &db.pieceinfo } +// V0PieceInfo returns database for storing piece information +func (db *InfoDB) V0PieceInfo() pieces.V0PieceInfoDB { return &db.v0PieceInfo } // Add inserts piece information into the database. -func (db *pieceinfo) Add(ctx context.Context, info *pieces.Info) (err error) { +func (db *v0PieceInfo) Add(ctx context.Context, info *pieces.Info) (err error) { defer mon.Task()(&ctx)(&err) orderLimit, err := proto.Marshal(info.OrderLimit) @@ -59,41 +55,65 @@ func (db *pieceinfo) Add(ctx context.Context, info *pieces.Info) (err error) { VALUES (?,?,?,?,?,?,?,?) `), info.SatelliteID, info.PieceID, info.PieceSize, info.PieceCreation.UTC(), pieceExpiration, orderLimit, uplinkPieceHash, 0) - if err == nil { - db.loadSpaceUsed(ctx) - atomic.AddInt64(&db.usedSpace, info.PieceSize) - } return ErrInfo.Wrap(err) } -// GetPieceIDs gets pieceIDs using the satelliteID -func (db *pieceinfo) GetPieceIDs(ctx context.Context, satelliteID storj.NodeID, createdBefore time.Time, limit int, cursor storj.PieceID) (pieceIDs []storj.PieceID, err error) { - defer mon.Task()(&ctx)(&err) - +func (db *v0PieceInfo) getAllPiecesOwnedBy(ctx context.Context, blobStore storage.Blobs, satelliteID storj.NodeID) ([]v0StoredPieceAccess, error) { rows, err := db.db.QueryContext(ctx, db.Rebind(` - SELECT piece_id + SELECT piece_id, piece_size, piece_creation, piece_expiration FROM pieceinfo_ - WHERE satellite_id = ? AND datetime(piece_creation) < datetime(?) AND piece_id > ? + WHERE satellite_id = ? ORDER BY piece_id - LIMIT ? - `), satelliteID, createdBefore.UTC(), cursor, limit) + `), satelliteID) if err != nil { return nil, ErrInfo.Wrap(err) } defer func() { err = errs.Combine(err, rows.Close()) }() + var pieceInfos []v0StoredPieceAccess for rows.Next() { - var pieceID storj.PieceID - err = rows.Scan(&pieceID) + pieceInfos = append(pieceInfos, v0StoredPieceAccess{ + blobStore: blobStore, + satellite: satelliteID, + }) + thisAccess := &pieceInfos[len(pieceInfos)-1] + err = rows.Scan(&thisAccess.pieceID, &thisAccess.pieceSize, &thisAccess.creationTime, &thisAccess.expirationTime) if err != nil { - return pieceIDs, ErrInfo.Wrap(err) + return nil, ErrInfo.Wrap(err) } - pieceIDs = append(pieceIDs, pieceID) } - return pieceIDs, nil + return pieceInfos, nil +} + +// WalkSatelliteV0Pieces executes walkFunc for each locally stored piece, stored with storage +// format V0 in the namespace of the given satellite. If walkFunc returns a non-nil error, +// WalkSatelliteV0Pieces will stop iterating and return the error immediately. The ctx parameter +// parameter is intended specifically to allow canceling iteration early. +// +// If blobStore is nil, the .Stat() and .FullPath() methods of the provided StoredPieceAccess +// instances will not work, but otherwise everything should be ok. +func (db *v0PieceInfo) WalkSatelliteV0Pieces(ctx context.Context, blobStore storage.Blobs, satelliteID storj.NodeID, walkFunc func(pieces.StoredPieceAccess) error) (err error) { + defer mon.Task()(&ctx)(&err) + + // TODO: is it worth paging this query? we hope that SNs will not yet have too many V0 pieces. + pieceInfos, err := db.getAllPiecesOwnedBy(ctx, blobStore, satelliteID) + if err != nil { + return err + } + // note we must not keep a transaction open with the db when calling walkFunc; the callback + // might need to make db calls as well + for i := range pieceInfos { + if err := ctx.Err(); err != nil { + return err + } + if err := walkFunc(&pieceInfos[i]); err != nil { + return err + } + } + return nil } // Get gets piece information by satellite id and piece id. -func (db *pieceinfo) Get(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (_ *pieces.Info, err error) { +func (db *v0PieceInfo) Get(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (_ *pieces.Info, err error) { defer mon.Task()(&ctx)(&err) info := &pieces.Info{} info.SatelliteID = satelliteID @@ -132,36 +152,20 @@ func (db *pieceinfo) Get(ctx context.Context, satelliteID storj.NodeID, pieceID } // Delete deletes piece information. -func (db *pieceinfo) Delete(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (err error) { +func (db *v0PieceInfo) Delete(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID) (err error) { defer mon.Task()(&ctx)(&err) - var pieceSize int64 - err = db.db.QueryRowContext(ctx, db.Rebind(` - SELECT piece_size - FROM pieceinfo_ - WHERE satellite_id = ? AND piece_id = ? - `), satelliteID, pieceID).Scan(&pieceSize) - // Ignore no rows found errors - if err != nil && err != sql.ErrNoRows { - return ErrInfo.Wrap(err) - } _, err = db.db.ExecContext(ctx, db.Rebind(` DELETE FROM pieceinfo_ WHERE satellite_id = ? AND piece_id = ? `), satelliteID, pieceID) - if pieceSize != 0 && err == nil { - db.loadSpaceUsed(ctx) - - atomic.AddInt64(&db.usedSpace, -pieceSize) - } - return ErrInfo.Wrap(err) } // DeleteFailed marks piece as a failed deletion. -func (db *pieceinfo) DeleteFailed(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID, now time.Time) (err error) { +func (db *v0PieceInfo) DeleteFailed(ctx context.Context, satelliteID storj.NodeID, pieceID storj.PieceID, now time.Time) (err error) { defer mon.Task()(&ctx)(&err) _, err = db.db.ExecContext(ctx, db.Rebind(` @@ -174,12 +178,12 @@ func (db *pieceinfo) DeleteFailed(ctx context.Context, satelliteID storj.NodeID, return ErrInfo.Wrap(err) } -// GetExpired gets pieceinformation identites that are expired. -func (db *pieceinfo) GetExpired(ctx context.Context, expiredAt time.Time, limit int64) (infos []pieces.ExpiredInfo, err error) { +// GetExpired gets ExpiredInfo records for pieces that are expired. +func (db *v0PieceInfo) GetExpired(ctx context.Context, expiredAt time.Time, limit int64) (infos []pieces.ExpiredInfo, err error) { defer mon.Task()(&ctx)(&err) rows, err := db.db.QueryContext(ctx, db.Rebind(` - SELECT satellite_id, piece_id, piece_size + SELECT satellite_id, piece_id FROM pieceinfo_ WHERE piece_expiration IS NOT NULL AND piece_expiration < ? @@ -192,8 +196,8 @@ func (db *pieceinfo) GetExpired(ctx context.Context, expiredAt time.Time, limit } defer func() { err = errs.Combine(err, rows.Close()) }() for rows.Next() { - info := pieces.ExpiredInfo{} - err = rows.Scan(&info.SatelliteID, &info.PieceID, &info.PieceSize) + info := pieces.ExpiredInfo{InPieceInfo: true} + err = rows.Scan(&info.SatelliteID, &info.PieceID) if err != nil { return infos, ErrInfo.Wrap(err) } @@ -202,50 +206,83 @@ func (db *pieceinfo) GetExpired(ctx context.Context, expiredAt time.Time, limit return infos, nil } -// SpaceUsed returns disk space used by all pieces from cache -func (db *pieceinfo) SpaceUsed(ctx context.Context) (_ int64, err error) { - defer mon.Task()(&ctx)(&err) - db.loadSpaceUsed(ctx) - - return atomic.LoadInt64(&db.usedSpace), nil +type v0StoredPieceAccess struct { + blobStore storage.Blobs + satellite storj.NodeID + pieceID storj.PieceID + pieceSize int64 + creationTime time.Time + expirationTime *time.Time + blobInfo storage.BlobInfo } -func (db *pieceinfo) loadSpaceUsed(ctx context.Context) { - defer mon.Task()(&ctx)(nil) - db.loadSpaceOnce.Do(func() { - usedSpace, _ := db.CalculatedSpaceUsed(ctx) - atomic.AddInt64(&db.usedSpace, usedSpace) - }) +// PieceID returns the piece ID for the piece +func (v0Access v0StoredPieceAccess) PieceID() storj.PieceID { + return v0Access.pieceID } -// CalculatedSpaceUsed calculates disk space used by all pieces -func (db *pieceinfo) CalculatedSpaceUsed(ctx context.Context) (_ int64, err error) { - defer mon.Task()(&ctx)(&err) - var sum sql.NullInt64 - err = db.db.QueryRowContext(ctx, db.Rebind(` - SELECT SUM(piece_size) - FROM pieceinfo_ - `)).Scan(&sum) +// Satellite returns the satellite ID that owns the piece +func (v0Access v0StoredPieceAccess) Satellite() (storj.NodeID, error) { + return v0Access.satellite, nil +} - if err == sql.ErrNoRows || !sum.Valid { - return 0, nil +// BlobRef returns the relevant storage.BlobRef locator for the piece +func (v0Access v0StoredPieceAccess) BlobRef() storage.BlobRef { + return storage.BlobRef{ + Namespace: v0Access.satellite.Bytes(), + Key: v0Access.pieceID.Bytes(), } - return sum.Int64, err } -// SpaceUsed calculates disk space used by all pieces -func (db *pieceinfo) SpaceUsedBySatellite(ctx context.Context, satelliteID storj.NodeID) (_ int64, err error) { - defer mon.Task()(&ctx)(&err) - - var sum sql.NullInt64 - err = db.db.QueryRowContext(ctx, db.Rebind(` - SELECT SUM(piece_size) - FROM pieceinfo_ - WHERE satellite_id = ? - `), satelliteID).Scan(&sum) - - if err == sql.ErrNoRows || !sum.Valid { - return 0, nil +func (v0Access v0StoredPieceAccess) fillInBlobAccess(ctx context.Context) error { + if v0Access.blobInfo == nil { + if v0Access.blobStore == nil { + return errs.New("this v0StoredPieceAccess instance has no blobStore reference, and cannot look up the relevant blob") + } + blobInfo, err := v0Access.blobStore.StatWithStorageFormat(ctx, v0Access.BlobRef(), v0Access.StorageFormatVersion()) + if err != nil { + return err + } + v0Access.blobInfo = blobInfo } - return sum.Int64, err + return nil +} + +// ContentSize gives the size of the piece content (not including the piece header, if applicable) +func (v0Access v0StoredPieceAccess) ContentSize(ctx context.Context) (int64, error) { + return v0Access.pieceSize, nil +} + +// CreationTime returns the piece creation time as given in the original order (which is not +// necessarily the same as the file mtime). +func (v0Access v0StoredPieceAccess) CreationTime(ctx context.Context) (time.Time, error) { + return v0Access.creationTime, nil +} + +// ModTime returns the same thing as CreationTime for V0 blobs. The intent is for ModTime to +// be a little faster when CreationTime is too slow and the precision is not needed, but in +// this case we already have the exact creation time from the database. +func (v0Access v0StoredPieceAccess) ModTime(ctx context.Context) (time.Time, error) { + return v0Access.creationTime, nil +} + +// FullPath gives the full path to the on-disk blob file +func (v0Access v0StoredPieceAccess) FullPath(ctx context.Context) (string, error) { + if err := v0Access.fillInBlobAccess(ctx); err != nil { + return "", err + } + return v0Access.blobInfo.FullPath(ctx) +} + +// StorageFormatVersion indicates the storage format version used to store the piece +func (v0Access v0StoredPieceAccess) StorageFormatVersion() storage.FormatVersion { + return filestore.FormatV0 +} + +// Stat does a stat on the on-disk blob file +func (v0Access v0StoredPieceAccess) Stat(ctx context.Context) (os.FileInfo, error) { + if err := v0Access.fillInBlobAccess(ctx); err != nil { + return nil, err + } + return v0Access.blobInfo.Stat(ctx) } diff --git a/storagenode/storagenodedb/pieceinfo_test.go b/storagenode/storagenodedb/pieceinfo_test.go deleted file mode 100644 index bbb12a288..000000000 --- a/storagenode/storagenodedb/pieceinfo_test.go +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (C) 2019 Storj Labs, Inc. -// See LICENSE for copying information. - -package storagenodedb_test - -import ( - "testing" - "time" - - "github.com/stretchr/testify/require" - - "storj.io/storj/internal/testcontext" - "storj.io/storj/internal/testidentity" - "storj.io/storj/internal/testrand" - "storj.io/storj/pkg/pb" - "storj.io/storj/pkg/signing" - "storj.io/storj/pkg/storj" - "storj.io/storj/storagenode" - "storj.io/storj/storagenode/pieces" - "storj.io/storj/storagenode/storagenodedb/storagenodedbtest" -) - -// TestGetPieceIDs does the following: -// * Create 90 pieces -// * Request 50 pieces starting from the beginning. Expect 50 pieces. -// * Request 50 pieces starting from the end of the previous request. Expect 40 pieces. -// * Request 50 pieces starting from the end of the previous request. Expect 0 pieces. -func TestGetPieceIDs(t *testing.T) { - storagenodedbtest.Run(t, func(t *testing.T, db storagenode.DB) { - ctx := testcontext.New(t) - defer ctx.Cleanup() - - pieceInfos := db.PieceInfo() - - satellite := testidentity.MustPregeneratedSignedIdentity(0, storj.LatestIDVersion()) - uplink := testidentity.MustPregeneratedSignedIdentity(3, storj.LatestIDVersion()) - totalPieces := 90 - for i := 0; i < totalPieces; i++ { - newID := testrand.PieceID() - - pieceHash, err := signing.SignPieceHash(ctx, - signing.SignerFromFullIdentity(uplink), - &pb.PieceHash{ - PieceId: newID, - Hash: []byte{0, 2, 3, 4, 5}, - }) - require.NoError(t, err) - - err = pieceInfos.Add(ctx, &pieces.Info{ - SatelliteID: satellite.ID, - PieceSize: 4, - PieceID: newID, - PieceCreation: time.Now().Add(-time.Minute), - UplinkPieceHash: pieceHash, - OrderLimit: &pb.OrderLimit{}, - }) - require.NoError(t, err) - } - - seen := make(map[storj.PieceID]bool) - - requestSize := 50 - cursor := storj.PieceID{} - - pieceIDs, err := pieceInfos.GetPieceIDs(ctx, satellite.ID, time.Now(), requestSize, cursor) - require.NoError(t, err) - require.Len(t, pieceIDs, 50) - for _, id := range pieceIDs { - require.False(t, seen[id]) - seen[id] = true - cursor = id - } - - pieceIDs, err = pieceInfos.GetPieceIDs(ctx, satellite.ID, time.Now(), requestSize, cursor) - require.NoError(t, err) - require.Len(t, pieceIDs, 40) - for _, id := range pieceIDs { - require.False(t, seen[id]) - seen[id] = true - cursor = id - } - - pieceIDs, err = pieceInfos.GetPieceIDs(ctx, satellite.ID, time.Now(), requestSize, cursor) - require.NoError(t, err) - require.Len(t, pieceIDs, 0) - }) -} diff --git a/storagenode/storagenodedb/testdata/sqlite.v15.sql b/storagenode/storagenodedb/testdata/sqlite.v15.sql new file mode 100644 index 000000000..dc1b842a1 --- /dev/null +++ b/storagenode/storagenodedb/testdata/sqlite.v15.sql @@ -0,0 +1,151 @@ +-- table for keeping serials that need to be verified against +CREATE TABLE used_serial_ ( + satellite_id BLOB NOT NULL, + serial_number BLOB NOT NULL, + expiration TIMESTAMP NOT NULL +); +-- primary key on satellite id and serial number +CREATE UNIQUE INDEX pk_used_serial_ ON used_serial_(satellite_id, serial_number); +-- expiration index to allow fast deletion +CREATE INDEX idx_used_serial_ ON used_serial_(expiration); + +-- certificate table for storing uplink/satellite certificates +CREATE TABLE certificate ( + cert_id INTEGER +); + +-- table for storing piece meta info +CREATE TABLE pieceinfo_ ( + satellite_id BLOB NOT NULL, + piece_id BLOB NOT NULL, + piece_size BIGINT NOT NULL, + piece_expiration TIMESTAMP, + + order_limit BLOB NOT NULL, + uplink_piece_hash BLOB NOT NULL, + uplink_cert_id INTEGER NOT NULL, + + deletion_failed_at TIMESTAMP, + piece_creation TIMESTAMP NOT NULL, + + FOREIGN KEY(uplink_cert_id) REFERENCES certificate(cert_id) +); +-- primary key by satellite id and piece id +CREATE UNIQUE INDEX pk_pieceinfo_ ON pieceinfo_(satellite_id, piece_id); +-- fast queries for expiration for pieces that have one +CREATE INDEX idx_pieceinfo__expiration ON pieceinfo_(piece_expiration) WHERE piece_expiration IS NOT NULL; + +-- table for storing bandwidth usage +CREATE TABLE bandwidth_usage ( + satellite_id BLOB NOT NULL, + action INTEGER NOT NULL, + amount BIGINT NOT NULL, + created_at TIMESTAMP NOT NULL +); +CREATE INDEX idx_bandwidth_usage_satellite ON bandwidth_usage(satellite_id); +CREATE INDEX idx_bandwidth_usage_created ON bandwidth_usage(created_at); + +-- table for storing all unsent orders +CREATE TABLE unsent_order ( + satellite_id BLOB NOT NULL, + serial_number BLOB NOT NULL, + + order_limit_serialized BLOB NOT NULL, + order_serialized BLOB NOT NULL, + order_limit_expiration TIMESTAMP NOT NULL, + + uplink_cert_id INTEGER NOT NULL, + + FOREIGN KEY(uplink_cert_id) REFERENCES certificate(cert_id) +); +CREATE UNIQUE INDEX idx_orders ON unsent_order(satellite_id, serial_number); + +-- table for storing all sent orders +CREATE TABLE order_archive_ ( + satellite_id BLOB NOT NULL, + serial_number BLOB NOT NULL, + + order_limit_serialized BLOB NOT NULL, + order_serialized BLOB NOT NULL, + + uplink_cert_id INTEGER NOT NULL, + + status INTEGER NOT NULL, + archived_at TIMESTAMP NOT NULL, + + FOREIGN KEY(uplink_cert_id) REFERENCES certificate(cert_id) +); + +-- table for storing vouchers +CREATE TABLE vouchers ( + satellite_id BLOB PRIMARY KEY NOT NULL, + voucher_serialized BLOB NOT NULL, + expiration TIMESTAMP NOT NULL +); + +CREATE TABLE bandwidth_usage_rollups ( + interval_start TIMESTAMP NOT NULL, + satellite_id BLOB NOT NULL, + action INTEGER NOT NULL, + amount BIGINT NOT NULL, + PRIMARY KEY ( interval_start, satellite_id, action ) +); + +-- table to hold expiration data (and only expirations. no other pieceinfo) +CREATE TABLE piece_expirations ( + satellite_id BLOB NOT NULL, + piece_id BLOB NOT NULL, + piece_expiration TIMESTAMP NOT NULL, -- date when it can be deleted + deletion_failed_at TIMESTAMP, + PRIMARY KEY ( satellite_id, piece_id ) +); +CREATE INDEX idx_piece_expirations_piece_expiration ON piece_expirations(piece_expiration); +CREATE INDEX idx_piece_expirations_deletion_failed_at ON piece_expirations(deletion_failed_at); + +INSERT INTO unsent_order VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',X'1eddef484b4c03f01332279032796972',X'0a101eddef484b4c03f0133227903279697212202b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf410001a201968996e7ef170a402fdfd88b6753df792c063c07c555905ffac9cd3cbd1c00022200ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac30002a20d00cf14f3c68b56321ace04902dec0484eb6f9098b22b31c6b3f82db249f191630643802420c08dfeb88e50510a8c1a5b9034a0c08dfeb88e50510a8c1a5b9035246304402204df59dc6f5d1bb7217105efbc9b3604d19189af37a81efbf16258e5d7db5549e02203bb4ead16e6e7f10f658558c22b59c3339911841e8dbaae6e2dea821f7326894',X'0a101eddef484b4c03f0133227903279697210321a47304502206d4c106ddec88140414bac5979c95bdea7de2e0ecc5be766e08f7d5ea36641a7022100e932ff858f15885ffa52d07e260c2c25d3861810ea6157956c1793ad0c906284','2019-04-01 16:01:35.9254586+00:00',1); + +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',0,0,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',0,0,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',1,1,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',1,1,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',2,2,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',2,2,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',3,3,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',3,3,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',4,4,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',4,4,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',5,5,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',5,5,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',6,6,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',6,6,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',1,1,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',1,1,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',2,2,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',2,2,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',3,3,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',3,3,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',4,4,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',4,4,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',5,5,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',5,5,'2019-04-01 20:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',6,6,'2019-04-01 18:51:24.1074772+00:00'); +INSERT INTO bandwidth_usage VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',6,6,'2019-04-01 20:51:24.1074772+00:00'); + +INSERT INTO vouchers VALUES(X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000', X'd5e757fd8d207d1c46583fb58330f803dc961b71147308ff75ff1e72a0df6b0b', '2019-07-04 00:00:00.000000+00:00'); + +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',0,0); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',0,0); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',1,1); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',1,1); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',2,2); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',2,2); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',3,3); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',3,3); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',4,4); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',4,4); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',5,5); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',5,5); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 18:00:00+00:00',X'0ed28abb2813e184a1e98b0f6605c4911ea468c7e8433eb583e0fca7ceac3000',6,6); +INSERT INTO bandwidth_usage_rollups VALUES('2019-07-12 20:00:00+00:00',X'2b3a5863a41f25408a8f5348839d7a1361dbd886d75786bb139a8ca0bdf41000',6,6); + +-- NEW DATA --