storage/filestore: better error message on data corruption

A user on the forum was seeing the error "bad message", which was not
very helpful. This case from the ext4 filesystem using the code EBADMSG
to indicate it detected an invalid CRC, suggesting disk corruption.

This change adds some explanatory information about probable disk
corruption to all errors coming from the (*blobInfo).Stat() call, which
is where storagenode fs corruption problems will usually manifest.

Refs: https://github.com/storj/storj/issues/5375
Change-Id: I87f4a800236050415c4191ef1a0fc952f9def315
This commit is contained in:
paul cannon 2023-01-30 08:39:31 -06:00
parent ed7c82439d
commit 2f04e20627
4 changed files with 82 additions and 0 deletions

View File

@ -8,6 +8,7 @@ import (
"context" "context"
"encoding/base32" "encoding/base32"
"errors" "errors"
"fmt"
"io" "io"
"math" "math"
"os" "os"
@ -871,6 +872,12 @@ func (info *blobInfo) Stat(ctx context.Context) (os.FileInfo, error) {
if info.fileInfo == nil { if info.fileInfo == nil {
fileInfo, err := os.Lstat(info.path) fileInfo, err := os.Lstat(info.path)
if err != nil { if err != nil {
if os.IsNotExist(err) {
return nil, err
}
if isLowLevelCorruptionError(err) {
return nil, &CorruptDataError{path: info.path, error: err}
}
return nil, err return nil, err
} }
if fileInfo.Mode().IsDir() { if fileInfo.Mode().IsDir() {
@ -884,3 +891,27 @@ func (info *blobInfo) Stat(ctx context.Context) (os.FileInfo, error) {
func (info *blobInfo) FullPath(ctx context.Context) (string, error) { func (info *blobInfo) FullPath(ctx context.Context) (string, error) {
return info.path, nil return info.path, nil
} }
// CorruptDataError represents a filesystem or disk error which indicates data corruption.
//
// We use a custom error type here so that we can add explanatory information and wrap the original
// error at the same time.
type CorruptDataError struct {
path string
error error
}
// Unwrap unwraps the error.
func (cde CorruptDataError) Unwrap() error {
return cde.error
}
// Path returns the path at which the error was encountered.
func (cde CorruptDataError) Path() string {
return cde.path
}
// Error returns an error string describing the condition.
func (cde CorruptDataError) Error() string {
return fmt.Sprintf("unrecoverable error accessing data on the storage file system (path=%v; error=%v). This is most likely due to disk bad sectors or a corrupted file system. Check your disk for bad sectors and integrity", cde.path, cde.error)
}

View File

@ -0,0 +1,23 @@
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
//go:build !unix
// +build !unix
package filestore
import (
"errors"
"os"
"strings"
)
func isLowLevelCorruptionError(err error) bool {
// convert to lowercase the perr.Op because Go returns inconsistently
// "lstat" in Linux and "Lstat" in Windows
var perr *os.PathError
if errors.As(err, &perr) && strings.ToLower(perr.Op) == "lstat" {
return true
}
return false
}

View File

@ -0,0 +1,27 @@
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
//go:build unix
package filestore
import (
"errors"
"os"
"syscall"
)
func isLowLevelCorruptionError(err error) bool {
var perr *os.PathError
if errors.As(err, &perr) && perr.Op == "lstat" {
return true
}
var errnoErr syscall.Errno
if errors.As(err, &errnoErr) {
switch errnoErr {
case syscall.EBADMSG, syscall.EIO:
return true
}
}
return false
}

View File

@ -398,6 +398,7 @@ func (s *Service) retainPieces(ctx context.Context, req Request) (err error) {
// piece was deleted while we were scanning. // piece was deleted while we were scanning.
return nil return nil
} }
piecesSkipped++ piecesSkipped++
s.log.Warn("failed to determine mtime of blob", zap.Error(err)) s.log.Warn("failed to determine mtime of blob", zap.Error(err))
// but continue iterating. // but continue iterating.