Test that network stalls don't cause indefinite hangs in uplink (#1530)

* test-network-stalls tests... network stalls!

in particular, right now, it just tests whether an uplink correctly
times out after some amount of time when one of the nodes it's talking
to suddenly goes offline.

This tool is meant to be run under `storj-sim network test`.

Also included here:

* fix storj-sim-related test scripts on Mac

the default storj config dir on Mac has a space in it
('~/Library/Application Support/Storj'), which breaks everywhere it
shows up in an unquoted variable in a sh/bash script. easy enough to fix
as a one-off, but quoting bash vars avoids a dozen other potential
problems too.

change a few things using `head -c` to use `dd`. `head -c` works,
but is not as widely understood (as evidenced by each of these scripts
getting through code review, one at a time, with the comments not
matching the numbers actually used).

* storj-sim reports PIDs of worker processes to test

so that the tests can cause unfortunate "accidents" to befall the worker
processes in the course of the test, and find out whether everything
reacts correctly.
This commit is contained in:
paul cannon 2019-03-20 08:58:07 -06:00 committed by GitHub
parent 0432f8f14d
commit 1d78ddc3df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 312 additions and 70 deletions

View File

@ -11,6 +11,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
@ -80,6 +81,7 @@ type Info struct {
Address string
Directory string
ID string
Pid int
Extra []string
}
@ -110,6 +112,9 @@ func (info *Info) Env() []string {
if info.Directory != "" {
env = append(env, name+"_DIR="+info.Directory)
}
if info.Pid != 0 {
env = append(env, name+"_PID="+strconv.Itoa(info.Pid))
}
for _, extra := range info.Extra {
env = append(env, name+"_"+extra)
}
@ -228,6 +233,7 @@ func (process *Process) Exec(ctx context.Context, command string) (err error) {
if err != nil {
return err
}
process.Info.Pid = cmd.Process.Pid
if command == "setup" || process.Address == "" {
// during setup we aren't starting the addresses, so we can release the dependencies immediately

View File

@ -44,13 +44,18 @@ echo "=> Secret Key: $secret_access_key"
export AWS_ACCESS_KEY_ID="$access_key_id"
export AWS_SECRET_ACCESS_KEY="$secret_access_key"
aws configure set default.region us-east-1
echo "=> Making test files"
head -c 1024 </dev/urandom > "$TMP_DIR/small-upload-testfile" # create 1mb file of random bytes (inline)
head -c 5120 </dev/urandom > "$TMP_DIR/big-upload-testfile" # create 5mb file of random bytes (remote)
head -c 5 </dev/urandom > "$TMP_DIR/multipart-upload-testfile" # create 5kb file of random bytes (remote)
random_bytes_file(){
size=$1
output=$2
dd if=/dev/urandom of="$output" count=1 bs="$size" >/dev/null 2>&1
}
random_bytes_file 1x1024x1024 "$TMP_DIR/small-upload-testfile" # create 1mb file of random bytes (inline)
random_bytes_file 5x1024x1024 "$TMP_DIR/big-upload-testfile" # create 5mb file of random bytes (remote)
random_bytes_file 5x1024 "$TMP_DIR/multipart-upload-testfile" # create 5kb file of random bytes (remote)
echo "=> Making bucket"
aws s3 --endpoint=http://localhost:7777/ mb s3://bucket
@ -89,24 +94,24 @@ aws s3 --endpoint=http://localhost:7777/ rb s3://bucket --force
echo "=> Comparing test files downloaded with uploaded versions"
if cmp "$TMP_DIR/small-upload-testfile" "$CMP_DIR/small-download-testfile"
then
echo "Downloaded file matches uploaded file"
echo "Downloaded file matches uploaded file"
else
echo "Downloaded file does not match uploaded file"
exit 1
echo "Downloaded file does not match uploaded file"
exit 1
fi
if cmp "$TMP_DIR/big-upload-testfile" "$CMP_DIR/big-download-testfile"
then
echo "Downloaded file matches uploaded file"
echo "Downloaded file matches uploaded file"
else
echo "Downloaded file does not match uploaded file"
exit 1
echo "Downloaded file does not match uploaded file"
exit 1
fi
if cmp "$TMP_DIR/multipart-upload-testfile" "$CMP_DIR/multipart-download-testfile"
then
echo "Downloaded file matches uploaded file"
echo "Downloaded file matches uploaded file"
else
echo "Downloaded file does not match uploaded file"
exit 1
echo "Downloaded file does not match uploaded file"
exit 1
fi

View File

@ -0,0 +1,230 @@
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
// +build ignore
// Tests whether the uplink tool correctly times out when one of the storage nodes it's talking to
// suddenly stops responding. In particular, this currently tests that happening during a Delete
// operation, because that is where we have observed indefinite hangs before.
package main
import (
"bytes"
"context"
"flag"
"fmt"
"io"
"io/ioutil"
"math/rand"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"syscall"
"time"
"github.com/zeebo/errs"
"storj.io/storj/internal/memory"
)
var (
numTries = flag.Int("num-tries", 20, "number of tries to cause a hang")
bucketName = flag.String("bucket", "bukkit", "name of bucket to use for test")
deleteTimeout = flag.Duration("timeout", 60*time.Second, "how long to wait for a delete to succeed or time out")
fileSize memory.Size = 5 * memory.MiB
tryAgain = errs.New("test needs to run again")
)
func init() {
flag.Var(&fileSize, "file-size", "size of test file to use")
}
type randDefaultSource struct{}
func (randSource *randDefaultSource) Read(p []byte) (int, error) {
return rand.Read(p)
}
func makeRandomContentsFile(path string, size memory.Size) (err error) {
outFile, err := os.Create(path)
if err != nil {
return err
}
defer func() {
err = errs.Combine(err, outFile.Close())
}()
if _, err := io.CopyN(outFile, &randDefaultSource{}, int64(size)); err != nil {
return err
}
return nil
}
type uplinkRunner struct {
execName string
configDir string
logLevel string
}
// Run runs the uplink executable with the given arguments, and hands back its
// output as well as an error if there were any problems with the execution or if
// the uplink exited non-zero.
func (ur *uplinkRunner) Run(ctx context.Context, args ...string) ([]byte, error) {
if ctx == nil {
ctx = context.Background()
}
cmdArgs := []string{"--config-dir", ur.configDir, "--log.level", ur.logLevel}
cmdArgs = append(cmdArgs, args...)
cmd := exec.CommandContext(ctx, ur.execName, cmdArgs...)
return cmd.CombinedOutput()
}
// skip the first four whitespace-delimited fields and keep the rest
var lsOutputRegexp = regexp.MustCompile(`(?m)^\s*(?:\S+\s+){4}(.*)$`)
func (ur *uplinkRunner) doesRemoteExist(remotePath string) (bool, error) {
pathParts := strings.Split(remotePath, "/")
if len(pathParts) < 2 {
return false, errs.New("invalid remote path %q", remotePath)
}
bucketAndDir := strings.Join(pathParts[:len(pathParts)-1], "/")
filenamePart := []byte(pathParts[len(pathParts)-1])
output, err := ur.Run(nil, "ls", bucketAndDir)
if err != nil {
return false, err
}
for _, matches := range lsOutputRegexp.FindAllSubmatch(output, -1) {
if bytes.Equal(matches[1], filenamePart) {
return true, nil
}
}
return false, nil
}
func storeFileAndCheck(uplink *uplinkRunner, srcFile, dstFile string) error {
if _, err := uplink.Run(nil, "cp", srcFile, dstFile); err != nil {
return errs.New("Could not copy file into storj-sim network: %v", err)
}
if exists, err := uplink.doesRemoteExist(dstFile); err != nil {
return errs.New("Could not check if file exists: %v", err)
} else if !exists {
return errs.New("Copied file not present in storj-sim network!")
}
return nil
}
func stallNode(ctx context.Context, proc *os.Process) {
// send node a SIGSTOP, which causes it to freeze as if being traced
proc.Signal(syscall.SIGSTOP)
// until the context is done
<-ctx.Done()
// then let the node continue again
proc.Signal(syscall.SIGCONT)
}
func deleteWhileStallingAndCheck(uplink *uplinkRunner, dstFile string, nodeProc *os.Process) error {
ctx, cancel := context.WithTimeout(context.Background(), *deleteTimeout)
defer cancel()
go stallNode(ctx, nodeProc)
output, err := uplink.Run(ctx, "rm", dstFile)
if err != nil {
if ctx.Err() == context.DeadlineExceeded {
// (uplink did not time out, but this test did)
return errs.New("uplink DID NOT time out waiting for stalled node 0 while issuing a delete")
}
return errs.New("Unexpected error trying to delete file %q from storj-sim network: %v", dstFile, err)
}
if exists, err := uplink.doesRemoteExist(dstFile); err != nil {
return errs.New("Failed to check if remote file %q was deleted: %v", dstFile, err)
} else if exists {
return errs.New("Deleted file still present in storj-sim network!")
}
if strings.Contains(string(output), "context deadline exceeded") {
// the uplink correctly timed out when one of the target nodes was stalled! all is well
return nil
}
// delete worked fine, which means our stall didn't hit at the right time and we need to try again
return tryAgain
}
func runTest() error {
// check run environment
configDir := os.Getenv("GATEWAY_0_DIR")
if configDir == "" {
return errs.New("This test should be run under storj-sim test ($GATEWAY_0_DIR not found).")
}
nodePid, err := strconv.Atoi(os.Getenv("STORAGENODE_0_PID"))
if err != nil {
return errs.New("Empty or invalid $STORAGENODE_0_PID: %v", err)
}
nodeProc, err := os.FindProcess(nodePid)
if err != nil {
return errs.New("No such process %v! $STORAGENODE_0_PID is wrong", nodePid)
}
// set up test
uplink := &uplinkRunner{
execName: "uplink",
configDir: configDir,
logLevel: "error",
}
tempDir, err := ioutil.TempDir("", "storj-test-network-stalls.")
if err != nil {
return err
}
bucket := "sj://" + *bucketName
srcFile := filepath.Join(tempDir, "to-storj-sim")
dstFile := bucket + "/in-storj-sim"
if err := makeRandomContentsFile(srcFile, fileSize); err != nil {
return errs.New("could not create test file with random contents: %v", err)
}
if _, err := uplink.Run(nil, "mb", bucket); err != nil {
return errs.New("could not create test bucket: %v", err)
}
defer func() {
// explicitly ignoring errors here; we don't much care if they fail,
// because this is best-effort
_, _ = uplink.Run(nil, "rm", dstFile)
_, _ = uplink.Run(nil, "rb", bucket)
}()
// run test
for i := 0; i < *numTries; i++ {
fmt.Printf("%d\n", i)
if err := storeFileAndCheck(uplink, srcFile, dstFile); err != nil {
return err
}
err := deleteWhileStallingAndCheck(uplink, dstFile, nodeProc)
if err == nil {
// success!
break
}
if err != tryAgain {
// unexpected error
return err
}
}
// clean up test. this part isn't deferred and run unconditionally because
// we want to inspect things when the test has failed.
return os.RemoveAll(tempDir)
}
func main() {
flag.Parse()
if err := runTest(); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
fmt.Println("SUCCESS")
}

View File

@ -10,55 +10,61 @@ trap cleanup EXIT
SRC_DIR=$TMPDIR/source
DST_DIR=$TMPDIR/dst
mkdir -p $SRC_DIR $DST_DIR
mkdir -p "$SRC_DIR" "$DST_DIR"
aws configure set aws_access_key_id $GATEWAY_0_ACCESS_KEY
aws configure set aws_secret_access_key $GATEWAY_0_SECRET_KEY
aws configure set aws_access_key_id "$GATEWAY_0_ACCESS_KEY"
aws configure set aws_secret_access_key "$GATEWAY_0_SECRET_KEY"
aws configure set default.region us-east-1
head -c 1024 </dev/urandom > $SRC_DIR/small-upload-testfile # create 1mb file of random bytes (inline)
head -c 5120 </dev/urandom > $SRC_DIR/big-upload-testfile # create 5mb file of random bytes (remote)
head -c 5 </dev/urandom > $SRC_DIR/multipart-upload-testfile # create 5kb file of random bytes (remote)
random_bytes_file () {
size=$1
output=$2
dd if=/dev/urandom of="$output" count=1 bs="$size" >/dev/null 2>&1
}
random_bytes_file 1x1024x1024 "$SRC_DIR/small-upload-testfile" # create 1mb file of random bytes (inline)
random_bytes_file 5x1024x1024 "$SRC_DIR/big-upload-testfile" # create 5mb file of random bytes (remote)
random_bytes_file 5x1024 "$SRC_DIR/multipart-upload-testfile" # create 5kb file of random bytes (remote)
echo "Creating Bucket"
aws s3 --endpoint=http://$GATEWAY_0_ADDR mb s3://bucket
aws s3 --endpoint="http://$GATEWAY_0_ADDR" mb s3://bucket
echo "Uploading Files"
aws configure set default.s3.multipart_threshold 1TB
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp $SRC_DIR/small-upload-testfile s3://bucket/small-testfile
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp $SRC_DIR/big-upload-testfile s3://bucket/big-testfile
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp "$SRC_DIR/small-upload-testfile" s3://bucket/small-testfile
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp "$SRC_DIR/big-upload-testfile" s3://bucket/big-testfile
# Wait 5 seconds to trigger any error related to one of the different intervals
sleep 5
echo "Uploading Multipart File"
aws configure set default.s3.multipart_threshold 4KB
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp $SRC_DIR/multipart-upload-testfile s3://bucket/multipart-testfile
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp "$SRC_DIR/multipart-upload-testfile" s3://bucket/multipart-testfile
echo "Downloading Files"
aws s3 --endpoint=http://$GATEWAY_0_ADDR ls s3://bucket
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp s3://bucket/small-testfile $DST_DIR/small-download-testfile
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp s3://bucket/big-testfile $DST_DIR/big-download-testfile
aws s3 --endpoint=http://$GATEWAY_0_ADDR cp s3://bucket/multipart-testfile $DST_DIR/multipart-download-testfile
aws s3 --endpoint=http://$GATEWAY_0_ADDR rb s3://bucket --force
aws s3 --endpoint="http://$GATEWAY_0_ADDR" ls s3://bucket
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp s3://bucket/small-testfile "$DST_DIR/small-download-testfile"
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp s3://bucket/big-testfile "$DST_DIR/big-download-testfile"
aws s3 --endpoint="http://$GATEWAY_0_ADDR" cp s3://bucket/multipart-testfile "$DST_DIR/multipart-download-testfile"
aws s3 --endpoint="http://$GATEWAY_0_ADDR" rb s3://bucket --force
if cmp $SRC_DIR/small-upload-testfile $DST_DIR/small-download-testfile
if cmp "$SRC_DIR/small-upload-testfile" "$DST_DIR/small-download-testfile"
then
echo "small-upload-testfile file matches uploaded file";
echo "small-upload-testfile file matches uploaded file";
else
echo "small-upload-testfile file does not match uploaded file";
echo "small-upload-testfile file does not match uploaded file";
fi
if cmp $SRC_DIR/big-upload-testfile $DST_DIR/big-download-testfile
if cmp "$SRC_DIR/big-upload-testfile" "$DST_DIR/big-download-testfile"
then
echo "big-upload-testfile file matches uploaded file";
echo "big-upload-testfile file matches uploaded file";
else
echo "big-upload-testfile file does not match uploaded file";
echo "big-upload-testfile file does not match uploaded file";
fi
if cmp $SRC_DIR/multipart-upload-testfile $DST_DIR/multipart-download-testfile
if cmp "$SRC_DIR/multipart-upload-testfile" "$DST_DIR/multipart-download-testfile"
then
echo "multipart-upload-testfile file matches uploaded file";
echo "multipart-upload-testfile file matches uploaded file";
else
echo "multipart-upload-testfile file does not match uploaded file";
fi
echo "multipart-upload-testfile file does not match uploaded file";
fi

View File

@ -4,7 +4,7 @@ set +x
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
make -C $SCRIPTDIR/.. install-sim
make -C "$SCRIPTDIR"/.. install-sim
# setup tmpdir for testfiles and cleanup
TMP=$(mktemp -d -t tmp.XXXXXXXXXX)
@ -19,14 +19,14 @@ export STORJ_NETWORK_DIR=$TMP
storj-sim -x network setup
# run aws-cli tests
storj-sim -x network test bash $SCRIPTDIR/test-sim-aws.sh
storj-sim -x network test bash $SCRIPTDIR/test-uplink.sh
storj-sim -x network test bash "$SCRIPTDIR"/test-sim-aws.sh
storj-sim -x network test bash "$SCRIPTDIR"/test-uplink.sh
storj-sim -x network destroy
# setup the network with ipv6
storj-sim -x --host "::1" network setup
# aws-cli doesn't support gateway with ipv6 address, so change it to use localhost
find $STORJ_NETWORK_DIR/gateway -type f -name config.yaml -exec sed -i "s/server.address: \"\[::1\]/server.address: \"127.0.0.1/" {} +
find "$STORJ_NETWORK_DIR"/gateway -type f -name config.yaml -exec sed -i 's/server.address: "\[::1\]/server.address: "127.0.0.1/' '{}' +
# run aws-cli tests using ipv6
storj-sim -x --host "::1" network test bash $SCRIPTDIR/test-sim-aws.sh
storj-sim -x network destroy
storj-sim -x --host "::1" network test bash "$SCRIPTDIR"/test-sim-aws.sh
storj-sim -x network destroy

View File

@ -14,51 +14,46 @@ BUCKET=bucket-123
SRC_DIR=$TMPDIR/source
DST_DIR=$TMPDIR/dst
mkdir -p $SRC_DIR $DST_DIR
mkdir -p "$SRC_DIR" "$DST_DIR"
head -c 1024 </dev/urandom > $SRC_DIR/small-upload-testfile # create 1mb file of random bytes (inline)
head -c 5120 </dev/urandom > $SRC_DIR/big-upload-testfile # create 5mb file of random bytes (remote)
head -c 5 </dev/urandom > $SRC_DIR/multipart-upload-testfile # create 5kb file of random bytes (remote)
random_bytes_file () {
size=$1
output=$2
dd if=/dev/urandom of="$output" count=1 bs="$size" >/dev/null 2>&1
}
uplink --config-dir $GATEWAY_0_DIR mb sj://$BUCKET/
random_bytes_file 2x1024 "$SRC_DIR/small-upload-testfile" # create 2kb file of random bytes (inline)
random_bytes_file 5x1024x1024 "$SRC_DIR/big-upload-testfile" # create 5mb file of random bytes (remote)
uplink --config-dir $GATEWAY_0_DIR cp $SRC_DIR/small-upload-testfile sj://$BUCKET/
uplink --config-dir $GATEWAY_0_DIR cp $SRC_DIR/big-upload-testfile sj://$BUCKET/
uplink --config-dir $GATEWAY_0_DIR cp $SRC_DIR/multipart-upload-testfile sj://$BUCKET/
uplink --config-dir "$GATEWAY_0_DIR" mb "sj://$BUCKET/"
uplink --config-dir $GATEWAY_0_DIR cp sj://$BUCKET/small-upload-testfile $DST_DIR
uplink --config-dir $GATEWAY_0_DIR cp sj://$BUCKET/big-upload-testfile $DST_DIR
uplink --config-dir $GATEWAY_0_DIR cp sj://$BUCKET/multipart-upload-testfile $DST_DIR
uplink --config-dir "$GATEWAY_0_DIR" cp "$SRC_DIR/small-upload-testfile" "sj://$BUCKET/"
uplink --config-dir "$GATEWAY_0_DIR" cp "$SRC_DIR/big-upload-testfile" "sj://$BUCKET/"
uplink --config-dir $GATEWAY_0_DIR rm sj://$BUCKET/small-upload-testfile
uplink --config-dir $GATEWAY_0_DIR rm sj://$BUCKET/big-upload-testfile
uplink --config-dir $GATEWAY_0_DIR rm sj://$BUCKET/multipart-upload-testfile
uplink --config-dir "$GATEWAY_0_DIR" cp "sj://$BUCKET/small-upload-testfile" "$DST_DIR"
uplink --config-dir "$GATEWAY_0_DIR" cp "sj://$BUCKET/big-upload-testfile" "$DST_DIR"
uplink --config-dir $GATEWAY_0_DIR ls sj://$BUCKET
uplink --config-dir "$GATEWAY_0_DIR" rm "sj://$BUCKET/small-upload-testfile"
uplink --config-dir "$GATEWAY_0_DIR" rm "sj://$BUCKET/big-upload-testfile"
uplink --config-dir $GATEWAY_0_DIR rb sj://$BUCKET
uplink --config-dir "$GATEWAY_0_DIR" ls "sj://$BUCKET"
if cmp $SRC_DIR/small-upload-testfile $DST_DIR/small-upload-testfile
uplink --config-dir "$GATEWAY_0_DIR" rb "sj://$BUCKET"
if cmp "$SRC_DIR/small-upload-testfile" "$DST_DIR/small-upload-testfile"
then
echo "small upload testfile matches uploaded file"
else
echo "small upload testfile does not match uploaded file"
fi
if cmp $SRC_DIR/big-upload-testfile $DST_DIR/big-upload-testfile
if cmp "$SRC_DIR/big-upload-testfile" "$DST_DIR/big-upload-testfile"
then
echo "big upload testfile matches uploaded file"
else
echo "big upload testfile does not match uploaded file"
fi
if cmp $SRC_DIR/multipart-upload-testfile $DST_DIR/multipart-upload-testfile
then
echo "multipart upload testfile matches uploaded file"
else
echo "multipart upload testfile does not match uploaded file"
fi
# check if all data files were removed
# FILES=$(find "$STORAGENODE_0_DIR/../" -type f -path "*/blob/*" ! -name "info.*")
# if [ -z "$FILES" ];
@ -68,4 +63,4 @@ fi
# echo "not all data files removed from storage nodes:"
# echo $FILES
# exit 1
# fi
# fi