1d78ddc3df
* test-network-stalls tests... network stalls! in particular, right now, it just tests whether an uplink correctly times out after some amount of time when one of the nodes it's talking to suddenly goes offline. This tool is meant to be run under `storj-sim network test`. Also included here: * fix storj-sim-related test scripts on Mac the default storj config dir on Mac has a space in it ('~/Library/Application Support/Storj'), which breaks everywhere it shows up in an unquoted variable in a sh/bash script. easy enough to fix as a one-off, but quoting bash vars avoids a dozen other potential problems too. change a few things using `head -c` to use `dd`. `head -c` works, but is not as widely understood (as evidenced by each of these scripts getting through code review, one at a time, with the comments not matching the numbers actually used). * storj-sim reports PIDs of worker processes to test so that the tests can cause unfortunate "accidents" to befall the worker processes in the course of the test, and find out whether everything reacts correctly.
231 lines
6.5 KiB
Go
231 lines
6.5 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
// +build ignore
|
|
|
|
// Tests whether the uplink tool correctly times out when one of the storage nodes it's talking to
|
|
// suddenly stops responding. In particular, this currently tests that happening during a Delete
|
|
// operation, because that is where we have observed indefinite hangs before.
|
|
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"math/rand"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/zeebo/errs"
|
|
|
|
"storj.io/storj/internal/memory"
|
|
)
|
|
|
|
var (
|
|
numTries = flag.Int("num-tries", 20, "number of tries to cause a hang")
|
|
bucketName = flag.String("bucket", "bukkit", "name of bucket to use for test")
|
|
deleteTimeout = flag.Duration("timeout", 60*time.Second, "how long to wait for a delete to succeed or time out")
|
|
|
|
fileSize memory.Size = 5 * memory.MiB
|
|
|
|
tryAgain = errs.New("test needs to run again")
|
|
)
|
|
|
|
func init() {
|
|
flag.Var(&fileSize, "file-size", "size of test file to use")
|
|
}
|
|
|
|
type randDefaultSource struct{}
|
|
|
|
func (randSource *randDefaultSource) Read(p []byte) (int, error) {
|
|
return rand.Read(p)
|
|
}
|
|
|
|
func makeRandomContentsFile(path string, size memory.Size) (err error) {
|
|
outFile, err := os.Create(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
err = errs.Combine(err, outFile.Close())
|
|
}()
|
|
if _, err := io.CopyN(outFile, &randDefaultSource{}, int64(size)); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type uplinkRunner struct {
|
|
execName string
|
|
configDir string
|
|
logLevel string
|
|
}
|
|
|
|
// Run runs the uplink executable with the given arguments, and hands back its
|
|
// output as well as an error if there were any problems with the execution or if
|
|
// the uplink exited non-zero.
|
|
func (ur *uplinkRunner) Run(ctx context.Context, args ...string) ([]byte, error) {
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
cmdArgs := []string{"--config-dir", ur.configDir, "--log.level", ur.logLevel}
|
|
cmdArgs = append(cmdArgs, args...)
|
|
cmd := exec.CommandContext(ctx, ur.execName, cmdArgs...)
|
|
return cmd.CombinedOutput()
|
|
}
|
|
|
|
// skip the first four whitespace-delimited fields and keep the rest
|
|
var lsOutputRegexp = regexp.MustCompile(`(?m)^\s*(?:\S+\s+){4}(.*)$`)
|
|
|
|
func (ur *uplinkRunner) doesRemoteExist(remotePath string) (bool, error) {
|
|
pathParts := strings.Split(remotePath, "/")
|
|
if len(pathParts) < 2 {
|
|
return false, errs.New("invalid remote path %q", remotePath)
|
|
}
|
|
bucketAndDir := strings.Join(pathParts[:len(pathParts)-1], "/")
|
|
filenamePart := []byte(pathParts[len(pathParts)-1])
|
|
output, err := ur.Run(nil, "ls", bucketAndDir)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
for _, matches := range lsOutputRegexp.FindAllSubmatch(output, -1) {
|
|
if bytes.Equal(matches[1], filenamePart) {
|
|
return true, nil
|
|
}
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
func storeFileAndCheck(uplink *uplinkRunner, srcFile, dstFile string) error {
|
|
if _, err := uplink.Run(nil, "cp", srcFile, dstFile); err != nil {
|
|
return errs.New("Could not copy file into storj-sim network: %v", err)
|
|
}
|
|
if exists, err := uplink.doesRemoteExist(dstFile); err != nil {
|
|
return errs.New("Could not check if file exists: %v", err)
|
|
} else if !exists {
|
|
return errs.New("Copied file not present in storj-sim network!")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func stallNode(ctx context.Context, proc *os.Process) {
|
|
// send node a SIGSTOP, which causes it to freeze as if being traced
|
|
proc.Signal(syscall.SIGSTOP)
|
|
// until the context is done
|
|
<-ctx.Done()
|
|
// then let the node continue again
|
|
proc.Signal(syscall.SIGCONT)
|
|
}
|
|
|
|
func deleteWhileStallingAndCheck(uplink *uplinkRunner, dstFile string, nodeProc *os.Process) error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), *deleteTimeout)
|
|
defer cancel()
|
|
|
|
go stallNode(ctx, nodeProc)
|
|
|
|
output, err := uplink.Run(ctx, "rm", dstFile)
|
|
if err != nil {
|
|
if ctx.Err() == context.DeadlineExceeded {
|
|
// (uplink did not time out, but this test did)
|
|
return errs.New("uplink DID NOT time out waiting for stalled node 0 while issuing a delete")
|
|
}
|
|
return errs.New("Unexpected error trying to delete file %q from storj-sim network: %v", dstFile, err)
|
|
}
|
|
if exists, err := uplink.doesRemoteExist(dstFile); err != nil {
|
|
return errs.New("Failed to check if remote file %q was deleted: %v", dstFile, err)
|
|
} else if exists {
|
|
return errs.New("Deleted file still present in storj-sim network!")
|
|
}
|
|
if strings.Contains(string(output), "context deadline exceeded") {
|
|
// the uplink correctly timed out when one of the target nodes was stalled! all is well
|
|
return nil
|
|
}
|
|
// delete worked fine, which means our stall didn't hit at the right time and we need to try again
|
|
return tryAgain
|
|
}
|
|
|
|
func runTest() error {
|
|
// check run environment
|
|
configDir := os.Getenv("GATEWAY_0_DIR")
|
|
if configDir == "" {
|
|
return errs.New("This test should be run under storj-sim test ($GATEWAY_0_DIR not found).")
|
|
}
|
|
nodePid, err := strconv.Atoi(os.Getenv("STORAGENODE_0_PID"))
|
|
if err != nil {
|
|
return errs.New("Empty or invalid $STORAGENODE_0_PID: %v", err)
|
|
}
|
|
nodeProc, err := os.FindProcess(nodePid)
|
|
if err != nil {
|
|
return errs.New("No such process %v! $STORAGENODE_0_PID is wrong", nodePid)
|
|
}
|
|
|
|
// set up test
|
|
uplink := &uplinkRunner{
|
|
execName: "uplink",
|
|
configDir: configDir,
|
|
logLevel: "error",
|
|
}
|
|
tempDir, err := ioutil.TempDir("", "storj-test-network-stalls.")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
bucket := "sj://" + *bucketName
|
|
srcFile := filepath.Join(tempDir, "to-storj-sim")
|
|
dstFile := bucket + "/in-storj-sim"
|
|
if err := makeRandomContentsFile(srcFile, fileSize); err != nil {
|
|
return errs.New("could not create test file with random contents: %v", err)
|
|
}
|
|
if _, err := uplink.Run(nil, "mb", bucket); err != nil {
|
|
return errs.New("could not create test bucket: %v", err)
|
|
}
|
|
defer func() {
|
|
// explicitly ignoring errors here; we don't much care if they fail,
|
|
// because this is best-effort
|
|
_, _ = uplink.Run(nil, "rm", dstFile)
|
|
_, _ = uplink.Run(nil, "rb", bucket)
|
|
}()
|
|
|
|
// run test
|
|
for i := 0; i < *numTries; i++ {
|
|
fmt.Printf("%d\n", i)
|
|
|
|
if err := storeFileAndCheck(uplink, srcFile, dstFile); err != nil {
|
|
return err
|
|
}
|
|
|
|
err := deleteWhileStallingAndCheck(uplink, dstFile, nodeProc)
|
|
if err == nil {
|
|
// success!
|
|
break
|
|
}
|
|
if err != tryAgain {
|
|
// unexpected error
|
|
return err
|
|
}
|
|
}
|
|
|
|
// clean up test. this part isn't deferred and run unconditionally because
|
|
// we want to inspect things when the test has failed.
|
|
return os.RemoveAll(tempDir)
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
|
|
if err := runTest(); err != nil {
|
|
fmt.Fprintln(os.Stderr, err)
|
|
os.Exit(1)
|
|
}
|
|
fmt.Println("SUCCESS")
|
|
}
|