Skip to content

fix: separate signals for passive, active, and forced shutdown #12358

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
args := append(os.Args, "--no-reap")
err := reaper.ForkReap(
reaper.WithExecArgs(args...),
reaper.WithCatchSignals(InterruptSignals...),
reaper.WithCatchSignals(StopSignals...),
)
if err != nil {
logger.Error(ctx, "agent process reaper unable to fork", slog.Error(err))
Expand All @@ -144,7 +144,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
// Note that we don't want to handle these signals in the
// process that runs as PID 1, that's why we do this after
// the reaper forked.
ctx, stopNotify := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stopNotify := inv.SignalNotifyContext(ctx, StopSignals...)
defer stopNotify()

// DumpHandler does signal handling, so we call it after the
Expand Down
2 changes: 1 addition & 1 deletion cli/exp_scaletest.go
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
Handler: func(inv *clibase.Invocation) (err error) {
ctx := inv.Context()

notifyCtx, stop := signal.NotifyContext(ctx, InterruptSignals...) // Checked later.
notifyCtx, stop := signal.NotifyContext(ctx, StopSignals...) // Checked later.
defer stop()
ctx = notifyCtx

Expand Down
2 changes: 1 addition & 1 deletion cli/externalauth.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fi
Handler: func(inv *clibase.Invocation) error {
ctx := inv.Context()

ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()

client, err := r.createAgentClient()
Expand Down
2 changes: 1 addition & 1 deletion cli/gitaskpass.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func (r *RootCmd) gitAskpass() *clibase.Cmd {
Handler: func(inv *clibase.Invocation) error {
ctx := inv.Context()

ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()

user, host, err := gitauth.ParseAskpass(inv.Args[0])
Expand Down
2 changes: 1 addition & 1 deletion cli/gitssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func (r *RootCmd) gitssh() *clibase.Cmd {

// Catch interrupt signals to ensure the temporary private
// key file is cleaned up on most cases.
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
defer stop()

// Early check so errors are reported immediately.
Expand Down
37 changes: 31 additions & 6 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.

// Register signals early on so that graceful shutdown can't
// be interrupted by additional signals. Note that we avoid
// shadowing cancel() (from above) here because notifyStop()
// shadowing cancel() (from above) here because stopCancel()
// restores default behavior for the signals. This protects
// the shutdown sequence from abruptly terminating things
// like: database migrations, provisioner work, workspace
// cleanup in dev-mode, etc.
//
// To get out of a graceful shutdown, the user can send
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
defer notifyStop()
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignalsNoInterrupt...)
defer stopCancel()
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
defer interruptCancel()

cacheDir := vals.CacheDir.String()
err = os.MkdirAll(cacheDir, 0o700)
Expand Down Expand Up @@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
hangDetector.Start()
defer hangDetector.Close()

waitForProvisionerJobs := false
// Currently there is no way to ask the server to shut
// itself down, so any exit signal will result in a non-zero
// exit of the server.
var exitErr error
select {
case <-notifyCtx.Done():
exitErr = notifyCtx.Err()
case <-stopCtx.Done():
exitErr = stopCtx.Err()
waitForProvisionerJobs = true
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
case <-interruptCtx.Done():
exitErr = interruptCtx.Err()
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
case <-tunnelDone:
exitErr = xerrors.New("dev tunnel closed unexpectedly")
Expand Down Expand Up @@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
defer wg.Done()

r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
timeout := 5 * time.Second
if waitForProvisionerJobs {
// It can last for a long time...
timeout = 30 * time.Minute
}

err := shutdownWithTimeout(func(ctx context.Context) error {
// We only want to cancel active jobs if we aren't exiting gracefully.
return provisionerDaemon.Shutdown(ctx, !waitForProvisionerJobs)
}, timeout)
if err != nil {
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
return
Expand Down Expand Up @@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {

return v, nil
}

func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
// On Windows, some of our signal functions lack support.
// If we pass in no signals, we should just return the context as-is.
if len(sig) == 0 {
return context.WithCancel(ctx)
}
return inv.SignalNotifyContext(ctx, sig...)
}
2 changes: 1 addition & 1 deletion cli/server_createadminuser.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (r *RootCmd) newCreateAdminUserCommand() *clibase.Cmd {
logger = logger.Leveled(slog.LevelDebug)
}

ctx, cancel := inv.SignalNotifyContext(ctx, InterruptSignals...)
ctx, cancel := inv.SignalNotifyContext(ctx, StopSignals...)
defer cancel()

if newUserDBURL == "" {
Expand Down
43 changes: 42 additions & 1 deletion cli/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"net/url"
"os"
"path/filepath"
"reflect"
"runtime"
"strconv"
"strings"
Expand Down Expand Up @@ -1605,7 +1606,7 @@ func TestServer_Production(t *testing.T) {
}

//nolint:tparallel,paralleltest // This test cannot be run in parallel due to signal handling.
func TestServer_Shutdown(t *testing.T) {
func TestServer_InterruptShutdown(t *testing.T) {
t.Skip("This test issues an interrupt signal which will propagate to the test runner.")

if runtime.GOOS == "windows" {
Expand Down Expand Up @@ -1638,6 +1639,46 @@ func TestServer_Shutdown(t *testing.T) {
require.NoError(t, err)
}

func TestServer_GracefulShutdown(t *testing.T) {
t.Parallel()
if runtime.GOOS == "windows" {
// Sending interrupt signal isn't supported on Windows!
t.SkipNow()
}
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()

root, cfg := clitest.New(t,
"server",
"--in-memory",
"--http-address", ":0",
"--access-url", "http://example.com",
"--provisioner-daemons", "1",
"--cache-dir", t.TempDir(),
)
var stopFunc context.CancelFunc
root = root.WithTestSignalNotifyContext(t, func(parent context.Context, signals ...os.Signal) (context.Context, context.CancelFunc) {
if !reflect.DeepEqual(cli.StopSignalsNoInterrupt, signals) {
return context.WithCancel(ctx)
}
var ctx context.Context
ctx, stopFunc = context.WithCancel(parent)
return ctx, stopFunc
})
serverErr := make(chan error, 1)
pty := ptytest.New(t).Attach(root)
go func() {
serverErr <- root.WithContext(ctx).Run()
}()
_ = waitAccessURL(t, cfg)
// It's fair to assume `stopFunc` isn't nil here, because the server
// has started and access URL is propagated.
stopFunc()
pty.ExpectMatch("waiting for provisioner jobs to complete")
err := <-serverErr
require.NoError(t, err)
}

func BenchmarkServerHelp(b *testing.B) {
// server --help is a good proxy for measuring the
// constant overhead of each command.
Expand Down
17 changes: 16 additions & 1 deletion cli/signal_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,23 @@ import (
"syscall"
)

var InterruptSignals = []os.Signal{
// StopSignals is the list of signals that are used for handling
// shutdown behavior.
var StopSignals = []os.Signal{
os.Interrupt,
syscall.SIGTERM,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change in all CLI commands that handle signals. They used to handle TERM but no longer do.

syscall.SIGHUP,
}

// StopSignals is the list of signals that are used for handling
// graceful shutdown behavior.
var StopSignalsNoInterrupt = []os.Signal{
syscall.SIGTERM,
syscall.SIGHUP,
}

// InterruptSignals is the list of signals that are used for handling
// immediate shutdown behavior.
var InterruptSignals = []os.Signal{
os.Interrupt,
}
10 changes: 9 additions & 1 deletion cli/signal_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,12 @@ import (
"os"
)

var InterruptSignals = []os.Signal{os.Interrupt}
var StopSignals = []os.Signal{
os.Interrupt,
}

var StopSignalsNoInterrupt = []os.Signal{}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love how Windows leaves you SOL. 🥲

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No doubt!


var InterruptSignals = []os.Signal{
os.Interrupt,
}
2 changes: 1 addition & 1 deletion cli/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
// session can persist for up to 72 hours, since we set a long
// timeout on the Agent side of the connection. In particular,
// OpenSSH sends SIGHUP to terminate a proxy command.
ctx, stop := inv.SignalNotifyContext(inv.Context(), InterruptSignals...)
ctx, stop := inv.SignalNotifyContext(inv.Context(), StopSignals...)
defer stop()
ctx, cancel := context.WithCancel(ctx)
defer cancel()
Expand Down
2 changes: 1 addition & 1 deletion cli/templatepull_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ func TestTemplatePull_ToDir(t *testing.T) {

require.NoError(t, inv.Run())

// Validate behaviour of choosing template name in the absence of an output path argument.
// Validate behavior of choosing template name in the absence of an output path argument.
destPath := actualDest
if destPath == "" {
destPath = template.Name
Expand Down
2 changes: 1 addition & 1 deletion coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ func (c *provisionerdCloser) Close() error {
c.closed = true
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
shutdownErr := c.d.Shutdown(ctx)
shutdownErr := c.d.Shutdown(ctx, true)
closeErr := c.d.Close()
if shutdownErr != nil {
return shutdownErr
Expand Down
19 changes: 14 additions & 5 deletions enterprise/cli/provisionerdaemons.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,10 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
ctx, cancel := context.WithCancel(inv.Context())
defer cancel()

notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, agpl.InterruptSignals...)
defer notifyStop()
stopCtx, stopCancel := inv.SignalNotifyContext(ctx, agpl.StopSignalsNoInterrupt...)
defer stopCancel()
interruptCtx, interruptCancel := inv.SignalNotifyContext(ctx, agpl.InterruptSignals...)
defer interruptCancel()

tags, err := agpl.ParseProvisionerTags(rawTags)
if err != nil {
Expand Down Expand Up @@ -212,10 +214,17 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
Metrics: metrics,
})

waitForProvisionerJobs := false
var exitErr error
select {
case <-notifyCtx.Done():
exitErr = notifyCtx.Err()
case <-stopCtx.Done():
exitErr = stopCtx.Err()
_, _ = fmt.Fprintln(inv.Stdout, cliui.Bold(
"Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit",
))
waitForProvisionerJobs = true
case <-interruptCtx.Done():
exitErr = interruptCtx.Err()
_, _ = fmt.Fprintln(inv.Stdout, cliui.Bold(
"Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit",
))
Expand All @@ -225,7 +234,7 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
cliui.Errorf(inv.Stderr, "Unexpected error, shutting down server: %s\n", exitErr)
}

err = srv.Shutdown(ctx)
err = srv.Shutdown(ctx, waitForProvisionerJobs)
if err != nil {
return xerrors.Errorf("shutdown: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion enterprise/cli/proxyserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func (r *RootCmd) proxyServer() *clibase.Cmd {
//
// To get out of a graceful shutdown, the user can send
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, cli.InterruptSignals...)
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, cli.StopSignals...)
defer notifyStop()

// Clean up idle connections at the end, e.g.
Expand Down
2 changes: 1 addition & 1 deletion enterprise/coderd/provisionerdaemons_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ func TestProvisionerDaemonServe(t *testing.T) {
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)

err = pd.Shutdown(ctx)
err = pd.Shutdown(ctx, false)
require.NoError(t, err)
err = terraformServer.Close()
require.NoError(t, err)
Expand Down
9 changes: 6 additions & 3 deletions provisionerd/provisionerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,15 +474,18 @@ func (p *Server) isClosed() bool {
}
}

// Shutdown triggers a graceful exit of each registered provisioner.
func (p *Server) Shutdown(ctx context.Context) error {
// Shutdown gracefully exists with the option to cancel the active job.
// If false, it will wait for the job to complete.
//
//nolint:revive
func (p *Server) Shutdown(ctx context.Context, cancelActiveJob bool) error {
p.mutex.Lock()
p.opts.Logger.Info(ctx, "attempting graceful shutdown")
if !p.shuttingDownB {
close(p.shuttingDownCh)
p.shuttingDownB = true
}
if p.activeJob != nil {
if cancelActiveJob && p.activeJob != nil {
p.activeJob.Cancel()
}
p.mutex.Unlock()
Expand Down
Loading