Skip to content

Commit 8d160ae

Browse files
committed
fix: separate signals for passive, active, and forced shutdown
`SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new jobs but waiting for existing jobs to successfully complete. `SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit. `SIGKILL`: Untouched from before, will force-quit.
1 parent b2a5e2f commit 8d160ae

File tree

8 files changed

+74
-20
lines changed

8 files changed

+74
-20
lines changed

cli/server.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
337337

338338
// Register signals early on so that graceful shutdown can't
339339
// be interrupted by additional signals. Note that we avoid
340-
// shadowing cancel() (from above) here because notifyStop()
340+
// shadowing cancel() (from above) here because stopCancel()
341341
// restores default behavior for the signals. This protects
342342
// the shutdown sequence from abruptly terminating things
343343
// like: database migrations, provisioner work, workspace
344344
// cleanup in dev-mode, etc.
345345
//
346346
// To get out of a graceful shutdown, the user can send
347347
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
348-
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
349-
defer notifyStop()
348+
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignals...)
349+
defer stopCancel()
350+
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
351+
defer interruptCancel()
350352

351353
cacheDir := vals.CacheDir.String()
352354
err = os.MkdirAll(cacheDir, 0o700)
@@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10281030
hangDetector.Start()
10291031
defer hangDetector.Close()
10301032

1033+
graceful := false
10311034
// Currently there is no way to ask the server to shut
10321035
// itself down, so any exit signal will result in a non-zero
10331036
// exit of the server.
10341037
var exitErr error
10351038
select {
1036-
case <-notifyCtx.Done():
1037-
exitErr = notifyCtx.Err()
1039+
case <-stopCtx.Done():
1040+
exitErr = stopCtx.Err()
1041+
graceful = true
1042+
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
1043+
case <-interruptCtx.Done():
1044+
exitErr = interruptCtx.Err()
10381045
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
10391046
case <-tunnelDone:
10401047
exitErr = xerrors.New("dev tunnel closed unexpectedly")
@@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10821089
defer wg.Done()
10831090

10841091
r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
1085-
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
1092+
timeout := 5 * time.Second
1093+
if graceful {
1094+
// It can last for a long time...
1095+
timeout = 30 * time.Minute
1096+
}
1097+
1098+
err := shutdownWithTimeout(func(ctx context.Context) error {
1099+
// We only want to cancel active jobs if we aren't exiting gracefully.
1100+
return provisionerDaemon.Shutdown(ctx, !graceful)
1101+
}, timeout)
10861102
if err != nil {
10871103
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
10881104
return
@@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {
25122528

25132529
return v, nil
25142530
}
2531+
2532+
func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
2533+
// On Windows, some of our signal functions lack support.
2534+
// If we pass in no signals, we should just return the context as-is.
2535+
if len(sig) == 0 {
2536+
return context.WithCancel(ctx)
2537+
}
2538+
return inv.SignalNotifyContext(ctx, sig...)
2539+
}

cli/signal_unix.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,24 @@ import (
77
"syscall"
88
)
99

10+
// StopSignals are used to gracefully exit.
11+
// An example is exiting provisioner daemons but not canceling
12+
// jobs, allowing a successful and clean exit.
13+
var StopSignals = []os.Signal{
14+
syscall.SIGTERM,
15+
}
16+
17+
// InterruptSignals are used to less gracefully exit.
18+
// An example is canceling a job, waiting for a timeout,
19+
// and then exiting.
1020
var InterruptSignals = []os.Signal{
21+
// SIGINT
1122
os.Interrupt,
12-
syscall.SIGTERM,
1323
syscall.SIGHUP,
1424
}
25+
26+
// KillSignals will force exit.
27+
var KillSignals = []os.Signal{
28+
// SIGKILL
29+
os.Kill,
30+
}

cli/signal_windows.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,14 @@ import (
66
"os"
77
)
88

9-
var InterruptSignals = []os.Signal{os.Interrupt}
9+
// Check the UNIX file for the comments on the signals
10+
11+
var StopSignals = []os.Signal{}
12+
13+
var InterruptSignals = []os.Signal{
14+
os.Interrupt,
15+
}
16+
17+
var KillSignals = []os.Signal{
18+
os.Kill,
19+
}

coderd/coderdtest/coderdtest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ func (c *provisionerdCloser) Close() error {
498498
c.closed = true
499499
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
500500
defer cancel()
501-
shutdownErr := c.d.Shutdown(ctx)
501+
shutdownErr := c.d.Shutdown(ctx, true)
502502
closeErr := c.d.Close()
503503
if shutdownErr != nil {
504504
return shutdownErr

enterprise/cli/provisionerdaemons.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
225225
cliui.Errorf(inv.Stderr, "Unexpected error, shutting down server: %s\n", exitErr)
226226
}
227227

228-
err = srv.Shutdown(ctx)
228+
err = srv.Shutdown(ctx, false)
229229
if err != nil {
230230
return xerrors.Errorf("shutdown: %w", err)
231231
}

enterprise/coderd/provisionerdaemons_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ func TestProvisionerDaemonServe(t *testing.T) {
440440
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
441441
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
442442

443-
err = pd.Shutdown(ctx)
443+
err = pd.Shutdown(ctx, false)
444444
require.NoError(t, err)
445445
err = terraformServer.Close()
446446
require.NoError(t, err)

provisionerd/provisionerd.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -474,15 +474,18 @@ func (p *Server) isClosed() bool {
474474
}
475475
}
476476

477-
// Shutdown triggers a graceful exit of each registered provisioner.
478-
func (p *Server) Shutdown(ctx context.Context) error {
477+
// Shutdown gracefully exists with the option to cancel the active job.
478+
// If false, it will wait for the job to complete.
479+
//
480+
//nolint:revive
481+
func (p *Server) Shutdown(ctx context.Context, cancelActiveJob bool) error {
479482
p.mutex.Lock()
480483
p.opts.Logger.Info(ctx, "attempting graceful shutdown")
481484
if !p.shuttingDownB {
482485
close(p.shuttingDownCh)
483486
p.shuttingDownB = true
484487
}
485-
if p.activeJob != nil {
488+
if cancelActiveJob && p.activeJob != nil {
486489
p.activeJob.Cancel()
487490
}
488491
p.mutex.Unlock()

provisionerd/provisionerd_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ func TestProvisionerd(t *testing.T) {
671671
}),
672672
})
673673
require.Condition(t, closedWithin(updateChan, testutil.WaitShort))
674-
err := server.Shutdown(context.Background())
674+
err := server.Shutdown(context.Background(), false)
675675
require.NoError(t, err)
676676
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
677677
require.NoError(t, server.Close())
@@ -762,7 +762,7 @@ func TestProvisionerd(t *testing.T) {
762762
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
763763
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
764764
defer cancel()
765-
require.NoError(t, server.Shutdown(ctx))
765+
require.NoError(t, server.Shutdown(ctx, false))
766766
require.NoError(t, server.Close())
767767
})
768768

@@ -853,7 +853,7 @@ func TestProvisionerd(t *testing.T) {
853853
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
854854
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
855855
defer cancel()
856-
require.NoError(t, server.Shutdown(ctx))
856+
require.NoError(t, server.Shutdown(ctx, false))
857857
require.NoError(t, server.Close())
858858
})
859859

@@ -944,7 +944,7 @@ func TestProvisionerd(t *testing.T) {
944944
t.Log("completeChan closed")
945945
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
946946
defer cancel()
947-
require.NoError(t, server.Shutdown(ctx))
947+
require.NoError(t, server.Shutdown(ctx, false))
948948
require.NoError(t, server.Close())
949949
})
950950

@@ -1039,7 +1039,7 @@ func TestProvisionerd(t *testing.T) {
10391039
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
10401040
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
10411041
defer cancel()
1042-
require.NoError(t, server.Shutdown(ctx))
1042+
require.NoError(t, server.Shutdown(ctx, false))
10431043
require.NoError(t, server.Close())
10441044
assert.Equal(t, ops[len(ops)-1], "CompleteJob")
10451045
assert.Contains(t, ops[0:len(ops)-1], "Log: Cleaning Up | ")
@@ -1076,7 +1076,7 @@ func createProvisionerd(t *testing.T, dialer provisionerd.Dialer, connector prov
10761076
t.Cleanup(func() {
10771077
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
10781078
defer cancel()
1079-
_ = server.Shutdown(ctx)
1079+
_ = server.Shutdown(ctx, false)
10801080
_ = server.Close()
10811081
})
10821082
return server

0 commit comments

Comments
 (0)