Skip to content

Commit 5715f44

Browse files
committed
fix: separate signals for passive, active, and forced shutdown
`SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new jobs but waiting for existing jobs to successfully complete. `SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit. `SIGKILL`: Untouched from before, will force-quit.
1 parent b2a5e2f commit 5715f44

File tree

11 files changed

+84
-30
lines changed

11 files changed

+84
-30
lines changed

cli/organization.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func (r *RootCmd) switchOrganization() *clibase.Cmd {
6060
conf := r.createConfig()
6161
orgs, err := client.OrganizationsByUser(inv.Context(), codersdk.Me)
6262
if err != nil {
63-
return fmt.Errorf("failed to get organizations: %w", err)
63+
return xerrors.Errorf("failed to get organizations: %w", err)
6464
}
6565
// Keep the list of orgs sorted
6666
slices.SortFunc(orgs, func(a, b codersdk.Organization) int {
@@ -84,7 +84,7 @@ func (r *RootCmd) switchOrganization() *clibase.Cmd {
8484
if switchToOrg == "" {
8585
err := conf.Organization().Delete()
8686
if err != nil && !errors.Is(err, os.ErrNotExist) {
87-
return fmt.Errorf("failed to unset organization: %w", err)
87+
return xerrors.Errorf("failed to unset organization: %w", err)
8888
}
8989
_, _ = fmt.Fprintf(inv.Stdout, "Organization unset\n")
9090
} else {
@@ -107,7 +107,7 @@ func (r *RootCmd) switchOrganization() *clibase.Cmd {
107107
// Always write the uuid to the config file. Names can change.
108108
err := conf.Organization().Write(orgs[index].ID.String())
109109
if err != nil {
110-
return fmt.Errorf("failed to write organization to config file: %w", err)
110+
return xerrors.Errorf("failed to write organization to config file: %w", err)
111111
}
112112
}
113113

@@ -123,7 +123,7 @@ func (r *RootCmd) switchOrganization() *clibase.Cmd {
123123
}
124124
return sdkError
125125
}
126-
return fmt.Errorf("failed to get current organization: %w", err)
126+
return xerrors.Errorf("failed to get current organization: %w", err)
127127
}
128128

129129
_, _ = fmt.Fprintf(inv.Stdout, "Current organization context set to %s (%s)\n", current.Name, current.ID.String())
@@ -213,7 +213,7 @@ func (r *RootCmd) currentOrganization() *clibase.Cmd {
213213
typed, ok := data.([]codersdk.Organization)
214214
if !ok {
215215
// This should never happen
216-
return "", fmt.Errorf("expected []Organization, got %T", data)
216+
return "", xerrors.Errorf("expected []Organization, got %T", data)
217217
}
218218
return stringFormat(typed)
219219
}),
@@ -250,7 +250,7 @@ func (r *RootCmd) currentOrganization() *clibase.Cmd {
250250
case "current":
251251
stringFormat = func(orgs []codersdk.Organization) (string, error) {
252252
if len(orgs) != 1 {
253-
return "", fmt.Errorf("expected 1 organization, got %d", len(orgs))
253+
return "", xerrors.Errorf("expected 1 organization, got %d", len(orgs))
254254
}
255255
return fmt.Sprintf("Current CLI Organization: %s (%s)\n", orgs[0].Name, orgs[0].ID.String()), nil
256256
}
@@ -275,7 +275,7 @@ func (r *RootCmd) currentOrganization() *clibase.Cmd {
275275
default:
276276
stringFormat = func(orgs []codersdk.Organization) (string, error) {
277277
if len(orgs) != 1 {
278-
return "", fmt.Errorf("expected 1 organization, got %d", len(orgs))
278+
return "", xerrors.Errorf("expected 1 organization, got %d", len(orgs))
279279
}
280280
return fmt.Sprintf("Organization: %s (%s)\n", orgs[0].Name, orgs[0].ID.String()), nil
281281
}

cli/organizationmanage.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func (r *RootCmd) createOrganization() *clibase.Cmd {
3434
// from creating it.
3535
existing, _ := client.OrganizationByName(inv.Context(), orgName)
3636
if existing.ID != uuid.Nil {
37-
return fmt.Errorf("organization %q already exists", orgName)
37+
return xerrors.Errorf("organization %q already exists", orgName)
3838
}
3939

4040
_, err := cliui.Prompt(inv, cliui.PromptOptions{
@@ -53,7 +53,7 @@ func (r *RootCmd) createOrganization() *clibase.Cmd {
5353
Name: orgName,
5454
})
5555
if err != nil {
56-
return fmt.Errorf("failed to create organization: %w", err)
56+
return xerrors.Errorf("failed to create organization: %w", err)
5757
}
5858

5959
_, _ = fmt.Fprintf(inv.Stdout, "Organization %s (%s) created.\n", organization.Name, organization.ID)

cli/root.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,7 @@ func CurrentOrganization(r *RootCmd, inv *clibase.Invocation, client *codersdk.C
715715
if selected == "" && conf.Organization().Exists() {
716716
org, err := conf.Organization().Read()
717717
if err != nil {
718-
return codersdk.Organization{}, fmt.Errorf("read selected organization from config file %q: %w", conf.Organization(), err)
718+
return codersdk.Organization{}, xerrors.Errorf("read selected organization from config file %q: %w", conf.Organization(), err)
719719
}
720720
selected = org
721721
}

cli/server.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
337337

338338
// Register signals early on so that graceful shutdown can't
339339
// be interrupted by additional signals. Note that we avoid
340-
// shadowing cancel() (from above) here because notifyStop()
340+
// shadowing cancel() (from above) here because stopCancel()
341341
// restores default behavior for the signals. This protects
342342
// the shutdown sequence from abruptly terminating things
343343
// like: database migrations, provisioner work, workspace
344344
// cleanup in dev-mode, etc.
345345
//
346346
// To get out of a graceful shutdown, the user can send
347347
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
348-
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
349-
defer notifyStop()
348+
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignals...)
349+
defer stopCancel()
350+
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
351+
defer interruptCancel()
350352

351353
cacheDir := vals.CacheDir.String()
352354
err = os.MkdirAll(cacheDir, 0o700)
@@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10281030
hangDetector.Start()
10291031
defer hangDetector.Close()
10301032

1033+
graceful := false
10311034
// Currently there is no way to ask the server to shut
10321035
// itself down, so any exit signal will result in a non-zero
10331036
// exit of the server.
10341037
var exitErr error
10351038
select {
1036-
case <-notifyCtx.Done():
1037-
exitErr = notifyCtx.Err()
1039+
case <-stopCtx.Done():
1040+
exitErr = stopCtx.Err()
1041+
graceful = true
1042+
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
1043+
case <-interruptCtx.Done():
1044+
exitErr = interruptCtx.Err()
10381045
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
10391046
case <-tunnelDone:
10401047
exitErr = xerrors.New("dev tunnel closed unexpectedly")
@@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10821089
defer wg.Done()
10831090

10841091
r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
1085-
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
1092+
timeout := 5 * time.Second
1093+
if graceful {
1094+
// It can last for a long time...
1095+
timeout = 30 * time.Minute
1096+
}
1097+
1098+
err := shutdownWithTimeout(func(ctx context.Context) error {
1099+
// We only want to cancel active jobs if we aren't exiting gracefully.
1100+
return provisionerDaemon.Shutdown(ctx, !graceful)
1101+
}, timeout)
10861102
if err != nil {
10871103
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
10881104
return
@@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {
25122528

25132529
return v, nil
25142530
}
2531+
2532+
func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
2533+
// On Windows, some of our signal functions lack support.
2534+
// If we pass in no signals, we should just return the context as-is.
2535+
if len(sig) == 0 {
2536+
return context.WithCancel(ctx)
2537+
}
2538+
return inv.SignalNotifyContext(ctx, sig...)
2539+
}

cli/signal_unix.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,24 @@ import (
77
"syscall"
88
)
99

10+
// StopSignals are used to gracefully exit.
11+
// An example is exiting provisioner daemons but not canceling
12+
// jobs, allowing a successful and clean exit.
13+
var StopSignals = []os.Signal{
14+
syscall.SIGTERM,
15+
}
16+
17+
// InterruptSignals are used to less gracefully exit.
18+
// An example is canceling a job, waiting for a timeout,
19+
// and then exiting.
1020
var InterruptSignals = []os.Signal{
21+
// SIGINT
1122
os.Interrupt,
12-
syscall.SIGTERM,
1323
syscall.SIGHUP,
1424
}
25+
26+
// KillSignals will force exit.
27+
var KillSignals = []os.Signal{
28+
// SIGKILL
29+
os.Kill,
30+
}

cli/signal_windows.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,14 @@ import (
66
"os"
77
)
88

9-
var InterruptSignals = []os.Signal{os.Interrupt}
9+
// Check the UNIX file for the comments on the signals
10+
11+
var StopSignals = []os.Signal{}
12+
13+
var InterruptSignals = []os.Signal{
14+
os.Interrupt,
15+
}
16+
17+
var KillSignals = []os.Signal{
18+
os.Kill,
19+
}

coderd/coderdtest/coderdtest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ func (c *provisionerdCloser) Close() error {
498498
c.closed = true
499499
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
500500
defer cancel()
501-
shutdownErr := c.d.Shutdown(ctx)
501+
shutdownErr := c.d.Shutdown(ctx, true)
502502
closeErr := c.d.Close()
503503
if shutdownErr != nil {
504504
return shutdownErr

enterprise/cli/provisionerdaemons.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
225225
cliui.Errorf(inv.Stderr, "Unexpected error, shutting down server: %s\n", exitErr)
226226
}
227227

228-
err = srv.Shutdown(ctx)
228+
err = srv.Shutdown(ctx, false)
229229
if err != nil {
230230
return xerrors.Errorf("shutdown: %w", err)
231231
}

enterprise/coderd/provisionerdaemons_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ func TestProvisionerDaemonServe(t *testing.T) {
440440
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
441441
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
442442

443-
err = pd.Shutdown(ctx)
443+
err = pd.Shutdown(ctx, false)
444444
require.NoError(t, err)
445445
err = terraformServer.Close()
446446
require.NoError(t, err)

provisionerd/provisionerd.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -474,15 +474,18 @@ func (p *Server) isClosed() bool {
474474
}
475475
}
476476

477-
// Shutdown triggers a graceful exit of each registered provisioner.
478-
func (p *Server) Shutdown(ctx context.Context) error {
477+
// Shutdown gracefully exists with the option to cancel the active job.
478+
// If false, it will wait for the job to complete.
479+
//
480+
//nolint:revive
481+
func (p *Server) Shutdown(ctx context.Context, cancelActiveJob bool) error {
479482
p.mutex.Lock()
480483
p.opts.Logger.Info(ctx, "attempting graceful shutdown")
481484
if !p.shuttingDownB {
482485
close(p.shuttingDownCh)
483486
p.shuttingDownB = true
484487
}
485-
if p.activeJob != nil {
488+
if cancelActiveJob && p.activeJob != nil {
486489
p.activeJob.Cancel()
487490
}
488491
p.mutex.Unlock()

0 commit comments

Comments
 (0)