Skip to content

Commit c321bdc

Browse files
authored
feat(telemetry): instrument connection lifecycle (#942)
Instruments SSH process and reconnecting-WebSocket lifecycles with structured telemetry, wired into the primary and remote-scoped Coder API clients. - **SSH process**: emit `ssh.process.discovered` (with `found=true|false` and attempts), `ssh.process.lost` (causes: `stale_network_info`, `missing_network_info`), `ssh.process.recovered`, `ssh.process.replaced` (always emitted, with `wasLost`), and `ssh.process.disposed` as a terminal event. - **Network sampling**: `ssh.network.sampled` on p2p flip, DERP change, latency change (≥25ms OR ≥20% ratio), or 60s heartbeat. - **WebSocket**: emit `connection.opened`, `connection.dropped`, `connection.state_transitioned`, and `connection.reconnect_resolved` (cycle outcome via `result`, so failed cycles aren't mistakenly counted as successes). - **Wiring**: telemetry threaded through long-lived primary/remote `CoderApi` instances; throwaway clients stay opt-in. - **Robustness**: dispose unblocks all concurrent `delay()` waits via a `Set`; cert-refresh reconnects are distinguishable from manual ones; `reset()` clears `#reconnectCycle`. Closes #905
1 parent 96f9fee commit c321bdc

13 files changed

Lines changed: 1397 additions & 139 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
local telemetry.
2424
- Local telemetry now records `http.requests` rollups for per-route HTTP
2525
health without emitting one event per request.
26+
- Connection lifecycle now records local telemetry: SSH process
27+
discovery/loss/recovery with sampled network info, and reconnecting
28+
WebSocket open, drop, reconnect, and state transitions, so connection
29+
stability is captured alongside other local telemetry.
2630

2731
### Fixed
2832

src/api/coderApi.ts

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import {
3939
import {
4040
ConnectionState,
4141
ReconnectingWebSocket,
42+
type ReconnectingWebSocketOptions,
4243
type SocketFactory,
4344
} from "../websocket/reconnectingWebSocket";
4445
import { SseConnection } from "../websocket/sseConnection";
@@ -95,6 +96,7 @@ export class CoderApi extends Api implements vscode.Disposable {
9596

9697
private constructor(
9798
private readonly output: Logger,
99+
private readonly telemetry: TelemetryReporter,
98100
private readonly httpRequestsTelemetry: HttpRequestsTelemetry,
99101
) {
100102
super();
@@ -104,7 +106,9 @@ export class CoderApi extends Api implements vscode.Disposable {
104106
/**
105107
* Create a new CoderApi instance with the provided configuration.
106108
* Automatically sets up logging interceptors, certificate handling,
107-
* and HTTP request telemetry that emits via the given reporter.
109+
* HTTP request telemetry, and WebSocket connection telemetry. All
110+
* telemetry routes through the single reporter passed in (defaults to
111+
* NOOP_TELEMETRY_REPORTER for throwaway clients).
108112
*/
109113
static create(
110114
baseUrl: string,
@@ -113,7 +117,7 @@ export class CoderApi extends Api implements vscode.Disposable {
113117
telemetry: TelemetryReporter = NOOP_TELEMETRY_REPORTER,
114118
): CoderApi {
115119
const httpRequestsTelemetry = new HttpRequestsTelemetry(telemetry);
116-
const client = new CoderApi(output, httpRequestsTelemetry);
120+
const client = new CoderApi(output, telemetry, httpRequestsTelemetry);
117121
client.setCredentials(baseUrl, token);
118122

119123
setupInterceptors(client, output, httpRequestsTelemetry);
@@ -463,18 +467,21 @@ export class CoderApi extends Api implements vscode.Disposable {
463467
private async createReconnectingSocket<TData>(
464468
socketFactory: SocketFactory<TData>,
465469
): Promise<ReconnectingWebSocket<TData>> {
470+
const options: ReconnectingWebSocketOptions = {
471+
onCertificateRefreshNeeded: async () => {
472+
const refreshCommand = getRefreshCommand();
473+
if (!refreshCommand) {
474+
return false;
475+
}
476+
return refreshCertificates(refreshCommand, this.output);
477+
},
478+
telemetry: this.telemetry,
479+
};
480+
466481
const reconnectingSocket = await ReconnectingWebSocket.create<TData>(
467482
socketFactory,
468483
this.output,
469-
{
470-
onCertificateRefreshNeeded: async () => {
471-
const refreshCommand = getRefreshCommand();
472-
if (!refreshCommand) {
473-
return false;
474-
}
475-
return refreshCertificates(refreshCommand, this.output);
476-
},
477-
},
484+
options,
478485
() => this.reconnectingSockets.delete(reconnectingSocket),
479486
);
480487

src/instrumentation/ssh.ts

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
import type { NetworkInfo } from "../remote/sshProcess";
2+
import type { TelemetryReporter } from "../telemetry/reporter";
3+
4+
const NETWORK_SAMPLE_INTERVAL_MS = 60_000;
5+
const NETWORK_LATENCY_CHANGE_RATIO = 0.2;
6+
const NETWORK_LATENCY_MIN_ABSOLUTE_CHANGE_MS = 25;
7+
8+
export type ProcessLossCause = "stale_network_info" | "missing_network_info";
9+
10+
interface NetworkSample {
11+
readonly emittedAtMs: number;
12+
readonly p2p: boolean;
13+
readonly preferredDerp: string;
14+
readonly latencyMs: number;
15+
}
16+
17+
interface ProcessDiscoveryResult {
18+
readonly pid: number | undefined;
19+
readonly attempts: number;
20+
}
21+
22+
export class SshTelemetry {
23+
readonly #telemetry: TelemetryReporter;
24+
#processStartedAtMs: number | undefined;
25+
#processLostAtMs: number | undefined;
26+
#lastNetworkSample: NetworkSample | undefined;
27+
28+
public constructor(telemetry: TelemetryReporter) {
29+
this.#telemetry = telemetry;
30+
}
31+
32+
public traceProcessDiscovery(
33+
fn: () => Promise<ProcessDiscoveryResult>,
34+
): Promise<number | undefined> {
35+
return this.#telemetry.trace("ssh.process.discovered", async (span) => {
36+
const { pid, attempts } = await fn();
37+
span.setProperty("found", String(pid !== undefined));
38+
span.setMeasurement("attempts", attempts);
39+
return pid;
40+
});
41+
}
42+
43+
public processStarted(): void {
44+
this.#processStartedAtMs = performance.now();
45+
this.#processLostAtMs = undefined;
46+
}
47+
48+
public processLost(cause: ProcessLossCause): void {
49+
if (
50+
this.#processStartedAtMs === undefined ||
51+
this.#processLostAtMs !== undefined
52+
) {
53+
return;
54+
}
55+
const now = performance.now();
56+
this.#processLostAtMs = now;
57+
this.#telemetry.log(
58+
"ssh.process.lost",
59+
{ cause },
60+
{ uptimeMs: now - this.#processStartedAtMs },
61+
);
62+
}
63+
64+
public processRecovered(): void {
65+
if (this.#processLostAtMs === undefined) {
66+
return;
67+
}
68+
this.#telemetry.log(
69+
"ssh.process.recovered",
70+
{},
71+
{ recoveryDurationMs: performance.now() - this.#processLostAtMs },
72+
);
73+
this.#processLostAtMs = undefined;
74+
}
75+
76+
/** Handover to a different SSH process. Always emits `ssh.process.replaced`,
77+
* even when the prior process was already lost (replacement is operationally
78+
* distinct from recovery). */
79+
public processReplaced(): void {
80+
const now = performance.now();
81+
if (this.#processStartedAtMs !== undefined) {
82+
const wasLost = this.#processLostAtMs !== undefined;
83+
const measurements: Record<string, number> = {
84+
previousUptimeMs: now - this.#processStartedAtMs,
85+
};
86+
if (this.#processLostAtMs !== undefined) {
87+
measurements.lostDurationMs = now - this.#processLostAtMs;
88+
}
89+
this.#telemetry.log(
90+
"ssh.process.replaced",
91+
{ wasLost: String(wasLost) },
92+
measurements,
93+
);
94+
}
95+
this.#processStartedAtMs = now;
96+
this.#processLostAtMs = undefined;
97+
this.#lastNetworkSample = undefined;
98+
}
99+
100+
/** Terminal teardown signal. Emits regardless of prior lost state so
101+
* consumers always see a session-ending event. */
102+
public disposed(): void {
103+
if (this.#processStartedAtMs === undefined) {
104+
return;
105+
}
106+
const now = performance.now();
107+
const wasLost = this.#processLostAtMs !== undefined;
108+
this.#telemetry.log(
109+
"ssh.process.disposed",
110+
{ wasLost: String(wasLost) },
111+
{ uptimeMs: now - this.#processStartedAtMs },
112+
);
113+
this.#processStartedAtMs = undefined;
114+
this.#processLostAtMs = undefined;
115+
this.#lastNetworkSample = undefined;
116+
}
117+
118+
public networkSampled(network: NetworkInfo): void {
119+
const now = performance.now();
120+
const previous = this.#lastNetworkSample;
121+
if (previous && !shouldEmitSample(previous, network, now)) {
122+
return;
123+
}
124+
125+
this.#lastNetworkSample = {
126+
emittedAtMs: now,
127+
p2p: network.p2p,
128+
preferredDerp: network.preferred_derp,
129+
latencyMs: network.latency,
130+
};
131+
this.#telemetry.log(
132+
"ssh.network.sampled",
133+
{
134+
p2p: String(network.p2p),
135+
preferredDerp: network.preferred_derp,
136+
},
137+
{
138+
latencyMs: network.latency,
139+
downloadMbits: bytesPerSecondToMbits(network.download_bytes_sec),
140+
uploadMbits: bytesPerSecondToMbits(network.upload_bytes_sec),
141+
},
142+
);
143+
}
144+
}
145+
146+
/** Emit on p2p flip, DERP change, large latency swing, or heartbeat interval. */
147+
function shouldEmitSample(
148+
previous: NetworkSample,
149+
current: NetworkInfo,
150+
now: number,
151+
): boolean {
152+
if (now - previous.emittedAtMs >= NETWORK_SAMPLE_INTERVAL_MS) {
153+
return true;
154+
}
155+
if (current.p2p !== previous.p2p) {
156+
return true;
157+
}
158+
if (current.preferred_derp !== previous.preferredDerp) {
159+
return true;
160+
}
161+
return hasMeaningfulLatencyChange(current.latency, previous.latencyMs);
162+
}
163+
164+
function hasMeaningfulLatencyChange(
165+
current: number,
166+
previous: number,
167+
): boolean {
168+
if (previous === 0) {
169+
return current !== 0;
170+
}
171+
const absoluteChange = Math.abs(current - previous);
172+
return (
173+
absoluteChange >= NETWORK_LATENCY_MIN_ABSOLUTE_CHANGE_MS ||
174+
absoluteChange / Math.abs(previous) >= NETWORK_LATENCY_CHANGE_RATIO
175+
);
176+
}
177+
178+
function bytesPerSecondToMbits(bytesPerSecond: number): number {
179+
return (bytesPerSecond * 8) / 1_000_000;
180+
}

0 commit comments

Comments
 (0)