Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions cdk/src/constructs/approval-metrics-publisher-consumer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import * as path from 'path';
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import { FilterCriteria, FilterRule, StartingPosition, Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
import { DynamoEventSource, SqsDlq } from 'aws-cdk-lib/aws-lambda-event-sources';
Expand Down Expand Up @@ -84,6 +85,8 @@ export interface ApprovalMetricsPublisherConsumerProps {
export class ApprovalMetricsPublisherConsumer extends Construct {
public readonly fn: lambda.NodejsFunction;
public readonly dlq: sqs.Queue;
/** CloudWatch alarm that fires when the DLQ has at least one poison-pill record. */
public readonly dlqAlarm: cloudwatch.IAlarm;

constructor(scope: Construct, id: string, props: ApprovalMetricsPublisherConsumerProps) {
super(scope, id);
Expand All @@ -93,10 +96,7 @@ export class ApprovalMetricsPublisherConsumer extends Construct {
this.dlq = new sqs.Queue(this, 'ApprovalMetricsPublisherDlq', {
// Persistent failures (malformed records the handler's
// per-record try/catch throws on three times in a row) land
// here for operator inspection. Alarm wiring is deferred to
// Chunk 10 follow-ups — until a notification channel is wired
// to SNS, an alarm on ``ApproximateNumberOfMessagesVisible``
// would fire into the void.
// here for operator inspection.
retentionPeriod: Duration.days(14),
enforceSSL: true,
});
Expand Down Expand Up @@ -153,6 +153,23 @@ export class ApprovalMetricsPublisherConsumer extends Construct {
filters: [agentMilestoneFilter],
}));

// #117: alarm on DLQ depth so poison-pill records don't silently
// accumulate without operator visibility.
this.dlqAlarm = new cloudwatch.Alarm(this, 'DlqMessageAlarm', {
metric: this.dlq.metricApproximateNumberOfMessagesVisible({
period: Duration.minutes(5),
statistic: 'Maximum',
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription:
'ApprovalMetricsPublisher DLQ has at least one message; investigate poison records. ' +
'Check CloudWatch Logs for the ApprovalMetricsPublisherFn error that caused the DLQ send. ' +
'See: https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/117',
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
});

NagSuppressions.addResourceSuppressions(this.fn, [
{
id: 'AwsSolutions-IAM4',
Expand Down
20 changes: 20 additions & 0 deletions cdk/src/constructs/fanout-consumer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import * as path from 'path';
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import * as iam from 'aws-cdk-lib/aws-iam';
import { StartingPosition, Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
Expand Down Expand Up @@ -106,6 +107,8 @@ export interface FanOutConsumerProps {
export class FanOutConsumer extends Construct {
public readonly fn: lambda.NodejsFunction;
public readonly dlq: sqs.Queue;
/** CloudWatch alarm that fires when the DLQ has at least one poison-pill record. */
public readonly dlqAlarm: cloudwatch.IAlarm;

constructor(scope: Construct, id: string, props: FanOutConsumerProps) {
super(scope, id);
Expand Down Expand Up @@ -185,6 +188,23 @@ export class FanOutConsumer extends Construct {
reportBatchItemFailures: true,
}));

// #117: alarm on DLQ depth so poison-pill records don't silently
// accumulate without operator visibility.
this.dlqAlarm = new cloudwatch.Alarm(this, 'DlqMessageAlarm', {
metric: this.dlq.metricApproximateNumberOfMessagesVisible({
period: Duration.minutes(5),
statistic: 'Maximum',
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription:
'FanOutConsumer DLQ has at least one message; investigate poison records. ' +
'Check CloudWatch Logs for the FanOutFn error that caused the DLQ send. ' +
'See: https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/117',
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
});

NagSuppressions.addResourceSuppressions(this.fn, [
{
id: 'AwsSolutions-IAM4',
Expand Down
15 changes: 15 additions & 0 deletions cdk/test/constructs/approval-metrics-publisher-consumer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,19 @@ describe('ApprovalMetricsPublisherConsumer', () => {
// expect exactly one Table in the synthesized stack.
template.resourceCountIs('AWS::DynamoDB::Table', 1);
});

test('creates a CloudWatch alarm on DLQ ApproximateNumberOfMessagesVisible (#117)', () => {
const { template } = createStack();

template.hasResourceProperties('AWS::CloudWatch::Alarm', {
MetricName: 'ApproximateNumberOfMessagesVisible',
Namespace: 'AWS/SQS',
Threshold: 1,
EvaluationPeriods: 1,
ComparisonOperator: 'GreaterThanOrEqualToThreshold',
TreatMissingData: 'notBreaching',
Statistic: 'Maximum',
Period: 300,
});
});
});
20 changes: 20 additions & 0 deletions cdk/test/constructs/fanout-consumer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,24 @@ describe('FanOutConsumer', () => {
expect(vars.TASK_TABLE_NAME).toBeUndefined();
}
});

test('creates a CloudWatch alarm on DLQ ApproximateNumberOfMessagesVisible (#117)', () => {
const app = new App();
const stack = new Stack(app, 'TestStack');
new FanOutConsumer(stack, 'FanOut', {
taskEventsTable: makeTaskEventsTable(stack),
});
const template = Template.fromStack(stack);

template.hasResourceProperties('AWS::CloudWatch::Alarm', {
MetricName: 'ApproximateNumberOfMessagesVisible',
Namespace: 'AWS/SQS',
Threshold: 1,
EvaluationPeriods: 1,
ComparisonOperator: 'GreaterThanOrEqualToThreshold',
TreatMissingData: 'notBreaching',
Statistic: 'Maximum',
Period: 300,
});
});
});
21 changes: 13 additions & 8 deletions docs/design/CEDAR_HITL_GATES.md
Original file line number Diff line number Diff line change
Expand Up @@ -1585,21 +1585,26 @@ Extend `TaskDashboard` (`cdk/src/constructs/task-dashboard.ts`). These are read-

Every `agent_milestone("approval_*")` event carries `trace_id` / `span_id`. A span `hitl.approval_wait` brackets the PreToolUse poll loop: `span.duration = decided_at - created_at`. `hitl.approval_race_loss` emitted when the agent's local timeout fired <5s before a late user decision (useful for tuning).

### 11.5 CloudWatch alarms — deferred (notification-channel gated)
### 11.5 CloudWatch alarms

**DLQ-depth alarms (shipped):** CloudWatch alarms on `ApproximateNumberOfMessagesVisible >= 1` (5-min period, Maximum statistic, `treatMissingData: NOT_BREACHING`) are deployed for:

- **FanOutConsumer DLQ** — poison-pill DynamoDB Stream records that failed three consecutive Lambda invocations.
- **ApprovalMetricsPublisher DLQ** — same failure mode for the metrics-publisher consumer.

These alarms transition to `ALARM` state in CloudWatch and appear in the console/dashboard, providing operator visibility into silent record loss. They ship **without an `addAlarmAction` / SNS notification target** — operators must check the CloudWatch Alarms console or configure a subscription manually. This is an intentional intermediate step: alarm state is durable and queryable even without push notifications, and prevents poison records from accumulating silently for the full 14-day DLQ retention window.

**Follow-up — notification channel wiring:** Once an operational notification channel (SNS topic → Slack / PagerDuty / email) is provisioned, add `alarm.addAlarmAction(new SnsAction(topic))` to both alarms. No metric or alarm restructuring is needed.

**Additional alarms (not yet shipped):** The following remain deferred until the notification channel exists (alarm-without-action provides limited value for rate/latency conditions that require human triage):

Operator-facing CloudWatch alarms that would page on:
- High approval-timeout rate (users not responding, notifications broken)
- Tasks stuck in AWAITING_APPROVAL beyond `timeout_s + 60s` (reconciler failure)
- High approval-write failure rate (DDB throttled or IAM drift)
- Approval-gate cap hit (suspicious retry loop)
- Publisher / fanout DLQ non-empty (persistent consumer-side poison pills)
- `MetricEmitSkipped` sustained > 0 (publisher schema mismatch — agent / publisher version skew)
- `MetricsPublisherHeartbeat` flat-line (publisher pipeline broken)

…are **out of scope for v1** because the project does not yet have a notification channel (Slack / PagerDuty / SNS topic / email distribution list) configured for operational alerts. Adding alarms without a notification channel produces CloudWatch widgets that nobody sees — no safety benefit.

**Plumbing status (post-Chunk 8):** the supporting metric data now flows as native CloudWatch metrics in namespace `ABCA/Cedar-HITL` via `ApprovalMetricsPublisherFn` (§11.3). Alarm wiring becomes a per-threshold `cloudwatch.Alarm` + `SnsAction`; no additional metric-extraction infra is needed. The remaining gap is the SNS topic + subscriber wiring itself — when that lands, the alarms above are a small bounded follow-up (not a multi-PR metrics build-out as they were pre-Chunk-8).

---

## 12. Security model
Expand Down Expand Up @@ -2070,7 +2075,7 @@ See §17.18 for the off-hours escalation future-work primitive, and §13.14 for
**Future work — polish (tracked in §17):**
- CLI inline streaming prompt (UX research first)
- `approve --defer` / allowlist revocation (`bgagent revoke-approval`)
- CloudWatch alarm plumbing (§11.5) — deferred until an operational notification channel is available
- CloudWatch alarm SNS notification wiring (§11.5) — DLQ-depth alarms ship without an action target; add `SnsAction` once a notification channel is provisioned
- More soft-deny policies in the default set based on real usage
- Persistent recent-decision cache (if container-restart telemetry justifies it)
- Persistent per-minute rate limit (if restart amplification becomes significant)
Expand Down
21 changes: 13 additions & 8 deletions docs/src/content/docs/architecture/Cedar-hitl-gates.md
Original file line number Diff line number Diff line change
Expand Up @@ -1589,21 +1589,26 @@ Extend `TaskDashboard` (`cdk/src/constructs/task-dashboard.ts`). These are read-

Every `agent_milestone("approval_*")` event carries `trace_id` / `span_id`. A span `hitl.approval_wait` brackets the PreToolUse poll loop: `span.duration = decided_at - created_at`. `hitl.approval_race_loss` emitted when the agent's local timeout fired <5s before a late user decision (useful for tuning).

### 11.5 CloudWatch alarms — deferred (notification-channel gated)
### 11.5 CloudWatch alarms

**DLQ-depth alarms (shipped):** CloudWatch alarms on `ApproximateNumberOfMessagesVisible >= 1` (5-min period, Maximum statistic, `treatMissingData: NOT_BREACHING`) are deployed for:

- **FanOutConsumer DLQ** — poison-pill DynamoDB Stream records that failed three consecutive Lambda invocations.
- **ApprovalMetricsPublisher DLQ** — same failure mode for the metrics-publisher consumer.

These alarms transition to `ALARM` state in CloudWatch and appear in the console/dashboard, providing operator visibility into silent record loss. They ship **without an `addAlarmAction` / SNS notification target** — operators must check the CloudWatch Alarms console or configure a subscription manually. This is an intentional intermediate step: alarm state is durable and queryable even without push notifications, and prevents poison records from accumulating silently for the full 14-day DLQ retention window.

**Follow-up — notification channel wiring:** Once an operational notification channel (SNS topic → Slack / PagerDuty / email) is provisioned, add `alarm.addAlarmAction(new SnsAction(topic))` to both alarms. No metric or alarm restructuring is needed.

**Additional alarms (not yet shipped):** The following remain deferred until the notification channel exists (alarm-without-action provides limited value for rate/latency conditions that require human triage):

Operator-facing CloudWatch alarms that would page on:
- High approval-timeout rate (users not responding, notifications broken)
- Tasks stuck in AWAITING_APPROVAL beyond `timeout_s + 60s` (reconciler failure)
- High approval-write failure rate (DDB throttled or IAM drift)
- Approval-gate cap hit (suspicious retry loop)
- Publisher / fanout DLQ non-empty (persistent consumer-side poison pills)
- `MetricEmitSkipped` sustained > 0 (publisher schema mismatch — agent / publisher version skew)
- `MetricsPublisherHeartbeat` flat-line (publisher pipeline broken)

…are **out of scope for v1** because the project does not yet have a notification channel (Slack / PagerDuty / SNS topic / email distribution list) configured for operational alerts. Adding alarms without a notification channel produces CloudWatch widgets that nobody sees — no safety benefit.

**Plumbing status (post-Chunk 8):** the supporting metric data now flows as native CloudWatch metrics in namespace `ABCA/Cedar-HITL` via `ApprovalMetricsPublisherFn` (§11.3). Alarm wiring becomes a per-threshold `cloudwatch.Alarm` + `SnsAction`; no additional metric-extraction infra is needed. The remaining gap is the SNS topic + subscriber wiring itself — when that lands, the alarms above are a small bounded follow-up (not a multi-PR metrics build-out as they were pre-Chunk-8).

---

## 12. Security model
Expand Down Expand Up @@ -2074,7 +2079,7 @@ See §17.18 for the off-hours escalation future-work primitive, and §13.14 for
**Future work — polish (tracked in §17):**
- CLI inline streaming prompt (UX research first)
- `approve --defer` / allowlist revocation (`bgagent revoke-approval`)
- CloudWatch alarm plumbing (§11.5) — deferred until an operational notification channel is available
- CloudWatch alarm SNS notification wiring (§11.5) — DLQ-depth alarms ship without an action target; add `SnsAction` once a notification channel is provisioned
- More soft-deny policies in the default set based on real usage
- Persistent recent-decision cache (if container-restart telemetry justifies it)
- Persistent per-minute rate limit (if restart amplification becomes significant)
Expand Down
Loading