Skip to content

Commit 27f63d9

Browse files
kaizenccgithub-actions
and
github-actions
authored
fix: remove replica lag metric (#1696)
we can no longer measure replica lag because NPM's new protocol does not allow us to query individual packages on the replica. ``` curl -H "npm-replication-opt-in: true" "https://replicate.npmjs.com/registry/construct-hub-probe" "Not Found"% ``` I can't think of a way to measure this now, so I propose to just rip it out so that we can migrate to the new protocol ---- *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license* --------- Signed-off-by: github-actions <[email protected]> Co-authored-by: github-actions <[email protected]>
1 parent 1917cc1 commit 27f63d9

File tree

6 files changed

+121
-340
lines changed

6 files changed

+121
-340
lines changed

src/__tests__/__snapshots__/construct-hub.test.ts.snap

+95-175
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/__tests__/devapp/__snapshots__/snapshot.test.ts.snap

+23-39
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/package-sources/npmjs.ts

+1-28
Original file line numberDiff line numberDiff line change
@@ -589,20 +589,11 @@ export class NpmJs implements IPackageSource {
589589
const period = Duration.minutes(5);
590590

591591
const alarm = new MathExpression({
592-
// When the npm replica is sufficiently behind the primary, the package source will not be
593-
// able to register new canary package versions within the SLA. In such cases, there is
594-
// nothing that can be done except for waiting until the replica has finally caught up. We
595-
// hence suppress the alarm if the replica lag is getting within 3 evaluation periods of the
596-
// visibility SLA.
597-
expression: `IF(FILL(mLag, REPEAT) < ${Math.max(
598-
visibilitySla.toSeconds() - 3 * period.toSeconds(),
599-
3 * period.toSeconds()
600-
)}, MAX([mDwell, mTTC]))`,
592+
expression: 'MAX([mDwell, mTTC])',
601593
period,
602594
usingMetrics: {
603595
mDwell: canary.metricDwellTime(),
604596
mTTC: canary.metricTimeToCatalog(),
605-
mLag: canary.metricEstimatedNpmReplicaLag(),
606597
},
607598
}).createAlarm(canary, 'Alarm', {
608599
alarmName: `${canary.node.path}/SLA-Breached`,
@@ -708,24 +699,6 @@ export class NpmJs implements IPackageSource {
708699
],
709700
rightYAxis: { min: 0 },
710701
}),
711-
new GraphWidget({
712-
height: 6,
713-
width: 12,
714-
title: 'Observed lag of replicate.npmjs.com',
715-
left: [
716-
canary.metricEstimatedNpmReplicaLag({
717-
label: `Replica lag (${packageName})`,
718-
}),
719-
],
720-
leftAnnotations: [
721-
{
722-
color: '#ffa500',
723-
label: visibilitySla.toHumanString(),
724-
value: visibilitySla.toSeconds(),
725-
},
726-
],
727-
leftYAxis: { min: 0 },
728-
}),
729702
];
730703
}
731704
}

src/package-sources/npmjs/canary/constants.ts

-8
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,6 @@ export const enum MetricName {
2929
*/
3030
TRACKED_VERSION_COUNT = 'TrackedVersionCount',
3131

32-
/**
33-
* The estimated lag between the npm registry replica (replicate.npmjs.com)
34-
* and the primary registry (registry.npmjs.com). This cannot be measured
35-
* directly because the primary does not expose the relevant CouchDB endpoints,
36-
* so we use the probe package to get a low-resolution view of this.
37-
*/
38-
NPM_REPLICA_LAG = 'EstimatedNpmReplicaLag',
39-
4032
/**
4133
* A metric tracking whether the npm registry replica (replicate.npmjs.com)
4234
* is down. The value is 1 when the replica is detected to be down, and 0

src/package-sources/npmjs/canary/index.ts

-20
Original file line numberDiff line numberDiff line change
@@ -85,26 +85,6 @@ export class NpmJsPackageCanary extends Construct {
8585
});
8686
}
8787

88-
/**
89-
* The estimated lag between the npm registry replica (replcate.npmjs.com) and
90-
* the primary registry (registry.npmjs.com).
91-
*
92-
* IMPORTANT NOTE: This is based on the difference in modified timestamps for
93-
* the probe package between the two and hence has a granularly no better than
94-
* the publishing interval of this. Since the construct-hub-probe package is
95-
* only published every 3 hours approximately, this metric has a resolution
96-
* that is strictly worse than 3 hours.
97-
*/
98-
public metricEstimatedNpmReplicaLag(opts?: MetricOptions): Metric {
99-
return new Metric({
100-
period: Duration.minutes(5),
101-
statistic: Statistic.MAXIMUM,
102-
...opts,
103-
metricName: MetricName.NPM_REPLICA_LAG,
104-
namespace: METRICS_NAMESPACE,
105-
});
106-
}
107-
10888
/**
10989
* A metric tracking whether the npm registry replica (replicate.npmjs.com)
11090
* is down. The value is 1 when the replica is detected to be down, and 0

src/package-sources/npmjs/canary/npmjs-package-canary.lambda.ts

+2-70
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ const REPLICA_REQUEST_TIMEOUT_MS = 30_000;
2525
/**
2626
* This package canary monitors the availability of the versions of a specified
2727
* package in the ConstructHub catalog. It publishes metrics that help
28-
* understand how much time passes between a pakcage appearing in the public
28+
* understand how much time passes between a package appearing in the public
2929
* registry and it's availability in the ConstructHub instance.
3030
*
3131
* From the moment a package has been published, and until it appeared in
@@ -74,8 +74,6 @@ export async function handler(event: unknown): Promise<void> {
7474
updateLatestIfNeeded(state, latest);
7575

7676
try {
77-
const replicaLag = await stateService.npmReplicaLagSeconds(packageName);
78-
7977
await metricScope((metrics) => async () => {
8078
// Clear out default dimensions as we don't need those. See https://github.com/awslabs/aws-embedded-metrics-node/issues/73.
8179
metrics.setDimensions({});
@@ -89,16 +87,6 @@ export async function handler(event: unknown): Promise<void> {
8987
(await stateService.isNpmReplicaDown()) ? 1 : 0,
9088
Unit.None
9189
);
92-
93-
// If we weren't able to calculate the replica's lag, then simply
94-
// don't report the metric.
95-
if (replicaLag !== undefined) {
96-
metrics.putMetric(
97-
MetricName.NPM_REPLICA_LAG,
98-
replicaLag,
99-
Unit.Seconds
100-
);
101-
}
10290
})();
10391

10492
for (const versionState of [
@@ -384,63 +372,6 @@ export class CanaryStateService {
384372
}
385373
}
386374

387-
/**
388-
* Estimate how far behind the NPM replica is compared to the live NPM
389-
* registry. If the NPM replica is down, return undefined.
390-
*/
391-
public async npmReplicaLagSeconds(
392-
packageName: string
393-
): Promise<number | undefined> {
394-
const encodedPackageName = encodeURIComponent(packageName);
395-
396-
console.log(`Measuring NPM replica lag using ${packageName}...`);
397-
398-
const primaryDate = await getModifiedTimestamp(`registry.npmjs.org`);
399-
400-
let replicaDate;
401-
try {
402-
replicaDate = await getModifiedTimestamp(
403-
`replicate.npmjs.com/registry`,
404-
REPLICA_REQUEST_TIMEOUT_MS
405-
);
406-
} catch (e) {
407-
if (e instanceof Error && e.message.includes('HTTP 504')) {
408-
console.log(
409-
`Warning: error fetching replicate.npmjs.com: ${e.toString()}`
410-
);
411-
// There is no value to report
412-
return undefined;
413-
} else {
414-
throw e;
415-
}
416-
}
417-
418-
const deltaMs = primaryDate.getTime() - replicaDate.getTime();
419-
420-
console.log(`Timestamp on primary: ${primaryDate.toISOString()}`);
421-
console.log(
422-
`Timestamp on replica: ${replicaDate.toISOString()} (${
423-
deltaMs / 3_600_000
424-
} hours behind)`
425-
);
426-
427-
// We return in seconds... The millisecond resolution is silly here since the probe package is
428-
// only published approximately once every three hours. We use seconds only because this is the
429-
// largest available time unit in CloudWatch.
430-
return deltaMs / 1_000;
431-
432-
async function getModifiedTimestamp(
433-
baseUrl: string,
434-
timeoutMillis?: number
435-
) {
436-
const isoDate = await getJSON(
437-
`https://${baseUrl}/${encodedPackageName}`,
438-
{ jsonPath: ['time', 'modified'], timeoutMillis }
439-
);
440-
return new Date(isoDate);
441-
}
442-
}
443-
444375
private key(packageName: string): string {
445376
return `${ObjectKey.STATE_PREFIX}${packageName}${ObjectKey.STATE_SUFFIX}`;
446377
}
@@ -534,6 +465,7 @@ function getJSON(
534465
headers: {
535466
Accept: 'application/json',
536467
'Accept-Encoding': 'identity',
468+
'npm-replication-opt-in': 'true', // can be deleted after May 29: https://github.com/orgs/community/discussions/152515
537469
},
538470
timeout: timeoutMillis,
539471
},

0 commit comments

Comments
 (0)