opentelemetry.io/scripts/double-check-refcache-400s.mjs

199 lines
5.9 KiB
JavaScript
Executable File

#!/usr/bin/env node
import fs from 'fs/promises';
import { getUrlStatus, isHttp2XX } from './get-url-status.mjs';
import { exit } from 'process';
const CACHE_FILE = 'static/refcache.json';
const GOOGLE_DOCS_URL = 'https://docs.google.com/';
let checkForFragments = false;
let maxNumEntriesToUpdate = 3;
const cratesIoURL = 'https://crates.io/crates/';
// Magic numbers that we use to determine if a URL with a fragment has been
// checked with this script. Since we can't add new fields to the cache, we
// encode "magic" values in the LastSeen field.
const fragSecondsOk = 12;
const fragMillisecondsOk = 345;
const fragSecondsInvalid = 59;
const fragMillisecondsInvalid = 999;
function isHttp2XXForFragments(StatusCode, lastSeenDate) {
return (
isHttp2XX(StatusCode) &&
lastSeenDate.getSeconds() === fragSecondsOk &&
lastSeenDate.getMilliseconds() === fragMillisecondsOk
);
}
function is4XXForFragments(StatusCode, lastSeenDate) {
return (
lastSeenDate.getSeconds() === fragSecondsInvalid &&
lastSeenDate.getMilliseconds() === fragMillisecondsInvalid
);
}
async function readRefcache() {
try {
const data = await fs.readFile(CACHE_FILE, 'utf8');
return JSON.parse(data);
} catch (error) {
console.error(`Error reading ${CACHE_FILE}:`, error.message);
process.exit(1);
}
}
async function writeRefcache(cache) {
await fs.writeFile(CACHE_FILE, JSON.stringify(cache, null, 2) + '\n', 'utf8');
console.log(`Wrote updated ${CACHE_FILE}.`);
}
// Retry HTTP status check for refcache URLs with non-200s and not 404
async function retry400sAndUpdateCache() {
console.log(`Checking ${CACHE_FILE} for 4XX status URLs ...`);
const cache = await readRefcache();
let updatedCount = 0;
let entriesCount = 0;
let urlWithFragmentCount = 0;
let urlWithInvalidFragCount = 0;
let statusCounts = {};
for (const [url, details] of Object.entries(cache)) {
entriesCount++;
const parsedUrl = new URL(url);
if (parsedUrl.hash) urlWithFragmentCount++;
const { StatusCode, LastSeen } = details;
const lastSeenDate = new Date(LastSeen);
countStatuses(StatusCode, parsedUrl, lastSeenDate, statusCounts);
if (
checkForFragments && parsedUrl.hash
? isHttp2XXForFragments(StatusCode, lastSeenDate)
: isHttp2XX(StatusCode)
) {
// process.stdout.write('.');
continue;
}
if (
(StatusCode === 404 &&
// Handles special case of crates.io. For details, see:
// https://github.com/rust-lang/crates.io/issues/788
!url.startsWith(cratesIoURL)) ||
(parsedUrl.hash && is4XXForFragments(StatusCode, lastSeenDate))
) {
console.log(
`Skipping ${StatusCode}: ${url} (last seen ${lastSeenDate.toLocaleDateString()})${
is4XXForFragments(StatusCode, lastSeenDate) ? ' INVALID FRAGMENT' : ''
}`,
);
if (parsedUrl.hash) urlWithInvalidFragCount++;
continue;
}
if (url.startsWith(GOOGLE_DOCS_URL)) {
// console.log(`Skipping Google Docs URL (for now): ${url}.`);
// process.stdout.write('.');
continue;
/*
URLs are of the form:
https://docs.google.com/document/d/15vR7D1x2tKd7u3zaTF0yH1WaHkUr2T4hhr7OyiZgmBg/edit?tab=t.0#heading=h.4xuru5ljcups
We can simply check for the presence of the heading query parameter value in the page.
"ps_hdid":"h.4xuru5ljcups" # cSpell:disable-line
*/
}
if (maxNumEntriesToUpdate && updatedCount >= maxNumEntriesToUpdate) {
console.log(`Updated max of ${maxNumEntriesToUpdate} entries, exiting.`);
break;
}
process.stdout.write(
`Checking${
parsedUrl.hash ? ` for fragment in` : `:`
} ${url} (was ${StatusCode}) ... `,
);
let status = await getUrlStatus(url);
console.log(`${status}.`);
let now = new Date();
if (parsedUrl.hash) {
if (isHttp2XX(status)) {
// Encore that the fragment was checked and is valid.
now.setSeconds(fragSecondsOk);
now.setMilliseconds(fragMillisecondsOk);
} else {
status = StatusCode; // Keep the original status, rather than our custom 4XX status.
now.setSeconds(fragSecondsInvalid);
now.setMilliseconds(fragMillisecondsInvalid);
urlWithInvalidFragCount++;
}
} else if (!isHttp2XX(status)) {
continue;
}
cache[url] = {
StatusCode: status,
LastSeen: now.toISOString(),
};
updatedCount++;
}
if (updatedCount) {
await writeRefcache(cache);
} else {
console.log(`No updates needed.`);
}
console.log(
`Processed ${entriesCount} URLs${
checkForFragments
? ` (${urlWithFragmentCount} with fragments, ${urlWithInvalidFragCount} are invalid)`
: ''
}`,
);
for (const [status, count] of Object.entries(statusCounts)) {
console.log(`Status ${status}: ${count}`);
}
}
function countStatuses(StatusCode, parsedUrl, lastSeenDate, statusCounts) {
let sc = StatusCode;
if (checkForFragments) {
sc += parsedUrl.hash
? ' frag ' +
(isHttp2XXForFragments(StatusCode, lastSeenDate) ? 'ok' : 'er')
: ' no frag';
}
statusCounts[sc] = (statusCounts[sc] || 0) + 1;
}
function getNumericFlagValue(flagName) {
const flagArg = process.argv.find((arg) => arg.startsWith(flagName));
if (!flagArg) return;
const valueArg = flagArg.includes('=')
? flagArg.split('=')[1]
: process.argv[process.argv.indexOf(flagName) + 1];
let value = parseInt(valueArg);
if (value < 0) {
console.error(
`ERROR: invalid value for ${flagName}: ${valueArg}. ` +
`Must be a number > 0. Using default ${maxNumEntriesToUpdate}.`,
);
exit(1);
}
return value;
}
const _maxNumEntriesToUpdateFlag = getNumericFlagValue('--max-num-to-update');
if (_maxNumEntriesToUpdateFlag >= 0)
maxNumEntriesToUpdate = _maxNumEntriesToUpdateFlag;
checkForFragments =
process.argv.includes('--check-for-fragments') || process.argv.includes('-f');
await retry400sAndUpdateCache();