#!/usr/bin/env node import puppeteer from 'puppeteer'; // Consider using puppeteer-core import { URL } from 'url'; const DOCS_ORACLE_URL = 'https://docs.oracle.com/'; const STATUS_OK_BUT_FRAG_NOT_FOUND = 422; const cratesIoURL = 'https://crates.io/crates/'; let verbose = false; export function log(...args) { if (!verbose) return; const lastArg = args[args.length - 1]; if (typeof lastArg === 'string' && lastArg.endsWith(' ')) { process.stdout.write(args.join(' ')); } else { console.log(...args); } } // Check for fragment and corresponding anchor ID in page. async function checkForFragment(_url, page, status) { // FIXME: htmltest seems to mistakenly double escape '+' in URLs, and // ampersands as `\u0026`. Let's attempt to patch that there. TODO: address // this upstream; at least create an issue. // DISABLING FOR NOW: const url = _url; // .replace(/+/g, '+').replace(/\\u0026/g, '&'); const parsedUrl = new URL(url); if (parsedUrl.hash) { let fragmentID = parsedUrl.hash.substring(1); // Remove the leading '#' // if (url.startsWith(DOCS_ORACLE_URL)) { // Would also need for GitHub.com fragmentID = decodeURIComponent(fragmentID); // } let anchorExists = // // Look for ID attribute in the page. // (await page.evaluate((id) => { return !!document.getElementById(id); }, fragmentID)) || // // Look for named anchors // (await page.evaluate((name) => { const elt = document.querySelector(`a[name="${name}"]`); return !!elt; }, fragmentID)) || // // Github.com repo special cases // (url.startsWith('https://github.com/') && (await anchorExistsInGitHub(page, fragmentID))); if (!anchorExists) status = STATUS_OK_BUT_FRAG_NOT_FOUND; } return status; } async function anchorExistsInGitHub(page, fragmentID) { if (/L\d+(-L\d+)?/.test(fragmentID)) { // Handle line references in GitHub repos. return await page.evaluate((name) => { // Look for references to the fragment in the page, possibly with an // `-ov-file` suffix (used as anchors of tabs in repo landing pages). return !!document.querySelector('div.highlighted-line'); }, fragmentID); } // Handle other fragment references in GitHub repos, link references // to files (such as README), or to headings inside of displayed markdown. return await page.evaluate((name) => { // Look for references to the fragment in the page, possibly with an // `-ov-file` suffix (used as anchors of tabs in repo landing pages). const elt = document.querySelector( `a[href="#${name}"], a[href="#${name}-ov-file"]`, ); return !!elt; }, fragmentID); } async function getUrlHeadless(url) { // Get the URL, headless, while trying our best to avoid triggering // bot-rejection from some servers. Returns the HTTP status code. log(`Fetch ${url} headless ... `); let browser; try { // cSpell:ignore KHTML const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', `--user-agent=${userAgent}`, ], }); const page = await browser.newPage(); await page.setUserAgent(userAgent); await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-US,en;q=0.9', }); const response = await page.goto(url, { waitUntil: 'networkidle2', timeout: 10_000, }); if (!response) throw new Error('No response from server.'); let status = response.status(); const title = await page.title(); // Handles special case of crates.io. For details, see: // https://github.com/rust-lang/crates.io/issues/788 if (url.startsWith(cratesIoURL)) { const crateName = url.split('/').pop(); // E.g. 'https://crates.io/crates/opentelemetry-sdk' -> 'opentelemetry-sdk' const crateNameRegex = new RegExp(crateName.replace(/-/g, '[-_]')); // Crate found if title starts with createName (in kebab or snake case) if (!crateNameRegex.test(title)) status = 404; } status = await checkForFragment(url, page, status); log(`${status}; page title: '${title}'`); return status; } catch (error) { console.error(`Error: ${error.message}`); return null; } finally { if (browser) await browser.close(); } } async function getUrlInBrowser(url) { let browser; try { browser = await puppeteer.launch({ headless: false }); const page = await browser.newPage(); const response = await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000, }); if (!response) throw new Error('No response from server.'); let status = response.status(); const title = await page.title(); status = await checkForFragment(url, page, status); log(`${status}; page title: '${title}'`); return status; } catch (error) { console.error(`Error fetching ${url}:`, error.message); return null; } finally { if (browser) await browser.close(); } } export function isHttp2XX(status) { return status && status >= 200 && status < 300; } export async function getUrlStatus(url, _verbose = false) { verbose = _verbose; let status = await getUrlHeadless(url); // If headless fetch fails, try in browser for non-404 statuses if (!isHttp2XX(status) && status !== 404 && status !== 422) { log(`\n\t retrying in browser ... `); status = await getUrlInBrowser(url); } return status; } async function mainCLI() { const url = process.argv[2]; verbose = !process.argv.includes('--quiet'); if (!url) { console.error(`Usage: ${process.argv[1]} URL`); process.exit(1); } const status = await getUrlStatus(url, verbose); if (!verbose) console.log(status); process.exit(isHttp2XX(status) ? 0 : 1); } // Only run if script is executed directly (CLI) if (import.meta.url === `file://${process.argv[1]}`) await mainCLI();