From a3b7cfd886a7fc2419e11c5d696cf466da814b6f Mon Sep 17 00:00:00 2001 From: Mauren Berti <698465+stormqueen1990@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:07:49 -0400 Subject: [PATCH] Add in-place substitution option for linkchecker.py (#41983) * Add in-place replacement option for linkchecker.py Add a new flag '-w' to enable an experimental in-place replacement for Markdown links only. * Apply suggestions from code review Use formatted string literals instead of simple concatenation. Co-authored-by: Matt Boersma * Remove other paths that should not be changed. * Add more logic to remove paths that start with http or paths that are already linking to the localized page (i.e. start with '/'). * Apply suggestions from code review Simplify expressions. Co-authored-by: Matt Boersma * Avoid updating pages in English. * Fix syntax error in set comprehension * Expand on documentation for new -w flag * Update documentation for linkchecker.py in README * Add a blurb with information about the new -w switch that describes what it does and what is the purpose of adding this behaviour change. * Update the previously existing description to match the currently available script flags. --------- Co-authored-by: Matt Boersma --- scripts/README.md | 31 +++++++++++++++++-------------- scripts/linkchecker.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 42e1afcfcd..54b032775c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -7,12 +7,12 @@ | `test_examples.sh` | This script tests whether a change affects example files bundled in the website. | | `check-headers-file.sh` | This script checks the headers if you are in a production environment. | | `diff_l10n_branches.py` | This script generates a report of outdated contents in `content/` directory by comparing two l10n team milestone branches. | -| `hash-files.sh` | This script emits as hash for the files listed in $@ | -| `linkchecker.py` | This a link checker for Kubernetes documentation website. | -| `lsync.sh` | This script checks if the English version of a page has changed since a localized page has been committed. | -| `replace-capture.sh` | This script sets K8S_WEBSITE in your env to your docs website root or rely on this script to determine it automatically | -| `check-ctrlcode.py` | This script finds control-code(0x00-0x1f) in text files. | -| `ja/verify-spelling.sh` | This script finds Japanese words that are against the guideline. | +| `hash-files.sh` | This script emits as hash for the files listed in $@ | +| `linkchecker.py` | This a link checker for Kubernetes documentation website. | +| `lsync.sh` | This script checks if the English version of a page has changed since a localized page has been committed. | +| `replace-capture.sh` | This script sets K8S_WEBSITE in your env to your docs website root or rely on this script to determine it automatically | +| `check-ctrlcode.py` | This script finds control-code(0x00-0x1f) in text files. | +| `ja/verify-spelling.sh` | This script finds Japanese words that are against the guideline. | @@ -104,14 +104,17 @@ This script emits as hash for the files listed in $@. ## linkchecker.py This a link checker for Kubernetes documentation website. -- We cover the following cases for the language you provide via `-l`, which - defaults to 'en'. -- If the language specified is not English (`en`), we check if you are - actually using the localized links. For example, if you specify `zh` as - the language, and for link target `/docs/foo/bar`, we check if the English - version exists AND if the Chinese version exists as well. A checking record - is produced if the link can use the localized version. - +- If the language for the files scanned is not English (`en`), we check if you + are actually using the localized links. For example, if you specify a filter + similar to as `content/zh-cn/docs/**/*.md`, we check if the English version + exists AND if the Chinese version exists as well. A checking record is + produced if the link can use the localized version. +- If the language specified is not English (`en`), a checking record is produced, + and the `-w` switch is used, the script will perform in-place substitutions + for links that have the format `/docs` and currently have a localized version + available. This is an experimental feature and aims to reduce the amount of + work required to update links to point to localized content. It currently + works for Markdown files only. ``` Usage: linkchecker.py -h diff --git a/scripts/linkchecker.py b/scripts/linkchecker.py index 3f719b26c3..4a2d9e1774 100755 --- a/scripts/linkchecker.py +++ b/scripts/linkchecker.py @@ -328,6 +328,7 @@ def check_target(page, anchor, target): return None msg = ("Localized page detected, please append '/%s' to the target" % LANG) + return new_record("ERROR", msg, target) # taget might be a redirect entry @@ -390,7 +391,7 @@ def check_apiref_target(target, anchor): target+"#"+anchor) -def validate_links(page): +def validate_links(page, in_place_edit): """Find and validate links on a content page. The checking records are consolidated into the global variable RESULT. @@ -410,10 +411,34 @@ def validate_links(page): matches = regex.findall(content) records = [] + target_records = [] for m in matches: r = check_target(page, m[0], m[1]) if r: records.append(r) + target_records.append(m[1]) + + # if multiple records are the same they need not be checked repeatedly + # remove paths that are not relative too + target_records = {item for item in target_records + if not item.startswith("http") and + not item.startswith(f"/{LANG}")} + + # English-language pages don't have "en" in their path + if in_place_edit and target_records and LANG != "en": + updated_data = [] + for line in data: + if any(rec in line for rec in target_records): + for rec in target_records: + line = line.replace( + f"({rec})", + # assumes unlocalized links are in "/docs/..." format + f"(/{LANG}{rec})") + updated_data.append(line) + + with open(page, "w") as f: + for line in updated_data: + f.write(line) # searches for pattern: {{< api-reference page="" anchor="" apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\"" @@ -455,6 +480,9 @@ def parse_arguments(): metavar="", help=("File pattern to scan. " "(default='content/en/docs/**/*.md')")) + PARSER.add_argument("-w", dest="in_place_edit", action="store_true", + help="[EXPERIMENTAL] Turns on in-place replacement " + "for localized content.") return PARSER.parse_args() @@ -500,7 +528,7 @@ def main(): folders = [f for f in glob.glob(ARGS.filter, recursive=True)] for page in folders: - validate_links(page) + validate_links(page, ARGS.in_place_edit) dump_result()