426 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			426 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/env python3
 | |
| #
 | |
| # This a link checker for Kubernetes documentation website.
 | |
| # - We cover the following cases for the language you provide via `-l`, which
 | |
| #   defaults to 'en'.
 | |
| # - If the language specified is not English (`en`), we check if you are
 | |
| #   actually using the localized links. For example, if you specify `zh` as
 | |
| #   the language, and for link target `/docs/foo/bar`, we check if the English
 | |
| #   version exists AND if the Chinese version exists as well. A checking record
 | |
| #   is produced if the link can use the localized version.
 | |
| #
 | |
| # Usage: linkchecker.py -h
 | |
| #
 | |
| # Cases handled:
 | |
| #
 | |
| # - [foo](#bar)                         : ignored currently
 | |
| # + [foo](http://bar)                   : insecure links to external site
 | |
| # + [foo](https://k8s.io/website/...)   : hardcoded site domain name
 | |
| #
 | |
| # + [foo](/<lang>/docs/bar/...)  : where <lang> is not 'en'
 | |
| #   + /<lang>/docs/bar           : contains shortcode, so ignore, or
 | |
| #   + /<lang>/docs/bar           : is a image link (ignore currently), or
 | |
| #   + /<lang>/docs/bar           : points to shared (non-localized) page, or
 | |
| #   + /<lang>/docs/bar.md        : exists for current lang, or
 | |
| #   + /<lang>/docs/bar/_index.md : exists for current lang, or
 | |
| #   + /<lang>/docs/bar/          : is a redirect entry, or
 | |
| #   + /<lang>/docs/bar           : is something we don't understand, then ERR
 | |
| #
 | |
| # + [foo](/docs/bar/...)
 | |
| #   + /docs/bar                : contains shortcode, so ignore, or
 | |
| #   + /docs/bar                : is a image link (ignore currently), or
 | |
| #   + /docs/bar                : points to a shared (non-localized) page, or
 | |
| #   + /docs/bar.md             : exists for current lang, or
 | |
| #   + /docs/bar/_index.md      : exists for current lang, or
 | |
| #   + /docs/bar                : is a redirect entry, or
 | |
| #   + /docs/bar                : is something we don't understand
 | |
| #
 | |
| 
 | |
| import argparse
 | |
| import glob
 | |
| import os
 | |
| import re
 | |
| import sys
 | |
| 
 | |
| # These are the bad links that doesn't hurt, though good to fix
 | |
| BAD_LINK_TYPES = {
 | |
|     "B01": {
 | |
|         "reason": "Using bad protocol",
 | |
|         "level": "WARNING",
 | |
|     },
 | |
|     "B02": {
 | |
|         "reason": "Link target is a redirect entry",
 | |
|         "level": "WARNING",
 | |
|     },
 | |
|     "B03": {
 | |
|         "reason": "Intra-site linkes should use relative path",
 | |
|         "level": "WARNING",
 | |
|     },
 | |
| }
 | |
| 
 | |
| # Constants for colored printing
 | |
| C_RED = "\033[31m"
 | |
| C_GREEN = "\033[32m"
 | |
| C_YELLOW = "\033[33m"
 | |
| C_GRAY  = "\033[90m"
 | |
| C_CYAN = "\033[36m"
 | |
| C_END = "\033[0m"
 | |
| 
 | |
| # Command line arguments shared across functions
 | |
| ARGS = None
 | |
| # Global result dictionary keyed by page examined
 | |
| RESULT = {}
 | |
| # Cached redirect entries
 | |
| REDIRECTS = {}
 | |
| 
 | |
| 
 | |
| def new_record(level, message, target):
 | |
|     """Create new checking record.
 | |
| 
 | |
|     :param level: Record severity level, one of 'INFO', 'WARNING' and 'ERROR'
 | |
|     :param message: Error message string
 | |
|     :param target: The link target in question
 | |
|     :returns: A string representation the checking result, may contain ASCII
 | |
|               coded terminal colors, or None if the record is suppressed.
 | |
|     """
 | |
|     global ARGS
 | |
| 
 | |
|     # Skip info when verbose
 | |
|     if ARGS.verbose == False and level == "INFO":
 | |
|         return None
 | |
| 
 | |
|     result = None
 | |
|     if ARGS.no_color:
 | |
|         result = target + ": " + message
 | |
|     else:
 | |
|         target = C_GRAY + target + C_END
 | |
|         if level == "INFO":
 | |
|             result =  target + ": " + C_GREEN  + message + C_END 
 | |
|         elif level == "WARNING":
 | |
|             result = target + ": " + C_YELLOW+ message + C_END
 | |
|         else:  # default to error
 | |
|             result = target + ": " + C_RED + message + C_END
 | |
| 
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def dump_result():
 | |
|     """Dump result to stdout."""
 | |
|     global RESULT, ARGS
 | |
| 
 | |
|     for path, path_output in RESULT.items():
 | |
|         norm_path = os.path.normpath(path)
 | |
|         if ARGS.no_color:
 | |
|             print("File: " + norm_path)
 | |
|         else:
 | |
|             print(C_CYAN + "File: " + norm_path + C_END)
 | |
|         for p in path_output:
 | |
|             print(" "*4 + p)
 | |
|     return
 | |
| 
 | |
| 
 | |
| def strip_comments(content):
 | |
|     """Manual striping of comments from file content.
 | |
| 
 | |
|     Many localized content pages contain original English content in comments.
 | |
|     These comments have to be stripped out before analyzing the links.
 | |
|     Doing this using regular expression is difficult. Even the grep tool is
 | |
|     not suitable for this use case.
 | |
| 
 | |
|     NOTE: We strived to preserve line numbers when producing the resulted
 | |
|     text. This can be useful in future if we want to print out the line
 | |
|     numbers for bad links.
 | |
|     """
 | |
|     result = []
 | |
|     in_comment = False
 | |
|     for line in content:
 | |
|         idx1 = line.find("<!--")
 | |
|         idx2 = line.find("-->")
 | |
|         if not in_comment:
 | |
|             # only care if new comment started
 | |
|             if idx1 < 0:
 | |
|                 result.append(line)
 | |
|                 continue
 | |
| 
 | |
|             # single line comment
 | |
|             if idx2 > 0:
 | |
|                 result.append(line[:idx1] + line[idx2+4:])
 | |
|                 continue
 | |
|             result.append(line[:idx1])
 | |
|             in_comment = True
 | |
|             continue
 | |
| 
 | |
|         # already in comment block
 | |
|         if idx2 < 0:  # ignore whole line
 | |
|             result.append("")
 | |
|             continue
 | |
|         result.append(line[idx2+4:])
 | |
|         in_comment = False
 | |
| 
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def normalize_filename(name, ftype="markdown"):
 | |
|     """Guess the filename based on a link target.
 | |
| 
 | |
|     This function only deals with regular files.
 | |
|     """
 | |
|     if name.endswith("/"):
 | |
|         name = name[:-1]
 | |
|     if ftype == "markdown":
 | |
|         name += ".md"
 | |
|     else:
 | |
|         name += ".html"
 | |
|     return name
 | |
| 
 | |
| 
 | |
| def check_file_exists(base, path, ftype="markdown"):
 | |
|     """Check if the target file exists.
 | |
| 
 | |
|     NOTE: We build a normalized path using 'base' and 'path' values. Suppose
 | |
|     the resulted path string is 'foo/bar', we check if 'foo/bar.md' exists,
 | |
|     AND we check if 'foo/bar/_index.md' exists.
 | |
| 
 | |
|     :param base: The base directory to begin with
 | |
|     :param path: The link target which is a relative path string
 | |
|     :returns: A boolean indicating whether the target file exists.
 | |
|     """
 | |
|     # NOTE: anchor is ignored, can be a todo item
 | |
|     parts = path.split("#")
 | |
| 
 | |
|     fn = normalize_filename(parts[0], ftype=ftype)
 | |
|     target = base + fn
 | |
| 
 | |
|     if os.path.isfile(target):
 | |
|         return True
 | |
| 
 | |
|     dir_name = base + parts[0]
 | |
|     if os.path.isdir(dir_name):
 | |
|         if os.path.isfile(dir_name + "/_index.md"):
 | |
|             return True
 | |
|         if os.path.isfile(dir_name + "/_index.html"):
 | |
|             return True
 | |
|         # /docs/contribute/style/hugo-shortcodes/ has this
 | |
|         if os.path.isfile(dir_name + "/index.md"):
 | |
|             return True
 | |
|     return False
 | |
| 
 | |
| 
 | |
| def get_redirect(path):
 | |
|     """Check if the path exists in the redirect database.
 | |
| 
 | |
|     NOTE: We do NOT check if the redirect target is there or not. We do an
 | |
|     **exact** matching for redirection entries.
 | |
|     :returns: The redirect target if any, or None if not found.
 | |
|     """
 | |
|     global REDIRECTS
 | |
| 
 | |
|     def _check_redirect(t):
 | |
|         for key, value in REDIRECTS.items():
 | |
|             if key == t:  # EXACT MATCH
 | |
|                 return value
 | |
|         return None
 | |
| 
 | |
|     # NOTE: anchor is ignored, can be a future todo
 | |
|     parts = path.split("#")
 | |
|     target = parts[0]
 | |
|     if not target.endswith("/"):
 | |
|         target += "/"
 | |
| 
 | |
|     new_target = _check_redirect(target)
 | |
|     last_target = new_target
 | |
|     while new_target:
 | |
|         new_target = _check_redirect(new_target)
 | |
|         if new_target is None:
 | |
|             break
 | |
|         last_target = new_target
 | |
| 
 | |
|     return last_target
 | |
| 
 | |
| 
 | |
| def check_target(page, anchor, target):
 | |
|     """Check a link from anchor to target on provided page.
 | |
| 
 | |
|     :param page: Currently not used. Passed here in case we want to check the
 | |
|                  in-page links in the future.
 | |
|     :param anchor: Anchor string from the content page. This is provided to
 | |
|                 help handle cases where target is empty.
 | |
|     :param target: The link target string to check
 | |
|     :returns: A checking record (string) if errors found, or None if we can
 | |
|               find the target link.
 | |
|     """
 | |
|     target = target.strip()
 | |
|     # B01: bad protocol
 | |
|     if target.startswith("http://"):
 | |
|         return new_record("WARNING", "Use HTTPS rather than HTTP", target)
 | |
| 
 | |
|     # full link
 | |
|     if target.startswith("https://"):
 | |
|         # B03: self link, should revise to relative path
 | |
|         if (target.startswith("https://k8s.io/docs") or
 | |
|                 target.startswith("https://kubernetes.io/docs")):
 | |
|             return new_record("ERROR", "Should use relative paths", target)
 | |
|         # external link, skip
 | |
|         return new_record("INFO", "External link, skipped", target)
 | |
| 
 | |
|     # in-page link
 | |
|     # TODO: check if the target anchor does exists
 | |
|     if target.startswith("#"):
 | |
|         return new_record("INFO", "In-page link, skipped", target)
 | |
| 
 | |
|     # Link has shortcode
 | |
|     if target.find("{{") > 0:
 | |
|         return new_record("INFO", "Link has shortcode, skipped", target)
 | |
| 
 | |
|     # TODO: check links to examples
 | |
|     if target.startswith("/examples/"):
 | |
|         return new_record("WARNING", "Examples link, skipped", target)
 | |
| 
 | |
|     # it is an embedded image
 | |
|     # TODO: an image might get translated as well
 | |
|     if target.endswith(".png") or target.endswith(".svg"):
 | |
|         return new_record("INFO", "Link to image, skipped", target)
 | |
| 
 | |
|     # link to English or localized page
 | |
|     if (target.startswith("/docs/") or
 | |
|             target.startswith("/" + ARGS.lang + "/docs/")):
 | |
| 
 | |
|         # target is shared reference (kubectl or kubernetes-api?
 | |
|         if (target.find("/docs/reference/generated/kubectl/") >= 0 or
 | |
|                 target.find("/docs/reference/generated/kubernetes-api/") >= 0):
 | |
|             if check_file_exists(ROOT + "/static", target, "html"):
 | |
|                 return None
 | |
|             return new_record("ERROR", "Missing shared reference", target)
 | |
| 
 | |
|         # target is a markdown (.md) or a "<dir>/_index.md"?
 | |
|         if target.startswith("/docs/"):
 | |
|             base = os.path.join(ROOT, "content", "en")
 | |
|         else:
 | |
|             # localized target
 | |
|             base = os.path.join(ROOT, "content")
 | |
|         ok = check_file_exists(base, target)
 | |
|         if ok:
 | |
|             # We do't do additional checks for English site even if it has
 | |
|             # links to a non-English page
 | |
|             if ARGS.lang == "en":
 | |
|                 return None
 | |
| 
 | |
|             # If we are already checking localized link, fine
 | |
|             if target.startswith("/" + ARGS.lang + "/docs/"):
 | |
|                 return None
 | |
| 
 | |
|             # additional check for localization even if English target exists
 | |
|             base = os.path.join(ROOT, "content", ARGS.lang)
 | |
|             found = check_file_exists(base, target)
 | |
|             if not found:
 | |
|                 # Still to be translated
 | |
|                 return None
 | |
|             msg = ("Localized page detected, please append '/%s' to the target"
 | |
|                    % ARGS.lang)
 | |
|             return new_record("ERROR", "Link not using localized page", target)
 | |
| 
 | |
|         # taget might be a redirect entry
 | |
|         real_target = get_redirect(target)
 | |
|         if real_target:
 | |
|             msg = ("Link using redirect records, should use %s instead" %
 | |
|                    real_target)
 | |
|             return new_record("WARNING", msg, target)
 | |
|         return new_record("ERROR", "Missing link for [%s]" % anchor, target)
 | |
| 
 | |
|     msg = "Link may be wrong for the anchor [%s]" % anchor
 | |
|     return new_record("WARNING", msg, target)
 | |
| 
 | |
| 
 | |
| def validate_links(page):
 | |
|     """Find and validate links on a content page.
 | |
| 
 | |
|     The checking records are consolidated into the global variable RESULT.
 | |
|     """
 | |
|     try:
 | |
|         with open(page, "r") as f:
 | |
|             data = f.readlines()
 | |
|     except Exception as ex:
 | |
|         print("[Error] failed in reading markdown file: " + str(ex))
 | |
|         return
 | |
| 
 | |
|     content = "\n".join(strip_comments(data))
 | |
| 
 | |
|     # Single results: searches for pattern: []()
 | |
|     link_pattern = r"\[([`/\w\s\n]*)\]\(([^\)]*)\)"
 | |
|     regex = re.compile(link_pattern)
 | |
| 
 | |
|     matches = regex.findall(content)
 | |
|     records = []
 | |
|     for m in matches:
 | |
|         r = check_target(page, m[0], m[1])
 | |
|         if r:
 | |
|             records.append(r)
 | |
|     if len(records):
 | |
|         RESULT[page] = records
 | |
| 
 | |
| 
 | |
| def parse_arguments():
 | |
|     """Argument parser.
 | |
| 
 | |
|     Result is returned and saved into global variable ARGS.
 | |
|     """
 | |
|     parser = argparse.ArgumentParser(description="Links checker for docs.")
 | |
|     parser.add_argument("-l", dest="lang", default="en", metavar="<LANG>",
 | |
|                         help=("two letter language code, e.g. 'zh'. "
 | |
|                               "(default='en')"))
 | |
|     parser.add_argument("-v", dest="verbose", action="store_true",
 | |
|                         help="switch on verbose level")
 | |
|     parser.add_argument("-f", dest="filter", default="/docs/**/*.md",
 | |
|                         metavar="<FILTER>",
 | |
|                         help=("File pattern to scan, e.g. '/docs/foo.md'. "
 | |
|                               "(default='/docs/**/*.md')"))
 | |
|     parser.add_argument("-n", "--no-color", action="store_true",
 | |
|                         help="Suppress colored printing.")
 | |
| 
 | |
|     return parser.parse_args()
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     """The main entry of the program."""
 | |
|     global ARGS, ROOT, REDIRECTS
 | |
| 
 | |
|     ARGS = parse_arguments()
 | |
|     print("Language: " + ARGS.lang)
 | |
|     ROOT = os.path.join(os.path.dirname(__file__), '..')
 | |
|     content_dir = os.path.join(ROOT, 'content')
 | |
|     lang_dir = os.path.join(content_dir, ARGS.lang)
 | |
| 
 | |
|     # read redirects data
 | |
|     redirects_fn = os.path.join(ROOT, "static", "_redirects")
 | |
|     try:
 | |
|         with open(redirects_fn, "r") as f:
 | |
|             data = f.readlines()
 | |
|         for item in data:
 | |
|             parts = item.split()
 | |
|             # There are entries without 301 specified
 | |
|             if len(parts) < 2:
 | |
|                 continue
 | |
|             entry = parts[0]
 | |
|             # There are some entries not ended with "/"
 | |
|             if entry.endswith("/"):
 | |
|                 REDIRECTS[entry] = parts[1]
 | |
|             else:
 | |
|                 REDIRECTS[entry + "/"] = parts[1]
 | |
| 
 | |
|     except Exception as ex:
 | |
|         print("[Error] failed in reading redirects file: " + str(ex))
 | |
|         return
 | |
| 
 | |
|     folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)]
 | |
|     for page in folders:
 | |
|         validate_links(page)
 | |
| 
 | |
|     dump_result()
 | |
| 
 | |
|     # Done
 | |
|     print("Completed link validation.")
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     sys.exit(main())
 |