From 7fde0426dfc9d966cfaabb00192019ee3c81b84d Mon Sep 17 00:00:00 2001 From: s-kawamura-w664 Date: Tue, 30 Mar 2021 07:12:44 +0000 Subject: [PATCH] Add script for detecting bad characters. Co-authored-by: Shu Muto --- scripts/README.md | 26 +++++++++++++++++++ scripts/check-ctrlcode.py | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100755 scripts/check-ctrlcode.py diff --git a/scripts/README.md b/scripts/README.md index 4f7ee7eec3..02e3e66ff0 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -11,6 +11,7 @@ | `linkchecker.py` | This a link checker for Kubernetes documentation website. | | `lsync.sh` | This script checks if the English version of a page has changed since a localized page has been committed. | | `replace-capture.sh` | This script sets K8S_WEBSITE in your env to your docs website root or rely on this script to determine it automatically | +| `check-ctrlcode.py` | This script finds control-code(0x00-0x1f) in text files. | @@ -152,3 +153,28 @@ The following command checks a subdirectory: ./scripts/lsync.sh content/zh/docs/concepts/ +## check-ctrlcode.py + +This script finds control-code(0x00-0x1f) in text files. +It will display illegal character in browser. + +``` +Usage: ./check-ctrlcode.py + + Specify the directory to check. + Specify the extension to check. + +For example, we can execute as following. + + ./check-ctrlcode.py ../content/en/ .md + +The output is following format. + + "{0} : {4}" + + {0} : The path of file that a control-code exists. + {1} : The line number that a control-code exists. + {2} : The column number that a control-code exists. + {3} : The found control-code. + {4} : The one-line strings in the file. +``` \ No newline at end of file diff --git a/scripts/check-ctrlcode.py b/scripts/check-ctrlcode.py new file mode 100755 index 0000000000..b2705a409a --- /dev/null +++ b/scripts/check-ctrlcode.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import os +import sys +import re + +def main(): + args = sys.argv + if (len(args) != 3): + print("Usage: ./check-ctrlcode.py ") + sys.exit(1) + + dirpath = args[1] + ext = args[2] + + fullpath = os.path.abspath(dirpath) + if (os.path.isdir(fullpath) is not True): + print("Directory not found.") + sys.exit(1) + + check_dir(fullpath, ext) + +def check_dir(path, ext): + for f in os.listdir(path): + if(f[0] == "."): + continue + fullpath = os.path.join(path, f) + if(os.path.isdir(fullpath)): + check_dir(fullpath, ext) + continue + exts = os.path.splitext(f) + if(exts[1] != ext): + continue + check_ctrlcode(fullpath) + +def check_ctrlcode(filepath): + line = 0 + with open(filepath, encoding='utf-8') as f: + while True: + str = f.readline() + if(str == ""): + break + line = line + 1 + # check 0x00-0x1f except 0x09(HT), 0x0a(LF), 0x0d(CR) + pattern = re.compile('[\u0000-\u0008\u000b\u000c\u000e-\u001f]') + m = pattern.search(str) + if(m == None): + continue + pos = m.end() + ctrl = m.group().encode("utf-8") + print("{0} : {4}\n".format(filepath, line, pos, ctrl, str.replace('\n',''))) + + +main()