mirror of https://github.com/kubeflow/website.git
Add script to validate urls in .md (#1684)
* Add script to validate urls in .md * add detailed README comments * only consier > 400 response as invalid * address comments
This commit is contained in:
parent
ee98e0101a
commit
ef02bb2826
|
|
@ -0,0 +1,94 @@
|
|||
# This script finds .md files under a directory and its subdirectories, extracts
|
||||
# http/https URLs from .md files and validates them.
|
||||
#
|
||||
# This script can be run periodically on kubeflow/website source repository
|
||||
# to find outdated URLs, which indicate possible outdated document sections.
|
||||
#
|
||||
# To run this script, type the following on the command line:
|
||||
# python3.8 validate-urls.py -d /path/to/kubeflow/website/content/docs
|
||||
#
|
||||
# Input:
|
||||
# The path of a directory that contains .md files as `-d` command line flag.
|
||||
#
|
||||
# Output:
|
||||
# STDOUT logs in the format of `<file>: <URL> , <status>` and a summary of all
|
||||
# invalid URLs at the end.
|
||||
#
|
||||
# Dependency:
|
||||
# You may need to install the `requests` Python package via command line:
|
||||
# python3.8 -m pip install requests
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate all URLs in the kubeflow.org website'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-d',
|
||||
'--dir',
|
||||
dest='input_dir',
|
||||
nargs='?',
|
||||
default='kubeflow/website/content',
|
||||
help=
|
||||
'Path to the doc content folder. (Default: %(default)s)',
|
||||
)
|
||||
|
||||
# http/https URLs
|
||||
HTTP_PATTERN = re.compile(
|
||||
'http[s]?://[a-zA-Z\-_?/*\.#\$][a-zA-Z0-9\-_?/*\.#\$]+')
|
||||
|
||||
# Patterns in this white list are considered valid.
|
||||
WHITE_LIST = [
|
||||
re.compile('http[s]?://localhost'),
|
||||
re.compile('http[s]?://\.\.'), # https://......
|
||||
re.compile('https://path/to/component.yaml'),
|
||||
re.compile('https://github.com/kubeflow/kfctl/releases/tag')
|
||||
]
|
||||
|
||||
def should_skip(url):
|
||||
for p in WHITE_LIST:
|
||||
if p.match(url):
|
||||
return True
|
||||
return False
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
# find all md files under INPUT_DIR.
|
||||
files = []
|
||||
for (dirpath, dirname, filenames) in os.walk(args.input_dir):
|
||||
for f in filenames:
|
||||
if f.endswith(".md"):
|
||||
files.append(os.path.join(dirpath, f))
|
||||
|
||||
urls = {}
|
||||
for file in files:
|
||||
with open(file, "r") as f:
|
||||
u = HTTP_PATTERN.findall(f.read())
|
||||
if u:
|
||||
urls[file[len(args.input_dir):]] = u
|
||||
|
||||
problematic_urls = []
|
||||
for file, urls in urls.items():
|
||||
for url in urls:
|
||||
if should_skip(url):
|
||||
print(f"skipping {url} ")
|
||||
continue
|
||||
print(f"{file}: URL {url}",end='')
|
||||
try:
|
||||
r = requests.head(url)
|
||||
print(f" , Status {r.status_code}")
|
||||
if r.status_code >= 400 and r.status_code < 500:
|
||||
problematic_urls.append((file, url, r.status_code))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
problematic_urls.append((file, url, "FAIL"))
|
||||
print("\nSummary:\n")
|
||||
for u in problematic_urls:
|
||||
print(f"|{u[0]} | {u[1]} | {u[2]}|")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue