website/hack/gen-content.py

180 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
# This was inspired by
# http://sigs.k8s.io/contributor-site/hack/gen-content.sh
# Check out
# external-sources/README.md for some instructions on how
# the file format works.
import csv
import os
import re
import shutil
import sys
import subprocess
import tempfile
# Workaround to make this work in Netlify...
LOCAL_PY_PATH = '/opt/buildhome/python3.8/lib/python3.8/site-packages/'
if LOCAL_PY_PATH not in sys.path:
sys.path.append(LOCAL_PY_PATH)
import pathlib
global link_mapping
link_mapping = []
'''
We are adding basic Front-Matter here.
`docs` files can't have front-matter and # (h1)
'''
def rewrite_header(out_file, title=None, docs=False, weight=None):
lines = open(out_file, 'r').readlines()
if not title or title == '-':
title = os.path.basename(out_file).split('.md')[0].title()
header_lines = [
'---\n',
'title: {}\n'.format(title),
'importedDoc: true\n'
]
if weight:
header_lines += ['weight: {}\n'.format(weight)]
header_lines += [
'---\n',
'\n'
]
file_desc = open(out_file, 'w')
file_desc.writelines(header_lines)
for line in lines:
if not docs or not line.startswith('# ') or lines.index(line) >= 4:
if not line.startswith('<!-- '): #FML!
file_desc.write(line)
file_desc.close()
class Repo():
def __init__(self, external_sources_dir, repo_fn):
self.temp_dir = tempfile.mkdtemp()
self.repo_id = repo_fn.split(external_sources_dir)[1][1:]
self.gh_source = 'https://github.com/{}/'.format(self.repo_id).strip('/')
self.repo_fn = repo_fn
self.dest = os.path.join(self.temp_dir, self.repo_id)
self.file_list = []
global link_mapping
with open(self.repo_fn, 'r') as file_desc:
csv_reader = csv.reader(file_desc)
for line in csv_reader:
self.file_list += [[entry.strip('/') for entry in line]]
link_mapping += [[self.gh_source]+[entry.strip('/') for entry in line]]
def __del__(self):
shutil.rmtree(self.temp_dir)
def clone_repo(self):
subprocess.call([
'git', 'clone', '--depth=1', '-q',
self.gh_source, self.dest])
def rewrite_links(self, out_file):
'''
Types of links in Markdown
1: <url> # hard to find - we have markdown that's mixed with html
2: [label](url)
3: [url]
4: [label]: url together with [label]
'''
with open(out_file, 'r') as file_desc:
content = file_desc.read()
# links_1 = re.findall(r'\<.+?\>', content, re.MULTILINE)
links_2 = re.findall(r'\[.+?\]\((.+?)\)', content, re.MULTILINE)
links_3 = re.findall(r'\[(.+?)]\s+', content, re.MULTILINE)
links_4 = re.findall(r'\[(.+?)\]\:\s+.+?', content, re.MULTILINE)
# set() because we only want to review every link just once
# str.split('#')[0] because we ignore anchors
links = set([a.split('#')[0]
for a in links_2+links_3+links_4
if not a.startswith('#')])
link_rewrites = {}
for link in links:
if not link:
continue
global link_mapping
for entry in link_mapping:
if link == entry[1]:
link_rewrites[link] = '/{}'.format(entry[2].lower().split('.md')[0])
elif link.startswith(self.gh_source) and link.endswith(entry[1]) and \
self.gh_source == entry[0]:
link_rewrites[link] = '/{}'.format(entry[2].lower().split('.md')[0])
if link not in link_rewrites and os.path.exists(os.path.join(self.dest, link)) and \
not link.startswith('https://') and not link.startswith('http://'):
link_rewrites[link] = os.path.join(self.gh_source, 'blob/main', link)
for key in link_rewrites:
content = content.replace(key, link_rewrites[key])
with open(out_file, 'w') as file_desc:
file_desc.write(content)
def copy_files(self, content_dir):
for entry in self.file_list:
out_file = os.path.join(content_dir, entry[1])
shutil.copyfile(
os.path.join(self.dest, entry[0]),
out_file)
docs = entry[1].startswith('flux/') or \
entry[1].startswith('flagger/')
title = None
weight = None
if len(entry) == 4:
weight = entry[3]
if len(entry) >= 3:
title = entry[2]
rewrite_header(out_file, title=title, docs=docs, weight=weight)
self.rewrite_links(out_file)
def get_repo_list(external_sources_dir):
repos = []
file_refs = pathlib.Path(external_sources_dir).glob('*/*')
for file in file_refs:
repo_fn = str(file)
if os.path.isfile(repo_fn):
repos += [Repo(external_sources_dir, repo_fn)]
return repos
def main():
repo_root = os.path.realpath(
os.path.join(os.path.dirname(__file__), '..'))
if os.getcwd() != repo_root:
print('Please run this script from top-level of the repository.')
sys.exit(1)
content_dir = os.path.join(repo_root, 'content/en')
external_sources_dir = os.path.join(repo_root, 'external-sources')
repos = get_repo_list(external_sources_dir)
for repo in repos:
repo.clone_repo()
for repo in repos:
repo.copy_files(content_dir)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("Aborted.", sys.stderr)
sys.exit(1)