spec/tools/specification_parser/specification_parser.py

import re
import glob
import json
from os.path import curdir, abspath, join, splitext, isfile
from os import walk

rfc_2119_keywords_regexes = [
    r"MUST",
    r"REQUIRED",
    r"SHALL",
    r"MUST NOT",
    r"SHALL NOT",
    r"SHOULD",
    r"RECOMMENDED",
    r"SHOULD NOT",
    r"NOT RECOMMENDED",
    r"MAY",
    r"OPTIONAL",
]

def get_ignored_path_globs(root):
    fileName = join(root, ".specignore")
    if not isfile(fileName):
        return []

    with open(fileName, 'r') as f:
        # trim whitespace
        globs = [line.strip() for line in f.readlines()]

        # remove empty lines
        globs = [g for g in globs if g]

        # remove comments
        globs = [g for g in globs if not g.startswith('#')]

        return globs

def get_ignored_paths(root):
    globs = get_ignored_path_globs(root)
    globbed_paths = set()
    ignored_files = set()

    for g in globs:
        globbed_paths.update(glob.glob(g, recursive=True))

    for p in globbed_paths:
        if isfile(p):
            ignored_files.add(join(root, p))
        else:
            ignored_files.update(glob.glob(join(root, p, "**/*.md"), recursive=True))

    return ignored_files

def find_markdown_file_paths(root):
    'Finds the .md files in the root provided.'
    markdown_file_paths = []
    ignored_paths = get_ignored_paths(root)

    for root_path, _, file_paths, in walk(root):
        for file_path in file_paths:

            absolute_file_path = join(root_path, file_path)

            if absolute_file_path in ignored_paths:
                continue

            _, file_extension = splitext(absolute_file_path)

            if file_extension == ".md":
                markdown_file_paths.append(absolute_file_path)

    return markdown_file_paths


def clean_content(content):
    'Transmutes markdown content to plain text'
    lines = content.splitlines()
    content = '\n'.join([x for x in lines if x.strip() != '' and x.strip().startswith('>')])

    for rfc_2119_keyword_regex in rfc_2119_keywords_regexes:
        content = re.sub(
            f"\\*\\*{rfc_2119_keyword_regex}\\*\\*",
            rfc_2119_keyword_regex,
            content
        )
    return re.sub(r"\n?>\s*", " ", content.strip()).strip()


def find_rfc_2119_keyword(content):
    'Returns the RFC2119 keyword, if present'
    for rfc_2119_keyword_regex in rfc_2119_keywords_regexes:

        if re.search(
            f"\\*\\*{rfc_2119_keyword_regex}\\*\\*", content
        ) is not None:
            return rfc_2119_keyword_regex

def parsed_content_to_hierarchy(parsed_content):
    'Turns a bunch of headline & content pairings into a tree of requirements'
    content_tree = []
    headline_stack = []

    node = lambda l,h,c: {'level': l, 'headline': h, 'content': c, 'children': []}

    for level, headline, content in parsed_content:
        try:
            if len(headline_stack) == 0: # top-most node
                cur = node(level, headline, content)
                content_tree.append(cur)
                headline_stack.insert(0, [level, headline, cur])
            elif len(headline_stack[0][0]) >= len(level): # Sibling or parent node
                if len(headline_stack[0][0]) > len(level): # parent, right?
                    headline_stack.pop(0)
                headline_stack.pop(0)
                if len(headline_stack) == 0:
                    parent = content_tree
                else:
                    parent = headline_stack[0][2]['children']
                cur = node(level, headline, content)
                parent.append(cur)
                headline_stack.insert(0, [level, headline, cur])
            elif len(level) > len(headline_stack[0][0]): # child node
                # TODO: emit warning if headlines are too deep
                cur = node(level, headline, content)
                parent = headline_stack[0][2]
                parent['children'].append(cur)
                headline_stack.insert(0, [level, headline, cur])
            else:
                headline_stack.pop(0)
        except Exception as k:
            print(k);

    # Specify a root so we know that everything is a node all the way down.
    root = node(0, '', '')
    root['children'] = content_tree
    return content_tree_to_spec(root)

def gen_node(ct):
    'given a content node, turn it into a requirements node'
    headline = ct['headline']
    content = ct['content']
    keyword = find_rfc_2119_keyword(content)

    req_group = re.search(r'(?P<req>(requirement|condition)[^\n]+)', headline, re.IGNORECASE)
    if req_group is None:
        return None

    _id = req_group.groups()[0]
    return {
        'id': _id,
        'machine_id': re.sub(r"[^\w]", "_", _id.lower()),
        'content': clean_content(content),
        'RFC 2119 keyword': keyword,
        'children': [],
    }

def content_tree_to_spec(ct):
    current = gen_node(ct)
    children_grouped = [content_tree_to_spec(x) for x in ct['children']]
    # Filter out potential None entries.
    children = []
    for _iter in children_grouped:
        '''
        So we might get a None (skip it), an object (add it to the list) or another list (merge it with list).
        '''
        if _iter is None:
            continue
        if type(_iter) == list:
            children.extend(_iter)
        else:
            children.append(_iter)

    if current is None:
        if len(children) > 0:
            return children
        return
    else:
        current['children'] = children
        return current


def parse(markdown_file_path):
    with open(markdown_file_path, "r") as markdown_file:
        content_finder = re.compile(r'^(?P<level>####+)(?P<headline>[^\n]+)\n+?.*?\n+?(?P<rest>>\s[^#?]*)', re.MULTILINE)
        parsed = content_finder.findall(markdown_file.read())
        return parsed_content_to_hierarchy(parsed)

def write_json_specifications(requirements):
    for md_absolute_file_path, requirement_sections in requirements.items():
        with open(
            "".join([splitext(md_absolute_file_path)[0], ".json"]), "w"
        ) as json_file:
            json_file.write(json.dumps(requirement_sections, indent=4))


if __name__ == "__main__":
    combined = {"rules": []}
    for markdown_file_path in find_markdown_file_paths(
        join(abspath(curdir))
    ):
        result = parse(markdown_file_path)
        if result:
            combined['rules'].extend(result)

    combined['rules'] = sorted(combined['rules'], key=lambda x: [int(x) for x in x['id'].split(' ')[-1].split('.')])
    with open('./specification.json', 'w') as f:
        json.dump(combined, f, indent=4)