Add more details to generate-devstats-repo-sql.py

There was still a lot of manual massaging of the script's
output that needed to happen before it was usable as a PR
to cncf/devstats.

I have now tried to encode as much of that knowledge in
code and comments here to allow the output of the script
to overwrite the existing file.

- add header/footer
- add comment pointing to this script
- add special case group for "Kubernetes"
- add support for committees that own code
- (manually) keep track of old repo names for history
This commit is contained in:
Aaron Crickenberger 2019-06-26 13:05:43 -07:00
parent f784eb4ab8
commit 7bd7f2c344
1 changed files with 121 additions and 18 deletions

View File

@ -34,50 +34,153 @@ import json
import re
import sys
update_gha_repos_template = """
update gha_repos set repo_group = 'SIG {}' where name in (
repo_group_sql_template = """
update gha_repos set repo_group = '{}' where name in (
{}
);
"""
def repos_from_sig(sig):
"""Returns a list of org/repos given a sig"""
# copied from github.com/cncf/devstats/scripts/kubernetes/repo_groups.sql,
# if this differs, consider cncf the authoritative source and update this
repo_groups_sql_header = """-- generated by github.com/kubernetes/community/hack/generate-devstats-repo-sql.py
-- Add repository groups
"""
# copied from github.com/cncf/devstats/scripts/kubernetes/repo_groups.sql,
# if this differs, consider cncf the authoritative source and update this
repo_groups_sql_footer = """
-- All other unknown repositories should have 'Other' repository group
-- update gha_repos set repo_group = 'Other' where repo_group is null;
-- By default alias is the newest repo name for given repo ID
update
gha_repos r
set
alias = coalesce((
select e.dup_repo_name
from
gha_events e
where
e.repo_id = r.id
order by
e.created_at desc
limit 1
), name)
;
update gha_repos set alias = 'kubernetes/kubernetes' where name like '%kubernetes' or name = 'kubernetes/';
select
repo_group,
count(*) as number_of_repos
from
gha_repos
where
repo_group is not null
group by
repo_group
order by
number_of_repos desc,
repo_group asc;
"""
special_case_groups = [{
# the main repo has no single owner and has gone by many names
'name': 'Kubernetes',
'repos': [
'kubernetes/kubernetes',
'GoogleCloudPlatform/kubernetes',
'kubernetes',
'kubernetes/'
]
}]
# devstats isn't aware of repo renames or migrations; we need to keep
# old repo names in its sql groups present for historical purposes;
#
# when reconciling deletions from repo_groups.sql by this script, use
# github.com/kubernetes/org issues to determine why; renamed, migrated,
# or used-and-retired repos belong here; unused/deleted repos do not
renamed_repos = {
'sig-architecture': [
'kubernetes/contrib',
],
'sig-api-machinery': [
'kubernetes-incubator/apiserver-builder',
],
'sig-cluster-lifecycle': [
'kubernetes-incubator/kubespray',
],
'sig-multicluster': [
'kubernetes-sigs/federation-v2',
],
'sig-node': [
'kubernetes-incubator/node-feature-discovery',
],
'sig-pm': [
'kubernetes/features',
],
'sig-service-catalog': [
'kubernetes-incubator/service-catalog',
]
}
def repos_from_k8s_group(k8s_group):
"""Returns a list of org/repos given a kubernetes community group"""
repos = {}
subprojects = sig.get('subprojects', [])
subprojects = k8s_group.get('subprojects', [])
if subprojects is None:
subprojects = []
for sp in subprojects:
for uri in sp['owners']:
owners_path = re.sub(r"https://raw.githubusercontent.com/(.*)/master/(.*)",r"\1/\2",uri)
path_parts = owners_path.split('/')
# org/repo is owned by sig if org/repo/OWNERS os in one of their subprojects
# org/repo is owned by k8s_group if org/repo/OWNERS os in one of their subprojects
if path_parts[2] == 'OWNERS':
repo = '/'.join(path_parts[0:2])
repos[repo] = True
return sorted(repos.keys())
def write_repo_groups_sql(sigs, fp):
for sig in sigs['sigs']:
repos = repos_from_sig(sig)
if len(repos):
fp.write(
update_gha_repos_template.format(
sig['name'],
',\n'.join([' \'{}\''.format(r) for r in repos])))
def k8s_group_name(k8s_group):
group_dir = k8s_group.get('dir', '')
if group_dir.startswith('sig-'):
return "SIG " + k8s_group['name']
if group_dir.startswith('committee-'):
return k8s_group['name'] + " Committee"
return "UNKNOWN " + group_dir
def write_repo_groups_template(name, repos, fp):
if len(repos):
fp.write(
repo_group_sql_template.format(
name,
',\n'.join([' \'{}\''.format(r) for r in repos])))
def write_repo_groups_sql(k8s_groups, fp):
fp.write(repo_groups_sql_header)
for g in special_case_groups:
write_repo_groups_template(g['name'], g['repos'], fp)
for group_type in ['sigs', 'committees']:
for g in k8s_groups[group_type]:
repos = set(repos_from_k8s_group(g)) | set(renamed_repos.get(g['dir'],[]))
repos = sorted(list(repos))
write_repo_groups_template(k8s_group_name(g), repos, fp)
fp.write(repo_groups_sql_footer)
def main(sigs_yaml, repo_groups_sql):
with open(sigs_yaml) as fp:
sigs = yaml.round_trip_load(fp)
k8s_groups = yaml.round_trip_load(fp)
if repo_groups_sql is not None:
with open(repo_groups_sql, 'w') as fp:
write_repo_groups_sql(sigs, fp)
write_repo_groups_sql(k8s_groups, fp)
else:
write_repo_groups_sql(sigs, sys.stdout)
write_repo_groups_sql(k8s_groups, sys.stdout)
if __name__ == '__main__':
PARSER = argparse.ArgumentParser(
description='Do things with sigs.yaml')
description='Generate a repo_groups.sql intended for github.com/cncf/devstats/scripts/kubernetes/repo_groups.sql')
PARSER.add_argument(
'--sigs-yaml',
default='./sigs.yaml',