mirror of https://github.com/kubeflow/examples.git
31 lines
1.3 KiB
SQL
31 lines
1.3 KiB
SQL
SELECT
|
|
url as issue_url
|
|
-- replace more than one white-space character in a row with a single space
|
|
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
|
|
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
|
|
|
|
FROM(
|
|
SELECT
|
|
JSON_EXTRACT(payload, '$.issue.html_url') as url
|
|
-- extract the title and body removing parentheses, brackets, and quotes
|
|
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
|
|
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
|
|
FROM `githubarchive.day.2017*`
|
|
WHERE
|
|
-- 70 random days in 2017 (because it costs money to query these tables!!)
|
|
_TABLE_SUFFIX BETWEEN '0101' and '1231'
|
|
and type="IssuesEvent"
|
|
-- Only want the issue at a specific point otherwise will have duplicates
|
|
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
|
|
) as tbl
|
|
|
|
WHERE
|
|
-- the body must be at least 8 words long and the title at least 3 words long
|
|
-- this is an arbitrary way to filter out empty or sparse issues
|
|
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
|
|
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
|
|
-- filter out issues that have really long titles or bodies
|
|
-- (these are outliers, and will slow tokenization down).
|
|
and LENGTH(title) <= 400
|
|
and LENGTH(body) <= 2000
|