mirror of https://github.com/kubeflow/examples.git
				
				
				
			
		
			
				
	
	
		
			31 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			SQL
		
	
	
	
			
		
		
	
	
			31 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			SQL
		
	
	
	
| SELECT
 | |
|   url as issue_url
 | |
|   -- replace more than one white-space character in a row with a single space
 | |
| , REGEXP_REPLACE(title, r"\s{2,}", ' ') as issue_title
 | |
| , REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
 | |
| 
 | |
| FROM(
 | |
|     SELECT
 | |
|         JSON_EXTRACT(payload, '$.issue.html_url') as url
 | |
|         -- extract the title and body removing parentheses, brackets, and quotes
 | |
|       , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as title
 | |
|       , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`", ' '))) as body
 | |
|     FROM `githubarchive.day.2017*`
 | |
|     WHERE 
 | |
|       -- 70 random days in 2017 (because it costs money to query these tables!!)  
 | |
|           _TABLE_SUFFIX BETWEEN '0101' and '1231'
 | |
|       and type="IssuesEvent" 
 | |
|       -- Only want the issue at a specific point otherwise will have duplicates
 | |
|       and JSON_EXTRACT(payload, '$.action') = "\"opened\"" 
 | |
| ) as tbl
 | |
| 
 | |
| WHERE 
 | |
|   -- the body must be at least 8 words long and the title at least 3 words long
 | |
|   --  this is an arbitrary way to filter out empty or sparse issues
 | |
|       ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
 | |
|   and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
 | |
|   -- filter out issues that have really long titles or bodies
 | |
|   --    (these are outliers, and will slow tokenization down).
 | |
|   and LENGTH(title) <= 400
 | |
|   and LENGTH(body) <= 2000
 |