mirror of https://github.com/kubeflow/examples.git
27 lines
914 B
Python
27 lines
914 B
Python
import argparse
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# Parsing flags.
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--input_csv")
|
|
parser.add_argument("--sample_size", type=int, default=2000000)
|
|
parser.add_argument("--output_traindf_csv")
|
|
parser.add_argument("--output_testdf_csv")
|
|
args = parser.parse_args()
|
|
print(args)
|
|
|
|
pd.set_option('display.max_colwidth', 500)
|
|
|
|
# Read in data sample 2M rows (for speed of tutorial)
|
|
traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sample_size),
|
|
test_size=.10)
|
|
|
|
# Print stats about the shape of the data.
|
|
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
|
|
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
|
|
|
|
# Store output as CSV.
|
|
traindf.to_csv(args.output_traindf_csv)
|
|
testdf.to_csv(args.output_testdf_csv)
|