# Source: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/ground_truth_labeling_jobs/from_unlabeled_data_to_deployed_machine_ # learning_model_ground_truth_demo_image_classification/from_unlabeled_data_to_deployed_machine_learning_model_ground_truth_demo_image_ # classification.ipynb import itertools import json import numpy as np import boto3 BUCKET = "" EXP_NAME = "mini-image-classification/ground-truth-demo" # Make sure the bucket is in the same region as this notebook. region = boto3.session.Session().region_name s3 = boto3.client("s3") bucket_region = s3.head_bucket(Bucket=BUCKET)["ResponseMetadata"]["HTTPHeaders"][ "x-amz-bucket-region" ] assert ( bucket_region == region ), "You S3 bucket {} and this notebook need to be in the same region.".format(BUCKET) # Process the Open Images annotations. with open("openimgs-annotations.csv", "r") as f: all_labels = [line.strip().split(",") for line in f.readlines()] # Extract image ids in each of our desired classes. ims = {} ims["Musical Instrument"] = [ label[0] for label in all_labels if (label[2] == "/m/04szw" and label[3] == "1") ][:500] ims["Fruit"] = [ label[0] for label in all_labels if (label[2] == "/m/02xwb" and label[3] == "1") ][:371] ims["Fruit"].remove( "02a54f6864478101" ) # This image contains personal information, let's remove it from our dataset. num_classes = len(ims) # If running the short version of the demo, reduce each class count 50 times. for key in ims.keys(): ims[key] = set(ims[key][: int(len(ims[key]) / 50)]) # Copy the images to our local bucket. print("Copying images to bucket") s3 = boto3.client("s3") for img_id, img in enumerate(itertools.chain.from_iterable(ims.values())): copy_source = {"Bucket": "open-images-dataset", "Key": "test/{}.jpg".format(img)} s3.copy(copy_source, BUCKET, "{}/images/{}.jpg".format(EXP_NAME, img)) # Create and upload the input manifests. input_data_paths = [ "s3://{}/{}/images/{}.jpg".format(BUCKET, EXP_NAME, img) for img in itertools.chain.from_iterable(ims.values()) ] # Shuffle input paths in place. np.random.shuffle(input_data_paths) dataset_size = len(input_data_paths) train_test_split_index = round(dataset_size * 0.8) print("Number of training samples: " + str(train_test_split_index)) print("Number of validation samples: " + str(dataset_size - train_test_split_index)) train_data_paths = input_data_paths[:train_test_split_index] validation_data_paths = input_data_paths[train_test_split_index:] with open("train.manifest", "w") as f: for img_path in train_data_paths: f.write('{"source-ref": "' + img_path + '"}\n') with open("validation.manifest", "w") as f: for img_path in validation_data_paths: f.write('{"source-ref": "' + img_path + '"}\n') s3.upload_file("train.manifest", BUCKET, EXP_NAME + "/" + "train.manifest") s3.upload_file("validation.manifest", BUCKET, EXP_NAME + "/" + "validation.manifest") print("Uploaded manifests at s3://{}/{}".format(BUCKET, EXP_NAME)) # Specify categories CLASS_LIST = list(ims.keys()) print("Label space is {}".format(CLASS_LIST)) json_body = {"labels": [{"label": label} for label in CLASS_LIST]} with open("class_labels.json", "w") as f: json.dump(json_body, f) s3.upload_file("class_labels.json", BUCKET, EXP_NAME + "/class_labels.json") # Create UI template img_examples = [ "https://s3.amazonaws.com/open-images-dataset/test/{}".format(img_id) for img_id in ["0634825fc1dcc96b.jpg", "0415b6a36f3381ed.jpg"] ] def make_template(test_template=False, save_fname="instructions.template"): template = r"""

Dear Annotator, please tell me whether what you can see in the image. Thank you!

Example "Musical Instrument".

Example "Fruit".

""".format( *img_examples, categories_str=str(CLASS_LIST) if test_template else "{{ task.input.labels | to_json | escape }}" ) with open(save_fname, "w") as f: f.write(template) make_template(test_template=True, save_fname="instructions.html") make_template(test_template=False, save_fname="instructions.template") s3.upload_file("instructions.template", BUCKET, EXP_NAME + "/instructions.template")