pipelines/samples/contrib/aws-samples/ground_truth_pipeline_demo/prep_inputs.py

135 lines
4.8 KiB
Python

# Source: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/ground_truth_labeling_jobs/from_unlabeled_data_to_deployed_machine_
# learning_model_ground_truth_demo_image_classification/from_unlabeled_data_to_deployed_machine_learning_model_ground_truth_demo_image_
# classification.ipynb
import itertools
import json
import numpy as np
import boto3
BUCKET = "<your-bucket-name>"
EXP_NAME = "mini-image-classification/ground-truth-demo"
# Make sure the bucket is in the same region as this notebook.
region = boto3.session.Session().region_name
s3 = boto3.client("s3")
bucket_region = s3.head_bucket(Bucket=BUCKET)["ResponseMetadata"]["HTTPHeaders"][
"x-amz-bucket-region"
]
assert (
bucket_region == region
), "You S3 bucket {} and this notebook need to be in the same region.".format(BUCKET)
# Process the Open Images annotations.
with open("openimgs-annotations.csv", "r") as f:
all_labels = [line.strip().split(",") for line in f.readlines()]
# Extract image ids in each of our desired classes.
ims = {}
ims["Musical Instrument"] = [
label[0] for label in all_labels if (label[2] == "/m/04szw" and label[3] == "1")
][:500]
ims["Fruit"] = [
label[0] for label in all_labels if (label[2] == "/m/02xwb" and label[3] == "1")
][:371]
ims["Fruit"].remove(
"02a54f6864478101"
) # This image contains personal information, let's remove it from our dataset.
num_classes = len(ims)
# If running the short version of the demo, reduce each class count 50 times.
for key in ims.keys():
ims[key] = set(ims[key][: int(len(ims[key]) / 50)])
# Copy the images to our local bucket.
print("Copying images to bucket")
s3 = boto3.client("s3")
for img_id, img in enumerate(itertools.chain.from_iterable(ims.values())):
copy_source = {"Bucket": "open-images-dataset", "Key": "test/{}.jpg".format(img)}
s3.copy(copy_source, BUCKET, "{}/images/{}.jpg".format(EXP_NAME, img))
# Create and upload the input manifests.
input_data_paths = [
"s3://{}/{}/images/{}.jpg".format(BUCKET, EXP_NAME, img)
for img in itertools.chain.from_iterable(ims.values())
]
# Shuffle input paths in place.
np.random.shuffle(input_data_paths)
dataset_size = len(input_data_paths)
train_test_split_index = round(dataset_size * 0.8)
print("Number of training samples: " + str(train_test_split_index))
print("Number of validation samples: " + str(dataset_size - train_test_split_index))
train_data_paths = input_data_paths[:train_test_split_index]
validation_data_paths = input_data_paths[train_test_split_index:]
with open("train.manifest", "w") as f:
for img_path in train_data_paths:
f.write('{"source-ref": "' + img_path + '"}\n')
with open("validation.manifest", "w") as f:
for img_path in validation_data_paths:
f.write('{"source-ref": "' + img_path + '"}\n')
s3.upload_file("train.manifest", BUCKET, EXP_NAME + "/" + "train.manifest")
s3.upload_file("validation.manifest", BUCKET, EXP_NAME + "/" + "validation.manifest")
print("Uploaded manifests at s3://{}/{}".format(BUCKET, EXP_NAME))
# Specify categories
CLASS_LIST = list(ims.keys())
print("Label space is {}".format(CLASS_LIST))
json_body = {"labels": [{"label": label} for label in CLASS_LIST]}
with open("class_labels.json", "w") as f:
json.dump(json_body, f)
s3.upload_file("class_labels.json", BUCKET, EXP_NAME + "/class_labels.json")
# Create UI template
img_examples = [
"https://s3.amazonaws.com/open-images-dataset/test/{}".format(img_id)
for img_id in ["0634825fc1dcc96b.jpg", "0415b6a36f3381ed.jpg"]
]
def make_template(test_template=False, save_fname="instructions.template"):
template = r"""<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>
<crowd-form>
<crowd-image-classifier
name="crowd-image-classifier"
src="{{{{ task.input.taskObject | grant_read_access }}}}"
header="Dear Annotator, please tell me what you can see in the image. Thank you!"
categories="{categories_str}"
>
<full-instructions header="Image classification instructions">
</full-instructions>
<short-instructions>
<p>Dear Annotator, please tell me whether what you can see in the image. Thank you!</p>
<p><img src="{}" style="max-width:100%">
<br>Example "Musical Instrument". </p>
<p><img src="{}" style="max-width:100%">
<br>Example "Fruit".</p>
</short-instructions>
</crowd-image-classifier>
</crowd-form>""".format(
*img_examples,
categories_str=str(CLASS_LIST)
if test_template
else "{{ task.input.labels | to_json | escape }}"
)
with open(save_fname, "w") as f:
f.write(template)
make_template(test_template=True, save_fname="instructions.html")
make_template(test_template=False, save_fname="instructions.template")
s3.upload_file("instructions.template", BUCKET, EXP_NAME + "/instructions.template")