#!/bin/sh
#
# This script pulls and extracts all files from an image in Docker Hub.
#
# Copyright (c) 2020-2023, Jeremy Lin
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

PLATFORM_DEFAULT="linux/amd64"
PLATFORM="${PLATFORM_DEFAULT}"
OUT_DIR="./output"

usage() {
    echo "This script pulls and extracts all files from an image in Docker Hub."
    echo
    echo "$0 [OPTIONS...] IMAGE[:REF]"
    echo
    echo "IMAGE can be a community user image (like 'some-user/some-image') or a"
    echo "Docker official image (like 'hello-world', which contains no '/')."
    echo
    echo "REF is either a tag name or a full SHA-256 image digest (with a 'sha256:' prefix)."
    echo "The default ref is the 'latest' tag."
    echo
    echo "Options:"
    echo
    echo "  -p PLATFORM  Pull image for the specified platform (default: ${PLATFORM})"
    echo "               For a given image on Docker Hub, the 'Tags' tab lists the"
    echo "               platforms supported for that image."
    echo "  -o OUT_DIR   Extract image to the specified output dir (default: ${OUT_DIR})"
    echo "  -h           Show help with usage examples"
}

usage_detailed() {
    usage
    echo
    echo "Examples:"
    echo
    echo "# Pull and extract all files in the 'hello-world' image tagged 'latest'."
    echo "\$ $0 hello-world:latest"
    echo
    echo "# Same as above; ref defaults to the 'latest' tag."
    echo "\$ $0 hello-world"
    echo
    echo "# Pull the 'hello-world' image for the 'linux/arm64/v8' platform."
    echo "\$ $0 -p linux/arm64/v8 hello-world"
    echo
    echo "# Pull an image by digest."
    echo "\$ $0 hello-world:sha256:90659bf80b44ce6be8234e6ff90a1ac34acbeb826903b02cfa0da11c82cbc042"
}

if [ $# -eq 0 ]; then
    usage_detailed
    exit 0
fi

while getopts ':ho:p:' opt; do
    case $opt in
        o)
            OUT_DIR="${OPTARG}"
            ;;
        p)
            PLATFORM="${OPTARG}"
            ;;
        h)
            usage_detailed
            exit 0
            ;;
        \?)
            echo "ERROR: Invalid option '-$OPTARG'."
            echo
            usage
            exit 1
            ;;
        \:) echo "ERROR: Argument required for option '-$OPTARG'."
            echo
            usage
            exit 1
            ;;
    esac
done
shift $(($OPTIND - 1))

if [ $# -eq 0 ]; then
    echo "ERROR: Image to pull must be specified."
    echo
    usage
    exit 1
fi

if [ -e "${OUT_DIR}" ]; then
    if [ -d "${OUT_DIR}" ]; then
        echo "WARNING: Output dir already exists. If it contains a previous extracted image,"
        echo "there may be errors when trying to overwrite files with read-only permissions."
        echo
    else
        echo "ERROR: Output dir already exists, but is not a directory."
        exit 1
    fi
fi

have_curl() {
    command -v curl >/dev/null
}

have_wget() {
    command -v wget >/dev/null
}

if ! have_curl && ! have_wget; then
    echo "This script requires either curl or wget."
    exit 1
fi

image_spec="$1"
image="${image_spec%%:*}"
if [ "${image#*/}" = "${image}" ]; then
    # Docker official images are in the 'library' namespace.
    image="library/${image}"
fi
ref="${image_spec#*:}"
if [ "${ref}" = "${image_spec}" ]; then
    echo "Defaulting ref to tag 'latest'..."
    ref=latest
fi

# Split platform (OS/arch/variant) into separate variables.
# A platform specifier doesn't always include the `variant` component.
OLD_IFS="${IFS}"
IFS=/ read -r OS ARCH VARIANT <<EOF
${PLATFORM}
EOF
IFS="${OLD_IFS}"

# Given a JSON input on stdin, extract the string value associated with the
# specified key. This avoids an extra dependency on a tool like `jq`.
extract() {
    local key="$1"
    # Extract "<key>":"<val>" (assumes key/val won't contain double quotes).
    # The colon may have whitespace on either side.
    grep -o "\"${key}\"[[:space:]]*:[[:space:]]*\"[^\"]\+\"" |
    # Extract just <val> by deleting the last '"', and then greedily deleting
    # everything up to '"'.
    sed -e 's/"$//' -e 's/.*"//'
}

# Fetch a URL to stdout. Up to two header arguments may be specified:
#
#   fetch <url> [name1: value1] [name2: value2]
#
fetch() {
    if have_curl; then
        if [ $# -eq 2 ]; then
            set -- -H "$2" "$1"
        elif [ $# -eq 3 ]; then
            set -- -H "$2" -H "$3" "$1"
        fi
        curl -sSL "$@"
    else
        if [ $# -eq 2 ]; then
            set -- --header "$2" "$1"
        elif [ $# -eq 3 ]; then
            set -- --header "$2" --header "$3" "$1"
        fi
        wget -qO- "$@"
    fi
}

# https://docs.docker.com/docker-hub/api/latest/#tag/repositories
manifest_list_url="https://hub.docker.com/v2/repositories/${image}/tags/${ref}"

# If the ref is already a SHA-256 image digest, then we don't need to look up anything.
if [ -z "${ref##sha256:*}" ]; then
    digest="${ref}"
else
    echo "Getting multi-arch manifest list..."
    NL='
'
    digest=$(fetch "${manifest_list_url}" |
        # Break up the single-line JSON output into separate lines by adding
        # newlines before and after the chars '[', ']', '{', and '}'.
        # This uses the \${NL} syntax because some BSD variants of sed don't
        # support \n syntax in the replacement string, but instead require
        # a literal newline preceded by a backslash.
        sed -e 's/\([][{}]\)/\'"${NL}"'\1\'"${NL}"'/g' |
        # Extract the "images":[...] list.
        sed -n '/"images":/,/]/ p' |
        # Each image's details are now on a separate line, e.g.
        # "architecture":"arm64","features":"","variant":"v8","digest":"sha256:054c85801c4cb41511b176eb0bf13a2c4bbd41611ddd70594ec3315e88813524","os":"linux","os_features":"","os_version":null,"size":828724,"status":"active","last_pulled":"2022-09-02T22:46:48.240632Z","last_pushed":"2022-09-02T00:42:45.69226Z"
        # The image details are interspersed with lines of stray punctuation,
        # so grep for an arbitrary string that must be in these lines.
        grep architecture |
        # Search for an image that matches the platform.
        while read -r image; do
            # Arch is probably most likely to be unique, so check that first.
            arch="$(echo ${image} | extract 'architecture')"
            if [ "${arch}" != "${ARCH}" ]; then continue; fi

            os="$(echo ${image} | extract 'os')"
            if [ "${os}" != "${OS}" ]; then continue; fi

            variant="$(echo ${image} | extract 'variant')"
            if [ "${variant}" = "${VARIANT}" ]; then
                echo ${image} | extract 'digest'
                break
            fi
        done)

    if [ -n "${digest}" ]; then
        echo "Platform ${PLATFORM} resolved to '${digest}'..."
    else
        echo "No image digest found. Verify that the image, ref, and platform are valid."
        exit 1
    fi
fi

# https://docs.docker.com/registry/spec/auth/token/#how-to-authenticate
api_token_url="https://auth.docker.io/token?service=registry.docker.io&scope=repository:$image:pull"

# https://github.com/docker/distribution/blob/master/docs/spec/api.md#pulling-an-image-manifest
manifest_url="https://registry-1.docker.io/v2/${image}/manifests/${digest}"

# https://github.com/docker/distribution/blob/master/docs/spec/api.md#pulling-a-layer
blobs_base_url="https://registry-1.docker.io/v2/${image}/blobs"

echo "Getting API token..."
token=$(fetch "${api_token_url}" | extract 'token')
auth_header="Authorization: Bearer $token"

# https://github.com/distribution/distribution/blob/main/docs/spec/manifest-v2-2.md
docker_manifest_v2="application/vnd.docker.distribution.manifest.v2+json"

# https://github.com/opencontainers/image-spec/blob/main/manifest.md
oci_manifest_v1="application/vnd.oci.image.manifest.v1+json"

# Docker Hub can return either type of manifest format. Most images seem to
# use the Docker format for now, but the OCI format will likely become more
# common as features that require that format become enabled by default
# (e.g., https://github.com/docker/build-push-action/releases/tag/v3.3.0).
accept_header="Accept: ${docker_manifest_v2},${oci_manifest_v1}"

echo "Getting image manifest for $image:$ref..."
layers=$(fetch "${manifest_url}" "${auth_header}" "${accept_header}" |
             # Extract `digest` values only after the `layers` section appears.
             sed -n '/"layers":/,$ p' |
             extract 'digest')

if [ -z "${layers}" ]; then
    echo "No layers returned. Verify that the image and ref are valid."
    exit 1
fi

mkdir -p "${OUT_DIR}"

for layer in $layers; do
    hash="${layer#sha256:}"
    echo "Fetching and extracting layer ${hash}..."
    fetch "${blobs_base_url}/${layer}" "${auth_header}" | gzip -d | tar -C "${OUT_DIR}" -xf -
    # Ref: https://github.com/moby/moby/blob/master/image/spec/v1.2.md#creating-an-image-filesystem-changeset
    #      https://github.com/moby/moby/blob/master/pkg/archive/whiteouts.go
    # Search for "whiteout" files to indicate files deleted in this layer.
    OLD_IFS="${IFS}"
    find "${OUT_DIR}" -name '.wh.*' | while IFS= read -r f; do
        dir="${f%/*}"
        wh_file="${f##*/}"
        file="${wh_file#.wh.}"
        # Delete both the whiteout file and the whited-out file.
        rm -rf "${dir}/${wh_file}" "${dir}/${file}"
    done
    IFS="${OLD_IFS}"
done

echo "Image contents extracted into ${OUT_DIR}."