ImageImpaint_Python_II/Scoring.py

# -*- coding: utf-8 -*-
"""
Author -- Michael Widrich, Andreas Schörgenhumer
Contact -- schoergenhumer@ml.jku.at
Date -- 07.06.2022

###############################################################################

The following copyright statement applies to all code within this file.

Copyright statement:
This  material,  no  matter  whether  in  printed  or  electronic  form,
may  be  used  for personal  and non-commercial educational use only.
Any reproduction of this manuscript, no matter whether as a whole or in parts,
no matter whether in printed or in electronic form, requires explicit prior
acceptance of the authors.

###############################################################################

"""
import argparse
import bz2
import gzip
import lzma
import os
import zipfile

import dill as pkl
import numpy as np

TEST_DATA_PATH = r"/daten/challenge/django/data/datasets/image_inpainting_2022/test.zip"


def load_data(file: str):
    if file.endswith(".zip"):
        # "mode" cannot be "rb", so set it manually to "r" (still need the parameter or the function invocation fails)
        # noinspection PyUnusedLocal
        def zip_open(file_, mode):
            with zipfile.ZipFile(file_, "r") as myzip:
                return myzip.open(myzip.namelist()[0])

        open_fn = zip_open
    elif file.endswith(".bz2"):
        open_fn = bz2.open
    elif file.endswith(".xz"):
        open_fn = lzma.open
    elif file.endswith(".gz"):
        open_fn = gzip.open
    else:
        open_fn = open
    with open_fn(file, "rb") as pfh:
        return pkl.load(pfh)


def rmse(predictions: list, targets: list):
    def rmse_(prediction_array: np.ndarray, target_array: np.ndarray):
        if prediction_array.shape != target_array.shape:
            raise IndexError(f"Target shape is {target_array.shape} but prediction shape is {prediction_array.shape}")
        prediction_array, target_array = np.asarray(prediction_array, np.float64), np.asarray(target_array, np.float64)
        return np.sqrt(np.mean((prediction_array - target_array) ** 2))

    # Compute RMSE for each sample
    rmses = [rmse_(prediction, target) for prediction, target in zip(predictions, targets)]
    return np.mean(rmses)


def scoring_file(prediction_file: str, target_file: str):
    """Computes the mean RMSE loss on two lists of numpy arrays stored in pickle files prediction_file and targets_file

    Computation of mean RMSE loss, as used in the challenge for exercise 5. See files "example_testset.pkl" and
    "example_submission_random.pkl" for an example testing set and example targets, respectively. The real testing set
    (without targets) will be available as download (see assignment sheet 2).

    Parameters
    ----------
    prediction_file: str
        File path of prediction file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
        uint8, as specified in assignment sheet 2. The file can optionally be compressed, which will be automatically
        determined based on its file extension, of which the following are supported:
        > ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
          module: https://docs.python.org/3/library/zlib.html)
        > ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
        > ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
        > ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
        If none of these file extensions match, it is assumed to be a raw pickle file.
    target_file: str
        File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
        uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "predictions_file"
        above for more details). This file will not be available for the challenge.
    """
    # Load predictions
    predictions = load_data(prediction_file)
    if not isinstance(predictions, list):
        raise TypeError(f"Expected a list of numpy arrays as pickle file. "
                        f"Got {type(predictions)} object in pickle file instead.")
    if not all([isinstance(prediction, np.ndarray) and np.uint8 == prediction.dtype
                for prediction in predictions]):
        raise TypeError("List of predictions contains elements which are not numpy arrays of dtype uint8")

    # Load targets
    targets = load_data(target_file)
    if len(targets) != len(predictions):
        raise IndexError(f"list of targets has {len(targets)} elements "
                         f"but list of submitted predictions has {len(predictions)} elements.")

    return rmse(predictions, targets)


def make_predictions(onnx_model_rt, test_data: np.ndarray):
    n_samples = len(test_data["input_arrays"])

    # Create predictions for each sample (one by one)
    predictions = []
    for sample_i in range(n_samples):
        # Normalize input by maximal value
        input_array = test_data["input_arrays"][sample_i].astype(np.float32) / 255
        known_array = test_data["known_arrays"][sample_i].astype(np.float32)
        # Stack both inputs for the network
        input_array = np.concatenate([input_array, known_array], axis=0)
        # Pretend we have a minibatch dimension
        inputs = input_array[None]  # Adds empty dimension

        # Get outputs for network
        inputs_rt = {onnx_model_rt.get_inputs()[0].name: inputs}
        outputs = onnx_model_rt.run(None, inputs_rt)[0]  # Get first return value
        # We pretended to have a minibatch dimension -> remove this dimension
        outputs = outputs[0]
        if outputs.shape != known_array.shape:
            raise ValueError(f"Unbatched model output shape is {outputs.shape} but should be {known_array.shape}")
        # Get actual prediction from (entire) raw model output
        prediction = outputs[known_array <= 0]

        # De-normalize prediction
        prediction = prediction * 255
        # Clip the predictions to a valid range (we know our prediction values can only be in range 0-255 because of
        # uint8 datatype!)
        prediction = np.clip(prediction, a_min=0, a_max=255)
        # Challenge server wants uint8 datatype for predictions
        prediction = np.asarray(prediction, dtype=np.uint8)
        # Add prediction for sample to list
        predictions.append(prediction)

    return predictions


def scoring_model(model_file: str, test_file: str, target_file: str):
    """
    Computation of mean RMSE loss, as used in the challenge for exercise 5. The targets are loaded from the specified
    "target_file" (pickle file containing list of numpy arrays), whereas the predictions are created using the model
    stored at "model_file" using the original testset input data stored at "test_file".

    Parameters
    ----------
    model_file : str
        File path of the stored (trained) model. The model must be in ONNX format, and the model output must be the
        entire image (rather than only the predicted missing pixel values as it is the case when directly submitting
        the predictions via the pickled list of numpy arrays; see function "scoring_file"). The actual predictions are
        extracted from this entire image ouput automatically. The input to the model will be the concatenated image
        data and the known array data from the original testset input data, and the batch size is fixed to 1, i.e.,
        the input shape is (N=1, C=6, H=100, W=100). The output of the model (the entire image) is thus expected to
        be (N=1, C=3, H=100, W=100), from which the actual predictions are extracted (given the known array).
    test_file: str
        File path of the original testset input data, which is a pickle file containing a dictionary with the following
        entries: "input_arrays" (list of numpy arrays), "known_arrays" (list of numpy arrays), "offsets" (list of
        integer 2-tuples), "spacings" (list of integer 2-tuples), "sample_ids" (list of strings). The file can
        optionally be compressed, which will be automatically determined based on its file extension, of which the
        following are supported:
        > ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
          module: https://docs.python.org/3/library/zlib.html)
        > ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
        > ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
        > ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
        If none of these file extensions match, it is assumed to be a raw pickle file.
    target_file: str
        File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
        uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "test_file" above
        for more details). This file will not be available for the challenge.
    """
    targets = load_data(target_file)
    model = onnx.load_model(model_file)
    onnx.checker.check_model(model)
    onnx_model_rt = onnxruntime.InferenceSession(model_file)
    test_data = load_data(test_file)
    predictions = make_predictions(onnx_model_rt, test_data)
    return rmse(predictions, targets)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--submission", type=str, help="Path to submission file")
    parser.add_argument("--target", type=str, default=None, help="Path to target file")
    args = parser.parse_args()
    # Infer the type of submission: 1) eported ONNX model or 2) predictions file
    if args.submission.endswith(".onnx"):
        mse_loss = scoring_model(model_file=args.submission, test_file=TEST_DATA_PATH, target_file=args.target)
    else:
        # Prediction files are too big to keep, so ensure that they are always deleted after use
        try:
            mse_loss = scoring_file(prediction_file=args.submission, target_file=args.target)
        finally:
            pass
            # if os.path.exists(args.submission):
            #     os.remove(args.submission)
    print(mse_loss)