add lots of validation code and correct saving of pickle file

2022-06-29 17:20:16 +02:00
parent 8cf208cfc9
commit 79734a8d75
29 changed files with 315 additions and 35 deletions
@@ -0,0 +1,44 @@
+import importlib
+import shutil
+
+
+def compress(filename: str):
+    supp_compr_algo = {"zip": "zipfile", "gzip": "gzip", "bzip2": "bz2", "lzma": "lzma"}
+    extensions = {"zip": ".zip", "gzip": ".gz", "bzip2": ".bz2", "lzma": ".xz"}
+    assert set(supp_compr_algo.keys()) == set(extensions.keys())
+
+    compression = "bzip2"
+
+    if compression not in supp_compr_algo:
+        raise ValueError(
+            f"Unknown compression algorithm '{compression}'; must be one of {list(supp_compr_algo.keys())}")
+
+    try_compression(
+        file=filename,
+        name=compression,
+        module_name=supp_compr_algo[compression],
+        extension=extensions[compression],
+        function=zip_compression if compression == "zip" else compression_open_context_manager
+    )
+    print(f"Successfully compressed '{filename}' to '{filename + extensions[compression]}' "
+          f"using {compression} as compression algorithm")
+
+
+def try_compression(file: str, name: str, module_name: str, extension: str, function: callable):
+    try:
+        compression_module = importlib.import_module(module_name)
+
+        function(file, compression_module, extension)
+    except ImportError as ex:
+        raise ImportError(f"compression='{name}' failed: required module could not be loaded ({ex})")
+
+
+def compression_open_context_manager(file: str, module, extension: str):
+    with open(file, "rb") as f_in:
+        with module.open(file + extension, "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+
+def zip_compression(file: str, module, extension: str):
+    with module.ZipFile(file + extension, "w", compression=module.ZIP_DEFLATED) as z:
+        z.write(file)
@@ -17,8 +17,8 @@ class ImageDataset(Dataset):
    def __init__(self, image_dir):
        self.image_files = sorted(glob.glob(os.path.join(image_dir, "**", "*.jpg"), recursive=True))
        # Mean and std arrays could also be defined as class attributes
-        self.norm_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
-        self.norm_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        # self.norm_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        # self.norm_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)

    def __getitem__(self, index):
        # Open image file, convert to numpy array and scale to [0, 1]
@@ -29,12 +29,7 @@ class ImageDataset(Dataset):
            transforms.CenterCrop(size=(IMG_SIZE, IMG_SIZE)),
        ])
        target_image = resize_transforms(target_image)
-
-        # normalize image from 0-1
-        target_image = np.array(target_image, dtype=np.float64) / 255.0
-
-        # Perform normalization for each channel
-        # image = (image - self.norm_mean) / self.norm_std
+        target_image = preprocess(target_image)

        # calculate image with black grid
        doomed_image = ex4.ex4(target_image, (5, 5), (4, 4))
@@ -48,17 +43,36 @@ class ImageDataset(Dataset):
        return len(self.image_files)


+def preprocess(input: np.array) -> np.array:
+    # normalize image from 0-1
+    target_image = np.array(input, dtype=np.float64) / 255.0
+
+    # Perform normalization for each channel
+    # image = (image - self.norm_mean) / self.norm_std
+
+    return target_image
+
+
+# postprecess should be the inverese function of preprocess!
+def postprocess(input: np.array) -> np.array:
+    target_image = (input * 255.0).astype(np.uint8)
+    return target_image
+
+
 def get_image_loader(path: str):
    image_dataset = ImageDataset(path)
    totlen = len(image_dataset)
-    trains, tests = torch.utils.data.dataset.random_split(image_dataset, (int(totlen * .7), totlen - int(totlen * .7)),
+
+    test_set_size = .001
+    trains, tests = torch.utils.data.dataset.random_split(image_dataset, lengths=(totlen - int(totlen * test_set_size),
+                                                                                  int(totlen * test_set_size)),
                                                          generator=torch.Generator().manual_seed(42))

    train_loader = DataLoader(
        trains,
        shuffle=True,  # shuffle the order of our samples
        batch_size=5,  # stack 4 samples to a minibatch
-        num_workers=2  # no background workers (see comment below)
+        num_workers=4  # no background workers (see comment below)
    )

    test_loader = DataLoader(
@@ -26,16 +26,16 @@ def train_model():
    nn.train()  # init with train mode
    nn.to(device)  # send net to device available

-    optimizer = torch.optim.SGD(nn.parameters(), lr=0.1)  # todo adjust parameters and lr
+    optimizer = torch.optim.AdamW(nn.parameters(), lr=0.1, weight_decay=1e-5)  # todo adjust parameters and lr
    loss_function = torch.nn.MSELoss()
-    n_epochs = 15  # todo epcchs here
+    n_epochs = 10  # todo epcchs here

    # todo look wtf is that
    nn.double()

    train_sample_size = len(train_loader)
    losses = []
-    best_eval_loss = 0
+    best_eval_loss = np.inf
    for epoch in range(n_epochs):
        print(f"Epoch {epoch}/{n_epochs}\n")
        i = 0
@@ -55,12 +55,12 @@ def train_model():
                end='')
            i += train_loader.batch_size

-            # eval model every 500th element
-            if i % 500 == 0:
+            # eval model every 3000th sample
+            if i % 15 == 0:
                print(f"\nEvaluating model")
                eval_loss = eval_model(nn, test_loader, loss_function, device)
                print(f"Evalution loss={eval_loss}")
-                if eval_loss > best_eval_loss:
+                if eval_loss < best_eval_loss:
                    best_eval_loss = eval_loss
                    save_model(nn)

@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+"""
+Author -- Michael Widrich, Andreas Schörgenhumer
+Contact -- schoergenhumer@ml.jku.at
+Date -- 07.06.2022
+
+###############################################################################
+
+The following copyright statement applies to all code within this file.
+
+Copyright statement:
+This  material,  no  matter  whether  in  printed  or  electronic  form,
+may  be  used  for personal  and non-commercial educational use only.
+Any reproduction of this manuscript, no matter whether as a whole or in parts,
+no matter whether in printed or in electronic form, requires explicit prior
+acceptance of the authors.
+
+###############################################################################
+
+"""
+import argparse
+import bz2
+import gzip
+import lzma
+import os
+import zipfile
+
+import dill as pkl
+import numpy as np
+import onnx
+import onnxruntime
+
+TEST_DATA_PATH = r"/daten/challenge/django/data/datasets/image_inpainting_2022/test.zip"
+
+
+def load_data(file: str):
+    if file.endswith(".zip"):
+        # "mode" cannot be "rb", so set it manually to "r" (still need the parameter or the function invocation fails)
+        # noinspection PyUnusedLocal
+        def zip_open(file_, mode):
+            with zipfile.ZipFile(file_, "r") as myzip:
+                return myzip.open(myzip.namelist()[0])
+
+        open_fn = zip_open
+    elif file.endswith(".bz2"):
+        open_fn = bz2.open
+    elif file.endswith(".xz"):
+        open_fn = lzma.open
+    elif file.endswith(".gz"):
+        open_fn = gzip.open
+    else:
+        open_fn = open
+    with open_fn(file, "rb") as pfh:
+        return pkl.load(pfh)
+
+
+def rmse(predictions: list, targets: list):
+    def rmse_(prediction_array: np.ndarray, target_array: np.ndarray):
+        if prediction_array.shape != target_array.shape:
+            raise IndexError(f"Target shape is {target_array.shape} but prediction shape is {prediction_array.shape}")
+        prediction_array, target_array = np.asarray(prediction_array, np.float64), np.asarray(target_array, np.float64)
+        return np.sqrt(np.mean((prediction_array - target_array) ** 2))
+
+    # Compute RMSE for each sample
+    rmses = [rmse_(prediction, target) for prediction, target in zip(predictions, targets)]
+    return np.mean(rmses)
+
+
+def scoring_file(prediction_file: str, target_file: str):
+    """Computes the mean RMSE loss on two lists of numpy arrays stored in pickle files prediction_file and targets_file
+
+    Computation of mean RMSE loss, as used in the challenge for exercise 5. See files "example_testset.pkl" and
+    "example_submission_random.pkl" for an example test set and example targets, respectively. The real test set
+    (without targets) will be available as download (see assignment sheet 2).
+
+    Parameters
+    ----------
+    prediction_file: str
+        File path of prediction file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
+        uint8, as specified in assignment sheet 2. The file can optionally be compressed, which will be automatically
+        determined based on its file extension, of which the following are supported:
+        > ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
+          module: https://docs.python.org/3/library/zlib.html)
+        > ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
+        > ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
+        > ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
+        If none of these file extensions match, it is assumed to be a raw pickle file.
+    target_file: str
+        File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
+        uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "predictions_file"
+        above for more details). This file will not be available for the challenge.
+    """
+    # Load predictions
+    predictions = load_data(prediction_file)
+    if not isinstance(predictions, list):
+        raise TypeError(f"Expected a list of numpy arrays as pickle file. "
+                        f"Got {type(predictions)} object in pickle file instead.")
+    if not all([isinstance(prediction, np.ndarray) and np.uint8 == prediction.dtype
+                for prediction in predictions]):
+        raise TypeError("List of predictions contains elements which are not numpy arrays of dtype uint8")
+
+    # Load targets
+    targets = load_data(target_file)
+    if len(targets) != len(predictions):
+        raise IndexError(f"list of targets has {len(targets)} elements "
+                         f"but list of submitted predictions has {len(predictions)} elements.")
+
+    return rmse(predictions, targets)
+
+
+def make_predictions(onnx_model_rt, test_data: np.ndarray):
+    n_samples = len(test_data["input_arrays"])
+
+    # Create predictions for each sample (one by one)
+    predictions = []
+    for sample_i in range(n_samples):
+        # Normalize input by maximal value
+        input_array = test_data["input_arrays"][sample_i].astype(np.float32) / 255
+        known_array = test_data["known_arrays"][sample_i].astype(np.float32)
+        # Stack both inputs for the network
+        input_array = np.concatenate([input_array, known_array], axis=0)
+        # Pretend we have a minibatch dimension
+        inputs = input_array[None]  # Adds empty dimension
+
+        # Get outputs for network
+        inputs_rt = {onnx_model_rt.get_inputs()[0].name: inputs}
+        outputs = onnx_model_rt.run(None, inputs_rt)[0]  # Get first return value
+        # We pretended to have a minibatch dimension -> remove this dimension
+        outputs = outputs[0]
+        if outputs.shape != known_array.shape:
+            raise ValueError(f"Unbatched model output shape is {outputs.shape} but should be {known_array.shape}")
+        # Get actual prediction from (entire) raw model output
+        prediction = outputs[known_array <= 0]
+
+        # De-normalize prediction
+        prediction = prediction * 255
+        # Clip the predictions to a valid range (we know our prediction values can only be in range 0-255 because of
+        # uint8 datatype!)
+        prediction = np.clip(prediction, a_min=0, a_max=255)
+        # Challenge server wants uint8 datatype for predictions
+        prediction = np.asarray(prediction, dtype=np.uint8)
+        # Add prediction for sample to list
+        predictions.append(prediction)
+
+    return predictions
+
+
+def scoring_model(model_file: str, test_file: str, target_file: str):
+    """
+    Computation of mean RMSE loss, as used in the challenge for exercise 5. The targets are loaded from the specified
+    "target_file" (pickle file containing list of numpy arrays), whereas the predictions are created using the model
+    stored at "model_file" using the original testset input data stored at "test_file".
+
+    Parameters
+    ----------
+    model_file : str
+        File path of the stored (trained) model. The model must be in ONNX format, and the model output must be the
+        entire image (rather than only the predicted missing pixel values as it is the case when directly submitting
+        the predictions via the pickled list of numpy arrays; see function "scoring_file"). The actual predictions are
+        extracted from this entire image ouput automatically. The input to the model will be the concatenated image
+        data and the known array data from the original testset input data, and the batch size is fixed to 1, i.e.,
+        the input shape is (N=1, C=6, H=100, W=100). The output of the model (the entire image) is thus expected to
+        be (N=1, C=3, H=100, W=100), from which the actual predictions are extracted (given the known array).
+    test_file: str
+        File path of the original testset input data, which is a pickle file containing a dictionary with the following
+        entries: "input_arrays" (list of numpy arrays), "known_arrays" (list of numpy arrays), "offsets" (list of
+        integer 2-tuples), "spacings" (list of integer 2-tuples), "sample_ids" (list of strings). The file can
+        optionally be compressed, which will be automatically determined based on its file extension, of which the
+        following are supported:
+        > ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
+          module: https://docs.python.org/3/library/zlib.html)
+        > ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
+        > ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
+        > ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
+        If none of these file extensions match, it is assumed to be a raw pickle file.
+    target_file: str
+        File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
+        uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "test_file" above
+        for more details). This file will not be available for the challenge.
+    """
+    targets = load_data(target_file)
+    model = onnx.load_model(model_file)
+    onnx.checker.check_model(model)
+    onnx_model_rt = onnxruntime.InferenceSession(model_file)
+    test_data = load_data(test_file)
+    predictions = make_predictions(onnx_model_rt, test_data)
+    return rmse(predictions, targets)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--submission", type=str, help="Path to submission file")
+    parser.add_argument("--target", type=str, default=None, help="Path to target file")
+    args = parser.parse_args()
+    # Infer the type of submission: 1) eported ONNX model or 2) predictions file
+    if args.submission.endswith(".onnx"):
+        mse_loss = scoring_model(model_file=args.submission, test_file=TEST_DATA_PATH, target_file=args.target)
+    else:
+        # Prediction files are too big to keep, so ensure that they are always deleted after use
+        try:
+            mse_loss = scoring_file(prediction_file=args.submission, target_file=args.target)
+        finally:
+            pass
+            # if os.path.exists(args.submission):
+            #     os.remove(args.submission)
+    print(mse_loss)
@@ -1,14 +1,46 @@
-import torch
+import pickle
+import sys

+import numpy as np
+import torch
+import Compress
+import DataLoader
 from Net import ImageNN

+MODEL_PATH = 'impaintmodel.pt'
+PICKEL_PATH = 'impaintmodel.pkl'
+

 def save_model(model: torch.nn.Module):
-    torch.save(model, 'impaintmodel.pt')
+    print(f"Saved raw model to {MODEL_PATH}")
+    torch.save(model, MODEL_PATH)
+
+    # read the provided testing pickle file
+    print("Generating pickle file with privided test data")
+    model.eval()
+    with open('testing/inputs.pkl', 'rb') as handle:
+        with open(PICKEL_PATH, 'wb') as writehandle:
+            b: dict = pickle.load(handle)
+            outarr = []
+            i=0
+            piclen = len(b['input_arrays'])
+            for pic in b['input_arrays']:
+                pic = DataLoader.preprocess(pic)
+                out = model(torch.from_numpy(pic))
+                out = DataLoader.postprocess(out.detach().numpy())
+                pickle.dump(out, writehandle, protocol=pickle.HIGHEST_PROTOCOL)
+
+                print(
+                    f'\rApplying model [{i}/{piclen}] {sys.getsizeof(outarr)}',end='')
+                i += 1
+
+    # compress the generated pickle arr
+    Compress.compress(PICKEL_PATH)


 def load_model():
    model = ImageNN()
-    model.load_state_dict(torch.load('impaintmodel.pt'))
+    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()
    return model
+
@@ -1,7 +0,0 @@
-unittest\unittest_input_0\00.jpg
-unittest\unittest_input_0\01.jpg
-unittest\unittest_input_0\02.jpg
-unittest\unittest_input_0\04.jpg
-unittest\unittest_input_0\05.jpg
-unittest\unittest_input_0\subfolder\06.jpg
-unittest\unittest_input_0\subfolder\07.jpg
@@ -1,9 +0,0 @@
-unittest\unittest_input_1\08.jpg
-unittest\unittest_input_1\09.jpg
-unittest\unittest_input_1\11.jpg
-unittest\unittest_input_1\12.jpg
-unittest\unittest_input_1\13.jpg
-unittest\unittest_input_1\14.jpg
-unittest\unittest_input_1\subfolder\15.jpg
-unittest\unittest_input_1\subfolder\subsubfolder\16.jpg
-unittest\unittest_input_1\subfolder\subsubfolder\17.jpg