add lots of validation code and correct saving of pickle file

This commit is contained in:
lukas-heiligenbrunner 2022-06-29 17:20:16 +02:00
parent 8cf208cfc9
commit 79734a8d75
29 changed files with 315 additions and 35 deletions

44
Compress.py Normal file
View File

@ -0,0 +1,44 @@
import importlib
import shutil
def compress(filename: str):
supp_compr_algo = {"zip": "zipfile", "gzip": "gzip", "bzip2": "bz2", "lzma": "lzma"}
extensions = {"zip": ".zip", "gzip": ".gz", "bzip2": ".bz2", "lzma": ".xz"}
assert set(supp_compr_algo.keys()) == set(extensions.keys())
compression = "bzip2"
if compression not in supp_compr_algo:
raise ValueError(
f"Unknown compression algorithm '{compression}'; must be one of {list(supp_compr_algo.keys())}")
try_compression(
file=filename,
name=compression,
module_name=supp_compr_algo[compression],
extension=extensions[compression],
function=zip_compression if compression == "zip" else compression_open_context_manager
)
print(f"Successfully compressed '{filename}' to '{filename + extensions[compression]}' "
f"using {compression} as compression algorithm")
def try_compression(file: str, name: str, module_name: str, extension: str, function: callable):
try:
compression_module = importlib.import_module(module_name)
function(file, compression_module, extension)
except ImportError as ex:
raise ImportError(f"compression='{name}' failed: required module could not be loaded ({ex})")
def compression_open_context_manager(file: str, module, extension: str):
with open(file, "rb") as f_in:
with module.open(file + extension, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
def zip_compression(file: str, module, extension: str):
with module.ZipFile(file + extension, "w", compression=module.ZIP_DEFLATED) as z:
z.write(file)

View File

@ -17,8 +17,8 @@ class ImageDataset(Dataset):
def __init__(self, image_dir):
self.image_files = sorted(glob.glob(os.path.join(image_dir, "**", "*.jpg"), recursive=True))
# Mean and std arrays could also be defined as class attributes
self.norm_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
self.norm_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
# self.norm_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
# self.norm_std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
def __getitem__(self, index):
# Open image file, convert to numpy array and scale to [0, 1]
@ -29,12 +29,7 @@ class ImageDataset(Dataset):
transforms.CenterCrop(size=(IMG_SIZE, IMG_SIZE)),
])
target_image = resize_transforms(target_image)
# normalize image from 0-1
target_image = np.array(target_image, dtype=np.float64) / 255.0
# Perform normalization for each channel
# image = (image - self.norm_mean) / self.norm_std
target_image = preprocess(target_image)
# calculate image with black grid
doomed_image = ex4.ex4(target_image, (5, 5), (4, 4))
@ -48,17 +43,36 @@ class ImageDataset(Dataset):
return len(self.image_files)
def preprocess(input: np.array) -> np.array:
# normalize image from 0-1
target_image = np.array(input, dtype=np.float64) / 255.0
# Perform normalization for each channel
# image = (image - self.norm_mean) / self.norm_std
return target_image
# postprecess should be the inverese function of preprocess!
def postprocess(input: np.array) -> np.array:
target_image = (input * 255.0).astype(np.uint8)
return target_image
def get_image_loader(path: str):
image_dataset = ImageDataset(path)
totlen = len(image_dataset)
trains, tests = torch.utils.data.dataset.random_split(image_dataset, (int(totlen * .7), totlen - int(totlen * .7)),
test_set_size = .001
trains, tests = torch.utils.data.dataset.random_split(image_dataset, lengths=(totlen - int(totlen * test_set_size),
int(totlen * test_set_size)),
generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(
trains,
shuffle=True, # shuffle the order of our samples
batch_size=5, # stack 4 samples to a minibatch
num_workers=2 # no background workers (see comment below)
num_workers=4 # no background workers (see comment below)
)
test_loader = DataLoader(

View File

@ -26,16 +26,16 @@ def train_model():
nn.train() # init with train mode
nn.to(device) # send net to device available
optimizer = torch.optim.SGD(nn.parameters(), lr=0.1) # todo adjust parameters and lr
optimizer = torch.optim.AdamW(nn.parameters(), lr=0.1, weight_decay=1e-5) # todo adjust parameters and lr
loss_function = torch.nn.MSELoss()
n_epochs = 15 # todo epcchs here
n_epochs = 10 # todo epcchs here
# todo look wtf is that
nn.double()
train_sample_size = len(train_loader)
losses = []
best_eval_loss = 0
best_eval_loss = np.inf
for epoch in range(n_epochs):
print(f"Epoch {epoch}/{n_epochs}\n")
i = 0
@ -55,12 +55,12 @@ def train_model():
end='')
i += train_loader.batch_size
# eval model every 500th element
if i % 500 == 0:
# eval model every 3000th sample
if i % 15 == 0:
print(f"\nEvaluating model")
eval_loss = eval_model(nn, test_loader, loss_function, device)
print(f"Evalution loss={eval_loss}")
if eval_loss > best_eval_loss:
if eval_loss < best_eval_loss:
best_eval_loss = eval_loss
save_model(nn)

206
Scoring.py Normal file
View File

@ -0,0 +1,206 @@
# -*- coding: utf-8 -*-
"""
Author -- Michael Widrich, Andreas Schörgenhumer
Contact -- schoergenhumer@ml.jku.at
Date -- 07.06.2022
###############################################################################
The following copyright statement applies to all code within this file.
Copyright statement:
This material, no matter whether in printed or electronic form,
may be used for personal and non-commercial educational use only.
Any reproduction of this manuscript, no matter whether as a whole or in parts,
no matter whether in printed or in electronic form, requires explicit prior
acceptance of the authors.
###############################################################################
"""
import argparse
import bz2
import gzip
import lzma
import os
import zipfile
import dill as pkl
import numpy as np
import onnx
import onnxruntime
TEST_DATA_PATH = r"/daten/challenge/django/data/datasets/image_inpainting_2022/test.zip"
def load_data(file: str):
if file.endswith(".zip"):
# "mode" cannot be "rb", so set it manually to "r" (still need the parameter or the function invocation fails)
# noinspection PyUnusedLocal
def zip_open(file_, mode):
with zipfile.ZipFile(file_, "r") as myzip:
return myzip.open(myzip.namelist()[0])
open_fn = zip_open
elif file.endswith(".bz2"):
open_fn = bz2.open
elif file.endswith(".xz"):
open_fn = lzma.open
elif file.endswith(".gz"):
open_fn = gzip.open
else:
open_fn = open
with open_fn(file, "rb") as pfh:
return pkl.load(pfh)
def rmse(predictions: list, targets: list):
def rmse_(prediction_array: np.ndarray, target_array: np.ndarray):
if prediction_array.shape != target_array.shape:
raise IndexError(f"Target shape is {target_array.shape} but prediction shape is {prediction_array.shape}")
prediction_array, target_array = np.asarray(prediction_array, np.float64), np.asarray(target_array, np.float64)
return np.sqrt(np.mean((prediction_array - target_array) ** 2))
# Compute RMSE for each sample
rmses = [rmse_(prediction, target) for prediction, target in zip(predictions, targets)]
return np.mean(rmses)
def scoring_file(prediction_file: str, target_file: str):
"""Computes the mean RMSE loss on two lists of numpy arrays stored in pickle files prediction_file and targets_file
Computation of mean RMSE loss, as used in the challenge for exercise 5. See files "example_testset.pkl" and
"example_submission_random.pkl" for an example test set and example targets, respectively. The real test set
(without targets) will be available as download (see assignment sheet 2).
Parameters
----------
prediction_file: str
File path of prediction file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
uint8, as specified in assignment sheet 2. The file can optionally be compressed, which will be automatically
determined based on its file extension, of which the following are supported:
> ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
module: https://docs.python.org/3/library/zlib.html)
> ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
> ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
> ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
If none of these file extensions match, it is assumed to be a raw pickle file.
target_file: str
File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "predictions_file"
above for more details). This file will not be available for the challenge.
"""
# Load predictions
predictions = load_data(prediction_file)
if not isinstance(predictions, list):
raise TypeError(f"Expected a list of numpy arrays as pickle file. "
f"Got {type(predictions)} object in pickle file instead.")
if not all([isinstance(prediction, np.ndarray) and np.uint8 == prediction.dtype
for prediction in predictions]):
raise TypeError("List of predictions contains elements which are not numpy arrays of dtype uint8")
# Load targets
targets = load_data(target_file)
if len(targets) != len(predictions):
raise IndexError(f"list of targets has {len(targets)} elements "
f"but list of submitted predictions has {len(predictions)} elements.")
return rmse(predictions, targets)
def make_predictions(onnx_model_rt, test_data: np.ndarray):
n_samples = len(test_data["input_arrays"])
# Create predictions for each sample (one by one)
predictions = []
for sample_i in range(n_samples):
# Normalize input by maximal value
input_array = test_data["input_arrays"][sample_i].astype(np.float32) / 255
known_array = test_data["known_arrays"][sample_i].astype(np.float32)
# Stack both inputs for the network
input_array = np.concatenate([input_array, known_array], axis=0)
# Pretend we have a minibatch dimension
inputs = input_array[None] # Adds empty dimension
# Get outputs for network
inputs_rt = {onnx_model_rt.get_inputs()[0].name: inputs}
outputs = onnx_model_rt.run(None, inputs_rt)[0] # Get first return value
# We pretended to have a minibatch dimension -> remove this dimension
outputs = outputs[0]
if outputs.shape != known_array.shape:
raise ValueError(f"Unbatched model output shape is {outputs.shape} but should be {known_array.shape}")
# Get actual prediction from (entire) raw model output
prediction = outputs[known_array <= 0]
# De-normalize prediction
prediction = prediction * 255
# Clip the predictions to a valid range (we know our prediction values can only be in range 0-255 because of
# uint8 datatype!)
prediction = np.clip(prediction, a_min=0, a_max=255)
# Challenge server wants uint8 datatype for predictions
prediction = np.asarray(prediction, dtype=np.uint8)
# Add prediction for sample to list
predictions.append(prediction)
return predictions
def scoring_model(model_file: str, test_file: str, target_file: str):
"""
Computation of mean RMSE loss, as used in the challenge for exercise 5. The targets are loaded from the specified
"target_file" (pickle file containing list of numpy arrays), whereas the predictions are created using the model
stored at "model_file" using the original testset input data stored at "test_file".
Parameters
----------
model_file : str
File path of the stored (trained) model. The model must be in ONNX format, and the model output must be the
entire image (rather than only the predicted missing pixel values as it is the case when directly submitting
the predictions via the pickled list of numpy arrays; see function "scoring_file"). The actual predictions are
extracted from this entire image ouput automatically. The input to the model will be the concatenated image
data and the known array data from the original testset input data, and the batch size is fixed to 1, i.e.,
the input shape is (N=1, C=6, H=100, W=100). The output of the model (the entire image) is thus expected to
be (N=1, C=3, H=100, W=100), from which the actual predictions are extracted (given the known array).
test_file: str
File path of the original testset input data, which is a pickle file containing a dictionary with the following
entries: "input_arrays" (list of numpy arrays), "known_arrays" (list of numpy arrays), "offsets" (list of
integer 2-tuples), "spacings" (list of integer 2-tuples), "sample_ids" (list of strings). The file can
optionally be compressed, which will be automatically determined based on its file extension, of which the
following are supported:
> ".zip": zip compression (https://docs.python.org/3/library/zipfile.html, including the requirement of the zlib
module: https://docs.python.org/3/library/zlib.html)
> ".gz": gzip compression (https://docs.python.org/3/library/gzip.html, also requires the zlib module)
> ".bz2": bzip2 compression (https://docs.python.org/3/library/bz2.html)
> ".xz": lzma compression (https://docs.python.org/3/library/lzma.html)
If none of these file extensions match, it is assumed to be a raw pickle file.
target_file: str
File path of target file. Has to be a pickle file (or dill file) and contain a list of numpy arrays of dtype
uint8, as specified in assignment sheet 2. The file can optionally be compressed (refer to "test_file" above
for more details). This file will not be available for the challenge.
"""
targets = load_data(target_file)
model = onnx.load_model(model_file)
onnx.checker.check_model(model)
onnx_model_rt = onnxruntime.InferenceSession(model_file)
test_data = load_data(test_file)
predictions = make_predictions(onnx_model_rt, test_data)
return rmse(predictions, targets)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--submission", type=str, help="Path to submission file")
parser.add_argument("--target", type=str, default=None, help="Path to target file")
args = parser.parse_args()
# Infer the type of submission: 1) eported ONNX model or 2) predictions file
if args.submission.endswith(".onnx"):
mse_loss = scoring_model(model_file=args.submission, test_file=TEST_DATA_PATH, target_file=args.target)
else:
# Prediction files are too big to keep, so ensure that they are always deleted after use
try:
mse_loss = scoring_file(prediction_file=args.submission, target_file=args.target)
finally:
pass
# if os.path.exists(args.submission):
# os.remove(args.submission)
print(mse_loss)

View File

@ -1,14 +1,46 @@
import torch
import pickle
import sys
import numpy as np
import torch
import Compress
import DataLoader
from Net import ImageNN
MODEL_PATH = 'impaintmodel.pt'
PICKEL_PATH = 'impaintmodel.pkl'
def save_model(model: torch.nn.Module):
torch.save(model, 'impaintmodel.pt')
print(f"Saved raw model to {MODEL_PATH}")
torch.save(model, MODEL_PATH)
# read the provided testing pickle file
print("Generating pickle file with privided test data")
model.eval()
with open('testing/inputs.pkl', 'rb') as handle:
with open(PICKEL_PATH, 'wb') as writehandle:
b: dict = pickle.load(handle)
outarr = []
i=0
piclen = len(b['input_arrays'])
for pic in b['input_arrays']:
pic = DataLoader.preprocess(pic)
out = model(torch.from_numpy(pic))
out = DataLoader.postprocess(out.detach().numpy())
pickle.dump(out, writehandle, protocol=pickle.HIGHEST_PROTOCOL)
print(
f'\rApplying model [{i}/{piclen}] {sys.getsizeof(outarr)}',end='')
i += 1
# compress the generated pickle arr
Compress.compress(PICKEL_PATH)
def load_model():
model = ImageNN()
model.load_state_dict(torch.load('impaintmodel.pt'))
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
return model

View File

@ -1,7 +0,0 @@
unittest\unittest_input_0\00.jpg
unittest\unittest_input_0\01.jpg
unittest\unittest_input_0\02.jpg
unittest\unittest_input_0\04.jpg
unittest\unittest_input_0\05.jpg
unittest\unittest_input_0\subfolder\06.jpg
unittest\unittest_input_0\subfolder\07.jpg

View File

@ -1,9 +0,0 @@
unittest\unittest_input_1\08.jpg
unittest\unittest_input_1\09.jpg
unittest\unittest_input_1\11.jpg
unittest\unittest_input_1\12.jpg
unittest\unittest_input_1\13.jpg
unittest\unittest_input_1\14.jpg
unittest\unittest_input_1\subfolder\15.jpg
unittest\unittest_input_1\subfolder\subsubfolder\16.jpg
unittest\unittest_input_1\subfolder\subsubfolder\17.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 73 KiB