diff --git a/typstalt/introduction.typ b/typstalt/introduction.typ new file mode 100644 index 0000000..7da5a88 --- /dev/null +++ b/typstalt/introduction.typ @@ -0,0 +1,31 @@ += Introduction +== Motivation +Anomaly detection has especially in the industrial and automotive field essential importance. +Lots of assembly lines need visual inspection to find errors often with the help of camera systems. +Machine learning helped the field to advance a lot in the past. +PatchCore and EfficientAD are state of the art algorithms trained only on good data and then detect anomalies within unseen (but similar) data. +One of their problems is the need of lots of training data and time to train. +Few-Shot learning might be a suitable alternative with essentially lowered train time. + +In this thesis the performance of 3 Few-Shot learning algorithms will be compared in the field of anomaly detection. +Moreover, few-shot learning might be able not only to detect anomalies but also to detect the anomaly class. + +== Research Questions + +=== Is Few-Shot learning a suitable fit for anomaly detection? + +Should Few-Shot learning be used for anomaly detection tasks? +How does it compare to well established algorithms such as Patchcore or EfficientAD? + +=== How does disbalancing the Shot number affect performance? +Does giving the Few-Shot learner more good than bad samples improve the model performance? + +=== How does the 3 (ResNet, CAML, \pmf) methods perform in only detecting the anomaly class? +How much does the performance improve if only detecting an anomaly or not? +How does it compare to PatchCore and EfficientAD? + +=== Extra: How does Euclidean distance compare to Cosine-similarity when using ResNet as a feature-extractor? +I've tried different distance measures $->$ but results are pretty much the same. + +== Outline +todo diff --git a/typstalt/main.pdf b/typstalt/main.pdf new file mode 100644 index 0000000..93d461c Binary files /dev/null and b/typstalt/main.pdf differ diff --git a/typstalt/main.typ b/typstalt/main.typ new file mode 100644 index 0000000..02d2a7a --- /dev/null +++ b/typstalt/main.typ @@ -0,0 +1,82 @@ +#import "@preview/springer-spaniel:0.1.0" +#import springer-spaniel.ctheorems: * // provides "proof", "theorem", "lemma" + +// Set citation style +#set cite(style: "iso-690-author-date") // page info visible +//#set cite(style: "iso-690-numeric") // page info visible +//#set cite(style: "springer-basic")// no additional info visible (page number in square brackets) +//#set cite(style: "alphanumeric")// page info not visible + + +#show: springer-spaniel.template( + title: [Few shot learning for anomaly detection Bachelor Thesis for AI], + authors: ( + ( + name: "Lukas Heiligenbrunner", + institute: "Johannes Kepler University", + address: "Linz, Austria", + email: "lukas.heiligenbrunner@gmail.com" + ), + // ... and so on + ), + abstract: lorem(75), + + // debug: true, // Highlights structural elements and links + // frame: 1pt, // A border around the page for white on white display + // printer-test: true, // Suitably placed CMYK printer tests +) + +#let date = datetime.today() // not today: datetime(year: 1969, month: 9, day: 6,) +#let k-number = "k12345678" + +// set equation and heading numbering +#set math.equation(numbering: "(1)") +#set heading(numbering: "1.1") + + +// Pagebreak after level 1 headings +#show heading.where(level: 1): it => [ + #pagebreak(weak: true) + #it +] + +// show reference targets in brackets +#show ref: it => { + let el = it.element + if el != none and el.func() == heading { + + [#it (#el.body)] + } else [#it] +} + +// style table-of-contents +#show outline.entry.where( + level: 1 +): it => { + v(1em, weak: true) + strong(it) +} + +// Table of contents. +#outline( + title: { + text(1.3em, weight: 700, "Contents") + v(10mm) + }, + indent: 2em, + depth: 3 +) +#pagebreak(weak: false) + +#include "introduction.typ" +#include "materialandmethods.typ" + += Section Heading +#cite() +== Subsection Heading +=== Subsubsection Heading +==== Paragraph Heading +===== Subparagraph Heading + +#set par(leading: 0.7em, first-line-indent: 0em, justify: true) +#bibliography("sources.bib", style: "apa") diff --git a/typstalt/materialandmethods.typ b/typstalt/materialandmethods.typ new file mode 100644 index 0000000..cdd0980 --- /dev/null +++ b/typstalt/materialandmethods.typ @@ -0,0 +1,121 @@ += Material and Methods + +== Material + +=== MVTec AD +MVTec AD is a dataset for benchmarking anomaly detection methods with a focus on industrial inspection. +It contains over 5000 high-resolution images divided into fifteen different object and texture categories. +Each category comprises a set of defect-free training images and a test set of images with various kinds of defects as well as images without defects. + +// todo source for https://www.mvtec.com/company/research/datasets/mvtec-ad + +// todo example image +//\begin{figure} +// \centering +// \includegraphics[width=\linewidth/2]{../rsc/muffin_chiauaua_poster} +// \caption{Sample images from dataset. \cite{muffinsvschiuahuakaggle_poster}} +// \label{fig:roc-example} +//\end{figure} + + +== Methods + +=== Few-Shot Learning +Few-Shot learning is a subfield of machine-learning which aims to train a classification-model with just a few or no samples at all. +In contrast to traditional supervised learning where a huge amount of labeled data is required is to generalize well to unseen data. +So the model is prone to overfitting to the few training samples. + +Typically a few-shot leaning task consists of a support and query set. +Where the support-set contains the training data and the query set the evaluation data for real world evaluation. +A common way to format a few-shot leaning problem is using n-way k-shot notation. +For Example 3 target classeas and 5 samples per class for training might be a 3-way 5-shot few-shot classification problem. + +A classical example of how such a model might work is a prototypical network. +These models learn a representation of each class and classify new examples based on proximity to these representations in an embedding space. + +The first and easiest method of this bachelor thesis uses a simple ResNet to calucalte those embeddings and is basically a simple prototypical netowrk. +See //%todo link to this section +// todo proper source + +=== Generalisation from few samples} + +=== Patchcore} + +%todo also show values how they perform on MVTec AD + +=== EfficientAD +todo stuff #cite() +// https://arxiv.org/pdf/2106.08265 +todo stuff #cite() +// https://arxiv.org/pdf/2303.14535 + +=== Jupyter Notebook + +A Jupyter notebook is a shareable document which combines code and its output, text and visualizations. +The notebook along with the editor provides a environment for fast prototyping and data analysis. +It is widely used in the data science, mathematics and machine learning community. + +In the context of this practical work it can be used to test and evaluate the active learning loop before implementing it in a Dagster pipeline. #cite() + +=== CNN +Convolutional neural networks are especially good model architectures for processing images, speech and audio signals. +A CNN typically consists of Convolutional layers, pooling layers and fully connected layers. +Convolutional layers are a set of learnable kernels (filters). +Each filter performs a convolution operation by sliding a window over every pixel of the image. +On each pixel a dot product creates a feature map. +Convolutional layers capture features like edges, textures or shapes. +Pooling layers sample down the feature maps created by the convolutional layers. +This helps reducing the computational complexity of the overall network and help with overfitting. +Common pooling layers include average- and max pooling. +Finally, after some convolution layers the feature map is flattened and passed to a network of fully connected layers to perform a classification or regression task. +@cnnarchitecture shows a typical binary classification task. +#cite() + +#figure( + image("rsc/cnn_architecture.png", width: 80%), + caption: [Architecture convolutional neural network. #cite()], +) + +=== RESNet + +Residual neural networks are a special type of neural network architecture. +They are especially good for deep learning and have been used in many state-of-the-art computer vision tasks. +The main idea behind ResNet is the skip connection. +The skip connection is a direct connection from one layer to another layer which is not the next layer. +This helps to avoid the vanishing gradient problem and helps with the training of very deep networks. +ResNet has proven to be very successful in many computer vision tasks and is used in this practical work for the classification task. +There are several different ResNet architectures, the most common are ResNet-18, ResNet-34, ResNet-50, ResNet-101 and ResNet-152. #cite() + +Since the dataset is relatively small and the two class classification task is relatively easy (for such a large model) the ResNet-18 architecture is used in this practical work. + +=== CAML +Todo +=== P$>$M$>$F +Todo + +=== Softmax + +The Softmax function @softmax #cite() converts $n$ numbers of a vector into a probability distribution. +Its a generalization of the Sigmoid function and often used as an Activation Layer in neural networks. + +$ +sigma(bold(z))_j = (e^(z_j)) / (sum_(k=1)^k e^(z_k)) "for" j=(1,...,k) +$ + +The softmax function has high similarities with the Boltzmann distribution and was first introduced in the 19th century #cite(). + + +=== Cross Entropy Loss +Cross Entropy Loss is a well established loss function in machine learning. +Equation~\eqref{eq:crelformal}\cite{crossentropy} shows the formal general definition of the Cross Entropy Loss. +And equation~\eqref{eq:crelbinary} is the special case of the general Cross Entropy Loss for binary classification tasks. + +$ +H(p,q) &= -sum_(x in cal(X)) p(x) log q(x)\ +H(p,q) &= -p log(q) + (1-p) log(1-q)\ +cal(L)(p,q) &= -1/N sum_(i=1)^(cal(B)) (p_i log(q_i) + (1-p_i) log(1-q_i)) +$ + +Equation~$cal(L)(p,q)$~\eqref{eq:crelbinarybatch}\cite{handsonaiI} is the Binary Cross Entropy Loss for a batch of size $cal(B)$ and used for model training in this Practical Work. + +=== Mathematical modeling of problem diff --git a/typstalt/rsc/cnn_architecture.png b/typstalt/rsc/cnn_architecture.png new file mode 100644 index 0000000..f5588ff Binary files /dev/null and b/typstalt/rsc/cnn_architecture.png differ diff --git a/typstalt/sources.bib b/typstalt/sources.bib new file mode 100644 index 0000000..8999b63 --- /dev/null +++ b/typstalt/sources.bib @@ -0,0 +1,92 @@ +%! Author = lukas +%! Date = 4/9/24 + +@InProceedings{crossentropy, + ISSN = {00359246}, + URL = {http://www.jstor.org/stable/2984087}, + abstract = {This paper deals first with the relationship between the theory of probability and the theory of rational behaviour. A method is then suggested for encouraging people to make accurate probability estimates, a connection with the theory of information being mentioned. Finally Wald's theory of statistical decision functions is summarised and generalised and its relation to the theory of rational behaviour is discussed.}, + author = {I. J. Good}, + journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, + number = {1}, + pages = {107--114}, + publisher = {[Royal Statistical Society, Wiley]}, + title = {Rational Decisions}, + urldate = {2024-05-23}, + volume = {14}, + year = {1952} +} + +@misc{efficientADpaper, + title={EfficientAD: Accurate Visual Anomaly Detection at Millisecond-Level Latencies}, + author={Kilian Batzner and Lars Heckler and Rebecca König}, + year={2024}, + eprint={2303.14535}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2303.14535}, +} + +@misc{patchcorepaper, + title={Towards Total Recall in Industrial Anomaly Detection}, + author={Karsten Roth and Latha Pemula and Joaquin Zepeda and Bernhard Schölkopf and Thomas Brox and Peter Gehler}, + year={2022}, + eprint={2106.08265}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2106.08265}, +} + +@misc{jupyter, + author = {}, + title = {{Project Jupyter Documentation}}, + howpublished = "\url{https://docs.jupyter.org/en/latest/}", + year = {2024}, + note = "[Online; accessed 13-May-2024]" + } + + @misc{cnnintro, + title={An Introduction to Convolutional Neural Networks}, + author={Keiron O'Shea and Ryan Nash}, + year={2015}, + eprint={1511.08458}, + archivePrefix={arXiv}, + primaryClass={cs.NE} + } + + @misc{cnnarchitectureimg, + author = {}, + title = {{What are convolutional neural networks?}}, + howpublished = "\url{https://cointelegraph.com/explained/what-are-convolutional-neural-networks}", + year = {2024}, + note = "[Online; accessed 12-April-2024]" + } + + @inproceedings{liang2017soft, + title={Soft-margin softmax for deep classification}, + author={Liang, Xuezhi and Wang, Xiaobo and Lei, Zhen and Liao, Shengcai and Li, Stan Z}, + booktitle={International Conference on Neural Information Processing}, + pages={413--421}, + year={2017}, + organization={Springer} + } + +@inbook{Boltzmann, + place = {Cambridge}, + series = {Cambridge Library Collection - Physical Sciences}, + title = {Studien über das Gleichgewicht der lebendigen Kraft zwischen bewegten materiellen Punkten}, + booktitle = {Wissenschaftliche Abhandlungen}, + publisher = {Cambridge University Press}, + author = {Boltzmann, Ludwig}, + editor = {Hasenöhrl, FriedrichEditor}, + year = {2012}, + pages = {49–96}, + collection = {Cambridge Library Collection - Physical Sciences}, key = {value},} + + @misc{resnet, + title={Deep Residual Learning for Image Recognition}, + author={Kaiming He and Xiangyu Zhang and Shaoqing Ren and Jian Sun}, + year={2015}, + eprint={1512.03385}, + archivePrefix={arXiv}, + primaryClass={cs.CV} + }