From 32b98841b390853a78cf6aa98733f7f4037e2a3e Mon Sep 17 00:00:00 2001
From: lukas-heilgenbrunner <lukas.heiligenbrunner@gmail.com>
Date: Wed, 5 Jun 2024 13:42:26 +0200
Subject: [PATCH] add some citations to crossentropy

---
 src/materialandmethods.tex | 19 +++++++++----------
 src/sources.bib            |  8 ++++++++
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/materialandmethods.tex b/src/materialandmethods.tex
index c63556b..875beca 100644
--- a/src/materialandmethods.tex
+++ b/src/materialandmethods.tex
@@ -115,6 +115,7 @@ When using the accuracy as the performance metric it doesn't reveal much about t
 There might be many true-positives and rarely any true-negatives and the accuracy is still good.
 The ROC curve helps with this problem and visualizes the true-positives and false-positives on a line plot.
 The more the curve ascents the upper-left or bottom-right corner the better the classifier gets.
+Figure~\ref{fig:roc-example} shows an example of a ROC curve with differently performing classifiers.
 
 \begin{figure}
     \centering
@@ -160,7 +161,7 @@ Figure~\ref{fig:cnn-architecture} shows a typical binary classification task.
 
 \subsubsection{Softmax}
 
-The Softmax function~\ref{eq:softmax}\cite{liang2017soft} converts $n$ numbers of a vector into a probability distribution.
+The Softmax function~\eqref{eq:softmax}\cite{liang2017soft} converts $n$ numbers of a vector into a probability distribution.
 Its a generalization of the Sigmoid function and often used as an Activation Layer in neural networks.
 \begin{equation}\label{eq:softmax}
 \sigma(\mathbf{z})_j = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}} \; for j\coloneqq\{1,\dots,K\}
@@ -171,8 +172,8 @@ The softmax function has high similarities with the Boltzmann distribution and w
 
 \subsubsection{Cross Entropy Loss}
 Cross Entropy Loss is a well established loss function in machine learning.
-\eqref{eq:crelformal} shows the formal general definition of the Cross Entropy Loss.
-And~\eqref{eq:crelbinary} is the special case of the general Cross Entropy Loss for binary classification tasks.
+Equation~\eqref{eq:crelformal}\cite{crossentropy} shows the formal general definition of the Cross Entropy Loss.
+And equation~\eqref{eq:crelbinary} is the special case of the general Cross Entropy Loss for binary classification tasks.
 
 \begin{align}
     H(p,q) &= -\sum_{x\in\mathcal{X}} p(x)\, \log q(x)\label{eq:crelformal}\\
@@ -180,7 +181,7 @@ And~\eqref{eq:crelbinary} is the special case of the general Cross Entropy Loss
     \mathcal{L}(p,q) &= - \frac1N \sum_{i=1}^{\mathcal{B}} (p_i \log q_i + (1-p_i) \log(1-q_i))\label{eq:crelbinarybatch}
 \end{align}
 
-$\mathcal{L}(p,q)$~\eqref{eq:crelbinarybatch} is the Binary Cross Entropy Loss for a batch of size $\mathcal{B}$ and used for model training in this PW.\cite{crossentropy}
+Equation~$\mathcal{L}(p,q)$~\eqref{eq:crelbinarybatch}\cite{handsonaiI} is the Binary Cross Entropy Loss for a batch of size $\mathcal{B}$ and used for model training in this Practical Work.
 
 \subsubsection{Mathematical modeling of problem}\label{subsubsec:mathematicalmodeling}
 
@@ -188,7 +189,7 @@ Here the task is modeled as a mathematical problem to get a better understanding
 
 The model is defined as $g(\pmb{x};\pmb{w})$ where $\pmb{w}$ are the model weights and $\mathcal{X}$ the input samples.\cite{suptechniques}
 We define two hyperparameters, the batch size $\mathcal{B}$ and the sample size $\mathcal{S}$ where $\mathcal{B} < \mathcal{S}$.
-In every active learning loop iteration we sample $\mathcal{S}$ random samples~\eqref{eq:batchdef}\cite{suptechniques} from our total unlabeled sample set $\mathcal{X}_U \subset \mathcal{X}$.
+In every active learning loop iteration we sample $\mathcal{S}$ random samples as in equation~\eqref{eq:batchdef}\cite{suptechniques} from our total unlabeled sample set $\mathcal{X}_U \subset \mathcal{X}$.
 
 \begin{equation}
     \label{eq:batchdef}
@@ -203,20 +204,18 @@ z = g(\pmb{x};\pmb{w})
 \end{equation}
 
 Those predictions might have any numerical value and have to be squeezed into a proper distribution which sums up to 1.
-The Softmax function has exactly this effect: $\sum^\mathcal{S}_{i=1}\sigma(z)_i=1$.
+The Softmax function has exactly this effect: $\sum^\mathcal{S}_{i=1}\sigma(z)_i=1$.\cite{handsonaiI}
 Since we have a two class problem the Softmax results in two result values, the two probabilities of how certain one class is a match.
 We want to calculate the distance to the class center and the more far away a prediction is from the center the more certain it is.
 Vice versa, the more centered the predictions are the more uncertain the prediction is.
 Labels $0$ and $1$ result in a class center of $\frac{0+1}{2}=\frac{1}{2}$.
-That means taking the absolute value of the prediction minus the class center results in the certainty of the sample~\eqref{eq:certainty}.
+That means taking the absolute value of the prediction minus the class center results in the certainty of the sample~\eqref{eq:certainty}.\cite{activelearning}
 
 \begin{align}
     \label{eq:certainty}
     S(z) = | 0.5 - \sigma(\mathbf{z})_0|  \; \textit{or}  \; S(z) = \max \sigma(\mathbf{z}) - 0.5
 \end{align}
 
-\cite{activelearning}
-
 With the help of this metric the pseudo predictions can be sorted by the score $S(z)$.
 We define $\text{min}_n(S)$ and $\text{max}_n(S)$ respectively in equation~\ref{eq:minnot} and equation~\ref{eq:maxnot} to define a short form of taking a subsection of the minimum or maximum of a set.
 
@@ -230,7 +229,7 @@ We define $\text{min}_n(S)$ and $\text{max}_n(S)$ respectively in equation~\ref{
 
 This notation helps to define which subsets of samples to give the user for labeling.
 There are different ways how this subset can be chosen.
-In this PW we do the obvious experiments with High-Certainty first in paragraph~\ref{par:low-certainty-first}, Low-Certainty\cite{certainty-based-al} first in paragraph~\ref{par:high-certainty-first}.
+In this PW we do the obvious experiments with High-Certainty first in paragraph~\ref{par:low-certainty-first}, Low-Certainty first~\cite{certainty-based-al} in paragraph~\ref{par:high-certainty-first}.
 Furthermore, the two mixtures between them, half-high and half-low certain and only the middle section of the sorted certainty scores.
 
 \paragraph{Low certainty first}\label{par:low-certainty-first}
diff --git a/src/sources.bib b/src/sources.bib
index e05a407..cbbda88 100644
--- a/src/sources.bib
+++ b/src/sources.bib
@@ -134,6 +134,14 @@ doi = {10.1007/978-0-387-85820-3_23}
     publisher={Johannes Kepler Universität Linz}
 }
 
+@misc{handsonaiI,
+    author        = {Andreas Schörgenhumer, Bernhard Schäfl, Michael Widrich},
+    title         = {Lecture notes in Hands On AI I, Unit 4 & 5},
+    month         = {October},
+    year          = {2021},
+    publisher={Johannes Kepler Universität Linz}
+}
+
 @online{ROCWikipedia,
     author  = "Wikimedia Commons",
     title   = "Receiver operating characteristic",