diff --git a/presentation/main.tex b/presentation/main.tex index d88b9a4..c48eeb1 100644 --- a/presentation/main.tex +++ b/presentation/main.tex @@ -5,14 +5,19 @@ %---------------------------------------------------------------------------------------- \usetheme{focus} + \usepackage[utf8]{inputenc} \usepackage{booktabs} \usepackage{amsmath} +\usepackage{amssymb} +\usepackage{amsfonts} +\usepackage{bbm} \usepackage{hyperref} \usepackage{graphicx} \usepackage{listings} \usepackage{xcolor} +\usepackage{mathtools} % Farbdefinitionen \definecolor{backgroundcoloreq}{RGB}{180,140,0} @@ -21,6 +26,9 @@ \definecolor{codepurple}{rgb}{0.58,0,0.82} \definecolor{codeorange}{RGB}{190,100,0} +% we wanna use default caleographic alphabet +\DeclareMathAlphabet{\mathcal}{OMS}{cmbrs}{m}{n} + \lstset{ language=C, @@ -75,20 +83,20 @@ %---------------------------------------------------------------------------------------- % todo pic of action - \section{The goal} + \section{The Goal} \begin{frame}{The goal} \begin{itemize} - \item train model - \item recognize action of person - \item from video [$\approx$10sec] - \item eg.: + \item Train model + \item Recognize action of person + \item From video [$\approx$10sec] + \item E.g.: \begin{itemize} \item brushing hair \item riding bike \item dancing \item playing violin \end{itemize} - \item as generic as possible + \item As generic as possible \end{itemize} \end{frame} @@ -173,6 +181,7 @@ % SECTION 2 %---------------------------------------------------------------------------------------- + \section{Cross-Model Pseudo-Labeling} \begin{frame}[allowframebreaks]{Papers approach} @@ -196,21 +205,138 @@ \begin{itemize} \item Cross-Model Pseudo-Labeling - \item Primary backbone + \item Primary backbone (large model) \item Supplemented by lightweight auxiliary network \begin{itemize} - \item Different structure - \item Fewer channels + \item Different structure + \item Fewer channels \end{itemize} \item Different representation of data complements primary backbone \end{itemize} \end{frame} - - \begin{frame}{Performance glance} - todo the pic of the performance graph + \begin{frame}{Structure Visualization} + \includegraphics[scale=.17]{rsc/structure} \end{frame} + \begin{frame}{Performance Perspectives} + \includegraphics[scale=.205]{rsc/performance_comparison} + \end{frame} + + + \section{Give me the math!} + + \begin{frame}{Definitions} + \begin{itemize} + \item Labeled data set of size $N_l$\\ + $\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$ + \item Unlabeled data set of size $N_u$\\ + $\mathcal{U} = \{u_1, \dots, u_{N_u}\}$ + \item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\ + \end{itemize} + \end{frame} + + \begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works} + \begin{itemize} + \item $\mathbbm{1} \coloneqq \text{Indicator Function}$ + \item $B_u \coloneqq \text{Batchsize}$ + \item $\mathcal{T} \coloneqq \text{Confidence Threshold}$ + \item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$ + \item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$ + \item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$ + \item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$ + \item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$ + \item $\mathcal{F} \coloneqq \text{Model}$ + \end{itemize} + \begin{align*} + \mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \mathcal{T}) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i))) + \end{align*} + + \framebreak + + \begin{itemize} + \item $\mathbbm{1}(\max(p_i) \geq \mathcal{T})$ + \begin{itemize} + \item 'confidence-based masking' + \item retain label only if largest probability is above threshold + \item keep only 'high confidence' labels + \end{itemize} + \item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$ + \begin{itemize} + \item 'consistency regularization' + \item cross-entropy loss of strong augmented and weak augmented data + \end{itemize} + \end{itemize} + + \end{frame} + + \begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)} + \begin{itemize} + \item $F(\cdot) \coloneqq \text{Primary backbone}$ + \item $A(\cdot) \coloneqq \text{Auxiliary network}$ + \item Learning on labeled data + \begin{align*} + \mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\ + \mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i))) + \end{align*} + \item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$ + \end{itemize} + + \framebreak + + \begin{itemize} + \item Learning on unlabeled data + \begin{align*} + \mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \mathcal{T}) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\ + \mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \mathcal{T}) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\ + \end{align*} + \item Complete training objective + \begin{align*} + \mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A) + \end{align*} + \item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$ + \end{itemize} + \end{frame} + + \section{Implementation} + + \begin{frame}{Networks} + \begin{itemize} + \item Auxiliary Network + \begin{itemize} + \item sub-network of primary model + \item 3D-ResNet18 + \item \textbf{3D-ResNet50x1/4} + \end{itemize} + \item Backbone network + \begin{itemize} + \item larger version of aux-net + \item \textbf{3D-ResNet50} + \end{itemize} + \end{itemize} + \end{frame} + + \begin{frame}{Dataset} + \begin{itemize} + \item Kinetics-400 + \begin{itemize} + \item 400 categories + \item 240k/20k training/validation samples + \end{itemize} + \item UCF-101 + \begin{itemize} + \item 101 classes + \item 9.5k/4k training/validation samples + \end{itemize} + \item $\approx$10sec every video + \item 1\% or 10\% labeled subsets balanced sampled from distribution + \end{itemize} + \end{frame} + + + \begin{frame}{Performance Results} + \includegraphics[scale=.65]{rsc/results} + \end{frame} % --- THE END diff --git a/presentation/rsc/performance_comparison.jpg b/presentation/rsc/performance_comparison.jpg new file mode 100644 index 0000000..3432a89 Binary files /dev/null and b/presentation/rsc/performance_comparison.jpg differ diff --git a/presentation/rsc/results.jpg b/presentation/rsc/results.jpg new file mode 100644 index 0000000..778f306 Binary files /dev/null and b/presentation/rsc/results.jpg differ diff --git a/presentation/rsc/structure.jpg b/presentation/rsc/structure.jpg new file mode 100644 index 0000000..57eab05 Binary files /dev/null and b/presentation/rsc/structure.jpg differ diff --git a/presentation/sources.bib b/presentation/sources.bib index 126ae6f..0791896 100644 --- a/presentation/sources.bib +++ b/presentation/sources.bib @@ -1,18 +1,16 @@ -@misc{structtutorialspoint, - Title = {struct basics}, - howpublished = {\url{https://www.tutorialspoint.com/cprogramming/c_structures.htm}}, - note = {Aufgerufen: 2020-04} +@InProceedings{Xu_2022_CVPR, + author = {Xu, Yinghao and Wei, Fangyun and Sun, Xiao and Yang, Ceyuan and Shen, Yujun and Dai, Bo and Zhou, Bolei and Lin, Stephen}, + title = {Cross-Model Pseudo-Labeling for Semi-Supervised Action Recognition}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2022}, + pages = {2959-2968} } -@misc{structchowto, - Title = {struct basics}, - howpublished = {\url{http://www.c-howto.de/tutorial/strukturierte-datentypen/strukturen/}}, - note = {Aufgerufen: 2020-04} +@online{knuthwebsite, + author = "Kihyuk Sohn, David Berthelot, Chun-Liang Li", + title = "FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence", + url = "https://arxiv.org/abs/2001.07685", + addendum = "(accessed: 20.03.2023)", + keywords = "FixMatch, semi-supervised" } - -@misc{pointertutorialspoint, - Title = {Pointer basics}, - howpublished = {\url{https://www.tutorialspoint.com/cprogramming/c_pointers.htm}}, - note = {Aufgerufen: 2020-04} -} -