\documentclass[usenames,dvipsnames]{beamer} %---------------------------------------------------------------------------------------- % Struktur und Pointer Referat % 20.04.2020 %---------------------------------------------------------------------------------------- \usetheme[nofirafonts]{focus} \usepackage[utf8]{inputenc} \usepackage{booktabs} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsfonts} \usepackage{bbm} \usepackage{hyperref} \usepackage{graphicx} \usepackage{xcolor} \usepackage{mathtools} \RequirePackage[T1]{fontenc} \PassOptionsToPackage{sfdefault}{FiraSans} \RequirePackage{FiraSans} \RequirePackage{FiraMono} % Farbdefinitionen \definecolor{backgroundcoloreq}{RGB}{180,140,0} \definecolor{codegreen}{rgb}{0,0.6,0} \definecolor{codegray}{rgb}{0.5,0.5,0.5} \definecolor{codepurple}{rgb}{0.58,0,0.82} \definecolor{codeorange}{RGB}{190,100,0} % we wanna use default caleographic alphabet \DeclareMathAlphabet{\mathcal}{OMS}{cmbrs}{m}{n} %---------------------------------------------------------------------------------------- % TITLE SLIDE %---------------------------------------------------------------------------------------- \title{Cross-Model Pseudo-Labeling} \subtitle{for Semi-Supervised Action Recognition} \author{Lukas Heiligenbrunner} \date{\today} %------------------------------------------------ \begin{document} %------------------------------------------------ \begin{frame} \maketitle \end{frame} %---------------------------------------------------------------------------------------- % SECTION 1 %---------------------------------------------------------------------------------------- % todo pic of action \section{The Goal} \begin{frame}{The goal} \begin{itemize} \item Train model \item Recognize action of person \item From video [$\approx$10sec] \item E.g.: \begin{itemize} \item brushing hair \item riding bike \item dancing \item playing violin \end{itemize} \item As generic as possible \end{itemize} \end{frame} %---------------------------------------------------------------------------------------- % SECTION 2 %---------------------------------------------------------------------------------------- \section{The Problem} % Section title slide, unnumbered %------------------------------------------------ \begin{frame}{Missing Labels} \begin{itemize} \item Supervised action recoginition \begin{itemize} \item lots of labeled samples necessary \item videos \end{itemize} \item Labeling Samples very expensive \begin{itemize} \item Avoid! \end{itemize} \item Tremendous amount of unlabled data \begin{itemize} \item YouTube \end{itemize} \item Using semi-supervised learning might be benefitial \end{itemize} \end{frame} %------------------------------------------------ \begin{frame}{What's all about Semi supervised?} \begin{itemize} \item Supervised learning \begin{itemize} \item Data samples \item Target labels \item Each sample is associated to target label \end{itemize} \item Unsupervised learning \begin{itemize} \item Data samples \item target is to find patterns in data \item without supervision \end{itemize} \item Semi-Supervised learning \begin{itemize} \item combination of both \item have labeled \& unlabeled data \item labeled data guides learning process \item unlabled helps to gain additional information \item goal is performance improvement \end{itemize} \end{itemize} \end{frame} %------------------------------------------------ \begin{frame}[allowframebreaks]{What's already been done} \begin{itemize} \item Pseudo-labeling \item Train model on labeled data \begin{itemize} \item Eg. 1\%/10\% of data labeled \end{itemize} \item Confidence of prediction [Threshold] \item Use pseudo-labels to predict unlabeled data \end{itemize} \framebreak \begin{itemize} \item quantity and quality of pseudo-labels \item significant impact on main model accuracy! \item we want to improve pseudo-label framework as much as possible \end{itemize} \end{frame} %---------------------------------------------------------------------------------------- % SECTION 2 %---------------------------------------------------------------------------------------- \section{Cross-Model Pseudo-Labeling} \begin{frame}[allowframebreaks]{Papers approach} \begin{itemize} \item Based on complementary-representations of model \item Models of different size \item Different structural-bias $\rightarrow$ different category-wise performance \item Small model \begin{itemize} \item lower capacity \item better captures temporal dynamics in recognizing actions \item scene changes/motion over time \end{itemize} \item Large model \begin{itemize} \item better learns spatial semantics \item to distinguish different action instances \item localize/identify objects in specific scene \end{itemize} \end{itemize} \framebreak \begin{itemize} \item Cross-Model Pseudo-Labeling \item Primary backbone (large model) \item Supplemented by lightweight auxiliary network \begin{itemize} \item Different structure \item Fewer channels (smaller) \end{itemize} \item Different representation of data complements primary backbone \end{itemize} \end{frame} \begin{frame}{Structure Visualization} \includegraphics[scale=.17]{rsc/structure} \end{frame} \begin{frame}{Performance Perspectives} \begin{itemize} \item 1\% labeled data + 400 Labels \item Kinetics-400 dataset \end{itemize} \includegraphics[scale=.205]{rsc/performance_comparison} \end{frame} \section{Give me the math!} \begin{frame}{Definitions} \begin{itemize} \item Labeled data set of size $N_l$\\ $\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$ \item Unlabeled data set of size $N_u$\\ $\mathcal{U} = \{u_1, \dots, u_{N_u}\}$ \item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\ \end{itemize} \end{frame} \begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works} \begin{itemize} \item $B_u \coloneqq \text{Batchsize}$ \item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$ \item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$ \item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$ \item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$ \item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$ \item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$ \item $F \coloneqq \text{Model}$ \item $\mathbbm{1} \coloneqq \text{Indicator Function}$ \end{itemize} \begin{align*} \mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i))) \end{align*} \framebreak \begin{itemize} \item $\mathbbm{1}(\max(p_i) \geq \tau)$ \begin{itemize} \item 'confidence-based masking' \item retain label only if largest probability is above threshold \item keep only 'high confidence' labels \end{itemize} \item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$ \begin{itemize} \item 'consistency regularization' \item cross-entropy loss of strong augmented and weak augmented data \end{itemize} \end{itemize} \end{frame} \begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)} \begin{itemize} \item $F(\cdot) \coloneqq \text{Primary backbone}$ \item $A(\cdot) \coloneqq \text{Auxiliary network}$ \item Learning on labeled data \begin{align*} \mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\ \mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i))) \end{align*} \item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$ \end{itemize} \framebreak \begin{itemize} \item Learning on unlabeled data \begin{align*} \mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\ \mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\ \end{align*} \item Complete training objective \begin{align*} \mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A) \end{align*} \item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$ \end{itemize} \end{frame} \section{Implementation} \begin{frame}{Networks} \begin{itemize} \item Auxiliary Network \begin{itemize} \item sub-network of primary model \item 3D-ResNet18 \item \textbf{3D-ResNet50x1/4} \end{itemize} \item Backbone network \begin{itemize} \item larger version of aux-net \item \textbf{3D-ResNet50} \end{itemize} \end{itemize} \end{frame} \begin{frame}{Dataset} \begin{itemize} \item Kinetics-400 \begin{itemize} \item 400 categories \item 240k/20k training/validation samples \end{itemize} \item UCF-101 \begin{itemize} \item 101 classes \item 9.5k/4k training/validation samples \end{itemize} \item $\approx$10sec every video \item 1\% or 10\% labeled subsets balanced sampled from distribution \end{itemize} \end{frame} \begin{frame}{Performance Results} \includegraphics[scale=.65]{rsc/results} \end{frame} % --- THE END \begin{frame}[focus] Thanks for your Attention! \end{frame} %---------------------------------------------------------------------------------------- % CLOSING/SUPPLEMENTARY SLIDES %---------------------------------------------------------------------------------------- \appendix \begin{frame}{Sources} \nocite{*} % Display all references regardless of if they were cited \bibliography{sources} \bibliographystyle{plain} \end{frame} \end{document}