\title{Cross-Model Pseudo-Labeling}
\subtitle{for Semi-Supervised Action Recognition}
\author{Lukas Heiligenbrunner}
\section{The Goal}
\begin{frame}{The goal}
\item Train model
\item Recognize action of person
\item From video [$\approx$10sec]
\item E.g.:
\item brushing hair
\item riding bike
\item dancing
\item playing violin
\item As generic as possible
\section{The Problem} % Section title slide, unnumbered
\begin{frame}{Missing Labels}
\item Supervised action recoginition
\item lots of labeled samples necessary
\item videos
\item Labeling Samples very expensive
\item Avoid!
\item Tremendous amount of unlabled data
\item YouTube
\item Using semi-supervised learning might be benefitial
\begin{frame}{What's all about Semi supervised?}
\item Supervised learning
\item Data samples
\item Target labels
\item Each sample is associated to target label
\item Unsupervised learning
\item Data samples
\item target is to find patterns in data
\item without supervision
\item Semi-Supervised learning
\item combination of both
\item have labeled \& unlabeled data
\item labeled data guides learning process
\item unlabled helps to gain additional information
\item goal is performance improvement
\begin{frame}[allowframebreaks]{What's already been done}
\item Pseudo-labeling
\item Train model on labeled data
\item Eg. 1\%/10\% of data labeled
\item Confidence of prediction [Threshold]
\item Use pseudo-labels to predict unlabeled data
\item quantity and quality of pseudo-labels
\item significant impact on main model accuracy!
\item we want to improve pseudo-label framework as much as possible
\section{Cross-Model Pseudo-Labeling}
\begin{frame}[allowframebreaks]{Papers approach}
\item Based on complementary-representations of model
\item Models of different size
\item Different structural-bias $\rightarrow$ different category-wise performance
\item Small model
\item lower capacity
\item better captures temporal dynamics in recognizing actions
\item scene changes/motion over time
\item Large model
\item better learns spatial semantics
\item to distinguish different action instances
\item localize/identify objects in specific scene
\item Cross-Model Pseudo-Labeling
\item Primary backbone (large model)
\item Supplemented by lightweight auxiliary network
\item Different structure
\item Fewer channels (smaller)
\item Different representation of data complements primary backbone
\begin{frame}{Structure Visualization}
\begin{frame}{Performance Perspectives}
\item 1\% labeled data + 400 Labels
\item Kinetics-400 dataset
\section{Give me the math!}
\item Labeled data set of size $N_l$\\
$\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$
\item Unlabeled data set of size $N_u$\\
$\mathcal{U} = \{u_1, \dots, u_{N_u}\}$
\item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\
\begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works}
\item $B_u \coloneqq \text{Batchsize}$
\item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$
\item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$
\item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$
\item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$
\item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$
\item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$
\item $F \coloneqq \text{Model}$
\item $\mathbbm{1} \coloneqq \text{Indicator Function}$
\mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))
\item $\mathbbm{1}(\max(p_i) \geq \tau)$
\item 'confidence-based masking'
\item retain label only if largest probability is above threshold
\item keep only 'high confidence' labels
\item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$
\item 'consistency regularization'
\item cross-entropy loss of strong augmented and weak augmented data
\begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)}
\item $F(\cdot) \coloneqq \text{Primary backbone}$
\item $A(\cdot) \coloneqq \text{Auxiliary network}$
\item Learning on labeled data
\mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\
\mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i)))
\item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$
\item Learning on unlabeled data
\mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\
\mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\
\item Complete training objective
\mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A)
\item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$
\item Auxiliary Network
\item sub-network of primary model
\item 3D-ResNet18
\item \textbf{3D-ResNet50x1/4}
\item Backbone network
\item larger version of aux-net
\item \textbf{3D-ResNet50}
\item Kinetics-400
\item 400 categories
\item 240k/20k training/validation samples
\item UCF-101
\item 101 classes
\item 9.5k/4k training/validation samples
\item $\approx$10sec every video
\item 1\% or 10\% labeled subsets balanced sampled from distribution
\begin{frame}{Performance Results}
Thanks for your Attention!
