Seminar_in_AI/presentation/main.tex

349 lines
12 KiB
TeX

\documentclass[usenames,dvipsnames]{beamer}
%----------------------------------------------------------------------------------------
% Struktur und Pointer Referat
% 20.04.2020
%----------------------------------------------------------------------------------------
\usetheme[nofirafonts]{focus}
\usepackage[utf8]{inputenc}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{bbm}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{mathtools}
\RequirePackage[T1]{fontenc}
\PassOptionsToPackage{sfdefault}{FiraSans}
\RequirePackage{FiraSans}
\RequirePackage{FiraMono}
% Farbdefinitionen
\definecolor{backgroundcoloreq}{RGB}{180,140,0}
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{codeorange}{RGB}{190,100,0}
% we wanna use default caleographic alphabet
\DeclareMathAlphabet{\mathcal}{OMS}{cmbrs}{m}{n}
%----------------------------------------------------------------------------------------
% TITLE SLIDE
%----------------------------------------------------------------------------------------
\title{Cross-Model Pseudo-Labeling}
\subtitle{for Semi-Supervised Action Recognition}
\author{Lukas Heiligenbrunner}
\date{\today}
%------------------------------------------------
\begin{document}
%------------------------------------------------
\begin{frame}
\maketitle
\end{frame}
%----------------------------------------------------------------------------------------
% SECTION 1
%----------------------------------------------------------------------------------------
% todo pic of action
\section{The Goal}
\begin{frame}{The goal}
\begin{itemize}
\item Train model
\item Recognize action of person
\item From video [$\approx$10sec]
\item E.g.:
\begin{itemize}
\item brushing hair
\item riding bike
\item dancing
\item playing violin
\end{itemize}
\item As generic as possible
\end{itemize}
\end{frame}
%----------------------------------------------------------------------------------------
% SECTION 2
%----------------------------------------------------------------------------------------
\section{The Problem} % Section title slide, unnumbered
%------------------------------------------------
\begin{frame}{Missing Labels}
\begin{itemize}
\item Supervised action recoginition
\begin{itemize}
\item lots of labeled samples necessary
\item videos
\end{itemize}
\item Labeling Samples very expensive
\begin{itemize}
\item Avoid!
\end{itemize}
\item Tremendous amount of unlabled data
\begin{itemize}
\item YouTube
\end{itemize}
\item Using semi-supervised learning might be benefitial
\end{itemize}
\end{frame}
%------------------------------------------------
\begin{frame}{What's all about Semi supervised?}
\begin{itemize}
\item Supervised learning
\begin{itemize}
\item Data samples
\item Target labels
\item Each sample is associated to target label
\end{itemize}
\item Unsupervised learning
\begin{itemize}
\item Data samples
\item target is to find patterns in data
\item without supervision
\end{itemize}
\item Semi-Supervised learning
\begin{itemize}
\item combination of both
\item have labeled \& unlabeled data
\item labeled data guides learning process
\item unlabled helps to gain additional information
\item goal is performance improvement
\end{itemize}
\end{itemize}
\end{frame}
%------------------------------------------------
\begin{frame}[allowframebreaks]{What's already been done}
\begin{itemize}
\item Pseudo-labeling
\item Train model on labeled data
\begin{itemize}
\item Eg. 1\%/10\% of data labeled
\end{itemize}
\item Confidence of prediction [Threshold]
\item Use pseudo-labels to predict unlabeled data
\end{itemize}
\framebreak
\begin{itemize}
\item quantity and quality of pseudo-labels
\item significant impact on main model accuracy!
\item we want to improve pseudo-label framework as much as possible
\end{itemize}
\end{frame}
%----------------------------------------------------------------------------------------
% SECTION 2
%----------------------------------------------------------------------------------------
\section{Cross-Model Pseudo-Labeling}
\begin{frame}[allowframebreaks]{Papers approach}
\begin{itemize}
\item Based on complementary-representations of model
\item Models of different size
\item Different structural-bias $\rightarrow$ different category-wise performance
\item Small model
\begin{itemize}
\item lower capacity
\item better captures temporal dynamics in recognizing actions
\item scene changes/motion over time
\end{itemize}
\item Large model
\begin{itemize}
\item better learns spatial semantics
\item to distinguish different action instances
\item localize/identify objects in specific scene
\end{itemize}
\end{itemize}
\framebreak
\begin{itemize}
\item Cross-Model Pseudo-Labeling
\item Primary backbone (large model)
\item Supplemented by lightweight auxiliary network
\begin{itemize}
\item Different structure
\item Fewer channels (smaller)
\end{itemize}
\item Different representation of data complements primary backbone
\end{itemize}
\end{frame}
\begin{frame}{Structure Visualization}
\includegraphics[scale=.17]{rsc/structure}
\end{frame}
\begin{frame}{Performance Perspectives}
\begin{itemize}
\item 1\% labeled data + 400 Labels
\item Kinetics-400 dataset
\end{itemize}
\includegraphics[scale=.205]{rsc/performance_comparison}
\end{frame}
\section{Give me the math!}
\begin{frame}{Definitions}
\begin{itemize}
\item Labeled data set of size $N_l$\\
$\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$
\item Unlabeled data set of size $N_u$\\
$\mathcal{U} = \{u_1, \dots, u_{N_u}\}$
\item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\
\end{itemize}
\end{frame}
\begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works}
\begin{itemize}
\item $B_u \coloneqq \text{Batchsize}$
\item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$
\item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$
\item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$
\item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$
\item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$
\item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$
\item $F \coloneqq \text{Model}$
\item $\mathbbm{1} \coloneqq \text{Indicator Function}$
\end{itemize}
\begin{align*}
\mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))
\end{align*}
\framebreak
\begin{itemize}
\item $\mathbbm{1}(\max(p_i) \geq \tau)$
\begin{itemize}
\item 'confidence-based masking'
\item retain label only if largest probability is above threshold
\item keep only 'high confidence' labels
\end{itemize}
\item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$
\begin{itemize}
\item 'consistency regularization'
\item cross-entropy loss of strong augmented and weak augmented data
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)}
\begin{itemize}
\item $F(\cdot) \coloneqq \text{Primary backbone}$
\item $A(\cdot) \coloneqq \text{Auxiliary network}$
\item Learning on labeled data
\begin{align*}
\mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\
\mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i)))
\end{align*}
\item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$
\end{itemize}
\framebreak
\begin{itemize}
\item Learning on unlabeled data
\begin{align*}
\mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\
\mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\
\end{align*}
\item Complete training objective
\begin{align*}
\mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A)
\end{align*}
\item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$
\end{itemize}
\end{frame}
\section{Implementation}
\begin{frame}{Networks}
\begin{itemize}
\item Auxiliary Network
\begin{itemize}
\item sub-network of primary model
\item 3D-ResNet18
\item \textbf{3D-ResNet50x1/4}
\end{itemize}
\item Backbone network
\begin{itemize}
\item larger version of aux-net
\item \textbf{3D-ResNet50}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Dataset}
\begin{itemize}
\item Kinetics-400
\begin{itemize}
\item 400 categories
\item 240k/20k training/validation samples
\end{itemize}
\item UCF-101
\begin{itemize}
\item 101 classes
\item 9.5k/4k training/validation samples
\end{itemize}
\item $\approx$10sec every video
\item 1\% or 10\% labeled subsets balanced sampled from distribution
\end{itemize}
\end{frame}
\begin{frame}{Performance Results}
\includegraphics[scale=.65]{rsc/results}
\end{frame}
% --- THE END
\begin{frame}[focus]
Thanks for your Attention!
\end{frame}
%----------------------------------------------------------------------------------------
% CLOSING/SUPPLEMENTARY SLIDES
%----------------------------------------------------------------------------------------
\appendix
\begin{frame}{Sources}
\nocite{*} % Display all references regardless of if they were cited
\bibliography{sources}
\bibliographystyle{plain}
\end{frame}
\end{document}