352 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			352 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
\documentclass[usenames,dvipsnames]{beamer}
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	Struktur und Pointer Referat
 | 
						|
%   20.04.2020
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
\usetheme[nofirafonts]{focus}
 | 
						|
 | 
						|
 | 
						|
\usepackage[utf8]{inputenc}
 | 
						|
 | 
						|
\usepackage{booktabs}
 | 
						|
\usepackage{amsmath}
 | 
						|
\usepackage{amssymb}
 | 
						|
\usepackage{amsfonts}
 | 
						|
\usepackage{bbm}
 | 
						|
\usepackage{hyperref}
 | 
						|
\usepackage{graphicx}
 | 
						|
\usepackage{xcolor}
 | 
						|
\usepackage{mathtools}
 | 
						|
 | 
						|
\RequirePackage[T1]{fontenc}
 | 
						|
 | 
						|
\PassOptionsToPackage{sfdefault}{FiraSans}
 | 
						|
\RequirePackage{FiraSans}
 | 
						|
 | 
						|
\RequirePackage{FiraMono}
 | 
						|
 | 
						|
% Farbdefinitionen
 | 
						|
\definecolor{backgroundcoloreq}{RGB}{180,140,0}
 | 
						|
\definecolor{codegreen}{rgb}{0,0.6,0}
 | 
						|
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
 | 
						|
\definecolor{codepurple}{rgb}{0.58,0,0.82}
 | 
						|
\definecolor{codeorange}{RGB}{190,100,0}
 | 
						|
 | 
						|
% we wanna use default caleographic alphabet
 | 
						|
\DeclareMathAlphabet{\mathcal}{OMS}{cmbrs}{m}{n}
 | 
						|
 | 
						|
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	 TITLE SLIDE
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
 | 
						|
\title{Cross-Model Pseudo-Labeling}
 | 
						|
 | 
						|
\subtitle{for Semi-Supervised Action Recognition}
 | 
						|
 | 
						|
\author{Lukas Heiligenbrunner}
 | 
						|
 | 
						|
\date{\today}
 | 
						|
 | 
						|
%------------------------------------------------
 | 
						|
 | 
						|
\begin{document}
 | 
						|
 | 
						|
%------------------------------------------------
 | 
						|
 | 
						|
    \begin{frame}
 | 
						|
        \maketitle
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	 SECTION 1
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
% todo pic of action
 | 
						|
 | 
						|
 | 
						|
    \section{The Goal}
 | 
						|
    \begin{frame}{The goal}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Train model
 | 
						|
            \item Recognize action of person
 | 
						|
            \item From video [$\approx$10sec]
 | 
						|
            \item E.g.:
 | 
						|
            \begin{itemize}
 | 
						|
                \item brushing hair
 | 
						|
                \item riding bike
 | 
						|
                \item dancing
 | 
						|
                \item playing violin
 | 
						|
            \end{itemize}
 | 
						|
            \item As generic as possible
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	 SECTION 2
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
 | 
						|
 | 
						|
    \section{The Problem} % Section title slide, unnumbered
 | 
						|
 | 
						|
%------------------------------------------------
 | 
						|
 | 
						|
    \begin{frame}{Missing Labels}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Supervised action recoginition
 | 
						|
            \begin{itemize}
 | 
						|
                \item lots of labeled samples necessary
 | 
						|
                \item videos
 | 
						|
            \end{itemize}
 | 
						|
            \item Labeling Samples very expensive
 | 
						|
            \begin{itemize}
 | 
						|
                \item Avoid!
 | 
						|
            \end{itemize}
 | 
						|
            \item Tremendous amount of unlabled data
 | 
						|
            \begin{itemize}
 | 
						|
                \item YouTube
 | 
						|
            \end{itemize}
 | 
						|
            \item Using semi-supervised learning might be benefitial
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
%------------------------------------------------
 | 
						|
 | 
						|
    \begin{frame}{What's all about Semi supervised?}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Supervised learning
 | 
						|
            \begin{itemize}
 | 
						|
                \item Data samples
 | 
						|
                \item Target labels
 | 
						|
                \item Each sample is associated to target label
 | 
						|
            \end{itemize}
 | 
						|
            \item Unsupervised learning
 | 
						|
            \begin{itemize}
 | 
						|
                \item Data samples
 | 
						|
                \item target is to find patterns in data
 | 
						|
                \item without supervision
 | 
						|
            \end{itemize}
 | 
						|
            \item Semi-Supervised learning
 | 
						|
            \begin{itemize}
 | 
						|
                \item combination of both
 | 
						|
                \item have labeled \& unlabeled data
 | 
						|
                \item labeled data guides learning process
 | 
						|
                \item unlabled helps to gain additional information
 | 
						|
                \item goal is performance improvement
 | 
						|
            \end{itemize}
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
%------------------------------------------------
 | 
						|
 | 
						|
    \begin{frame}[allowframebreaks]{What's already been done}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Pseudo-labeling
 | 
						|
            \item Train model on labeled data
 | 
						|
            \begin{itemize}
 | 
						|
                \item Eg. 1\%/10\% of data labeled
 | 
						|
            \end{itemize}
 | 
						|
            \item Predict pseudo-labels from unlabeled data
 | 
						|
            \item Confidence of prediction [Threshold]
 | 
						|
            \item Drop/Use prediction to train model further
 | 
						|
            \item Finally use pseudo-labels + 1/10\% to train main model
 | 
						|
 | 
						|
        \end{itemize}
 | 
						|
 | 
						|
        \framebreak
 | 
						|
        \begin{itemize}
 | 
						|
            \item quantity and quality of pseudo-labels
 | 
						|
            \item significant impact on main model accuracy!
 | 
						|
            \item we want to improve pseudo-label framework as much as possible
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	 SECTION 2
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
 | 
						|
 | 
						|
    \section{Cross-Model Pseudo-Labeling}
 | 
						|
 | 
						|
    \begin{frame}[allowframebreaks]{Papers approach}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Based on complementary-representations of model
 | 
						|
            \item Models of different size
 | 
						|
            \item Different structural-bias $\rightarrow$ different category-wise performance
 | 
						|
            \item Small model
 | 
						|
            \begin{itemize}
 | 
						|
                \item lower capacity
 | 
						|
                \item better captures temporal dynamics in recognizing actions
 | 
						|
                \item scene changes/motion over time
 | 
						|
            \end{itemize}
 | 
						|
            \item Large model
 | 
						|
            \begin{itemize}
 | 
						|
                \item better learns spatial semantics
 | 
						|
                \item to distinguish different action instances
 | 
						|
                \item localize/identify objects in specific scene
 | 
						|
            \end{itemize}
 | 
						|
        \end{itemize}
 | 
						|
 | 
						|
        \framebreak
 | 
						|
 | 
						|
        \begin{itemize}
 | 
						|
            \item Cross-Model Pseudo-Labeling
 | 
						|
            \item Primary backbone (large model)
 | 
						|
            \item Supplemented by lightweight auxiliary network
 | 
						|
            \begin{itemize}
 | 
						|
                \item Different structure
 | 
						|
                \item Fewer channels (smaller)
 | 
						|
            \end{itemize}
 | 
						|
            \item Different representation of data complements primary backbone
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
    \begin{frame}{Structure Visualization}
 | 
						|
        \includegraphics[scale=.17]{rsc/structure}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
    \begin{frame}{Performance Perspectives}
 | 
						|
        \begin{itemize}
 | 
						|
            \item 1\% labeled data + 400 Labels
 | 
						|
            \item Kinetics-400 dataset
 | 
						|
        \end{itemize}
 | 
						|
        \includegraphics[scale=.205]{rsc/performance_comparison}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
 | 
						|
    \section{Give me the math!}
 | 
						|
 | 
						|
    \begin{frame}{Definitions}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Labeled data set of size $N_l$\\
 | 
						|
            $\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$
 | 
						|
            \item Unlabeled data set of size $N_u$\\
 | 
						|
            $\mathcal{U} = \{u_1, \dots, u_{N_u}\}$
 | 
						|
            \item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
    \begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works}
 | 
						|
        \begin{itemize}
 | 
						|
            \item $B_u \coloneqq \text{Batchsize}$
 | 
						|
            \item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$
 | 
						|
            \item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$
 | 
						|
            \item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$
 | 
						|
            \item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$
 | 
						|
            \item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$
 | 
						|
            \item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$
 | 
						|
            \item $F \coloneqq \text{Model}$
 | 
						|
            \item $\mathbbm{1} \coloneqq \text{Indicator Function}$
 | 
						|
        \end{itemize}
 | 
						|
        \begin{align*}
 | 
						|
            \mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))
 | 
						|
        \end{align*}
 | 
						|
 | 
						|
        \framebreak
 | 
						|
 | 
						|
        \begin{itemize}
 | 
						|
            \item $\mathbbm{1}(\max(p_i) \geq \tau)$
 | 
						|
            \begin{itemize}
 | 
						|
                \item 'confidence-based masking'
 | 
						|
                \item retain label only if largest probability is above threshold
 | 
						|
                \item keep only 'high confidence' labels
 | 
						|
            \end{itemize}
 | 
						|
            \item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$
 | 
						|
            \begin{itemize}
 | 
						|
                \item 'consistency regularization'
 | 
						|
                \item cross-entropy loss of strong augmented and weak augmented data
 | 
						|
            \end{itemize}
 | 
						|
        \end{itemize}
 | 
						|
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
    \begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)}
 | 
						|
        \begin{itemize}
 | 
						|
            \item $F(\cdot) \coloneqq \text{Primary backbone}$
 | 
						|
            \item $A(\cdot) \coloneqq \text{Auxiliary network}$
 | 
						|
            \item Learning on labeled data
 | 
						|
            \begin{align*}
 | 
						|
                \mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\
 | 
						|
                \mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i)))
 | 
						|
            \end{align*}
 | 
						|
            \item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$
 | 
						|
        \end{itemize}
 | 
						|
 | 
						|
        \framebreak
 | 
						|
 | 
						|
        \begin{itemize}
 | 
						|
            \item Learning on unlabeled data
 | 
						|
            \begin{align*}
 | 
						|
                \mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\
 | 
						|
                \mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\
 | 
						|
            \end{align*}
 | 
						|
            \item Complete training objective
 | 
						|
            \begin{align*}
 | 
						|
                \mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A)
 | 
						|
            \end{align*}
 | 
						|
            \item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
 | 
						|
    \section{Implementation}
 | 
						|
 | 
						|
    \begin{frame}{Networks}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Auxiliary Network
 | 
						|
            \begin{itemize}
 | 
						|
                \item sub-network of primary model
 | 
						|
                \item 3D-ResNet18
 | 
						|
                \item \textbf{3D-ResNet50x1/4}
 | 
						|
            \end{itemize}
 | 
						|
            \item Backbone network
 | 
						|
            \begin{itemize}
 | 
						|
                \item larger version of aux-net
 | 
						|
                \item \textbf{3D-ResNet50}
 | 
						|
            \end{itemize}
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
    \begin{frame}{Dataset}
 | 
						|
        \begin{itemize}
 | 
						|
            \item Kinetics-400
 | 
						|
            \begin{itemize}
 | 
						|
                \item 400 categories
 | 
						|
                \item 240k/20k training/validation samples
 | 
						|
            \end{itemize}
 | 
						|
            \item UCF-101
 | 
						|
            \begin{itemize}
 | 
						|
                \item 101 classes
 | 
						|
                \item 9.5k/4k training/validation samples
 | 
						|
            \end{itemize}
 | 
						|
            \item $\approx$10sec every video
 | 
						|
            \item 1\% or 10\% labeled subsets balanced sampled from distribution
 | 
						|
        \end{itemize}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
 | 
						|
    \begin{frame}{Performance Results}
 | 
						|
        \includegraphics[scale=.65]{rsc/results}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
 | 
						|
    % ---  THE END
 | 
						|
 | 
						|
    \begin{frame}[focus]
 | 
						|
        Thanks for your Attention!
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
%	 CLOSING/SUPPLEMENTARY SLIDES
 | 
						|
%----------------------------------------------------------------------------------------
 | 
						|
 | 
						|
    \appendix
 | 
						|
 | 
						|
    \begin{frame}{Sources}
 | 
						|
        \nocite{*} % Display all references regardless of if they were cited
 | 
						|
        \bibliography{sources}
 | 
						|
        \bibliographystyle{plain}
 | 
						|
    \end{frame}
 | 
						|
 | 
						|
\end{document}
 |