diff --git a/presentation/main.tex b/presentation/main.tex index 2efa5d3..e2d6da9 100644 --- a/presentation/main.tex +++ b/presentation/main.tex @@ -40,12 +40,9 @@ % TITLE SLIDE %---------------------------------------------------------------------------------------- -\title{Cross-Model Pseudo-Labeling} - -\subtitle{for Semi-Supervised Action Recognition} - +\title{De-Cluttering Scatterplots} +\subtitle{with Integral Images} \author{Lukas Heiligenbrunner} - \date{\today} %------------------------------------------------ @@ -59,274 +56,264 @@ \end{frame} -%---------------------------------------------------------------------------------------- -% SECTION 1 -%---------------------------------------------------------------------------------------- -% todo pic of action +%---------------------------------------------------------------------------------------- +% SECTION 1: INTRODUCTION +%---------------------------------------------------------------------------------------- - \section{The Goal} - \begin{frame}{The goal} + \section{Introduction} + + \begin{frame}{Goal of the Paper} \begin{itemize} - \item Train model - \item Recognize action of person - \item From video [$\approx$10sec] - \item E.g.: + \item Scatterplots are fundamental for exploring multidimensional data + \item But: with large datasets they suffer from \textbf{overplotting} + \item Dense regions obscure structure, samples become inaccessible + \item Goal: \begin{itemize} - \item brushing hair - \item riding bike - \item dancing - \item playing violin + \item Reduce clutter + \item Preserve neighborhood relations + \item Achieve uniform sample distribution + \item Maintain interpretability \end{itemize} - \item As generic as possible \end{itemize} \end{frame} %---------------------------------------------------------------------------------------- -% SECTION 2 +% SECTION 2: PROBLEM %---------------------------------------------------------------------------------------- + \section{Problem: Overplotting} - \section{The Problem} % Section title slide, unnumbered - -%------------------------------------------------ - - \begin{frame}{Missing Labels} + \begin{frame}{Why Scatterplots Clutter} \begin{itemize} - \item Supervised action recoginition + \item Modern datasets: millions of samples + \item Pixel resolution fixed → many samples map to the same pixel + \item Consequences: \begin{itemize} - \item lots of labeled samples necessary - \item videos + \item Occlusion of clusters + outliers + \item Loss of density information + \item Hard to select individual items + \item Misleading visual perception \end{itemize} - \item Labeling Samples very expensive - \begin{itemize} - \item Avoid! - \end{itemize} - \item Tremendous amount of unlabled data - \begin{itemize} - \item YouTube - \end{itemize} - \item Using semi-supervised learning might be benefitial + \item A method is needed to \textbf{declutter} without losing structure \end{itemize} \end{frame} -%------------------------------------------------ - - \begin{frame}{What's all about Semi supervised?} + \begin{frame}{Limitations of Traditional Approaches} \begin{itemize} - \item Supervised learning + \item Transparency-based methods \begin{itemize} - \item Data samples - \item Target labels - \item Each sample is associated to target label + \item Improve density perception + \item But still lose individual sample visibility \end{itemize} - \item Unsupervised learning + \item Down-sampling \begin{itemize} - \item Data samples - \item target is to find patterns in data - \item without supervision + \item Removes data → not acceptable for analysis \end{itemize} - \item Semi-Supervised learning + \item Local spatial distortions \begin{itemize} - \item combination of both - \item have labeled \& unlabeled data - \item labeled data guides learning process - \item unlabled helps to gain additional information - \item goal is performance improvement + \item Risk of collisions + \item Often non-monotonic mappings + \end{itemize} + \item Need a \textbf{global}, \textbf{smooth}, \textbf{monotonic}, \textbf{collision-free} method + \end{itemize} + \end{frame} + + %---------------------------------------------------------------------------------------- +% SECTION 3: BACKGROUND +%---------------------------------------------------------------------------------------- + + \section{Background: Density Fields \& Integral Images} + + \begin{frame}{Density Estimation} + \begin{itemize} + \item Given samples $z_i = (x_i, y_i)$ + \item Build smoothed density: + \[ + d_r(x,y) = \sum_{p=1}^n \varphi_r(x-x_p, y-y_p) + \] + \item Typically Gaussian kernel + \item Add global constant $d_0$ for stability: + \[ + d(i,j) = d_r(i,j) + d_0 + \] + \item Ensures no empty regions → avoids singular mappings + \end{itemize} + \end{frame} + + \begin{frame}{Integral Images (InIms)} + \begin{itemize} + \item Integral images compute cumulative sums over regions + \item Four standard tables: + \[ + \alpha,\beta,\gamma,\delta + \] + \item Four tilted (45°) tables: + \[ + \alpha_t, \beta_t, \gamma_t, \delta_t + \] + \item Each encodes global density distribution + \item Key advantage: + \begin{itemize} + \item Displacements depend on \textbf{global density}, not local neighborhood + \item Avoids collisions \end{itemize} \end{itemize} \end{frame} -%------------------------------------------------ + %---------------------------------------------------------------------------------------- +% SECTION 4: METHOD +%---------------------------------------------------------------------------------------- - \begin{frame}[allowframebreaks]{What's already been done} + \section{Density-Equalizing Mapping} + + \begin{frame}{Original Mapping (Molchanov \& Linsen)} \begin{itemize} - \item Pseudo-labeling - \item Train model on labeled data + \item Prior work defined mapping: + \[ + t(x,y; d) = \frac{ + \alpha q_1 + \beta q_2 + \gamma q_3 + \delta q_4 + + \alpha_t (x,1) + \beta_t (1,y) + \gamma_t (x,0) + \delta_t (0,y) + }{2C} + \] + \item But: \begin{itemize} - \item Eg. 1\%/10\% of data labeled + \item Not identity for uniform density + \item Iteration unstable + \item Does not converge to equalized distribution \end{itemize} - \item Predict pseudo-labels from unlabeled data - \item Confidence of prediction [Threshold] - \item Drop/Use prediction to train model further - \item Finally use pseudo-labels + 1/10\% to train main model - \end{itemize} + \end{frame} - \framebreak + \begin{frame}{Corrected Mapping (This Paper)} \begin{itemize} - \item quantity and quality of pseudo-labels - \item significant impact on main model accuracy! - \item we want to improve pseudo-label framework as much as possible + \item Compute deformation for true density $d$ + \item Compute deformation for constant density $d_0$ + \item Subtract: + \[ + t(x,y) = (x,y) + t(x,y; d) - t(x,y; d_0) + \] + \item This ensures: + \begin{itemize} + \item Identity for uniform density + \item Smooth monotonic deformation + \item Progressive convergence to equalization + \item No overlap of regions + \end{itemize} + \end{itemize} + \end{frame} + + \begin{frame}{Iterative Algorithm Overview} + \begin{enumerate} + \item Rasterize and smooth density + \item Compute integral images + \item Compute corrected deformation $t(x,y)$ + \item Apply bi-linear interpolation to sample positions + \item Iterate until: + \begin{itemize} + \item Time budget reached + \item Uniformity threshold reached + \end{itemize} + \end{enumerate} + \end{frame} +%---------------------------------------------------------------------------------------- +% SECTION 5: IMPLEMENTATION +%---------------------------------------------------------------------------------------- + + \section{GPU Implementation} + + \begin{frame}{Efficient GPU Computation} + \begin{itemize} + \item All major steps implemented on GPU: + \begin{itemize} + \item Density accumulation + \item Gaussian smoothing + \item Integral image computation + \end{itemize} + \item Fast multi-pass reduction for InIms + \item Complexity: + \[ + O(n + m) + \] + where $m = 2^k \times 2^k$ is texture resolution + \item Achieves interactive rates for millions of samples \end{itemize} \end{frame} %---------------------------------------------------------------------------------------- -% SECTION 2 +% SECTION 6: VISUAL ENCODING %---------------------------------------------------------------------------------------- + \section{Visual Encoding of Deformation} - \section{Cross-Model Pseudo-Labeling} - - \begin{frame}[allowframebreaks]{Papers approach} + \begin{frame}{Problem After Deformation} \begin{itemize} - \item Based on complementary-representations of model - \item Models of different size - \item Different structural-bias $\rightarrow$ different category-wise performance - \item Small model + \item After equalization: \begin{itemize} - \item lower capacity - \item better captures temporal dynamics in recognizing actions - \item scene changes/motion over time + \item Local densities lost + \item Cluster shapes distorted + \item Distances no longer meaningful \end{itemize} - \item Large model - \begin{itemize} - \item better learns spatial semantics - \item to distinguish different action instances - \item localize/identify objects in specific scene - \end{itemize} - \end{itemize} - - \framebreak - - \begin{itemize} - \item Cross-Model Pseudo-Labeling - \item Primary backbone (large model) - \item Supplemented by lightweight auxiliary network - \begin{itemize} - \item Different structure - \item Fewer channels (smaller) - \end{itemize} - \item Different representation of data complements primary backbone + \item Need additional encodings to preserve structure \end{itemize} \end{frame} - \begin{frame}{Structure Visualization} - \includegraphics[scale=.17]{rsc/structure} - \end{frame} - - \begin{frame}{Performance Perspectives} + \begin{frame}{Three Proposed Encodings} \begin{itemize} - \item 1\% labeled data + 400 Labels - \item Kinetics-400 dataset - \end{itemize} - \includegraphics[scale=.205]{rsc/performance_comparison} - \end{frame} - - - \section{Give me the math!} - - \begin{frame}{Definitions} - \begin{itemize} - \item Labeled data set of size $N_l$\\ - $\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$ - \item Unlabeled data set of size $N_u$\\ - $\mathcal{U} = \{u_1, \dots, u_{N_u}\}$ - \item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\ - \end{itemize} - \end{frame} - - \begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works} - \begin{itemize} - \item $B_u \coloneqq \text{Batchsize}$ - \item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$ - \item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$ - \item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$ - \item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$ - \item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$ - \item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$ - \item $F \coloneqq \text{Model}$ - \item $\mathbbm{1} \coloneqq \text{Indicator Function}$ - \end{itemize} - \begin{align*} - \mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i))) - \end{align*} - - \framebreak - - \begin{itemize} - \item $\mathbbm{1}(\max(p_i) \geq \tau)$ + \item \textbf{Deformed grid lines} \begin{itemize} - \item 'confidence-based masking' - \item retain label only if largest probability is above threshold - \item keep only 'high confidence' labels + \item Show local expansion / contraction \end{itemize} - \item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$ + \item \textbf{Background density texture} \begin{itemize} - \item 'consistency regularization' - \item cross-entropy loss of strong augmented and weak augmented data + \item Shows cluster cores after deformation \end{itemize} - \end{itemize} - - \end{frame} - - \begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)} - \begin{itemize} - \item $F(\cdot) \coloneqq \text{Primary backbone}$ - \item $A(\cdot) \coloneqq \text{Auxiliary network}$ - \item Learning on labeled data - \begin{align*} - \mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\ - \mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i))) - \end{align*} - \item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$ - \end{itemize} - - \framebreak - - \begin{itemize} - \item Learning on unlabeled data - \begin{align*} - \mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\ - \mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\ - \end{align*} - \item Complete training objective - \begin{align*} - \mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A) - \end{align*} - \item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$ - \end{itemize} - \end{frame} - - - \section{Implementation} - - \begin{frame}{Networks} - \begin{itemize} - \item Auxiliary Network + \item \textbf{Contour lines} \begin{itemize} - \item sub-network of primary model - \item 3D-ResNet18 - \item \textbf{3D-ResNet50x1/4} - \end{itemize} - \item Backbone network - \begin{itemize} - \item larger version of aux-net - \item \textbf{3D-ResNet50} + \item Reveal subcluster structure \end{itemize} \end{itemize} \end{frame} - \begin{frame}{Dataset} +%---------------------------------------------------------------------------------------- +% SECTION 7: RESULTS +%---------------------------------------------------------------------------------------- + + \section{Results} + + \begin{frame}{Performance} \begin{itemize} - \item Kinetics-400 + \item Runs at interactive frame rates: \begin{itemize} - \item 400 categories - \item 240k/20k training/validation samples + \item e.g. 4M samples in $\approx 28$ ms per iteration \end{itemize} - \item UCF-101 - \begin{itemize} - \item 101 classes - \item 9.5k/4k training/validation samples - \end{itemize} - \item $\approx$10sec every video - \item 1\% or 10\% labeled subsets balanced sampled from distribution + \item Standard deviation of samples/bin decreases monotonically + \item Overplotting fraction also decreases monotonically \end{itemize} + \centering + \includegraphics[scale=0.4]{rsc/results} \end{frame} - - \begin{frame}{Performance Results} - \includegraphics[scale=.65]{rsc/results} + \begin{frame}{User Study} + \begin{itemize} + \item 25 participants, 3 tasks: + \begin{enumerate} + \item Estimate cluster size + \item Sort clusters by size + \item Select clusters (lasso) + \end{enumerate} + \item Findings: + \begin{itemize} + \item Size estimation (T1): regularized significantly better + \item Sorting (T2): regularized significantly better + \item Cluster selection (T3): + \begin{itemize} + \item Grid encoding: worst + \item Background texture: better + \item Original scatterplot: best + \end{itemize} + \end{itemize} + \end{itemize} \end{frame}