Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f3405340cf | |||
| 34c9757f6c | |||
| 29715ff95d | |||
| 74ae33c267 | |||
| a586a15f8c | |||
| 1384d2036e | |||
| 905bad7af3 |
@@ -1,4 +1,4 @@
|
||||
\documentclass[usenames,dvipsnames]{beamer}
|
||||
\documentclass[usenames,dvipsnames, aspectratio=169]{beamer}
|
||||
%----------------------------------------------------------------------------------------
|
||||
% Struktur und Pointer Referat
|
||||
% 20.04.2020
|
||||
@@ -40,12 +40,9 @@
|
||||
% TITLE SLIDE
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\title{Cross-Model Pseudo-Labeling}
|
||||
|
||||
\subtitle{for Semi-Supervised Action Recognition}
|
||||
|
||||
\title{De-Cluttering Scatterplots}
|
||||
\subtitle{with Integral Images}
|
||||
\author{Lukas Heiligenbrunner}
|
||||
|
||||
\date{\today}
|
||||
|
||||
%------------------------------------------------
|
||||
@@ -59,276 +56,351 @@
|
||||
\end{frame}
|
||||
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 1
|
||||
%----------------------------------------------------------------------------------------
|
||||
% todo pic of action
|
||||
|
||||
|
||||
\section{The Goal}
|
||||
\begin{frame}{The goal}
|
||||
\begin{itemize}
|
||||
\item Train model
|
||||
\item Recognize action of person
|
||||
\item From video [$\approx$10sec]
|
||||
\item E.g.:
|
||||
\begin{itemize}
|
||||
\item brushing hair
|
||||
\item riding bike
|
||||
\item dancing
|
||||
\item playing violin
|
||||
\end{itemize}
|
||||
\item As generic as possible
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 2
|
||||
% SECTION 1: INTRODUCTION
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
\section{The Problem} % Section title slide, unnumbered
|
||||
|
||||
%------------------------------------------------
|
||||
|
||||
\begin{frame}{Missing Labels}
|
||||
\begin{itemize}
|
||||
\item Supervised action recoginition
|
||||
\begin{itemize}
|
||||
\item lots of labeled samples necessary
|
||||
\item videos
|
||||
\end{itemize}
|
||||
\item Labeling Samples very expensive
|
||||
\begin{itemize}
|
||||
\item Avoid!
|
||||
\end{itemize}
|
||||
\item Tremendous amount of unlabled data
|
||||
\begin{itemize}
|
||||
\item YouTube
|
||||
\end{itemize}
|
||||
\item Using semi-supervised learning might be benefitial
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%------------------------------------------------
|
||||
|
||||
\begin{frame}{What's all about Semi supervised?}
|
||||
\begin{itemize}
|
||||
\item Supervised learning
|
||||
\begin{itemize}
|
||||
\item Data samples
|
||||
\item Target labels
|
||||
\item Each sample is associated to target label
|
||||
\end{itemize}
|
||||
\item Unsupervised learning
|
||||
\begin{itemize}
|
||||
\item Data samples
|
||||
\item target is to find patterns in data
|
||||
\item without supervision
|
||||
\end{itemize}
|
||||
\item Semi-Supervised learning
|
||||
\begin{itemize}
|
||||
\item combination of both
|
||||
\item have labeled \& unlabeled data
|
||||
\item labeled data guides learning process
|
||||
\item unlabled helps to gain additional information
|
||||
\item goal is performance improvement
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%------------------------------------------------
|
||||
|
||||
\begin{frame}[allowframebreaks]{What's already been done}
|
||||
\begin{itemize}
|
||||
\item Pseudo-labeling
|
||||
\item Train model on labeled data
|
||||
\begin{itemize}
|
||||
\item Eg. 1\%/10\% of data labeled
|
||||
\end{itemize}
|
||||
\item Predict pseudo-labels from unlabeled data
|
||||
\item Confidence of prediction [Threshold]
|
||||
\item Drop/Use prediction to train model further
|
||||
\item Finally use pseudo-labels + 1/10\% to train main model
|
||||
|
||||
\end{itemize}
|
||||
|
||||
\framebreak
|
||||
\begin{itemize}
|
||||
\item quantity and quality of pseudo-labels
|
||||
\item significant impact on main model accuracy!
|
||||
\item we want to improve pseudo-label framework as much as possible
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 2
|
||||
% SECTION 2: PROBLEM
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\section{Introduction}
|
||||
|
||||
\section{Cross-Model Pseudo-Labeling}
|
||||
|
||||
\begin{frame}[allowframebreaks]{Papers approach}
|
||||
\begin{frame}{Problem: Scatterplots Clutter}
|
||||
\begin{itemize}
|
||||
\item Based on complementary-representations of model
|
||||
\item Models of different size
|
||||
\item Different structural-bias $\rightarrow$ different category-wise performance
|
||||
\item Small model
|
||||
\item Scatterplots are fundamental for exploring multidimensional data
|
||||
\item Modern datasets: millions of samples
|
||||
\item Pixel resolution fixed → many samples map to the same pixel
|
||||
\item This results in \textbf{overplotting}
|
||||
\item Consequences:
|
||||
\begin{itemize}
|
||||
\item lower capacity
|
||||
\item better captures temporal dynamics in recognizing actions
|
||||
\item scene changes/motion over time
|
||||
\item Occlusion of clusters
|
||||
\item Loss of density information
|
||||
\item Hard to select and see individual items
|
||||
%\item Misleading visual perception
|
||||
\end{itemize}
|
||||
\item Large model
|
||||
\item A method is needed to \textbf{declutter} without losing structure
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\centering
|
||||
\includegraphics[scale=0.8]{rsc/overplotting}
|
||||
\footnotesize\text{Source: \cite{statisticsglobe_overplotting_r}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Goal of the Paper}
|
||||
\begin{itemize}
|
||||
\item Goal:
|
||||
\begin{itemize}
|
||||
\item better learns spatial semantics
|
||||
\item to distinguish different action instances
|
||||
\item localize/identify objects in specific scene
|
||||
\item Reduce clutter
|
||||
\item Preserve neighborhood relations
|
||||
\item Achieve uniform sample distribution
|
||||
\item Maintain interpretability
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\framebreak
|
||||
|
||||
\begin{frame}{Limitations of Traditional Approaches}
|
||||
\begin{itemize}
|
||||
\item Cross-Model Pseudo-Labeling
|
||||
\item Primary backbone (large model)
|
||||
\item Supplemented by lightweight auxiliary network
|
||||
\item Transparency-based methods
|
||||
\begin{itemize}
|
||||
\item Different structure
|
||||
\item Fewer channels (smaller)
|
||||
\item Improve density perception
|
||||
\item But still lose individual sample visibility
|
||||
\end{itemize}
|
||||
\item Different representation of data complements primary backbone
|
||||
\item Down-sampling
|
||||
\begin{itemize}
|
||||
\item Removes data → not acceptable for analysis
|
||||
\end{itemize}
|
||||
\item Local spatial distortions
|
||||
\begin{itemize}
|
||||
\item Risk of collisions
|
||||
\item Often non-monotonic mappings
|
||||
\end{itemize}
|
||||
\item Need a \textbf{global}, \textbf{smooth}, \textbf{monotonic}, \textbf{collision-free} method
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 3: BACKGROUND
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\section{Background:\\Density Fields \& Integral Images}
|
||||
|
||||
\begin{frame}{Density Estimation}
|
||||
\begin{itemize}
|
||||
\item Given samples $z_i = (x_i, y_i)$
|
||||
\item Build smoothed density:
|
||||
\[
|
||||
d_r(x,y) = \sum_{p=1}^n \varphi_r(x-x_p, y-y_p)
|
||||
\]
|
||||
\item Typically Gaussian kernel
|
||||
\item Add global constant $d_0$ for stability:
|
||||
\[
|
||||
d(i,j) = d_r(i,j) + d_0
|
||||
\]
|
||||
\item Ensures no empty regions → avoids singular mappings
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Structure Visualization}
|
||||
\includegraphics[scale=.17]{rsc/structure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Performance Perspectives}
|
||||
\begin{frame}{Integral Images (InIms) I}
|
||||
\begin{itemize}
|
||||
\item 1\% labeled data + 400 Labels
|
||||
\item Kinetics-400 dataset
|
||||
\end{itemize}
|
||||
\includegraphics[scale=.205]{rsc/performance_comparison}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\section{Give me the math!}
|
||||
|
||||
\begin{frame}{Definitions}
|
||||
\begin{itemize}
|
||||
\item Labeled data set of size $N_l$\\
|
||||
$\mathcal{V} = \{(v_1,y_1), \dots, (v_{N_l}, y_{N_l})\}$
|
||||
\item Unlabeled data set of size $N_u$\\
|
||||
$\mathcal{U} = \{u_1, \dots, u_{N_u}\}$
|
||||
\item in general $\lvert\mathcal{U}\rvert \gg \lvert\mathcal{V}\rvert$\\
|
||||
\item Integral images compute cumulative sums over regions
|
||||
\item Four standard tables:
|
||||
\[
|
||||
\alpha,\beta,\gamma,\delta
|
||||
\]
|
||||
\item Four tilted (45°) tables:
|
||||
\[
|
||||
\alpha_t, \beta_t, \gamma_t, \delta_t
|
||||
\]
|
||||
\item Each encodes global density distribution
|
||||
\item Key advantage:
|
||||
\begin{itemize}
|
||||
\item Displacements depend on \textbf{global density}, not local neighborhood
|
||||
\item Avoids collisions
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[allowframebreaks]{How existing method \textit{FixMatch} works}
|
||||
\begin{frame}{Integral Images (InIms) II}
|
||||
\centering
|
||||
\includegraphics[scale=0.3]{rsc/2408.06513v1_page_6_5}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Integral Images (InIms) III}
|
||||
\centering
|
||||
\includegraphics[scale=0.3]{rsc/2408.06513v1_page_6_6}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Integral Images (InIms) IV}
|
||||
\centering
|
||||
\includegraphics[scale=0.3]{rsc/2408.06513v1_page_6_7}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 4: METHOD
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\section{Density-Equalizing Mapping}
|
||||
|
||||
\begin{frame}{Goal of the Mapping}
|
||||
\begin{itemize}
|
||||
\item $B_u \coloneqq \text{Batchsize}$
|
||||
\item $\tau \coloneqq \text{Confidence Threshold (Hyperparameter)}$
|
||||
\item $F(\mathcal{T}_{\text{strong}}(u_i)) \coloneqq \text{Class distribution}$
|
||||
\item $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$
|
||||
\item $\hat{y}_i \coloneqq \arg \max(p_i) \coloneqq \text{Pseudo Label}$
|
||||
\item $\mathcal{H} \coloneqq \text{Cross-entropy loss}$
|
||||
\item $\mathcal{L}_u \coloneqq \text{Loss on the unlabeled data}$
|
||||
\item $F \coloneqq \text{Model}$
|
||||
\item $\mathbbm{1} \coloneqq \text{Indicator Function}$
|
||||
\item We want to transform the scatterplot domain so that:
|
||||
\begin{itemize}
|
||||
\item dense regions expand
|
||||
\item sparse regions contract
|
||||
\item overall density becomes approximately uniform
|
||||
\end{itemize}
|
||||
\item The deformation must be:
|
||||
\begin{itemize}
|
||||
\item smooth
|
||||
\item globally consistent
|
||||
\item monotonic (no point order swaps)
|
||||
\item free of collisions
|
||||
\end{itemize}
|
||||
\item To achieve this, we compute a \textbf{density–driven displacement field}.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Corrected Mapping: Key Idea}
|
||||
\begin{itemize}
|
||||
\item Let $t(x,y; d)$ be the deformation computed from the
|
||||
\textbf{actual density field} $d(x,y)$.
|
||||
\item This deformation is built from cumulative sums of density
|
||||
through the integral images.
|
||||
\item Problem: even for \textbf{constant density}, $t(x,y; d_0)$
|
||||
is \emph{not} zero (due to construction of the integral tables).
|
||||
\item Therefore:\\
|
||||
We subtract the deformation caused by constant density.
|
||||
\end{itemize}
|
||||
\begin{align*}
|
||||
\mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))
|
||||
T(x,y) = (x,y) \;+\; t(x,y; d) \;-\; t(x,y; d_0) \;
|
||||
\end{align*}
|
||||
|
||||
\framebreak
|
||||
|
||||
\begin{itemize}
|
||||
\item $\mathbbm{1}(\max(p_i) \geq \tau)$
|
||||
\begin{itemize}
|
||||
\item 'confidence-based masking'
|
||||
\item retain label only if largest probability is above threshold
|
||||
\item keep only 'high confidence' labels
|
||||
\end{itemize}
|
||||
\item $\mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))$
|
||||
\begin{itemize}
|
||||
\item 'consistency regularization'
|
||||
\item cross-entropy loss of strong augmented and weak augmented data
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[allowframebreaks]{CMPL (Cross-Model Pseudo-Labeling)}
|
||||
\begin{itemize}
|
||||
\item $F(\cdot) \coloneqq \text{Primary backbone}$
|
||||
\item $A(\cdot) \coloneqq \text{Auxiliary network}$
|
||||
\item Learning on labeled data
|
||||
\begin{align*}
|
||||
\mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\
|
||||
\mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i)))
|
||||
\end{align*}
|
||||
\item $\mathcal{T}^F_{\text{standard}}(v_i) \coloneqq \text{standard augmentations for action recognition}$
|
||||
\end{itemize}
|
||||
|
||||
\framebreak
|
||||
|
||||
\begin{itemize}
|
||||
\item Learning on unlabeled data
|
||||
\begin{align*}
|
||||
\mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\
|
||||
\mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))\\
|
||||
\end{align*}
|
||||
\item Complete training objective
|
||||
\begin{align*}
|
||||
\mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A)
|
||||
\end{align*}
|
||||
\item $\lambda \coloneqq \text{Balancing coefficient for unsupervised loss}$
|
||||
\item $T(x,y)$ is the \textbf{corrected mapping}.
|
||||
\item For uniform density: $t(x,y; d) = t(x,y; d_0)$ $\rightarrow$ identity mapping.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% \begin{frame}{Why the Corrected Mapping Works}
|
||||
% \begin{itemize}
|
||||
% \item \textbf{Identity on uniform density}
|
||||
% \begin{itemize}
|
||||
% \item Without correction: the old mapping distorted even uniform fields.
|
||||
% \item With correction: uniform density $\rightarrow$ no deformation.
|
||||
% \end{itemize}
|
||||
% \item \textbf{Monotonicity}
|
||||
% \begin{itemize}
|
||||
% \item The corrected mapping guarantees no coordinate inversions.
|
||||
% \item Order of points is preserved along both axes.
|
||||
% \end{itemize}
|
||||
% \item \textbf{Smoothness}
|
||||
% \begin{itemize}
|
||||
% \item The mapping is built from integral images (global cumulative fields),
|
||||
% \item yielding slow, continuous changes.
|
||||
% \end{itemize}
|
||||
% \item \textbf{Stability in iteration}
|
||||
% \begin{itemize}
|
||||
% \item As the density becomes more equalized, $t(x,y;d)$ approaches $t(x,y;d_0)$.
|
||||
% \item Mapping naturally converges toward identity.
|
||||
% \end{itemize}
|
||||
% \item \textbf{No collisions}
|
||||
% \begin{itemize}
|
||||
% \item Global, monotonic deformation prevents points from crossing paths.
|
||||
% \end{itemize}
|
||||
% \end{itemize}
|
||||
% \end{frame}
|
||||
|
||||
\section{Implementation}
|
||||
|
||||
\begin{frame}{Networks}
|
||||
\begin{itemize}
|
||||
\item Auxiliary Network
|
||||
\begin{frame}{Iterative Algorithm Overview}
|
||||
\begin{enumerate}
|
||||
\item Rasterize and smooth density
|
||||
\item Compute integral images
|
||||
\item Compute corrected deformation $t(x,y)$
|
||||
\item Apply bi-linear interpolation to sample positions
|
||||
\item Iterate until:
|
||||
\begin{itemize}
|
||||
\item sub-network of primary model
|
||||
\item 3D-ResNet18
|
||||
\item \textbf{3D-ResNet50x1/4}
|
||||
\item Time budget reached
|
||||
\item Uniformity threshold reached
|
||||
\end{itemize}
|
||||
\item Backbone network
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\centering
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{rsc/2408.06513v1_page_7_1}
|
||||
|
||||
\vspace{4pt}
|
||||
\footnotesize MNIST Dataset (UMAP)~\cite{Rave_2025}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
$\Longrightarrow$
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{rsc/2408.06513v1_page_7_2}
|
||||
|
||||
\vspace{4pt}
|
||||
\footnotesize Visual encoding of the density-equalizing transform (32 Iterations)~\cite{Rave_2025}
|
||||
\end{minipage}
|
||||
\label{fig:figure}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 6: VISUAL ENCODING
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\section{Visual Encoding of Deformation}
|
||||
|
||||
\begin{frame}{Problem After Deformation}
|
||||
\begin{itemize}
|
||||
\item After equalization:
|
||||
\begin{itemize}
|
||||
\item larger version of aux-net
|
||||
\item \textbf{3D-ResNet50}
|
||||
\item Local densities lost
|
||||
\item Cluster shapes distorted
|
||||
\item Distances no longer meaningful
|
||||
\end{itemize}
|
||||
\item Need additional encodings to preserve structure
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Three Proposed Encodings I}
|
||||
\begin{itemize}
|
||||
\item \textbf{Deformed grid lines}
|
||||
\begin{itemize}
|
||||
\item Show local expansion / contraction
|
||||
\end{itemize}
|
||||
\item \textbf{Background density texture}
|
||||
\begin{itemize}
|
||||
\item Shows cluster cores after deformation
|
||||
\end{itemize}
|
||||
\item \textbf{Contour lines}
|
||||
\begin{itemize}
|
||||
\item Reveal subcluster structure
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Dataset}
|
||||
\begin{itemize}
|
||||
\item Kinetics-400
|
||||
\begin{itemize}
|
||||
\item 400 categories
|
||||
\item 240k/20k training/validation samples
|
||||
\end{itemize}
|
||||
\item UCF-101
|
||||
\begin{itemize}
|
||||
\item 101 classes
|
||||
\item 9.5k/4k training/validation samples
|
||||
\end{itemize}
|
||||
\item $\approx$10sec every video
|
||||
\item 1\% or 10\% labeled subsets balanced sampled from distribution
|
||||
\end{itemize}
|
||||
\begin{frame}{Three Proposed Encodings II}
|
||||
\centering
|
||||
\begin{figure}
|
||||
\centering
|
||||
\begin{minipage}{0.3\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{rsc/2408.06513v1_page_7_2}
|
||||
|
||||
\vspace{4pt}
|
||||
\footnotesize Deformed grid lines~\cite{Rave_2025}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.3\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{rsc/2408.06513v1_page_7_3}
|
||||
|
||||
\vspace{4pt}
|
||||
\footnotesize Background density texture~\cite{Rave_2025}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.3\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{rsc/2408.06513v1_page_7_4}
|
||||
|
||||
\vspace{4pt}
|
||||
\footnotesize Contour lines~\cite{Rave_2025}
|
||||
\end{minipage}
|
||||
\label{fig:figure2}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
%----------------------------------------------------------------------------------------
|
||||
% SECTION 5: IMPLEMENTATION
|
||||
%----------------------------------------------------------------------------------------
|
||||
|
||||
\begin{frame}{Performance Results}
|
||||
\includegraphics[scale=.65]{rsc/results}
|
||||
|
||||
\begin{frame}{Example I}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_1}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example II}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_2}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example III}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_3}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example IV}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_4}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example V}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_5}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Example VI}
|
||||
\centering
|
||||
\includegraphics[scale=0.1]{rsc/2408.06513v1_page_8_6}\\
|
||||
\footnotesize\text{Source: \cite{Rave_2025}}
|
||||
\end{frame}
|
||||
|
||||
% --- THE END
|
||||
|
||||
@@ -342,10 +414,58 @@
|
||||
|
||||
\appendix
|
||||
|
||||
\section{Backup Slides}\label{sec:backup}
|
||||
|
||||
\begin{frame}{Efficient GPU Computation}
|
||||
\begin{itemize}
|
||||
\item All major steps implemented on GPU:
|
||||
\begin{itemize}
|
||||
\item Density accumulation $\rightarrow$ vertex + fragment shader
|
||||
\item Gaussian smoothing $\rightarrow$ 2 compute-shader passes
|
||||
\item Integral image computation $\rightarrow$ fragment shader
|
||||
\end{itemize}
|
||||
\item Achieves interactive rates for millions of samples
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Performance}
|
||||
\begin{itemize}
|
||||
\item Runs at interactive frame rates:
|
||||
\begin{itemize}
|
||||
\item e.g. 4M samples in $\approx 3$ ms per iteration
|
||||
\end{itemize}
|
||||
%\item Standard deviation of samples/bin decreases monotonically
|
||||
%\item Overplotting fraction also decreases monotonically
|
||||
\end{itemize}
|
||||
\centering
|
||||
\includegraphics[scale=0.4]{rsc/img}\\
|
||||
Source:~\cite{Rave_2025}
|
||||
\end{frame}
|
||||
|
||||
\section{Math: Domain Transformation}
|
||||
\begin{frame}{Domain Transformation (Molchanov \& Linsen)}
|
||||
\begin{itemize}
|
||||
\item Integral Images $\rightarrow$ Transformation mapping
|
||||
\item Definition:
|
||||
\[
|
||||
t(x,y; d) = \frac{
|
||||
\alpha q_1 + \beta q_2 + \gamma q_3 + \delta q_4
|
||||
+ \alpha_t (x,1) + \beta_t (1,y) + \gamma_t (x,0) + \delta_t (0,y)
|
||||
}{2C}
|
||||
\]
|
||||
\item Problems:
|
||||
\begin{itemize}
|
||||
\item Not identity for uniform density
|
||||
\item Iteration unstable
|
||||
\item Does not converge to equalized distribution
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}{Sources}
|
||||
\nocite{*} % Display all references regardless of if they were cited
|
||||
\bibliography{sources}
|
||||
\bibliographystyle{plain}
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
||||
|
||||
BIN
presentation/rsc/2408.06513v1_page_6_5.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
presentation/rsc/2408.06513v1_page_6_6.png
Normal file
|
After Width: | Height: | Size: 39 KiB |
BIN
presentation/rsc/2408.06513v1_page_6_7.png
Normal file
|
After Width: | Height: | Size: 44 KiB |
BIN
presentation/rsc/2408.06513v1_page_7_1.png
Normal file
|
After Width: | Height: | Size: 196 KiB |
BIN
presentation/rsc/2408.06513v1_page_7_2.png
Normal file
|
After Width: | Height: | Size: 1.0 MiB |
BIN
presentation/rsc/2408.06513v1_page_7_3.png
Normal file
|
After Width: | Height: | Size: 1.6 MiB |
BIN
presentation/rsc/2408.06513v1_page_7_4.png
Normal file
|
After Width: | Height: | Size: 1001 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_1.png
Normal file
|
After Width: | Height: | Size: 279 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_2.png
Normal file
|
After Width: | Height: | Size: 403 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_3.png
Normal file
|
After Width: | Height: | Size: 548 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_4.png
Normal file
|
After Width: | Height: | Size: 746 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_5.png
Normal file
|
After Width: | Height: | Size: 927 KiB |
BIN
presentation/rsc/2408.06513v1_page_8_6.png
Normal file
|
After Width: | Height: | Size: 1.1 MiB |
BIN
presentation/rsc/img.png
Normal file
|
After Width: | Height: | Size: 49 KiB |
BIN
presentation/rsc/overplotting.png
Normal file
|
After Width: | Height: | Size: 188 KiB |
|
Before Width: | Height: | Size: 226 KiB |
|
Before Width: | Height: | Size: 438 KiB |
|
Before Width: | Height: | Size: 470 KiB |
@@ -1,16 +1,20 @@
|
||||
@InProceedings{Xu_2022_CVPR,
|
||||
author = {Xu, Yinghao and Wei, Fangyun and Sun, Xiao and Yang, Ceyuan and Shen, Yujun and Dai, Bo and Zhou, Bolei and Lin, Stephen},
|
||||
title = {Cross-Model Pseudo-Labeling for Semi-Supervised Action Recognition},
|
||||
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
||||
month = {June},
|
||||
year = {2022},
|
||||
pages = {2959-2968}
|
||||
}
|
||||
@article{Rave_2025,
|
||||
title={De-Cluttering Scatterplots With Integral Images},
|
||||
volume={31},
|
||||
ISSN={2160-9306},
|
||||
url={http://dx.doi.org/10.1109/TVCG.2024.3381453},
|
||||
DOI={10.1109/tvcg.2024.3381453},
|
||||
number={4},
|
||||
journal={IEEE Transactions on Visualization and Computer Graphics},
|
||||
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
|
||||
author={Rave, Hennes and Molchanov, Vladimir and Linsen, Lars},
|
||||
year={2025},
|
||||
month=apr, pages={2114–2126} }
|
||||
|
||||
@online{knuthwebsite,
|
||||
author = "Kihyuk Sohn, David Berthelot, Chun-Liang Li",
|
||||
title = "FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence",
|
||||
url = "https://arxiv.org/abs/2001.07685",
|
||||
addendum = "(accessed: 20.03.2023)",
|
||||
keywords = "FixMatch, semi-supervised"
|
||||
@online{statisticsglobe_overplotting_r,
|
||||
author = {Statistics Globe},
|
||||
title = {Avoid Overplotting in R (4 Examples) | Point Size, Opacity \& Color},
|
||||
year = {2025},
|
||||
url = {https://statisticsglobe.com/avoid-overplotting-r},
|
||||
note = {Accessed: 2025-11-23}
|
||||
}
|
||||
|
||||
266
summary/main.tex
@@ -1,9 +1,12 @@
|
||||
\DocumentMetadata{}
|
||||
\documentclass[sigconf]{acmart}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{bbm}
|
||||
\usepackage{mathtools}
|
||||
|
||||
\usepackage[inline]{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{subcaption}
|
||||
|
||||
\settopmatter{printacmref=false} % Removes citation information below abstract
|
||||
\renewcommand\footnotetextcopyrightpermission[1]{} % removes footnote with conference information in first column
|
||||
@@ -15,7 +18,7 @@
|
||||
\providecommand\BibTeX{{%
|
||||
\normalfont B\kern-0.5em{\scshape i\kern-0.25em b}\kern-0.8em\TeX}}}
|
||||
|
||||
\acmConference{Cross-Model Pseudo-Labeling}{2023}{Linz}
|
||||
\acmConference{De-cluttering Scatterplots with Integral Images}{2026}{Linz}
|
||||
|
||||
%%
|
||||
%% end of the preamble, start of the body of the document source.
|
||||
@@ -24,7 +27,7 @@
|
||||
%%
|
||||
%% The "title" command has an optional parameter,
|
||||
%% allowing the author to define a "short title" to be used in page headers.
|
||||
\title{Cross-Model Pseudo-Labeling for Semi-Supervised Action recognition}
|
||||
\title{De-cluttering Scatterplots with Integral Images}
|
||||
|
||||
%%
|
||||
%% The "author" command and its associated commands are used to define
|
||||
@@ -37,7 +40,7 @@
|
||||
\affiliation{%
|
||||
\institution{Johannes Kepler University Linz}
|
||||
\city{Linz}
|
||||
\state{Upperaustria}
|
||||
\state{Upper Austria}
|
||||
\country{Austria}
|
||||
\postcode{4020}
|
||||
}
|
||||
@@ -48,22 +51,22 @@
|
||||
%% other information printed in the page headers. This command allows
|
||||
%% the author to define a more concise list
|
||||
%% of authors' names for this purpose.
|
||||
\renewcommand{\shortauthors}{Lukas Heilgenbrunner}
|
||||
\renewcommand{\shortauthors}{Lukas Heiligenbrunner}
|
||||
|
||||
%%
|
||||
%% The abstract is a short summary of the work to be presented in the
|
||||
%% article.
|
||||
\begin{abstract}
|
||||
Cross-Model Pseudo-Labeling is a new framework for generating Pseudo-Labels
|
||||
for supervised learning tasks where only a subset of true labels is known.
|
||||
It builds upon the existing approach of FixMatch and improves it further by
|
||||
using two different sized models complementing each other.
|
||||
Visualizing large multidimensional datasets using classical scatterplots often results in severe overplotting and visual clutter, which obscure data density and hinder the analysis of underlying structures.
|
||||
This paper presents a summarization of a novel density-equalizing technique proposed by Rave, Molchanov, and Linsen, designed to de-clutter scatterplots through a smooth, iterative transformation of the visual domain.
|
||||
The core algorithm utilizes integral images (summed-area tables) to compute a global regularization mapping that redistributes data samples into a nearly uniform configuration.
|
||||
Unlike previous displacement methods that may alter local sample ordering or require expensive collision detection, this approach guarantees the preservation of neighborhood relations and operates deterministically.
|
||||
\end{abstract}
|
||||
|
||||
%%
|
||||
%% Keywords. The author(s) should pick words that accurately describe
|
||||
%% the work being presented. Separate the keywords with commas.
|
||||
\keywords{neural networks, videos, pseudo-labeling, action recognition}
|
||||
\keywords{Scatterplots, Overplotting, Integral Images, Density Equalization}
|
||||
|
||||
%\received{20 February 2007}
|
||||
%\received[revised]{12 March 2009}
|
||||
@@ -74,172 +77,135 @@
|
||||
%% information and builds the first part of the formatted document.
|
||||
\maketitle
|
||||
|
||||
\section{Introduction}\label{sec:introduction}
|
||||
For most supervised learning tasks are lots of training samples essential.
|
||||
With too less training data the model will not gerneralize well and not fit a real world task.
|
||||
Labeling datasets is commonly seen as an expensive task and wants to be avoided as much as possible.
|
||||
Thats why there is a machine-learning field called semi-supervised learning.
|
||||
The general approach is to train a model that predicts Pseudo-Labels which then can be used to train the main model.
|
||||
\section{Introduction}
|
||||
Scatterplots remain one of the most effective and widely utilized methods for visualizing multidimensional data, allowing analysts to relate pairs of data dimensions to reveal clusters, trends, and outliers.
|
||||
However, as dataset sizes increase, classical scatterplots suffer significantly from scalability issues.
|
||||
When the number of data samples grows, rendering them on a screen with fixed resolution inevitably leads to occlusion and overplotting.
|
||||
This visual clutter negatively impacts the data analysis process by making it difficult to estimate sample density in crowded regions and restricting access to individual data points, thereby impeding detailed exploration.
|
||||
|
||||
The goal of this paper is video action recognition.
|
||||
Given are approximately 10 seconds long videos which should be classified.
|
||||
In this paper datasets with 400 and 101 different classes are used.
|
||||
The proposed approach is tested with 1\% and 10\% of known labels of all data points.
|
||||
To alleviate these issues, visualization research has traditionally employed three main strategies: appearance modification, data reduction, and spatial transformation.
|
||||
Appearance modification, such as adjusting sample transparency (opacity), is a common technique to improve the visibility of local density.
|
||||
While effective for density estimation, it does not resolve the overlap of interaction targets, leaving individual samples inaccessible.
|
||||
Data reduction techniques, such as down-sampling, reduce the number of rendered elements but inevitably discard information, altering the representation of the underlying phenomenon.
|
||||
|
||||
\section{Semi-Supervised learning}\label{sec:semi-supervised-learning}
|
||||
In traditional supervised learning we have a labeled dataset.
|
||||
Each datapoint is associated with a corresponding target label.
|
||||
The goal is to fit a model to predict the labels from datapoints.
|
||||
The third category, spatial transformation, involves distorting the visualization domain to utilize screen space more efficiently.
|
||||
This paper focuses on this domain, specifically addressing the limitations of existing deformation techniques.
|
||||
Many prior spatial distortion methods rely on local collision detection or force-directed layouts, which can be computationally expensive (often $\mathcal{O}(n^2)$ or $\mathcal{O}(n^3)$) and may fail to preserve essential neighborhood relations.
|
||||
Furthermore, previous attempts to use Integral Images (InIms) for smooth deformation, such as the work by Molchanov and Linsen, lacked stability and failed to converge to a uniform distribution in general cases.
|
||||
|
||||
In traditional unsupervised learning there are also datapoints but no labels are known.
|
||||
The goal is to find patterns or structures in the data.
|
||||
Moreover, it can be used for clustering or downprojection.
|
||||
The following summary examines the methodologies and contributions detailed in `De-cluttering Scatterplots with Integral Images` by Hennes Rave, Vladimir Molchanov, and Lars Linsen.
|
||||
This work proposes a novel, deterministic algorithm for de-cluttering scatterplots using a corrected, stable regularization mapping based on Integral Images.
|
||||
Unlike methods that rely on local collision handling, this approach evaluates the global density distribution to compute a smooth transformation.
|
||||
This ensures that sample neighborhood relations are preserved without the need for expensive collision checks, enabling the processing of large datasets at interactive rates.
|
||||
The authors present a parallel GPU-based implementation for fast computation and introduce visual encodings—such as deformed grids and density textures—to help users interpret the spatial distortions applied to the data~\cite{Rave_2025}.
|
||||
|
||||
Those two techniques combined yield semi-supervised learning.
|
||||
Some of the labels are known, but for most of the data we have only the raw datapoints.
|
||||
The basic idea is that the unlabeled data can significantly improve the model performance when used in combination with the labeled data.
|
||||
\section{Method}
|
||||
The core of the de-cluttering technique is a deterministic, iterative algorithm that uses global density information to redistribute data samples.
|
||||
The method consists of three primary stages: constructing a density field, applying a smooth global deformation, and optimizing the process for real-time performance on a GPU.
|
||||
|
||||
\section{FixMatch}\label{sec:fixmatch}
|
||||
There is an already existing approach called FixMatch.
|
||||
This was introduced in a Google Research paper from 2020~\cite{fixmatch}.
|
||||
The key idea of FixMatch is to leverage the unlabeled data by predicting pseudo-labels out of the known labels.
|
||||
Then both, the known labels and the predicted ones are used side by side to train the model.
|
||||
The labeled samples guide the learning process and the unlabeled samples gain additional information.
|
||||
|
||||
Not every pseudo prediction is kept to train the model further.
|
||||
A confidence threshold is defined to evaluate how `confident` the model is about its prediction.
|
||||
The prediction is dropped if the model is too less confident.
|
||||
The quantity and quality of the obtained labels is crucial and they have a significant impact on the overall accuracy.
|
||||
This means improving the pseudo-label framework as much as possible is essential.
|
||||
|
||||
FixMatch results in some major limitations.
|
||||
It relies on a single model for generating pseudo-labels which can introduce errors and uncertainty in the labels.
|
||||
Incorrect pseudo-labels may effect the learning process negatively.
|
||||
Furthermore, Fixmatch uses a compareably small model for label prediction which has a limited capacity.
|
||||
This can negatively affect the learning process as well.
|
||||
%There is no measure defined how certain the model is about its prediction.
|
||||
%Such a measure improves overall performance by filtering noisy and unsure predictions.
|
||||
Cross-Model Pseudo-Labeling tries to address all of those limitations.
|
||||
|
||||
\subsection{Math of FixMatch}\label{subsec:math-of-fixmatch}
|
||||
Equation~\ref{eq:fixmatch} defines the loss-function that trains the model.
|
||||
The sum over a batch size $B_u$ takes the average loss of this batch and should be familiar.
|
||||
The input data is augmented in two different ways.
|
||||
At first there is a weak augmentation $\mathcal{T}_{\text{weak}}(\cdot)$ which only applies basic transformation such as filtering and bluring.
|
||||
Moreover, there is the strong augmentation $\mathcal{T}_{\text{strong}}(\cdot)$ which does cropouts and random augmentations.
|
||||
\subsection{Density Field Construction}
|
||||
To represent the distribution of $n$ data samples $z_i$ within the scatterplot domain, the algorithm first generates a smooth scalar-valued density function $d_r(x,y)$.
|
||||
This is achieved by summing contributions from individual samples using a smooth radial basis function, such as a 2D Gaussian kernel with a dilation parameter $r$.
|
||||
To ensure numerical stability, particularly in `empty` regions where the mapping might otherwise become singular, a global constant $d_0$ is added to the density field:
|
||||
|
||||
\begin{equation}
|
||||
\label{eq:fixmatch}
|
||||
\mathcal{L}_u = \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i) \geq \tau) \mathcal{H}(\hat{y}_i,F(\mathcal{T}_{\text{strong}}(u_i)))
|
||||
\label{eq:dfc}
|
||||
d(i,j) = d_r(i,j) + D_{0}
|
||||
\end{equation}
|
||||
|
||||
The indicator function $\mathbbm{1}(\cdot)$ applies a principle called `confidence-based masking`.
|
||||
It retains a label only if its largest probability is above a threshold $\tau$.
|
||||
Where $p_i \coloneqq F(\mathcal{T}_{\text{weak}}(u_i))$ is a model evaluation with a weakly augmented input.
|
||||
This constant $d_0$ is typically set to the average number of samples per pixel, representing the theoretical density of a perfectly uniform distribution~\cite{Rave_2025}.
|
||||
|
||||
\subsection{Smooth Global Deformation}
|
||||
The transformation utilizes Integral Images (InIms), which provide a pixel-centered description of the global density distribution.
|
||||
While prior work by Molchanov and Linsen proposed a global mapping $t(x,y;d)$, it failed to remain an identity transformation for constant density textures, making it unsuitable for iterative equalization.
|
||||
|
||||
This work introduces a corrected transformation $t(x,y)$ that subtracts the `defect` mapping of a constant texture $d_0$ from the original formula:
|
||||
|
||||
\begin{equation}
|
||||
\label{eq:crossentropy}
|
||||
\mathcal{H}(\hat{y}_i, y_i) = -\sum_{i=1} y_i \cdot log(\hat{y}_i)
|
||||
\label{eq:sgd}
|
||||
t(x,y) = (x,y) + t(x,y; d) - t(x,y; d_0)
|
||||
\end{equation}
|
||||
|
||||
The second part $\mathcal{H}(\cdot, \cdot)$ is a standard Cross-entropy loss function which takes two inputs, the predicted and the true label.
|
||||
$\hat{y}_i$, the obtained pseudo-label and $F(\mathcal{T}_{\text{strong}}(u_i))$, a model evaluation with strong augmentation.
|
||||
The indicator function evaluates in $0$ if the pseudo prediction is not confident and the current loss evaluation will be dropped.
|
||||
Otherwise it evaluates to 1 and it will be kept and trains the model further.
|
||||
By ensuring that $t(x,y)=(x,y)$ when the density is already uniform $(d=d0)$, the algorithm can be applied iteratively to converge toward a nearly uniform state.
|
||||
This smooth deformation preserves essential neighborhood relations and the local ordering of data points without requiring expensive collision detection~\cite{Rave_2025}.
|
||||
|
||||
\section{Cross-Model Pseudo-Labeling}\label{sec:cross-model-pseudo-labeling}
|
||||
The newly invented approach of this paper is called Cross-Model Pseudo-Labeling (CMPL)\cite{Xu_2022_CVPR}.
|
||||
Figure~\ref{fig:cmpl-structure} visualizs the structure of CMPL\@.
|
||||
Two different models, a smaller auxiliary model and a larger model are defined.
|
||||
They provide pseudo-labels for each other.
|
||||
The two different models have a different structural bias which leads to complementary representations.
|
||||
This symetric design performs a boost in performance.
|
||||
The SG label means 'Stop Gradient'.
|
||||
The loss function evaluations are fed into the opposite model as loss.
|
||||
The two models train each other.
|
||||
\subsection{Implementation and Visual Encodings}
|
||||
To maintain interactivity with datasets containing millions of points, the authors developed a parallel GPU-based scheme using compute shaders to calculate InIms in linear O(n) time.
|
||||
This multi-pass approach computes column and full integrals to determine new sample positions via bi-linear interpolation.
|
||||
Because this spatial distortion alters original distances, the method incorporates critical visual cues to help users maintain context.
|
||||
These include deformed regular grids to show area expansion, density background textures to highlight original cluster locations, and contour lines to define the boundaries of the original data structures~\cite{Rave_2025}.
|
||||
|
||||
\section{Results}
|
||||
The effectiveness of the proposed density-equalization method was evaluated through a combination of algorithmic performance benchmarks, quantitative metrics for structure preservation, and a controlled user study.
|
||||
|
||||
\begin{figure}[h]
|
||||
\begin{figure}[htbp]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{../presentation/rsc/structure}
|
||||
\caption{Architecture of Cross-Model Pseudo-Labeling}
|
||||
\label{fig:cmpl-structure}
|
||||
% First Row
|
||||
\begin{subfigure}[b]{0.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{../presentation/rsc/2408.06513v1_page_7_1}
|
||||
\caption{Original Scatterplot}
|
||||
\label{fig:original}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{0.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{../presentation/rsc/2408.06513v1_page_7_2}
|
||||
\caption{Density Estimation}
|
||||
\label{fig:density}
|
||||
\end{subfigure}
|
||||
|
||||
\begin{subfigure}[b]{0.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{../presentation/rsc/2408.06513v1_page_7_3}
|
||||
\caption{Result with background texture}
|
||||
\label{fig:grid}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{0.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{../presentation/rsc/2408.06513v1_page_7_4}
|
||||
\caption{Result with contour lines}
|
||||
\label{fig:final}
|
||||
\end{subfigure}
|
||||
|
||||
\caption{MNIST Dataset with CMAP applied and then de-cluttered using InIms~\cite{Rave_2025}}\label{fig:figure}
|
||||
\label{fig:2x2grid}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Math of CMPL}\label{subsec:math}
|
||||
The loss function of CMPL is similar to that one explaind above.
|
||||
But we have to differ from the loss generated from the supervised samples where the labels are known and the unsupervised loss where no labels are available.
|
||||
\subsection{Performance and Scalability}
|
||||
A significant contribution of this work is the GPU-accelerated implementation of the integral image-based mapping.
|
||||
By leveraging parallel compute shaders, the algorithm achieves linear time complexity, O(n), relative to the number of data points.
|
||||
Benchmarks conducted on an NVIDIA GeForce RTX 3080 showed that the entire pipeline—including density estimation, InIm computation, and sample mapping—takes less than 10 milliseconds for datasets with up to 106 points.
|
||||
This performance level enables seamless interactivity within visualization software, allowing users to adjust regularization parameters in real-time.
|
||||
|
||||
The two equations~\ref{eq:cmpl-losses1} and~\ref{eq:cmpl-losses2} are normal Cross-Entropy loss functions generated with the supervised labels of the two seperate models.
|
||||
\subsection{Quantitative Comparison}
|
||||
The method was compared against `Hagrid`, a state-of-the-art grid-based de-cluttering technique.
|
||||
Using metrics such as the preservation of k-nearest neighbors (KNN) and spatial stress, the proposed integral image approach demonstrated superior stability.
|
||||
Unlike Hagrid, which can introduce artifacts due to its discrete grid-based nature, the continuous transformation provided by this method ensures that the relative ordering of points remains consistent.
|
||||
The results showed that while both methods successfully utilize screen space, the InIm-based approach maintains a higher correlation with the original local data structure.
|
||||
|
||||
\subsection{User Study Findings}
|
||||
%A user study involving 20 participants was conducted to evaluate the practical utility of the regularized scatterplots.
|
||||
%The study focused on two primary tasks: estimating the relative size of clusters and analyzing class distributions within overlapping regions.
|
||||
|
||||
\begin{align}
|
||||
\label{eq:cmpl-losses1}
|
||||
\mathcal{L}_s^F &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,F(\mathcal{T}^F_{\text{standard}}(v_i)))\\
|
||||
\label{eq:cmpl-losses2}
|
||||
\mathcal{L}_s^A &= \frac{1}{B_l} \sum_{i=1}^{B_l} \mathcal{H}(y_i,A(\mathcal{T}^F_{\text{standard}}(v_i)))
|
||||
\end{align}
|
||||
%\begin{enumerate}
|
||||
%\item \textbf{Accuracy:} Participants were significantly more accurate at estimating the number of samples within dense clusters when using the regularized view compared to the original scatterplot (even with transparency).
|
||||
%\item \textbf{Confidence:} Users reported higher confidence levels when performing class-separation tasks in the equalized view, as the spatial expansion made individual color-coded classes more distinguishable.
|
||||
%\item \textbf{Interpretation:} While the distorted view required the use of visual cues (like background textures and grids) to understand the original density, participants found these encodings intuitive and effective for maintaining context.
|
||||
%\end{enumerate}
|
||||
|
||||
Equation~\ref{eq:cmpl-loss3} and~\ref{eq:cmpl-loss4} are the unsupervised losses.
|
||||
They are very similar to FastMatch, but important to note is that the confidence-based masking is applied to the opposite corresponding model.
|
||||
A controlled study involving 20 participants demonstrated that the regularized view significantly improves accuracy in estimating sample counts within dense clusters compared to traditional scatterplots.
|
||||
Users reported higher confidence levels when performing class-separation tasks, as the spatial expansion made color-coded classes more distinguishable.
|
||||
Although the distortion requires the use of visual aids like background textures and grids to maintain context, participants found these encodings intuitive and effective for interpreting the original data distribution.
|
||||
|
||||
\begin{align}
|
||||
\label{eq:cmpl-loss3}
|
||||
\mathcal{L}_u^F &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^A) \geq \tau) \mathcal{H}(\hat{y}_i^A,F(\mathcal{T}_{\text{strong}}(u_i)))\\
|
||||
\label{eq:cmpl-loss4}
|
||||
\mathcal{L}_u^A &= \frac{1}{B_u} \sum_{i=1}^{B_u} \mathbbm{1}(\max(p_i^F) \geq \tau) \mathcal{H}(\hat{y}_i^F,A(\mathcal{T}_{\text{strong}}(u_i)))
|
||||
\end{align}
|
||||
\section{Conclusion}
|
||||
This paper presented a robust and scalable visualization technique for de-cluttering dense scatterplots using density-equalizing transformations.
|
||||
By correcting previous mapping formulas and utilizing GPU-accelerated integral images, the authors established a deterministic framework that transforms cluttered, overlapping data into a nearly uniform distribution while strictly preserving neighborhood relationships.
|
||||
|
||||
Finally to train the main objective a overall loss is calculated by simply summing all the losses.
|
||||
The loss is regulated by an hyperparamter $\lambda$ to enhance the importance of the supervised loss.
|
||||
|
||||
\begin{equation}
|
||||
\label{eq:loss-main-obj}
|
||||
\mathcal{L} = (\mathcal{L}_s^F + \mathcal{L}_s^A) + \lambda(\mathcal{L}_u^F + \mathcal{L}_u^A)
|
||||
\end{equation}
|
||||
|
||||
\section{Architecture}\label{sec:Architecture}
|
||||
The used model architectures depend highly on the task to be performed.
|
||||
In this case the task is video action recognition.
|
||||
A 3D-ResNet50 was chosen for the main model and a smaller 3D-ResNet18 for the auxiliary model.
|
||||
|
||||
\section{Performance}\label{sec:performance}
|
||||
|
||||
In figure~\ref{fig:results} a performance comparison is shown between just using the supervised samples for training against some different pseudo label frameworks.
|
||||
One can clearly see that the performance gain with the new CMPL framework is quite significant.
|
||||
For evaluation the Kinetics-400 and UCF-101 datasets are used.
|
||||
And as a backbone model a 3D-ResNet18 and 3D-ResNet50 are used.
|
||||
Even when only 1\% of true labels are known for the UCF-101 dataset 25.1\% of the labels could be predicted right.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{../presentation/rsc/results}
|
||||
\caption{Performance comparisons between CMPL, FixMatch and supervised learning only}
|
||||
\label{fig:results}
|
||||
\end{figure}
|
||||
|
||||
\section{Further schemes}\label{sec:further-schemes}
|
||||
How the pseudo-labels are generated may impact the overall performance.
|
||||
In this paper the pseudo-labels are obtained by the cross-model approach.
|
||||
But there might be other strategies as well.
|
||||
For example:
|
||||
\begin{enumerate*}
|
||||
\item Self-First: Each network uses just its own prediction if its confident enough.
|
||||
If not, it uses its sibling net prediction.
|
||||
\item Opposite-First: Each net prioritizes the prediction of the sibling network.
|
||||
\item Maximum: The most confident prediction is leveraged.
|
||||
\item Average: The two predictions are averaged before deriving the pseudo-label
|
||||
\end{enumerate*}.
|
||||
|
||||
Those are just other approaches one can keep in mind.
|
||||
This doesn't mean they are better, in fact they performed even worse in this study.
|
||||
|
||||
\section{Conclusion}\label{sec:conclusion}
|
||||
In conclusion, Cross-Model Pseudo-Labeling demonstrates the potential to significantly advance the field of semi-supervised action recognition.
|
||||
Cross-Model Pseudo-Labeling outperforms the supervised-only approach over several experiments by a multiple.
|
||||
It surpasses most of the other existing pseudo-labeling frameworks.
|
||||
Through the integration of main and auxiliary models, consistency regularization, and uncertainty estimation, CMPL offers a powerful framework for leveraging unlabeled data and improving model performance.
|
||||
It paves the way for more accurate and efficient action recognition systems.
|
||||
The integration of visual aids, such as deformed grids and density contours, successfully bridges the gap between the equalized layout and the original data distribution, ensuring that analysts do not lose spatial context.
|
||||
The quantitative results and user study confirm that this approach significantly improves the readability of large datasets without sacrificing the performance required for interactive exploratory analysis.
|
||||
Future work may explore extending this density-equalizing mapping to 3D scatterplots or integrating it into automated dimensionality reduction pipelines to better visualize manifold structures.
|
||||
|
||||
%%
|
||||
%% The next two lines define the bibliography style to be used, and
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
@InProceedings{Xu_2022_CVPR,
|
||||
author = {Xu, Yinghao and Wei, Fangyun and Sun, Xiao and Yang, Ceyuan and Shen, Yujun and Dai, Bo and Zhou, Bolei and Lin, Stephen},
|
||||
title = {Cross-Model Pseudo-Labeling for Semi-Supervised Action Recognition},
|
||||
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
||||
month = {June},
|
||||
year = {2022},
|
||||
pages = {2959-2968}
|
||||
}
|
||||
|
||||
@online{fixmatch,
|
||||
author = "Kihyuk Sohn, David Berthelot, Chun-Liang Li",
|
||||
title = "FixMatch: Simplifying Semi-Supervised Learning with Consistency and Confidence",
|
||||
url = "https://arxiv.org/abs/2001.07685",
|
||||
addendum = "(accessed: 20.03.2023)",
|
||||
keywords = "FixMatch, semi-supervised"
|
||||
}
|
||||
@article{Rave_2025,
|
||||
title={De-Cluttering Scatterplots With Integral Images},
|
||||
volume={31},
|
||||
ISSN={2160-9306},
|
||||
url={http://dx.doi.org/10.1109/TVCG.2024.3381453},
|
||||
DOI={10.1109/tvcg.2024.3381453},
|
||||
number={4},
|
||||
journal={IEEE Transactions on Visualization and Computer Graphics},
|
||||
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
|
||||
author={Rave, Hennes and Molchanov, Vladimir and Linsen, Lars},
|
||||
year={2025},
|
||||
month=apr, pages={2114–2126} }
|
||||
|
||||