Commit 4017a588 authored by Christoph Lampert's avatar Christoph Lampert

now it compiled

parent 42b5ee08
......@@ -33,7 +33,7 @@
%\usepackage{times}\usefonttheme{professionalfonts} % obsolete
%\usefonttheme[onlymath]{serif}
\boldmath
\usepackage[orientation=portrait,size=a0,scale=1.4, debug]{beamerposter}
\usepackage[orientation=landscape,size=a1,scale=1.4, debug]{beamerposter}
% change list indention level
% \setdefaultleftmargin{3em}{}{}{}{}{}
......@@ -72,7 +72,7 @@
%\title{\LARGE iCaRL: incremental Classifier and Representation Learning}
\title{Computer Vision and Machine Learning}
\author{}
\author{~}
\institute{\vskip-.5\baselineskip\large Institute of Science and Technology (IST) Austria, 3400 Klosterneuburg, Austria}
%\institute{~}%Christoph Lampert} %\textsuperscript{1} ENS Rennes (Ecole Normale Sup\'{e}rieure de Rennes), Rennes, France \textsuperscript{2} IST Austria (Institute of Science and Technology Austria), Klosterneuburg, Austria}
......@@ -80,7 +80,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newlength{\columnheight}
\setlength{\columnheight}{95cm}
\setlength{\columnheight}{40cm}
\setlength{\columnsep}{1cm}
\renewcommand{\P}{\pmb{\mathbb{P}}}
......@@ -139,8 +139,14 @@
\vspace*{-1.5cm}
\begin{block}{\Large People}
\newcommand{\peopleheight}{6cm}
\vskip-1cm
\begin{columns}[t]
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% First COlumn
\ \ \ \begin{column}{.49\textwidth}
\begin{block}{\Large Our Research}
\newcommand{\peopleheight}{4cm}
\begin{center}
\includegraphics[height=\peopleheight{}]{people/clampert.jpg} ~~
\includegraphics[height=\peopleheight{}]{people/akolesnikov-new.jpg} ~~
......@@ -152,136 +158,59 @@
\includegraphics[height=\peopleheight{}]{people/gsperl.jpg} ~~
\includegraphics[height=\peopleheight{}]{people/azimin.jpg} ~~
\end{center}
\end{block}
\vskip-1cm
\begin{columns}[t]
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% First COlumn
\ \ \ \begin{column}{.49\textwidth}
\begin{block}{\Large Multi-Task Learning}
\bigskip
Object categorization methods are trained to recognize \textbf{1000s of classes}:
\medskip
\centerline{\includegraphics[width=.9\textwidth]{2010-imagenet}}%
\medskip
\blue{Standard training requires:}
\begin{itemize}
\item random order access to 100s of GB of training data,
\item many days to weeks of training time.
\end{itemize}
\bigskip
\medskip
\centerline{\includegraphics[width=.9\textwidth]{2010-imagenet-new}}%
\medskip
\end{block}
\blue{What, if a few new classes are meant to be included?}
\begin{itemize}
\item training must be re-run for all classes
\begin{block}{\Large Example: Fine-tuning}
$\rightarrow$ huge computational cost, all training data must be kept around {\color{orange}\large \Frowny{}}
\end{itemize}
\begin{center}
yes%\includegraphics{dummy} ~~
\end{center}
\bigskip
Potential solution: \bblue{class-incremental learning}
\end{block}
\begin{block}{\Large Example: Distillation}
\vskip4\blockskip
\begin{block}{\Large Conditional Risk}
\textbf{Fixed data representation:}
\begin{itemize}
\item retrain classifiers on data subset with biased regularization {\scriptsize [Kuzborskij \etal, 2013]}
\item represent classes by \blue{mean feature vectors} {\scriptsize [Mensink \etal, 2012], [Ristin \etal, 2014]}
\end{itemize}
\textbf{Learning the data representation:}
\begin{itemize}
\item grow neural network incrementally, fixing parts that are responsible for earlier class decisions
{\scriptsize [Mandziuk, Shastri. 1998], \dots, [Rusu \etal, 2016]}
\item continuously generate patterns to prevent forgetting {\scriptsize [Ans, Rousset. 2000]}
\begin{center}
yes%\includegraphics{dummy} ~~
\end{center}
\item multi-task setting: preserve network activations by \blue{distillation} {\scriptsize [Li, Hoiem. 2016]}
\end{itemize}
\end{block}
\vskip4\blockskip
\begin{block}{\Large iCaRL} % {\scriptsize [arXiv \dots]}}
We incrementally learn \blue{classifiers and features} with a fixed-size network.
\end{column}
%
%Notation:
\begin{itemize}
\item $f^1(x),\dots,f^T(x)$: probabilistic network outputs for (up to) $T$ classes
\item $\bphi$: (current) feature representation defined by the network
%\item $K$: size of extra memory that we can use to store images/exemplars
\item $t$: number of classes observed so far
\item $P^y=(p^y_1,\dots,p^y_{m})$ set of exemplar images for class $y=1,\dots,t$
\end{itemize}
\bigskip
\bblue{iCaRL component 1: exemplar-based classification.}
\begin{itemize}
\item a new image, $x$, is classified by the \blue{nearest-mean-of-exemplars} rule
$$ y^\ast = \operatorname*{argmin}_{y=1,\dots,t} \Big\|\bphi(x) - \mu^y\Big\|
\qquad\text{for }\
\mu^y=\frac{1}{m}\sum_{j=1}^{m}\bphi(p^y_j).$$
\end{itemize}
\ \ \begin{column}{.495\textwidth}
\bigskip
\bblue{iCaRL component 2: representation learning.}
For new data $X^y=\{x^y_1,\dots,x^y_{n_y}\}$ of classes $y=t\!+\!1,\dots,t'$
\begin{itemize}
\item create training set of training examples and exemplars
$$\mathcal{D} \leftarrow \bigcup_{y=t+1,\dots,t'}\!\!\!\{(x,y) : x\in X^y\}
\ \cup\!\!\bigcup_{y=1,\dots,t} \!\!\!\{(x,y) : x\in P^y\} $$
\item for all $x_i\in\mathcal{D}$, store network outputs $a^y_i = f^y(x_i)$ of classes $y=1,\dots,t$
\item update the network parameters, $\theta$, using BackProp on loss function
$$\ell(\theta) = -\!\!\!\!\!\!\sum_{(x_i,y_i)\in\mathcal{D}}\!\!\!\![\ \log( f^{y_i}(x_i;\theta)+\sum_{y=1}^{t}\, a^y_i\log( f^y(x_i;\theta) )\ ].$$
\end{itemize}
\begin{block}{\Large Example: Class-incremental learning}
\begin{center}
yes%\includegraphics{dummy} ~~
\end{center}
\bigskip
\bblue{iCaRL component 3: exemplar selection.}
Object categorization methods are trained to recognize \textbf{1000s of classes}:
When the number of observed classes increases from $t$ to $t'$: set $m'=\frac{K}{t'}$.
\begin{itemize}
\item for classes $y=1,\dots,t$, keep exemplars $p^y_{1},\dots,p^y_{m'}$, discard others % $p^j_{m'_j+1},\dots,p^j_{m_j}$
\item for classes $y=t\!+\!1,\dots,t'$, find new exemplars $p^y_{1},\dots,p^y_{m'}$ as
\begin{align*}
p^y_k \leftarrow\!\argmin\limits_{x\in X^y} \Big\| \frac{1}{n_y}\sum_{i=1}^{n_y}\!\!\bphi(x^y_i) - \frac{1}{k}\Big[\bphi(x)+\sum_{j=1}^{k-1}\!\!\bphi(p^y_j)\Big] \Big\|
\ \text{for $k=1,\dots,m'$}.
\end{align*}
%\begin{align*}
%\operatorname*{argmin}_{|I|=m'_j} \Big\| \frac{1}{n_j}\sum_{i=1}^{n_j}\bphi(x^j_i) - \frac{1}{m'_j}\sum_{i\in I}\bphi(x^j_i) \Big\|
%\end{align*}
\end{itemize}
\end{block}
%
%
%\STATE // form combined training set:
%%
%\STATE // store network outputs with pre-update parameters:
%\FOR{$j=1,\dots,s-1$}
%\STATE $a^j_i\leftarrow f^j(x_i)$ \quad for all $(x_i,\cdot)\in\D$
%\ENDFOR
%%\STATE run network training (\eg BackProp) with loss function
%\begin{align*}
%\ell(\Theta) = -\!\!\!\!\!\!\sum_{(x_i,y_i)\in\D}&\!\!\!\!\!\!\big[\log( f^{y_i}(x_i))+\sum_{j=1}^{s-1}\, a^j_i\log( f^j(x_i) )\big]
%\end{align*}
%\textcolor{red}{\textbf{is this the right way to handle EXEMPLARS???}}
\begin{block}{\Large Example: Multi-task Learning}
\end{column}
%
\begin{center}
yes%\includegraphics{dummy} ~~
\end{center}
\bigskip
Object categorization methods are trained to recognize \textbf{1000s of classes}:
\ \ \begin{column}{.495\textwidth}
\begin{block}{\Large Multi-output Distillation}
\end{block}
\begin{block}{\Large Example: Multi-output Distillation}
\textbf{Situation:}
\begin{itemize}
\item classes appear sequentially (or in batches) % $c_1,c_2,\dots,c_T$
......@@ -294,96 +223,8 @@ p^y_k \leftarrow\!\argmin\limits_{x\in X^y} \Big\| \frac{1}{n_y}\sum_{i=1}^{n_y}
\item for any number of observed classes, $t$, learn a multi-class classifier% for $c_1,\dots,c_t$
\item store a certain number, $K$, of images (a few hundreds or thousands)
\end{itemize}
\bigskip
\textbf{We do not want to/we cannot:}
\begin{itemize}
\item retrain from scratch whenever new classes become available
\item store all training examples (could be millions)
\end{itemize}
\bigskip
\textbf{The dilemma:}
\begin{itemize}
\item \textcolor{blue}{fixing the data representation}: suboptimal results on new classes. % {\tiny [Tommasi \etal, 2015]}.
\item \textcolor{blue}{continuously improving the representation}: classifiers for earlier classes deteriorate over time
("catastrophic forgetting/interference").{\scriptsize [McCloskey, Cohen. 1989]} % (catastrophic forgetting) {\tiny []}.
% Ratcliff, R. (1990) Connectionist models of recognition memory: Constraints imposed by learning and forgetting functions. Psychological Review,97, 285-308
\end{itemize}
\end{block}
\vskip4\blockskip
\begin{block}{\Large Flexible Fine-tuning}
\vskip4\blockskip
\mbox{
\parbox{.48\textwidth}{
\!\!\!\textbf{CIFAR-100:}
\begin{itemize}
\item 100 classes, in batches of 10
\item 32-layer ResNet {\scriptsize [He \etal, 2015]}
\item evaluated by top-1 accuracy
\item number of exemplars: 2000
\end{itemize}
}
\parbox{.48\textwidth}{
\textbf{ImageNet ILSVRC 2012 (subset):}
\begin{itemize}
\item 100 classes, in batches of 10
\item 18-layer ResNet {\scriptsize [He \etal, 2015]}
\item evaluated by top-5 accuracy
\item number of exemplars: 2000
\end{itemize}
}}
\vskip4\blockskip
\textbf{Baselines:}
\begin{itemize}
\item fixed representation: freeze representation after first batch of classes
\item finetuning: ordinary NN learning, finetune whenever new classes come in
\item LwF: \emph{"Learning without Forgetting"} {\scriptsize [Li, Hoiem. 2016]}, use network itself to classify
%\item LwF+proto: like LwF, but with prototypes used for representation learning
\item iNCM: like iCaRL, but store all images and classify with true class means
\end{itemize}
\end{block}
\vskip4\blockskip
\begin{block}{\Large 7) Summary}
\begin{itemize}
\item iCaRL learns incrementally with a fixed memory footprint
\item much better results than baselines, on par with (intractable) iNCM
\end{itemize}
\end{block}
\bigskip\hrule\medskip\tiny
%[Thrun \etal, "Learning one more thing", \dots]
[Ans, Rousset. \emph{"Neural networks with a self-refreshing memory: Knowledge transfer in sequential learning tasks without catastrophic forgetting"}, Connection Science 12(1), 2000]
[He, Zhang, Ren, Sun. \emph{"Deep residual learning for image recognition"}. arXiv:1512.03385, 2015]
[Hinton, Vinyals, Dean. \emph{"Distilling the Knowledge in a Neural Network"}, NIPS Workshop on Deep Learning, 2014]
[Kuzborskij, Orabona, Caputo. \emph{"From N to N+1: Multiclass transfer incremental learning"}, CVPR 2013]
[Mandziuk, Shastri. \emph{"Incremental class learning approach and its application to handwritten digit recognition"}, Information Sciences, 2002]
[McCloskey, Cohen. \emph{"Catastrophic interference in connectionist networks: The sequential learning problem"}, The Psychology of Learning and Motivation, 1989]
[Mensink, Verbeek, Perronnin, Csurka. \emph{"Distance-based image classification: Generalizing to new Classes at near-zero cost"}, 2013]
[Li, Hoiem. \emph{"Learning without forgetting"}, ECCV 2016]
[Ristin, Huillaumin, Gall, van Gool. \emph{"Incremental learning of NCM forests for large-scale image classification"}, CVPR 2014]
[Rusu, Rabinowitz, Desjardins, Soyer, Kirkpatrick, Kavukcuoglu, Pascanu, Hadsell. \emph{"Progressive neural networks"}, arXiv:1606.04671 [cs.LG] 2016]
%[Ans, "Sequential Learning in Distributed Neural Networks without Catastrophic Forgetting: A Single and Realistic Self-Refreshing Memory Can Do It", Neural Information Processing--Letters and Reviews. 2004]
%[He, Zhang, Ren, Sun. Deep residual learning for image recognition. CVPR 2016]
%[Rocco De Rosa, Thomas Mensink, and Barbara Caputo, "Online Open World Recognition"] arXiv
\end{column}
\end{columns}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment