poster/neurips.tex

% arara: lualatex: { shell: true }
% arara: lualatex: { shell: true }
% arara: lualatex: { shell: true, synctex: true }

% A2  : 420 x 594 mm    |
% 2A0 : 1189 x 1682 mm  > Factor 2.83 => 11pt ~ 31 pt
% 1m  : 1000 x 1414     > Factor 2.38 => 12pt ~ 28 pt

% A3  : 297 x 420 mm    |
% 2A0 : 1189 x 1682 mm  > Factor 4 => 11pt ~ 44 pt
% 1m  : 1000 x 1414     > Factor 3.36 => 10pt ~ 34 pt
\PassOptionsToPackage{force}{filehook} % see https://tex.stackexchange.com/questions/513051/filehook-error-with-memoir-after-update-texlive-2019-in-oct-15
\documentclass[10pt]{article}

\usepackage{luatex85}

% layout
\usepackage[a3paper,landscape]{geometry}

% math support
\usepackage{mathtools,amssymb}

% fonts
\RequirePackage[factor=0]{microtype} % no protrusion
\usepackage{unicode-math}
\defaultfontfeatures{Ligatures=TeX}
\IfFileExists{fonts/Berling.otf}{%
  % load fonts of official UU design
  \setmainfont{Berling}[%
  Path=./fonts/,
  Extension=.otf,
  BoldFont=*-Bold,
  ItalicFont=*-Italic,
  BoldItalicFont=*-BoldItalic]
}{%
  \setmainfont{Libertinus Serif}
}
\setsansfont{Libertinus Sans}
\setmonofont{Libertinus Mono}
\setmathfont{Libertinus Math}

\usepackage{bm}

% language support
\usepackage{polyglossia}
\setdefaultlanguage{english}
\usepackage{csquotes}

% better looking tables
\usepackage{booktabs}

% colors
\usepackage[CMYK]{xcolor}
\usepackage{UUcolorPantone}

\newcommand{\hl}[1]{\begingroup\bfseries\boldmath\color{uured}#1\endgroup}

% graphics
\usepackage{graphicx}
\usepackage{svg}
\svgpath{{./figures/}}

% captions
\usepackage{caption,subcaption}
\captionsetup{font=scriptsize}

% fancy lists
\usepackage{enumitem}
\setlist{leftmargin=*,itemsep=0pt}
\setlist[itemize,1]{label={\color{uured}$\blacktriangleright$}}

% hyperlinks
\usepackage{hyperref}

% boxes
\usepackage[poster,xparse,raster]{tcolorbox}

% poster settings
\tcbposterset{
  coverage =
  {
    spread,
    interior style={color=white},
  },
  poster =
  {
    columns=10,
    rows=1,
  },
  boxes =
  {
    enhanced standard jigsaw,
    sharp corners=downhill,
    arc=3pt,
    boxrule=1pt,
    lower separated=false,
    % colors
    coltext=black,
    colback=white,
    colframe=black,
    coltitle=black,
    colbacktitle=uulightgrey,
    % fonts
    fonttitle=\bfseries\large,
    % subtitles
    subtitle style=
    {
      frame empty,
      hbox,
      rounded corners=east,
      arc=8pt,
      coltext=white!50!uulightgrey,
      colback=black!10!uudarkgrey,
    },
  }
}

% plots
\usepackage{pgfplots,pgfplotstable}
\pgfplotsset{compat=1.16}
\usetikzlibrary{positioning,arrows,arrows.meta,calc,decorations.markings,intersections,patterns}

\usepgfplotslibrary{groupplots,fillbetween}
\usetikzlibrary{plotmarks}

% Use colorbrewer
\usepgfplotslibrary{colorbrewer}
\pgfplotsset{
  % initialize Dark2-8:
  cycle list/Dark2-8,
  % combine it with ’mark list*’:
  cycle multiindex* list={
    mark list*\nextlist
    Dark2-8\nextlist
  },
}

% plotting options
\pgfplotsset{every axis/.append style={axis background style={fill=gray!10}}}

% automatic references
\usepackage{cleveref}

% some abbreviations
\newcommand*{\Prob}{\mathbb{P}}
\newcommand*{\Expect}{\mathbb{E}}
\newcommand*{\transpose}[1]{{#1}^{\mathsf{T}}}
\newcommand*{\ECE}{\mathup{ECE}}
\newcommand*{\measure}{\mathup{CE}}
\newcommand*{\kernelmeasure}{\mathup{KCE}}
\newcommand*{\squaredkernelmeasure}{\mathup{SKCE}}
\newcommand*{\biasedestimator}{\widehat{\mathup{SKCE}}_{\mathup{b}}}
\newcommand*{\unbiasedestimator}{\widehat{\mathup{SKCE}}_{\mathup{uq}}}
\newcommand*{\linearestimator}{\widehat{\mathup{SKCE}}_{\mathup{ul}}}
\newcommand*{\Dir}{\mathup{Dir}}
\newcommand*{\Categorical}{\mathup{Cat}}

% metadata
\title{Calibration tests in multi-class classification:\\ A unifying framework}
\author{David Widmann$^\star$ Fredrik Lindsten$^\ddagger$ Dave Zachariah$^\star$}
\date{}
\makeatletter
\pgfkeys{%
  /my poster/.cd,
  title/.initial=\@title,
  author/.initial=\@author,
  institute/.initial={},
  contact/.initial={},
  date/.initial=\@date,
}
\makeatother

\pgfkeys{%
  /my poster/.cd,
  institute={$^\star$Department of Information Technology, Uppsala University, Sweden $^\ddagger$Division of Statistics and Machine Learning, Linköping University, Sweden},
  contact={david.widmann@it.uu.se fredrik.lindsten@liu.se dave.zachariah@it.uu.se},
}

\pagestyle{empty}

\begin{document}
\begin{tcbposter}

  % title
  \posterbox[blankest,interior engine=path,halign=left,valign=top,
  underlay =
  {%
    \node[below left,inner sep=0pt,outer sep=0pt] (LiU) at (frame.north east) {\includegraphics[width=3.5cm]{figures/logos/LiU.pdf}};%
    \node[inner sep=0pt,outer sep=0pt,below left=0mm and 5mm of LiU.north west] (UU) {\includegraphics[width=2.5cm]{figures/logos/UU.pdf}};%
  }]{name=title,column=1,span=6,below=top}{%
    \Huge\textbf{\pgfkeysvalueof{/my poster/title}}\\[1ex]
    \large\pgfkeysvalueof{/my poster/author}\\[1ex]
    \normalsize\pgfkeysvalueof{/my poster/institute}%
  }%

  % footline
  \posterbox[blankest,top=2pt,bottom=2pt,valign=center,fontupper=\ttfamily\small,interior engine=path,interior style={color=uumidgrey}%
  ]{name=footline,column=1,span=10,above=bottom}{%
    \pgfkeysvalueof{/my poster/date}\hfill\pgfkeysvalueof{/my poster/contact}%
  }%

  \posterbox[adjusted title={Motivation - what is a calibrated model?}, colback=blondsvag]{name=calibration,column=3,span=4,below=title}{
    \begin{tcolorbox}[colback=blondstark]
      \begin{center}
        A \hl{calibrated model} yields predictions consistent with empirically observed frequencies.
      \end{center}
    \end{tcolorbox}

    \tcbsubtitle{Collision detection system}

    Consider a model that predicts if there is an object, a human, or an animal ahead of a car.

    \begin{minipage}[c]{0.57\linewidth}
        \begin{center}
          \begin{tikzpicture}
            \node[draw, inner sep=2mm] (image) at (0, 0) {\includesvg[height=8mm]{car}};
            \node[above=2mm of image, anchor=base, font=\scriptsize] {Input $X$};

            \node[draw, fill=gronskasvag, right=0.75cm of image, inner sep=2mm] (model)
            {\includesvg[height=8mm]{gear}};
            \node[above=2mm of model, anchor=base, font=\scriptsize] {Model $g$};
            \draw [->] (image) -- (model);

            \node[draw, right=0.75cm of model, minimum height=1.2cm, font=\scriptsize, align=center] (prediction)
            {\begin{tabular}{@{}ccc@{}}
               \includesvg[width=6mm]{barrier} & \includesvg[width=6mm]{pedestrian} & \includesvg[width=6mm]{bear} \\
               80\% & 0\% & 20\% \\
             \end{tabular}};
           \node[above=2mm of prediction, anchor=base, font=\scriptsize] {Prediction $g(X) \in \Delta^m$};
           \draw [->] (model) -- (prediction);
         \end{tikzpicture}
       \end{center}
     \end{minipage}%
     \begin{minipage}[c]{0.43\linewidth}
       We use $m$ for the number of classes, and
       $\Delta^m \coloneqq \{ z \in [0,1]^m \colon \|z\|_1 = 1\}$ for the
       $(m-1)$-dimensional probability simplex.
     \end{minipage}\vspace*{\baselineskip}

     If the model is calibrated we know that for all inputs with this
     prediction there is an object ahead 80\% of the time, a human 0\%
     of the time, and an animal 20\% of the time.

     \begin{center}
       \begin{tikzpicture}
         \node[minimum height=1.2cm, inner sep=2mm] (image) at (0, 0)
         {\begin{tabular}{@{}ccc@{}}
            \includesvg[height=3mm]{car0} & \includesvg[height=3mm]{car1} & \includesvg[height=3mm]{car2} \\
            \includesvg[height=3mm]{car3} & \includesvg[height=3mm]{car4} & $\cdots$ \\
          \end{tabular}};

        \node[draw, fill=gronskasvag, right=0.75cm of image, inner sep=2mm] (model)
        {\includesvg[height=8mm]{gear}};
        \draw [->] (image) -- (model);

        \node[draw, right=0.75cm of model, minimum height=1.2cm, font=\scriptsize, align=center] (prediction)
        {\begin{tabular}{@{}ccc@{}}
           \includesvg[width=6mm]{barrier} & \includesvg[width=6mm]{pedestrian} & \includesvg[width=6mm]{bear} \\
           80\% & 0\% & 20\% \\
         \end{tabular}};
        \draw [->] (model) -- (prediction);

        \node[right=1cm of prediction] (empirical)
        {\begin{tabular}{@{}cccccc@{}} \toprule
           \multicolumn{4}{c}{\includesvg[width=3mm]{barrier}} & \includesvg[width=3mm]{pedestrian} & \includesvg[width=3mm]{bear} \\ \midrule
           \includesvg[height=3mm]{car0} & \includesvg[height=3mm]{car2} & \includesvg[height=3mm]{car3} & \includesvg[height=3mm]{car4} & & \includesvg[height=3mm]{car1} \\
           $\vdots$ & $\vdots$ & $\vdots$ & $\vdots$ & & $\vdots$ \\ \bottomrule
         \end{tabular}};
        \node[above=2mm of empirical, anchor=base, font=\scriptsize] (A) {Empirical frequency $r(g(X)) \in \Delta^m$};
        \node[font=\scriptsize] at (prediction |- A) {Prediction $g(X) \in \Delta^m$};

        \path (prediction) -- node [font=\boldmath\Huge, color=uured, align=center, midway] {$=$} (empirical);
      \end{tikzpicture}
    \end{center}
}

  \posterbox[adjusted title={Quantifying calibration - a unifying framework}, colback=gryningmellan]{name=error,column=1,span=3,between=calibration and footline}{
    \begin{tcolorbox}[colback=blondstark]
      We define the \hl{calibration error}~($\measure$) of model $g$ with respect to a class $\mathcal{F}$ of functions $f \colon \Delta^m \to \mathbb{R}^m$ as
      \begin{equation*}
        \measure[\mathcal{F}, g] \coloneqq \sup_{f \in \mathcal{F}} \Expect\left[\transpose{(r(g(X)) - g(X))} f(g(X)) \right],
      \end{equation*}
      where $r(g(X)) \in \Delta^m$ is the empirical frequency of prediction $g(X)$.
    \end{tcolorbox}

    By design, if model $g$ is calibrated then the $\measure$ is zero, regardless of $\mathcal{F}$.

    \tcbsubtitle{Kernel calibration error}

    \begin{tcolorbox}[colback=blondstark]
      We define the \hl{kernel calibration error} ($\kernelmeasure$)
      of model $g$ with respect to a kernel
      $k \colon \Delta^m \times \Delta^m \to \mathbb{R}^{m \times m}$ as
      \begin{equation*}
        \kernelmeasure[k, g] \coloneqq \measure[\mathcal{F}, g],
      \end{equation*}
      where $\mathcal{F}$ is the unit ball in the reproducing kernel
      Hilbert space corresponding to $k$.
    \end{tcolorbox}

    If $k$ is a universal kernel, then the $\kernelmeasure$ is zero if
    and only if $g$ is calibrated.

    \tcbsubtitle{Relation to existing measures}
    \begin{itemize}
    \item For common distances $d$ the expected calibration error ($\ECE$)
      \begin{equation}\label{eq:ece}
        \ECE[d, g] = \Expect[d(r(g(X)), g(X))]
      \end{equation}
      can be formulated as a $\measure$.

    \item The framework captures the maximum mean calibration error as well.
    \end{itemize}
  }

  \posterbox[adjusted title=The paper in 30 seconds, colback=blondmellan]{name=summary,column=1,span=2,between=title and error}{
    \begin{itemize}
    \item We propose a \hl{unifying framework} of calibration errors
      that allows us to derive a new \hl{kernel calibration error} with
      \hl{unbiased and consistent estimators}.
    \item Calibration error estimates are not interpretable. Instead we
      can conduct hypothesis tests of calibration.
    \item In contrast to existing approaches, the KCE enables
      well-founded bounds and approximations of the p-value for
      calibration tests.
    \end{itemize}

    \tcbsubtitle{Take with you}
    \begin{itemize}
    \item Kernel calibration error (KCE) with unbiased and consistent estimators
    \item Calibration errors have no meaningful unit or scale
    \item Reliable calibrations tests with the KCE
    \end{itemize}
  }

  \posterbox[adjusted title={Estimating the calibration error}, colback=gronskasvag]{name=estimation,column=4,span=3,between=calibration and footline}{
    We want to estimate the $\measure$ of model $g$ using a validation
    data set $\{(X_i, Y_i)\}_{i=1}^n$ of i.i.d.\ pairs of inputs and labels.

    \tcbsubtitle{Kernel calibration error}

    \begin{tcolorbox}[colback=blondstark]
      If $\Expect[\|k(g(X), g(X))\|] < \infty$, then the \hl{squared kernel
      calibration error}
      $\squaredkernelmeasure[k, g] \coloneqq \kernelmeasure^2[k,g]$ is
      given by
      \begin{equation}\label{eq:skce}
        \squaredkernelmeasure[k, g] = \Expect\left[\transpose{(e_Y - g(X))} k(g(X), g(X)) {(e_{Y'} - g(X'))} \right],
      \end{equation}
      where $(X', Y')$ is an independent copy of $(X, Y)$ and
      $e_i \in \Delta^m$ denotes the $i$th unit vector.
    \end{tcolorbox}

    For $i,j \in \{1,\ldots,n\}$, let
    $h_{i,j} \coloneqq \transpose{(e_{Y_i} - g(X_i))} k(g(X_i), g(X_j)) (e_{Y_j} - g(X_j))$.

    \begin{tcolorbox}[colback=blondstark]
      If $\Expect[\|k(g(X),g(X))\|] < \infty$, then \hl{consistent estimators}
      of the $\squaredkernelmeasure$ are:
      \begin{center}
        \begin{tabular}{llll} \toprule
          Notation & Definition & Properties & Complexity\\ \midrule
          $\biasedestimator$ & $n^{-2} \sum_{i,j=1}^n h_{i,j}$ & biased & $O(n^2)$ \\
          $\unbiasedestimator$ & $ {\binom{n}{2}}^{-1} \sum_{1 \leq i < j \leq n} h_{i,j}$ & unbiased & $O(n^2)$ \\
          $\linearestimator$ & $ {\lfloor n/2\rfloor}^{-1} \sum_{i = 1}^{\lfloor n / 2\rfloor} h_{2i-1,2i}$ & unbiased & $O(n)$ \\ \bottomrule
        \end{tabular}
      \end{center}
    \end{tcolorbox}

    \tcbsubtitle{Relation to the expected calibration error}

    Standard estimators of the $\ECE$ are usually biased and inconsistent.
    The main difficulty is the estimation of the empirical frequencies
    $r(g(X))$ in \cref{eq:ece}. For the $\kernelmeasure$ there is no need
    to estimate them due to \cref{eq:skce}!
  }

  \posterbox[adjusted title={Is my model calibrated?}, colback=sandsvag]{name=statistics,column=7,span=4,below=top}{
    In general, calibration errors have no meaningful unit or scale.
    This renders it difficult to interpret an estimated non-zero error.

    \tcbsubtitle{Calibration tests}
    \begin{minipage}[t]{0.35\linewidth}
      \vspace*{0pt}
      We can use the calibration error estimates to perform a
      statistical test of the null hypothesis
      \begin{equation*}
        H_0 \coloneqq \text{\enquote{the model is calibrated}}.
      \end{equation*}
    \end{minipage}
    \begin{minipage}[t]{0.65\linewidth}
      \vspace*{0pt}
      \begin{center}
        \begin{tikzpicture}[
          declare function={normal(\m,\s)=1/(2*\s*sqrt(pi))*exp(-(x-\m)^2/(2*\s^2));},
          declare function={binormal(\ma,\sa,\mb,\sb,\p)=(\p*normal(\ma,\sa)+(1-\p)*normal(\mb,\sb));}
          ]

          \begin{axis}[
            domain = -0.1:0.2,
            no marks,
            xlabel = calibration error estimate,
            ylabel = density,
            grid=major,
            ymin = 0,
            tick label style={font=\tiny},
            label style={font=\small},
            width = 0.75\linewidth,
            height = 0.33\linewidth,
            legend pos=outer north east,
            legend cell align=left,
            legend style=
            {
              fill=none,
              draw=none,
              inner sep={0pt},
              font=\small,
              align=left,
            }
            ]

            \draw [Dark2-A, thick] (0.07,\pgfkeysvalueof{/pgfplots/ymin}) -- (0.07,\pgfkeysvalueof{/pgfplots/ymax}) node [at end, above, anchor=south east, sloped, font=\small] {observed};

            \draw[Dark2-B, thick] (0,\pgfkeysvalueof{/pgfplots/ymin}) -- (0,\pgfkeysvalueof{/pgfplots/ymax}) node [at end, above, anchor=south east, sloped, font=\small] {calibrated};

            % mixture model of normal distributions
            \addplot+ [color=Dark2-B, dashed, thick, samples=31, smooth, name path=A] {binormal(-0.05,0.01,0.05,0.03,0.5)};
            \addlegendentry{distribution\\ under $H_0$};

            % indicate p-value
            \path [name path=B] (\pgfkeysvalueof{/pgfplots/xmin},0) -- (\pgfkeysvalueof{/pgfplots/xmax},0);
            \addplot+ [draw=Dark2-C, pattern color=Dark2-C, pattern={north east lines}] fill between [of=A and B, soft clip={domain=0.07:0.2}];
            \addlegendentry{p-value};

            % add comment
            \node[anchor=west, align=left, text=Dark2-C, font=\small] (annotation) at (0.075, 10) {reject $H_0$ if the \\p-value is small};
            \draw[->, >=stealth, thick, Dark2-C] (annotation) -- (0.08, 1);
          \end{axis}
        \end{tikzpicture}
      \end{center}
    \end{minipage}

    \begin{tcolorbox}[colback=blondstark]
      We derive \hl{well-founded bounds and approximations} of the p-value
      based on the $\squaredkernelmeasure$.
    \end{tcolorbox}
  }

  \posterbox[adjusted title={Experiments}, colback=gryningmellan]{name=experiment,column=7,span=4,between=statistics and footline}{
    We sample $10^4$ synthetic data sets $\{(g(X_i), Y_i)\}_{i=1}^{250}$
    from three generative models with $10$ classes by sampling
    predictions $g(X_i) \sim \Dir(0.1, \dots, 0.1)$ and labels $Y_i$
    conditionally on $g(X_i)$ from
    \begin{equation*}
      \symbf{M1}\colon \, \Categorical(g(X_i)), \quad
      \symbf{M2}\colon \, 0.5\Categorical(g(X_i)) + 0.5\Categorical(1,0,\dots,0), \quad
      \symbf{M3}\colon \, \Categorical(0.1, \dots, 0.1).
    \end{equation*}
    Model $\symbf{M1}$ is calibrated, and models $\symbf{M2}$ and
    $\symbf{M3}$ are uncalibrated.

    \tcbsubtitle{Calibration error estimates}

    \begin{minipage}[t]{0.35\linewidth}
      \vspace*{0pt}
      We show the distribution of a standard estimator of the $\ECE$,
      denoted by $\widehat{\ECE}$, and of the three proposed estimators
      of the $\squaredkernelmeasure$ with kernel
      \begin{equation*}
        k(x, y) = \exp{(- \|x - y\| / \nu)} \symbf{I}_{10},
      \end{equation*}
      where the kernel bandwidth $\nu > 0$ is chosen by the median
      heuristic.

      \vspace{\baselineskip}

      The solid line indicates the sample mean of the estimates, and the
      dashed line displays the true calibration error.
    \end{minipage}%
    \begin{minipage}[t]{0.65\linewidth}
      \vspace*{0pt}
      \begin{center}
        \input{figures/errors_comparison.tex}
      \end{center}
    \end{minipage}

    \vspace{\baselineskip}

    We see that the standard estimator of the $\ECE$ exhibits both
    negative and positive bias, whereas, theoretically guaranteed,
    $\biasedestimator$ is biased upwards and $\unbiasedestimator$
    and $\linearestimator$ are unbiased.

    \tcbsubtitle{Empirical test errors}

    \begin{minipage}[t]{0.4\linewidth}
      \vspace*{0pt}
      We evaluate the derived bounds $\symbf{D}_{\mathup{b}}$,
      $\symbf{D}_{\mathup{uq}}$, and $\symbf{D}_{\mathup{ul}}$, and
      approximations $\symbf{A}_{\mathup{uq}}$ and
      $\symbf{A}_{\mathup{l}}$ of the p-value based on the
      $\squaredkernelmeasure$. We compare them with a previously
      proposed hypothesis test for the standard $\ECE$ estimator
      ($\symbf{C}$). We show the empirical test errors computed
      from the p-value approximations for different significance
      levels.

      \vspace{\baselineskip}

      We see that consistency resampling can lead to unreliable
      calibration tests. Bounds $\symbf{D}_{\mathup{b}}$,
      $\symbf{D}_{\mathup{uq}}$, and $\symbf{D}_{\mathup{ul}}$ yield
      reliable but usually not powerful tests, whereas
      based on approximations $\symbf{A}_{\mathup{uq}}$ and
      $\symbf{A}_{\mathup{l}}$ we obtain reliable and powerful
      calibration tests in our experiments.
    \end{minipage}%
    \begin{minipage}[t]{0.6\linewidth}
      \vspace*{0pt}
      \begin{center}
        \input{figures/pvalues_comparison.tex}
      \end{center}
    \end{minipage}
  }
\end{tcbposter}
\end{document}