-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrobusthist.m
107 lines (97 loc) · 4.11 KB
/
robusthist.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
% [barh, lohix, counts, xbars] = robusthist(x, nbin, lohiq)
%
% plots histograms in which extremely small or large values are captured
% in flanking bins with a different colour, extreme values are defined via
% the quantiles of the sampled distribution
%
% This is to make histograms of very heavy-tailed distributions more
% informative. hist.m would produce very wide bins. Then a lot of the
% interesting parts of the distribution may be hidden, because of the low
% bin resolution. histc.m allows to exclude large values, or to make bins
% with different widths. Excluding is not good, because then the heavy tail
% is hidden, and different bin widths look ugly. So here you can at least
% see how many data points are in the tail, but maintain a tidy look of the
% remaining distribution.
%
% Tails of the distribution are defined via a lower and an upper quantile.
% From these a lower (xmin) and upper (xmax) data value is
% calculated as limits. Then the lower tail of the distribution is defined
% as all data values x < xmin. The upper tail of the distribution is
% defined as all data values x >= xmax. Each tail then is represented by
% one bar adjacent to the edges of the bars representing the centre of the
% distribution.
%
% in:
% x - samples from a distribution, a vector
% N = length
% nbin - number of bins to display
% lohiq - quantiles used to define the upper and lower tail of
% the distribution which should be excluded from
% computing bar widths and positions
% lohiq(1) - lower quantile
% lohiq(2) - upper quantile
% [default: [0 1]]
% out:
% barh - handles to plotted bar series objects
% barh(1) - centre of distribution
% barh(2) - lower tail
% barh(3) - upper tail
% lohix - data values xmin and xmax corresponding to quantiles
% in lohiq in the provided empirical distribution
% = [xmin xmax]
% counts - computed counts for each bin
% [1, nbin+1] = size
% counts(1) - count for lower tail
% counts(2:end-2) - counts for centre of distribution
% counts(end-1) - count for upper tail
% counts(end) - count of occurrence of Inf
% xbars - positions of bars on x-axis (centre of bars)
% [1, nbin] = size
% xbars(1) - lower tail bar
% xbars(2:end-1) - centre bars
% xbars(nbin) - upper tail bar
% author:
% Sebastian Bitzer ([email protected])
function [barh, lohix, counts, xbars] = robusthist(x, nbin, lohiq)
if nargin < 3
lohiq = [0 1];
end
N = length(x);
% find x-values corresponding to cutoff quantiles
x = sort(x(:))';
cdf = (.5 : N) / N;
lohix = nan(1,2);
% lower cutoff
% find empirical quantile closest to lower cutoff quantile
[~, ind] = min((cdf - lohiq(1)).^2);
% find the two empirical quantiles around that, but take care of there
% potentially being no smaller quantile in the data
ind1 = max(ind-1, 1);
ind2 = ind + 1;
% get the data value corresponding to the lower cutoff quantile by
% interpolating between data values of the three empirical quantiles,
% extrapolation is allowed for the case that the cutoff quantile is
% smaller than the smallest empirical quantile
lohix(1) = interp1( cdf(ind1:ind2), ...
x(ind1:ind2) + (0:1e-12:(ind2-ind1)*1e-12), lohiq(1), 'linear', 'extrap');
% upper cutoff (same idea as for lower cutoff above)
[~, ind] = min((cdf - lohiq(2)).^2);
ind1 = ind - 1;
ind2 = min(ind+1, N);
lohix(2) = interp1( cdf(ind1:ind2), ...
x(ind1:ind2) + (0:1e-12:(ind2-ind1)*1e-12), lohiq(2), 'linear', 'extrap');
% define bin edges for histc
xbars = linspace(lohix(1), lohix(2), nbin-1);
edges = [-Inf, xbars, Inf];
% define bar centres for bar
dx = xbars(2) - xbars(1);
xbars = [xbars - dx / 2, xbars(end) + dx/2];
% count
counts = histc(x, edges);
% plot histogram
ax = gca;
barh(1) = bar(ax, xbars(1:end-1), counts(1:end-2), 'hist');
hold on
barh(2) = bar(ax, [xbars(1) - dx, xbars(1)], [0, counts(1)], 'hist');
barh(3) = bar(ax, [xbars(end), xbars(end) + dx], counts(end-1:end), 'hist');
set(barh(2:3), 'FaceColor', [.6 .6 .6])