Skip to content

Commit db11eca

Browse files
committed
+
0 parents  commit db11eca

File tree

8 files changed

+497
-0
lines changed

8 files changed

+497
-0
lines changed

.vscode/launch.json

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
// Use IntelliSense to find out which attributes exist for C# debugging
3+
// Use hover for the description of the existing attributes
4+
// For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": ".NET Core Launch (console)",
9+
"type": "coreclr",
10+
"request": "launch",
11+
"preLaunchTask": "build",
12+
// If you have changed target frameworks, make sure to update the program path.
13+
"program": "${workspaceFolder}/bin/Debug/netstandard2.0/stats.dll",
14+
"args": [],
15+
"cwd": "${workspaceFolder}",
16+
// For more information about the 'console' field, see https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md#console-terminal-window
17+
"console": "internalConsole",
18+
"stopAtEntry": false,
19+
"internalConsoleOptions": "openOnSessionStart"
20+
},
21+
{
22+
"name": ".NET Core Attach",
23+
"type": "coreclr",
24+
"request": "attach",
25+
"processId": "${command:pickProcess}"
26+
}
27+
,]
28+
}

.vscode/tasks.json

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"version": "2.0.0",
3+
"tasks": [
4+
{
5+
"label": "build",
6+
"command": "dotnet",
7+
"type": "process",
8+
"args": [
9+
"build",
10+
"${workspaceFolder}/stats.csproj"
11+
],
12+
"problemMatcher": "$msCompile"
13+
}
14+
]
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Diagnostics;
4+
using System.Linq;
5+
6+
namespace Stats.Impl.Classification.JenksFisher
7+
{
8+
public class JenksFisherClassifier
9+
{
10+
private readonly List<ValueCountPair> _cumulValues;
11+
private readonly int _numValues;
12+
private readonly int _numBreaks;
13+
private readonly int _bufferSize;
14+
private double[] _previousSsm;
15+
private double[] _currentSsm;
16+
private readonly int[] _classBreaks;
17+
private int _classBreaksIndex;
18+
private int _completedRows;
19+
20+
/// <summary>
21+
/// Constructor that initializes main variables used in fisher calculation of natural breaks.
22+
/// </summary>
23+
/// <param name="vcpc">ordered list of pairs of values to occurrence counts.</param>
24+
/// <param name="k">number of breaks to find.</param>
25+
private JenksFisherClassifier(ValueCountPair[] vcpc, int k)
26+
{
27+
_cumulValues = new List<ValueCountPair>();
28+
_numValues = vcpc.Length;
29+
_numBreaks = k;
30+
_bufferSize = _numValues - (k - 1);
31+
_previousSsm = new double[_bufferSize];
32+
_currentSsm = new double[_bufferSize];
33+
_classBreaks = new int[_bufferSize * (_numBreaks - 1)];
34+
_classBreaksIndex = 0;
35+
_completedRows = 0;
36+
37+
var cwv = 0.0;
38+
var cw = 0;
39+
40+
for (var i = 0; i != _numValues; ++i)
41+
{
42+
var currPair = vcpc[i];
43+
Debug.Assert(i == 0 || currPair.Value >= vcpc[i - 1].Value); // PRECONDITION: the value sequence must be strictly increasing
44+
45+
var w = currPair.Count;
46+
Debug.Assert(w > 0); // PRECONDITION: all weights must be positive
47+
48+
cw += w;
49+
Debug.Assert(cw >= w); // No overflow? No loss of precision?
50+
51+
cwv += w * currPair.Value;
52+
_cumulValues.Add(new ValueCountPair(cwv, cw));
53+
54+
if (i < _bufferSize)
55+
_previousSsm[i] = cwv * cwv / cw; // prepare sum of squared means for first class. Last (k-1) values are omitted
56+
}
57+
}
58+
59+
/// <summary>
60+
/// Does the internal processing to actually create the breaks.
61+
/// </summary>
62+
/// <param name="breakCount">number of breaks.</param>
63+
/// <param name="values">asc ordered input of values and their occurence counts.</param>
64+
/// <returns>collection of breaks.</returns>
65+
public static double[] Classify(ValueCountPair[] values, int breakCount)
66+
{
67+
var breaksArray = new double[breakCount];
68+
var m = values.Length;
69+
70+
Debug.Assert(breakCount <= m); // PRECONDITION
71+
72+
if (breakCount == 0)
73+
return breaksArray;
74+
75+
var jf = new JenksFisherClassifier(values, breakCount);
76+
77+
if (breakCount > 1)
78+
{
79+
// runs the actual calculation
80+
jf.CalcAll();
81+
82+
var lastClassBreakIndex = jf.FindMaxBreakIndex(jf._bufferSize - 1, 0, jf._bufferSize);
83+
84+
while (--breakCount != 0)
85+
{
86+
// assign the break values to the result
87+
breaksArray[breakCount] = values[lastClassBreakIndex + breakCount].Value;
88+
89+
Debug.Assert(lastClassBreakIndex < jf._bufferSize);
90+
91+
if (breakCount > 1)
92+
{
93+
jf._classBreaksIndex -= jf._bufferSize;
94+
lastClassBreakIndex = jf._classBreaks[jf._classBreaksIndex + lastClassBreakIndex];
95+
}
96+
}
97+
98+
Debug.Assert(jf._classBreaks[jf._classBreaksIndex] == jf._classBreaks[0]);
99+
}
100+
101+
Debug.Assert(breakCount == 0);
102+
103+
breaksArray[0] = values[0].Value; // break for the first class is the minimum of the dataset.
104+
105+
return breaksArray;
106+
}
107+
108+
/// <summary>
109+
/// Main entry point for creation of Jenks-Fisher natural breaks.
110+
/// </summary>
111+
/// <param name="values">array of the values, do not need to be sorted.</param>
112+
/// <param name="breakCount">number of breaks to create.</param>
113+
/// <returns>collection with breaks.</returns>
114+
public static double[] Classify(double[] values, int breakCount)
115+
{
116+
var sortedUniqueValueCounts = GetValueCountPairs(values);
117+
double[] breaksArray;
118+
119+
if (sortedUniqueValueCounts.Length > breakCount)
120+
breaksArray = Classify(sortedUniqueValueCounts, breakCount);
121+
else
122+
{
123+
var i = 0;
124+
125+
breaksArray = new double[sortedUniqueValueCounts.Length];
126+
127+
foreach (var vcp in sortedUniqueValueCounts)
128+
{
129+
breaksArray[i] = vcp.Value;
130+
131+
i++;
132+
}
133+
}
134+
135+
var result = new List<double>(breaksArray.Length);
136+
137+
result.AddRange(breaksArray);
138+
139+
return result.ToArray();
140+
}
141+
142+
/// <summary>
143+
/// Gets sum of weighs for elements with index b..e.
144+
/// </summary>
145+
/// <param name="b">index of begin element.</param>
146+
/// <param name="e">index of end element.</param>
147+
/// <returns>sum of weights.</returns>
148+
private int GetSumOfWeights(int b, int e)
149+
{
150+
Debug.Assert(b != 0); // First element always belongs to class 0, thus queries should never include it.
151+
Debug.Assert(b <= e);
152+
Debug.Assert(e < _numValues);
153+
154+
var res = _cumulValues[e].Count;
155+
156+
res -= _cumulValues[b - 1].Count;
157+
158+
return res;
159+
}
160+
161+
/// <summary>
162+
/// Gets sum of weighed values for elements with index b..e.
163+
/// </summary>
164+
/// <param name="b">index of begin element.</param>
165+
/// <param name="e">index of end element.</param>
166+
/// <returns>the cumul. sum of the values*weight.</returns>
167+
private double GetSumOfWeightedValues(int b, int e)
168+
{
169+
Debug.Assert(b != 0);
170+
Debug.Assert(b <= e);
171+
Debug.Assert(e < _numValues);
172+
173+
var res = _cumulValues[e].Value;
174+
175+
res -= _cumulValues[b - 1].Value;
176+
177+
return res;
178+
}
179+
180+
/// <summary>
181+
/// Gets the Squared Mean for elements within index b..e, multiplied by weight. Note that
182+
/// n*mean^2 = sum^2/n when mean := sum/n.
183+
/// </summary>
184+
/// <param name="b">index of begin element.</param>
185+
/// <param name="e">index of end element.</param>
186+
/// <returns>the sum of squared mean.</returns>
187+
private double GetSsm(int b, int e)
188+
{
189+
var res = GetSumOfWeightedValues(b, e);
190+
191+
return res * res / GetSumOfWeights(b, e);
192+
}
193+
194+
/// <summary>
195+
/// <para>
196+
/// Finds CB[i+completedRows] given that the result is at least
197+
/// bp+(completedRows-1) and less than ep+(completedRows-1).
198+
/// </para>
199+
/// <para>
200+
/// Complexity: O(ep-bp) &lt;= O(m).
201+
/// </para>
202+
/// </summary>
203+
/// <param name="i">startIndex.</param>
204+
/// <param name="bp">endindex.</param>
205+
/// <param name="ep"></param>
206+
/// <returns>The index.</returns>
207+
private int FindMaxBreakIndex(int i, int bp, int ep)
208+
{
209+
Debug.Assert(bp < ep);
210+
Debug.Assert(bp <= i);
211+
Debug.Assert(ep <= i + 1);
212+
Debug.Assert(i < _bufferSize);
213+
Debug.Assert(ep <= _bufferSize);
214+
215+
var minSsm = _previousSsm[bp] + GetSsm(bp + _completedRows, i + _completedRows);
216+
var foundP = bp;
217+
218+
while (++bp < ep)
219+
{
220+
var currSsm = _previousSsm[bp] + GetSsm(bp + _completedRows, i + _completedRows);
221+
222+
if (currSsm > minSsm)
223+
{
224+
minSsm = currSsm;
225+
226+
foundP = bp;
227+
}
228+
}
229+
230+
_currentSsm[i] = minSsm;
231+
232+
return foundP;
233+
}
234+
235+
/// <summary>
236+
/// <para>
237+
/// Find CB[i+completedRows] for all i&gt;=bi and i&lt;ei given that the
238+
/// results are at least bp+(completedRows-1) and less than ep+(completedRows-1).
239+
/// </para>
240+
/// <para>
241+
/// Complexity: O(log(ei-bi)*Max((ei-bi),(ep-bp)))&lt;= O(m*log(m)).
242+
/// </para>
243+
/// </summary>
244+
private void CalcRange(int bi, int ei, int bp, int ep)
245+
{
246+
Debug.Assert(bi <= ei);
247+
Debug.Assert(ep <= ei);
248+
Debug.Assert(bp <= bi);
249+
250+
if (bi == ei)
251+
return;
252+
253+
Debug.Assert(bp < ep);
254+
255+
var mi = (int)Math.Floor((bi + ei) / 2.0);
256+
var mp = FindMaxBreakIndex(mi, bp, Math.Min(ep, mi + 1));
257+
258+
Debug.Assert(bp <= mp);
259+
Debug.Assert(mp < ep);
260+
Debug.Assert(mp <= mi);
261+
262+
// solve first half of the sub-problems with lower 'half' of possible outcomes
263+
CalcRange(bi, mi, bp, Math.Min(mi, mp + 1));
264+
265+
_classBreaks[_classBreaksIndex + mi] = mp; // store result for the middle element.
266+
267+
// solve second half of the sub-problems with upper 'half' of possible outcomes
268+
CalcRange(mi + 1, ei, mp, ep);
269+
}
270+
271+
/// <summary>
272+
/// Swaps the content of the two lists with each other.
273+
/// </summary>
274+
private void SwapArrays()
275+
{
276+
var temp = new double[_previousSsm.Length];
277+
278+
Array.Copy(_previousSsm, temp, _previousSsm.Length);
279+
280+
_previousSsm = new double[_currentSsm.Length];
281+
282+
Array.Copy(_currentSsm, _previousSsm, _currentSsm.Length);
283+
284+
_currentSsm = new double[temp.Length];
285+
286+
Array.Copy(temp, _currentSsm, temp.Length);
287+
}
288+
289+
/// <summary>
290+
/// <para>Starting point of calculation of breaks.</para>
291+
/// <para>Complexity: O(m*log(m)*k).</para>
292+
/// </summary>
293+
private void CalcAll()
294+
{
295+
if (_numBreaks >= 2)
296+
{
297+
_classBreaksIndex = 0;
298+
for (_completedRows = 1; _completedRows < _numBreaks - 1; ++_completedRows)
299+
{
300+
CalcRange(0, _bufferSize, 0, _bufferSize); // complexity: O(m*log(m))
301+
302+
SwapArrays();
303+
_classBreaksIndex += _bufferSize;
304+
}
305+
}
306+
}
307+
308+
/// <summary>
309+
/// Calculates the occurence count of given values and returns them.
310+
/// </summary>
311+
/// <param name="values"></param>
312+
/// <returns>Occurences of values.</returns>
313+
private static ValueCountPair[] GetValueCountPairs(IReadOnlyCollection<double> values)
314+
{
315+
var result = new List<ValueCountPair>();
316+
var vcpMap = new Dictionary<double, ValueCountPair>();
317+
318+
foreach (var value in values)
319+
{
320+
if (!vcpMap.ContainsKey(value))
321+
{
322+
var vcp = new ValueCountPair(value, 1);
323+
324+
vcpMap.Add(value, vcp);
325+
result.Add(vcp);
326+
}
327+
else
328+
vcpMap[value].Count++;
329+
}
330+
331+
return result.OrderBy(o => o.Value).ToArray();
332+
}
333+
}
334+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
namespace Stats.Impl.Classification.JenksFisher
2+
{
3+
public class ValueCountPair
4+
{
5+
public ValueCountPair(double value, int count)
6+
{
7+
Value = value;
8+
Count = count;
9+
}
10+
11+
public double Value { get; set; }
12+
13+
public int Count { get; set; }
14+
}
15+
}

0 commit comments

Comments
 (0)