forked from stanford-futuredata/macrobase
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Mean shift movers query (stanford-futuredata#264)
Introduce a new query type (Classifier+Summarizer) that returns itemsets that have minimal support where the mean of some metric has shifted from inliers to outliers.
- Loading branch information
Showing
10 changed files
with
588 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"pipeline": "CubePipeline", | ||
"inputURI": "csv://lib/src/test/resources/sample_cubedshift.csv", | ||
"classifier": "countmeanshift", | ||
"metric": "time", | ||
"predicate": "==", | ||
"cutoff": "1", | ||
"meanColumn": "meanLatency", | ||
"countColumn": "count", | ||
"summarizer": "countmeanshift", | ||
"attributes": [ | ||
"location", | ||
"version", | ||
"language", | ||
], | ||
"meanShiftRatio": 1.1, | ||
"minSupport": 0.05, | ||
"numThreads": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
103 changes: 103 additions & 0 deletions
103
...in/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftClassifier.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package edu.stanford.futuredata.macrobase.analysis.classify; | ||
|
||
import edu.stanford.futuredata.macrobase.analysis.classify.stats.MBPredicate; | ||
import edu.stanford.futuredata.macrobase.datamodel.DataFrame; | ||
import edu.stanford.futuredata.macrobase.util.MacroBaseException; | ||
|
||
import java.util.function.DoublePredicate; | ||
import java.util.function.Predicate; | ||
|
||
public class CountMeanShiftClassifier extends Classifier { | ||
private Predicate<String> strPredicate; | ||
private DoublePredicate doublePredicate; | ||
private DataFrame output; | ||
private String metricColumnName; | ||
private String meanColumnName; | ||
private boolean isStrPredicate; | ||
public static String outlierCountColumnName = "_OUTLIERCOUNT"; | ||
public static String inlierCountColumnName = "_INLIERCOUNT"; | ||
public static String outlierMeanSumColumnName = "_OUTLIERMEANSUM"; | ||
public static String inlierMeanSumColumnName = "_INLIERMEANSUM"; | ||
|
||
/** | ||
* @param metricColumnName Column on which to classify outliers | ||
* @param meanColumnName Column containing means whose shifts will be explained | ||
* @param predicateStr Predicate used for classification: "==" or "!=" | ||
* @param sentinel String sentinel value used when evaluating the predicate to determine outlier | ||
* @throws MacroBaseException | ||
*/ | ||
public CountMeanShiftClassifier( | ||
final String metricColumnName, | ||
final String meanColumnName, | ||
final String predicateStr, | ||
final String sentinel | ||
) throws MacroBaseException { | ||
super(meanColumnName); | ||
this.metricColumnName = metricColumnName; | ||
this.meanColumnName = meanColumnName; | ||
this.strPredicate = MBPredicate.getStrPredicate(predicateStr, sentinel); | ||
this.isStrPredicate = true; | ||
} | ||
|
||
/** | ||
* @param metricColumnName Column on which to classify outliers | ||
* @param meanColumnName Column containing means whose shifts will be explained | ||
* @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">=" | ||
* @param sentinel Double sentinel value used when evaluating the predicate to determine outlier | ||
*/ | ||
public CountMeanShiftClassifier( | ||
final String metricColumnName, | ||
final String meanColumnName, | ||
final String predicateStr, | ||
final double sentinel | ||
) throws MacroBaseException { | ||
super(meanColumnName); | ||
this.metricColumnName = metricColumnName; | ||
this.meanColumnName = meanColumnName; | ||
this.doublePredicate = MBPredicate.getDoublePredicate(predicateStr, sentinel); | ||
this.isStrPredicate = false; | ||
} | ||
|
||
/** | ||
* Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame | ||
* remains unmodified; a copy is created and all modifications are made on the copy. Then store counts and | ||
* meancounts for both outliers and inliers. | ||
* @throws Exception | ||
*/ | ||
@Override | ||
public void process(DataFrame input) throws Exception { | ||
String[] stringMetrics = null; | ||
if (isStrPredicate) | ||
stringMetrics = input.getStringColumnByName(metricColumnName); | ||
double[] doubleMetrics = null; | ||
if (!isStrPredicate) | ||
doubleMetrics = input.getDoubleColumnByName(metricColumnName); | ||
output = input.copy(); | ||
double[] totalMeanColumn = input.getDoubleColumnByName(meanColumnName); | ||
int len = totalMeanColumn.length; | ||
double[] outlierCountColumn = new double[len]; | ||
double[] inlierCountColumn = new double[len]; | ||
double[] outlierMeanColumn = new double[len]; | ||
double[] inlierMeanColumn = new double[len]; | ||
for (int i = 0; i < len; i++) { | ||
if ((isStrPredicate && strPredicate.test(stringMetrics[i])) || | ||
(!isStrPredicate && doublePredicate.test(doubleMetrics[i]))) { | ||
outlierCountColumn[i] = 1.0; | ||
outlierMeanColumn[i] = totalMeanColumn[i]; | ||
} else { | ||
inlierCountColumn[i] = 1.0; | ||
inlierMeanColumn[i] = totalMeanColumn[i]; | ||
} | ||
} | ||
output.addColumn(outlierCountColumnName, outlierCountColumn); | ||
output.addColumn(inlierCountColumnName, inlierCountColumn); | ||
output.addColumn(outlierMeanSumColumnName, outlierMeanColumn); | ||
output.addColumn(inlierMeanSumColumnName, inlierMeanColumn); | ||
} | ||
|
||
|
||
@Override | ||
public DataFrame getResults() { | ||
return output; | ||
} | ||
} |
Oops, something went wrong.