Skip to content

Commit

Permalink
Bitmaps/mod bit sets (stanford-futuredata#265)
Browse files Browse the repository at this point in the history
* copied code from old branch to new

* coded hybrid system

* removed Arrays.sort and manually sorted

* fixed null ptr bug

* copied code from old branch to new

* coded hybrid system

* removed Arrays.sort and manually sorted

* fixed null ptr bug

* copied code from old branch to new

* coded hybrid system

* removed Arrays.sort and manually sorted

* updated threshold

* creating synthetic datasets

* testing roaring bitmaps

* Debugging options

* Hybrid Apriori System

* updated Dockerfile

* Support threshold bug in AttributeEncoder fixed

* BitMap -> Bitmap

* Support threshold bug in AttributeEncoder fixed for all-bitmaps

* Simplified hybrid system

* Better cardinalityThreshold

* hacky one-to-one and many-to-one strict FDs

* Infrastructure for bitmaps cost model

* Experimental bitmap cost model

* Revised andCardinalityCost

* Replacing RoaringBitmaps with BitSets

* Don't materialize ands

* Properly treat multiple same-cardinality columns

* Per-instance bitmap decision

* Better threshold application

* Bitmap Ratio Threshold now a configurable parameter

* Generalization of encodeAttributesWithSupport

* Revert "Generalization of encodeAttributesWithSupport"

This reverts commit 7b2e61d.

* Better timing

* Documentation

* AttributeEncoder documentation

* AttributeEncoder documentation

* Update gitignore

* Merge

* Address Firas comments

* Update .travis.yml

Co-authored-by: Firas Abuzaid <[email protected]>
Co-authored-by: Sahaana Suri <[email protected]>
  • Loading branch information
3 people committed Jan 10, 2020
1 parent 19d70ea commit ae7b655
Show file tree
Hide file tree
Showing 23 changed files with 1,616 additions and 188 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
data/
conf/
*.class
.classpath
.project
.settings/*
*.log
*.csv

# Mobile Tools for Java (J2ME)
.mtj.tmp/
Expand Down
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
language: java
jdk: oraclejdk8
jdk: openjdk11
install: mvn install -DskipTests=true -Dgpg.skip=true -Dmaven.javadoc.skip=true -B -V
notifications:
slack: stanford-futuredata:qmO6Keu8ifOyXHsmSQ97CeLH
Expand Down
Empty file modified bin/batch.sh
100755 → 100644
Empty file.
Empty file modified bin/cli.sh
100755 → 100644
Empty file.
Empty file modified bin/frontend.sh
100755 → 100644
Empty file.
Empty file modified bin/macrobase-sql
100755 → 100644
Empty file.
Empty file modified bin/server.sh
100755 → 100644
Empty file.
Empty file modified bin/streaming.sh
100755 → 100644
Empty file.
Empty file modified build.sh
100755 → 100644
Empty file.
Empty file modified core/demo/query.sh
100755 → 100644
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class BasicBatchPipeline implements Pipeline {
private boolean pctileLow;
private String predicateStr;
private int numThreads;
private int bitmapRatioThreshold;

private String summarizerType;
private List<String> attributes;
Expand All @@ -41,6 +42,9 @@ public class BasicBatchPipeline implements Pipeline {
private double minRiskRatio;
private double meanShiftRatio;

private boolean useFDs;
private int[] functionalDependencies;


public BasicBatchPipeline (PipelineConfig conf) {
inputURI = conf.get("inputURI");
Expand Down Expand Up @@ -71,6 +75,22 @@ public BasicBatchPipeline (PipelineConfig conf) {
minRiskRatio = conf.get("minRatioMetric", 3.0);
minSupport = conf.get("minSupport", 0.01);
numThreads = conf.get("numThreads", Runtime.getRuntime().availableProcessors());
bitmapRatioThreshold = conf.get("bitmapRatioThreshold", 256);


//if FDs are behind used, parse them into bitmaps. For now, all FDs must be in the first 31 attributes
useFDs = conf.get("useFDs", false);
if (useFDs) {
ArrayList<ArrayList<Integer>> rawDependencies = conf.get("functionalDependencies");
functionalDependencies = new int[attributes.size()];
for (ArrayList<Integer> dependency : rawDependencies) {
for (int i : dependency) {
for (int j : dependency) {
if (i != j) functionalDependencies[i] |= (1 << j);
}
}
}
}
meanColumn = Optional.ofNullable(conf.get("meanColumn"));
meanShiftRatio = conf.get("meanShiftRatio", 1.0);
}
Expand Down Expand Up @@ -131,7 +151,10 @@ public BatchSummarizer getSummarizer(String outlierColumnName) throws MacroBaseE
summarizer.setAttributes(attributes);
summarizer.setMinSupport(minSupport);
summarizer.setMinRatioMetric(minRiskRatio);
summarizer.setBitmapRatioThreshold(bitmapRatioThreshold);
summarizer.setNumThreads(numThreads);
summarizer.setFDUsage(useFDs);
summarizer.setFDValues(functionalDependencies);
return summarizer;
}
case "countmeanshift": {
Expand Down
Empty file modified lib/genCP.sh
100755 → 100644
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public abstract class BatchSummarizer implements Operator<DataFrame, Explanation
protected int numThreads = Runtime.getRuntime().availableProcessors();
protected String ratioMetric = "global_ratio";
protected int maxOrder = 3;
protected boolean useFDs = false;
protected int[] functionalDependencies;

/**
* Adjust this to tune the significance (e.g. number of rows affected) of the results returned.
Expand Down Expand Up @@ -83,4 +85,14 @@ public BatchSummarizer setMaxOrder(final int maxOrder) throws MacroBaseException
this.maxOrder = maxOrder;
return this;
}

public BatchSummarizer setFDUsage(final boolean useFDs) {
this.useFDs = useFDs;
return this;
}

public BatchSummarizer setFDValues(final int[] functionalDependencies) {
this.functionalDependencies = functionalDependencies;
return this;
}
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package edu.stanford.futuredata.macrobase.analysis.summary.aplinear;

import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;
import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.AggregationOp;
import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.QualityMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
* Generic summarizer superclass that can be customized with
* different quality metrics and input sources. Subclasses are responsible
Expand All @@ -21,10 +22,11 @@ public abstract class APLSummarizer extends BatchSummarizer {
APrioriLinear aplKernel;
List<QualityMetric> qualityMetricList;
List<Double> thresholds;
private double[][] globalAggregateCols;
private double[][] globalAggregateCols = null;

protected long numEvents = 0;
protected long numOutliers = 0;
protected int bitmapRatioThreshold = 256;

public abstract List<String> getAggregateNames();
public abstract AggregationOp[] getAggregationOps();
Expand Down Expand Up @@ -59,7 +61,7 @@ public void process(DataFrame input) throws Exception {
int[][] encoded = getEncoded(input.getStringColsByName(attributes), input);
long elapsed = System.currentTimeMillis() - startTime;
log.info("Encoded in: {} ms", elapsed);
log.info("Distinct values encoded: {}", encoder.getNextKey() - 1);
log.info("Encoded Categories: {}", encoder.getNextKey() - 1);

thresholds = getThresholds();
qualityMetricList = getQualityMetricList();
Expand All @@ -80,7 +82,10 @@ public void process(DataFrame input) throws Exception {
numThreads,
encoder.getBitmap(),
encoder.getOutlierList(),
encoder.getIsBitmapEncodedArray()
encoder.getColCardinalities(),
useFDs,
functionalDependencies,
bitmapRatioThreshold
);
log.info("Number of results: {}", aplResults.size());
numOutliers = (long)getNumberOutliers(aggregateColumns);
Expand All @@ -99,8 +104,14 @@ public APLExplanation getResults() {
return explanation;
}

public void setBitmapRatioThreshold(int bitmapRatioThreshold) {
this.bitmapRatioThreshold = bitmapRatioThreshold;
}

public void setGlobalAggregateCols(double[][] globalAggregateCols) {
this.globalAggregateCols = globalAggregateCols;
}



}
Loading

0 comments on commit ae7b655

Please sign in to comment.