summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java161
1 files changed, 83 insertions, 78 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
index 6b2f9e13..f3f9c101 100644
--- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
+++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
@@ -34,11 +34,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearsonCorrelationDistanceFunction;
import de.lmu.ifi.dbs.elki.evaluation.roc.ROC;
@@ -52,7 +55,6 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair;
import de.lmu.ifi.dbs.elki.workflow.InputStep;
/**
@@ -84,9 +86,9 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep;
@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.")
public class GreedyEnsembleExperiment extends AbstractApplication {
/**
- * Get static logger
+ * Get static logger.
*/
- private static final Logging logger = Logging.getLogger(GreedyEnsembleExperiment.class);
+ private static final Logging LOG = Logging.getLogger(GreedyEnsembleExperiment.class);
/**
* The data input part.
@@ -101,11 +103,10 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
/**
* Constructor.
*
- * @param verbose Verbosity
* @param inputstep Input step
*/
- public GreedyEnsembleExperiment(boolean verbose, InputStep inputstep) {
- super(verbose);
+ public GreedyEnsembleExperiment(InputStep inputstep) {
+ super();
this.inputstep = inputstep;
}
@@ -114,22 +115,23 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Note: the database contains the *result vectors*, not the original data
// points.
final Database database = inputstep.getDatabase();
- final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
+ final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
+ final NumberVector.Factory<NumberVector<?>, ?> factory = RelationUtil.getNumberVectorFactory(relation);
final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
- final DBID firstid = labels.iterDBIDs().getDBID();
+ final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
final String firstlabel = labels.get(firstid);
if(!firstlabel.matches("bylabel")) {
throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
}
// Dimensionality and reference vector
- final int dim = DatabaseUtil.dimensionality(relation);
- final NumberVector<?, ?> refvec = relation.get(firstid);
+ final int dim = RelationUtil.dimensionality(relation);
+ final NumberVector<?> refvec = relation.get(firstid);
// Build the positive index set for ROC AUC.
Set<Integer> positive = new TreeSet<Integer>();
for(int d = 0; d < dim; d++) {
- if(refvec.doubleValue(d + 1) > 0) {
+ if(refvec.doubleValue(d) > 0) {
positive.add(d);
}
}
@@ -140,18 +142,17 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Find the top-k for each ensemble member
{
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
// Skip "by label", obviously
- if(firstid.sameDBID(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
- heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
+ heap.add(new DoubleIntPair(vec.doubleValue(i), i));
}
if(heap.size() >= 2 * estimated_outliers) {
- logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size());
+ LOG.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size());
}
for(DoubleIntPair pair : heap) {
if(outliers_seen[pair.second] == 0) {
@@ -164,34 +165,33 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
}
}
- logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers");
+ LOG.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers");
// Build the final weight vector.
final double[] estimated_weights = new double[dim];
final double[] estimated_truth = new double[dim];
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
- NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth);
+ NumberVector<?> estimated_truth_vec = factory.newNumberVector(estimated_truth);
- PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights);
- PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist;
+ PrimitiveDoubleDistanceFunction<NumberVector<?>> wdist = getDistanceFunction(estimated_weights);
+ PrimitiveDoubleDistanceFunction<NumberVector<?>> tdist = wdist;
// Build the naive ensemble:
final double[] naiveensemble = new double[dim];
{
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
- if(firstid.equals(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
for(int d = 0; d < dim; d++) {
- naiveensemble[d] += vec.doubleValue(d + 1);
+ naiveensemble[d] += vec.doubleValue(d);
}
}
for(int d = 0; d < dim; d++) {
naiveensemble[d] /= (relation.size() - 1);
}
}
- NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble);
+ NumberVector<?> naivevec = factory.newNumberVector(naiveensemble);
// Compute single AUC scores and estimations.
// Remember the method most similar to the estimation
@@ -204,72 +204,70 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
{
// Compute individual scores
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
- if(firstid.equals(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
// fout.append(labels.get(id));
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
double auc = computeROCAUC(vec, positive, dim);
double estimated = wdist.doubleDistance(vec, estimated_truth_vec);
double cost = tdist.doubleDistance(vec, refvec);
- logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id));
+ LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
if(auc > bestauc) {
bestauc = auc;
- bestaucstr = labels.get(id);
+ bestaucstr = labels.get(iditer);
}
if(cost < bestcost) {
bestcost = cost;
- bestcoststr = labels.get(id);
+ bestcoststr = labels.get(iditer);
}
if(estimated < bestest) {
bestest = estimated;
- bestid = id;
+ bestid = DBIDUtil.deref(iditer);
}
}
}
// Initialize ensemble with "best" method
- logger.verbose("Distance function: " + wdist);
- logger.verbose("Initial estimation of outliers: " + union_outliers);
- logger.verbose("Initializing ensemble with: " + labels.get(bestid));
+ LOG.verbose("Distance function: " + wdist);
+ LOG.verbose("Initial estimation of outliers: " + union_outliers);
+ LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
enscands.remove(bestid);
enscands.remove(firstid);
final double[] greedyensemble = new double[dim];
{
- final NumberVector<?, ?> vec = relation.get(bestid);
+ final NumberVector<?> vec = relation.get(bestid);
for(int i = 0; i < dim; i++) {
- greedyensemble[i] = vec.doubleValue(i + 1);
+ greedyensemble[i] = vec.doubleValue(i);
}
}
// Greedily grow the ensemble
final double[] testensemble = new double[dim];
while(enscands.size() > 0) {
- NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble);
+ NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble);
// Weighting factors for combining:
double s1 = ensemble.size() / (ensemble.size() + 1.);
double s2 = 1. / (ensemble.size() + 1.);
final int heapsize = enscands.size();
- TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder());
+ TopBoundedHeap<DoubleDBIDPair> heap = new TopBoundedHeap<DoubleDBIDPair>(heapsize, Collections.reverseOrder());
for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
- DBID id = iter.getDBID();
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iter);
double diversity = wdist.doubleDistance(vec, greedyvec);
- heap.add(new DoubleObjPair<DBID>(diversity, id));
+ heap.add(DBIDUtil.newPair(diversity, iter));
}
while(heap.size() > 0) {
- DBID bestadd = heap.poll().second;
+ DBIDRef bestadd = heap.poll();
enscands.remove(bestadd);
// Update ensemble:
- final NumberVector<?, ?> vec = relation.get(bestadd);
+ final NumberVector<?> vec = relation.get(bestadd);
for(int i = 0; i < dim; i++) {
- testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2;
+ testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i) * s2;
}
- NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble);
+ NumberVector<?> testvec = factory.newNumberVector(testensemble);
double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec);
double newd = wdist.doubleDistance(estimated_truth_vec, testvec);
// logger.verbose("Distances: " + oldd + " vs. " + newd);
@@ -286,7 +284,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Update target vectors and weights
TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
- oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
+ oheap.add(new DoubleIntPair(vec.doubleValue(i), i));
}
for(DoubleIntPair pair : oheap) {
assert (outliers_seen[pair.second] > 0);
@@ -298,14 +296,14 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
if(refresh) {
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
- estimated_truth_vec = refvec.newNumberVector(estimated_truth);
+ estimated_truth_vec = factory.newNumberVector(estimated_truth);
}
}
}
}
}
// Build the improved ensemble:
- StringBuffer greedylbl = new StringBuffer();
+ StringBuilder greedylbl = new StringBuilder();
{
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
if(greedylbl.length() > 0) {
@@ -314,27 +312,27 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
greedylbl.append(labels.get(iter));
}
}
- NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble);
- logger.verbose("Estimated outliers remaining: " + union_outliers);
- logger.verbose("Greedy ensemble: " + greedylbl.toString());
+ NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble);
+ LOG.verbose("Estimated outliers remaining: " + union_outliers);
+ LOG.verbose("Greedy ensemble: " + greedylbl.toString());
- logger.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
- logger.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
+ LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
+ LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
// Evaluate the naive ensemble and the "shrunk" ensemble
double naiveauc, naivecost;
{
naiveauc = computeROCAUC(naivevec, positive, dim);
naivecost = tdist.doubleDistance(naivevec, refvec);
- logger.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
- logger.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
+ LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
+ LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
}
double greedyauc, greedycost;
{
greedyauc = computeROCAUC(greedyvec, positive, dim);
greedycost = tdist.doubleDistance(greedyvec, refvec);
- logger.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
- logger.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
- logger.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
+ LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
+ LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
+ LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
}
{
MeanVariance meanauc = new MeanVariance();
@@ -347,33 +345,32 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
{
DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i);
for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
- DBID id = iter.getDBID();
- assert (!firstid.equals(id));
+ assert (!DBIDUtil.equal(firstid, iter));
// logger.verbose("Using: "+labels.get(id));
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iter);
for(int d = 0; d < dim; d++) {
- randomensemble[d] += vec.doubleValue(d + 1);
+ randomensemble[d] += vec.doubleValue(d);
}
}
for(int d = 0; d < dim; d++) {
randomensemble[d] /= ensemble.size();
}
}
- NumberVector<?, ?> randomvec = refvec.newNumberVector(randomensemble);
+ NumberVector<?> randomvec = factory.newNumberVector(randomensemble);
double auc = computeROCAUC(randomvec, positive, dim);
meanauc.put(auc);
double cost = tdist.doubleDistance(randomvec, refvec);
meancost.put(cost);
}
- logger.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
- logger.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
- logger.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
- logger.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
- logger.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
- logger.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
- logger.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
- logger.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
- logger.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
+ LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
+ LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
+ LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
+ LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
+ LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
+ LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
+ LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
+ LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
+ LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
}
}
@@ -390,21 +387,29 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
}
- private PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> getDistanceFunction(double[] estimated_weights) {
+ private PrimitiveDoubleDistanceFunction<NumberVector<?>> getDistanceFunction(double[] estimated_weights) {
// return new WeightedSquaredEuclideanDistanceFunction(estimated_weights);
// return new WeightedLPNormDistanceFunction(1.0, estimated_weights);
return new WeightedPearsonCorrelationDistanceFunction(estimated_weights);
}
- private double computeROCAUC(NumberVector<?, ?> vec, Set<Integer> positive, int dim) {
+ private double computeROCAUC(NumberVector<?> vec, Set<Integer> positive, int dim) {
final DoubleIntPair[] scores = new DoubleIntPair[dim];
for(int d = 0; d < dim; d++) {
- scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d);
+ scores[d] = new DoubleIntPair(vec.doubleValue(d), d);
}
Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
return XYCurve.areaUnderCurve(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator()));
}
+ /**
+ * Compute the gain coefficient.
+ *
+ * @param score New score
+ * @param ref Reference score
+ * @param optimal Maximum score possible
+ * @return Gain
+ */
double gain(double score, double ref, double optimal) {
return 1 - ((optimal - score) / (optimal - ref));
}
@@ -418,7 +423,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
*/
public static class Parameterizer extends AbstractApplication.Parameterizer {
/**
- * Data source
+ * Data source.
*/
InputStep inputstep;
@@ -431,7 +436,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
@Override
protected AbstractApplication makeInstance() {
- return new GreedyEnsembleExperiment(verbose, inputstep);
+ return new GreedyEnsembleExperiment(inputstep);
}
}