diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java | 161 |
1 files changed, 83 insertions, 78 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java index 6b2f9e13..f3f9c101 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java @@ -34,11 +34,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearsonCorrelationDistanceFunction; import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; @@ -52,7 +55,6 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.workflow.InputStep; /** @@ -84,9 +86,9 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep; @Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") public class GreedyEnsembleExperiment extends AbstractApplication { /** - * Get static logger + * Get static logger. */ - private static final Logging logger = Logging.getLogger(GreedyEnsembleExperiment.class); + private static final Logging LOG = Logging.getLogger(GreedyEnsembleExperiment.class); /** * The data input part. @@ -101,11 +103,10 @@ public class GreedyEnsembleExperiment extends AbstractApplication { /** * Constructor. * - * @param verbose Verbosity * @param inputstep Input step */ - public GreedyEnsembleExperiment(boolean verbose, InputStep inputstep) { - super(verbose); + public GreedyEnsembleExperiment(InputStep inputstep) { + super(); this.inputstep = inputstep; } @@ -114,22 +115,23 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Note: the database contains the *result vectors*, not the original data // points. final Database database = inputstep.getDatabase(); - final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final NumberVector.Factory<NumberVector<?>, ?> factory = RelationUtil.getNumberVectorFactory(relation); final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database); - final DBID firstid = labels.iterDBIDs().getDBID(); + final DBID firstid = DBIDUtil.deref(labels.iterDBIDs()); final String firstlabel = labels.get(firstid); if(!firstlabel.matches("bylabel")) { throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); } // Dimensionality and reference vector - final int dim = DatabaseUtil.dimensionality(relation); - final NumberVector<?, ?> refvec = relation.get(firstid); + final int dim = RelationUtil.dimensionality(relation); + final NumberVector<?> refvec = relation.get(firstid); // Build the positive index set for ROC AUC. Set<Integer> positive = new TreeSet<Integer>(); for(int d = 0; d < dim; d++) { - if(refvec.doubleValue(d + 1) > 0) { + if(refvec.doubleValue(d) > 0) { positive.add(d); } } @@ -140,18 +142,17 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Find the top-k for each ensemble member { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); // Skip "by label", obviously - if(firstid.sameDBID(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); for(int i = 0; i < dim; i++) { - heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + heap.add(new DoubleIntPair(vec.doubleValue(i), i)); } if(heap.size() >= 2 * estimated_outliers) { - logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size()); + LOG.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size()); } for(DoubleIntPair pair : heap) { if(outliers_seen[pair.second] == 0) { @@ -164,34 +165,33 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } } } - logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers"); + LOG.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers"); // Build the final weight vector. final double[] estimated_weights = new double[dim]; final double[] estimated_truth = new double[dim]; updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); - NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth); + NumberVector<?> estimated_truth_vec = factory.newNumberVector(estimated_truth); - PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights); - PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist; + PrimitiveDoubleDistanceFunction<NumberVector<?>> wdist = getDistanceFunction(estimated_weights); + PrimitiveDoubleDistanceFunction<NumberVector<?>> tdist = wdist; // Build the naive ensemble: final double[] naiveensemble = new double[dim]; { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); - if(firstid.equals(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); for(int d = 0; d < dim; d++) { - naiveensemble[d] += vec.doubleValue(d + 1); + naiveensemble[d] += vec.doubleValue(d); } } for(int d = 0; d < dim; d++) { naiveensemble[d] /= (relation.size() - 1); } } - NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble); + NumberVector<?> naivevec = factory.newNumberVector(naiveensemble); // Compute single AUC scores and estimations. // Remember the method most similar to the estimation @@ -204,72 +204,70 @@ public class GreedyEnsembleExperiment extends AbstractApplication { { // Compute individual scores for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); - if(firstid.equals(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } // fout.append(labels.get(id)); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); double auc = computeROCAUC(vec, positive, dim); double estimated = wdist.doubleDistance(vec, estimated_truth_vec); double cost = tdist.doubleDistance(vec, refvec); - logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id)); + LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer)); if(auc > bestauc) { bestauc = auc; - bestaucstr = labels.get(id); + bestaucstr = labels.get(iditer); } if(cost < bestcost) { bestcost = cost; - bestcoststr = labels.get(id); + bestcoststr = labels.get(iditer); } if(estimated < bestest) { bestest = estimated; - bestid = id; + bestid = DBIDUtil.deref(iditer); } } } // Initialize ensemble with "best" method - logger.verbose("Distance function: " + wdist); - logger.verbose("Initial estimation of outliers: " + union_outliers); - logger.verbose("Initializing ensemble with: " + labels.get(bestid)); + LOG.verbose("Distance function: " + wdist); + LOG.verbose("Initial estimation of outliers: " + union_outliers); + LOG.verbose("Initializing ensemble with: " + labels.get(bestid)); ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid); ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs()); enscands.remove(bestid); enscands.remove(firstid); final double[] greedyensemble = new double[dim]; { - final NumberVector<?, ?> vec = relation.get(bestid); + final NumberVector<?> vec = relation.get(bestid); for(int i = 0; i < dim; i++) { - greedyensemble[i] = vec.doubleValue(i + 1); + greedyensemble[i] = vec.doubleValue(i); } } // Greedily grow the ensemble final double[] testensemble = new double[dim]; while(enscands.size() > 0) { - NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); + NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble); // Weighting factors for combining: double s1 = ensemble.size() / (ensemble.size() + 1.); double s2 = 1. / (ensemble.size() + 1.); final int heapsize = enscands.size(); - TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder()); + TopBoundedHeap<DoubleDBIDPair> heap = new TopBoundedHeap<DoubleDBIDPair>(heapsize, Collections.reverseOrder()); for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iter); double diversity = wdist.doubleDistance(vec, greedyvec); - heap.add(new DoubleObjPair<DBID>(diversity, id)); + heap.add(DBIDUtil.newPair(diversity, iter)); } while(heap.size() > 0) { - DBID bestadd = heap.poll().second; + DBIDRef bestadd = heap.poll(); enscands.remove(bestadd); // Update ensemble: - final NumberVector<?, ?> vec = relation.get(bestadd); + final NumberVector<?> vec = relation.get(bestadd); for(int i = 0; i < dim; i++) { - testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2; + testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i) * s2; } - NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble); + NumberVector<?> testvec = factory.newNumberVector(testensemble); double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec); double newd = wdist.doubleDistance(estimated_truth_vec, testvec); // logger.verbose("Distances: " + oldd + " vs. " + newd); @@ -286,7 +284,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Update target vectors and weights TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); for(int i = 0; i < dim; i++) { - oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + oheap.add(new DoubleIntPair(vec.doubleValue(i), i)); } for(DoubleIntPair pair : oheap) { assert (outliers_seen[pair.second] > 0); @@ -298,14 +296,14 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } if(refresh) { updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); - estimated_truth_vec = refvec.newNumberVector(estimated_truth); + estimated_truth_vec = factory.newNumberVector(estimated_truth); } } } } } // Build the improved ensemble: - StringBuffer greedylbl = new StringBuffer(); + StringBuilder greedylbl = new StringBuilder(); { for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) { if(greedylbl.length() > 0) { @@ -314,27 +312,27 @@ public class GreedyEnsembleExperiment extends AbstractApplication { greedylbl.append(labels.get(iter)); } } - NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); - logger.verbose("Estimated outliers remaining: " + union_outliers); - logger.verbose("Greedy ensemble: " + greedylbl.toString()); + NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble); + LOG.verbose("Estimated outliers remaining: " + union_outliers); + LOG.verbose("Greedy ensemble: " + greedylbl.toString()); - logger.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")"); - logger.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")"); + LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")"); + LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")"); // Evaluate the naive ensemble and the "shrunk" ensemble double naiveauc, naivecost; { naiveauc = computeROCAUC(naivevec, positive, dim); naivecost = tdist.doubleDistance(naivevec, refvec); - logger.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost); - logger.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0)); + LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost); + LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0)); } double greedyauc, greedycost; { greedyauc = computeROCAUC(greedyvec, positive, dim); greedycost = tdist.doubleDistance(greedyvec, refvec); - logger.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost); - logger.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0)); - logger.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0)); + LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost); + LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0)); + LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0)); } { MeanVariance meanauc = new MeanVariance(); @@ -347,33 +345,32 @@ public class GreedyEnsembleExperiment extends AbstractApplication { { DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i); for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); - assert (!firstid.equals(id)); + assert (!DBIDUtil.equal(firstid, iter)); // logger.verbose("Using: "+labels.get(id)); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iter); for(int d = 0; d < dim; d++) { - randomensemble[d] += vec.doubleValue(d + 1); + randomensemble[d] += vec.doubleValue(d); } } for(int d = 0; d < dim; d++) { randomensemble[d] /= ensemble.size(); } } - NumberVector<?, ?> randomvec = refvec.newNumberVector(randomensemble); + NumberVector<?> randomvec = factory.newNumberVector(randomensemble); double auc = computeROCAUC(randomvec, positive, dim); meanauc.put(auc); double cost = tdist.doubleDistance(randomvec, refvec); meancost.put(cost); } - logger.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())); - logger.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1)); - logger.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."); - logger.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())); - logger.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0)); - logger.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."); - logger.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0)); - logger.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0)); - logger.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0)); + LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())); + LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1)); + LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."); + LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())); + LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0)); + LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."); + LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0)); + LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0)); + LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0)); } } @@ -390,21 +387,29 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } } - private PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> getDistanceFunction(double[] estimated_weights) { + private PrimitiveDoubleDistanceFunction<NumberVector<?>> getDistanceFunction(double[] estimated_weights) { // return new WeightedSquaredEuclideanDistanceFunction(estimated_weights); // return new WeightedLPNormDistanceFunction(1.0, estimated_weights); return new WeightedPearsonCorrelationDistanceFunction(estimated_weights); } - private double computeROCAUC(NumberVector<?, ?> vec, Set<Integer> positive, int dim) { + private double computeROCAUC(NumberVector<?> vec, Set<Integer> positive, int dim) { final DoubleIntPair[] scores = new DoubleIntPair[dim]; for(int d = 0; d < dim; d++) { - scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d); + scores[d] = new DoubleIntPair(vec.doubleValue(d), d); } Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); return XYCurve.areaUnderCurve(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator())); } + /** + * Compute the gain coefficient. + * + * @param score New score + * @param ref Reference score + * @param optimal Maximum score possible + * @return Gain + */ double gain(double score, double ref, double optimal) { return 1 - ((optimal - score) / (optimal - ref)); } @@ -418,7 +423,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { */ public static class Parameterizer extends AbstractApplication.Parameterizer { /** - * Data source + * Data source. */ InputStep inputstep; @@ -431,7 +436,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { @Override protected AbstractApplication makeInstance() { - return new GreedyEnsembleExperiment(verbose, inputstep); + return new GreedyEnsembleExperiment(inputstep); } } |