diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/application/greedyensemble')
3 files changed, 163 insertions, 148 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java index de1ddbec..5f5074a2 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java @@ -43,8 +43,8 @@ import de.lmu.ifi.dbs.elki.application.AbstractApplication; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; -import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery; @@ -94,7 +94,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends /** * Our logger class. */ - private static final Logging logger = Logging.getLogger(ComputeKNNOutlierScores.class); + private static final Logging LOG = Logging.getLogger(ComputeKNNOutlierScores.class); /** * Input step @@ -139,7 +139,6 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends /** * Constructor. * - * @param verbose Verbose flag * @param inputstep Input step * @param distf Distance function * @param startk Starting value of k @@ -148,8 +147,8 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends * @param bylabel By label outlier (reference) * @param outfile Output file */ - public ComputeKNNOutlierScores(boolean verbose, InputStep inputstep, DistanceFunction<? super O, D> distf, int startk, int stepk, int maxk, ByLabelOutlier bylabel, File outfile) { - super(verbose); + public ComputeKNNOutlierScores(InputStep inputstep, DistanceFunction<? super O, D> distf, int startk, int stepk, int maxk, ByLabelOutlier bylabel, File outfile) { + super(); this.distf = distf; this.startk = startk; this.stepk = stepk; @@ -163,14 +162,14 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends public void run() { final Database database = inputstep.getDatabase(); final Relation<O> relation = database.getRelation(distf.getInputTypeRestriction()); - logger.verbose("Running preprocessor ..."); + LOG.verbose("Running preprocessor ..."); MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<O, D>(relation, distf, maxk + 2); database.addIndex(preproc); // Test that we did get a proper index query KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, distf); if(!(knnq instanceof PreprocessorKNNQuery)) { - logger.warning("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass()); + LOG.warning("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass()); } final DBIDs ids = relation.getDBIDs(); @@ -187,9 +186,8 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends try { MessageDigest md = MessageDigest.getInstance("MD5"); for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); md.update(" ".getBytes()); - md.update(id.toString().getBytes()); + md.update(DBIDUtil.toString(iter).getBytes()); } fout.append("# DBID-series MD5:"); fout.append(Base64.encodeBase64(md.digest())); @@ -214,7 +212,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } // KNN - logger.verbose("Running KNN"); + LOG.verbose("Running KNN"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -228,7 +226,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } }); // KNN Weight - logger.verbose("Running KNNweight"); + LOG.verbose("Running KNNweight"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -242,7 +240,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } }); // Run LOF - logger.verbose("Running LOF"); + LOG.verbose("Running LOF"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -256,7 +254,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } }); // LoOP - logger.verbose("Running LoOP"); + LOG.verbose("Running LoOP"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -267,7 +265,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } }); // LDOF - logger.verbose("Running LDOF"); + LOG.verbose("Running LDOF"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -286,7 +284,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends final PolynomialKernelFunction poly = new PolynomialKernelFunction(PolynomialKernelFunction.DEFAULT_DEGREE); @SuppressWarnings("unchecked") final DistanceFunction<DoubleVector, DoubleDistance> df = DistanceFunction.class.cast(distf); - logger.verbose("Running ABOD"); + LOG.verbose("Running ABOD"); runForEachK(new AlgRunner() { @Override public void run(int k, String kstr) { @@ -302,7 +300,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends } catch(ClassCastException e) { // ABOD might just be not appropriate. - logger.warning("Running ABOD failed - probably not appropriate to this data type / distance?", e); + LOG.warning("Running ABOD failed - probably not appropriate to this data type / distance?", e); } } } @@ -375,17 +373,17 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends /** * Option ID for k step size. */ - public static final OptionID STEPK_ID = OptionID.getOrCreateOptionID("stepk", "Step size for k."); + public static final OptionID STEPK_ID = new OptionID("stepk", "Step size for k."); /** * Option ID for k start size. */ - public static final OptionID STARTK_ID = OptionID.getOrCreateOptionID("startk", "Minimum value for k."); + public static final OptionID STARTK_ID = new OptionID("startk", "Minimum value for k."); /** * Option ID for k step size. */ - public static final OptionID MAXK_ID = OptionID.getOrCreateOptionID("maxk", "Maximum value for k."); + public static final OptionID MAXK_ID = new OptionID("maxk", "Maximum value for k."); /** * k step size @@ -433,18 +431,21 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends distf = distP.instantiateClass(config); } // k parameters - IntParameter stepkP = new IntParameter(STEPK_ID, new GreaterConstraint(0)); + IntParameter stepkP = new IntParameter(STEPK_ID); + stepkP.addConstraint(new GreaterConstraint(0)); if(config.grab(stepkP)) { stepk = stepkP.getValue(); } - IntParameter startkP = new IntParameter(STARTK_ID, true); + IntParameter startkP = new IntParameter(STARTK_ID); + startkP.setOptional(true); if(config.grab(startkP)) { startk = startkP.getValue(); } else { startk = stepk; } - IntParameter maxkP = new IntParameter(MAXK_ID, new GreaterConstraint(0)); + IntParameter maxkP = new IntParameter(MAXK_ID); + maxkP.addConstraint(new GreaterConstraint(0)); if(config.grab(maxkP)) { maxk = maxkP.getValue(); } @@ -455,7 +456,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends @Override protected AbstractApplication makeInstance() { - return new ComputeKNNOutlierScores<O, D>(verbose, inputstep, distf, startk, stepk, maxk, bylabel, outfile); + return new ComputeKNNOutlierScores<O, D>(inputstep, distf, startk, stepk, maxk, bylabel, outfile); } } diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java index 6b2f9e13..f3f9c101 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java @@ -34,11 +34,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearsonCorrelationDistanceFunction; import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; @@ -52,7 +55,6 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; import de.lmu.ifi.dbs.elki.workflow.InputStep; /** @@ -84,9 +86,9 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep; @Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") public class GreedyEnsembleExperiment extends AbstractApplication { /** - * Get static logger + * Get static logger. */ - private static final Logging logger = Logging.getLogger(GreedyEnsembleExperiment.class); + private static final Logging LOG = Logging.getLogger(GreedyEnsembleExperiment.class); /** * The data input part. @@ -101,11 +103,10 @@ public class GreedyEnsembleExperiment extends AbstractApplication { /** * Constructor. * - * @param verbose Verbosity * @param inputstep Input step */ - public GreedyEnsembleExperiment(boolean verbose, InputStep inputstep) { - super(verbose); + public GreedyEnsembleExperiment(InputStep inputstep) { + super(); this.inputstep = inputstep; } @@ -114,22 +115,23 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Note: the database contains the *result vectors*, not the original data // points. final Database database = inputstep.getDatabase(); - final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final NumberVector.Factory<NumberVector<?>, ?> factory = RelationUtil.getNumberVectorFactory(relation); final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database); - final DBID firstid = labels.iterDBIDs().getDBID(); + final DBID firstid = DBIDUtil.deref(labels.iterDBIDs()); final String firstlabel = labels.get(firstid); if(!firstlabel.matches("bylabel")) { throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); } // Dimensionality and reference vector - final int dim = DatabaseUtil.dimensionality(relation); - final NumberVector<?, ?> refvec = relation.get(firstid); + final int dim = RelationUtil.dimensionality(relation); + final NumberVector<?> refvec = relation.get(firstid); // Build the positive index set for ROC AUC. Set<Integer> positive = new TreeSet<Integer>(); for(int d = 0; d < dim; d++) { - if(refvec.doubleValue(d + 1) > 0) { + if(refvec.doubleValue(d) > 0) { positive.add(d); } } @@ -140,18 +142,17 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Find the top-k for each ensemble member { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); // Skip "by label", obviously - if(firstid.sameDBID(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); for(int i = 0; i < dim; i++) { - heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + heap.add(new DoubleIntPair(vec.doubleValue(i), i)); } if(heap.size() >= 2 * estimated_outliers) { - logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size()); + LOG.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size()); } for(DoubleIntPair pair : heap) { if(outliers_seen[pair.second] == 0) { @@ -164,34 +165,33 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } } } - logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers"); + LOG.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers"); // Build the final weight vector. final double[] estimated_weights = new double[dim]; final double[] estimated_truth = new double[dim]; updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); - NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth); + NumberVector<?> estimated_truth_vec = factory.newNumberVector(estimated_truth); - PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights); - PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist; + PrimitiveDoubleDistanceFunction<NumberVector<?>> wdist = getDistanceFunction(estimated_weights); + PrimitiveDoubleDistanceFunction<NumberVector<?>> tdist = wdist; // Build the naive ensemble: final double[] naiveensemble = new double[dim]; { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); - if(firstid.equals(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); for(int d = 0; d < dim; d++) { - naiveensemble[d] += vec.doubleValue(d + 1); + naiveensemble[d] += vec.doubleValue(d); } } for(int d = 0; d < dim; d++) { naiveensemble[d] /= (relation.size() - 1); } } - NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble); + NumberVector<?> naivevec = factory.newNumberVector(naiveensemble); // Compute single AUC scores and estimations. // Remember the method most similar to the estimation @@ -204,72 +204,70 @@ public class GreedyEnsembleExperiment extends AbstractApplication { { // Compute individual scores for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id = iditer.getDBID(); - if(firstid.equals(id)) { + if(DBIDUtil.equal(firstid, iditer)) { continue; } // fout.append(labels.get(id)); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iditer); double auc = computeROCAUC(vec, positive, dim); double estimated = wdist.doubleDistance(vec, estimated_truth_vec); double cost = tdist.doubleDistance(vec, refvec); - logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id)); + LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer)); if(auc > bestauc) { bestauc = auc; - bestaucstr = labels.get(id); + bestaucstr = labels.get(iditer); } if(cost < bestcost) { bestcost = cost; - bestcoststr = labels.get(id); + bestcoststr = labels.get(iditer); } if(estimated < bestest) { bestest = estimated; - bestid = id; + bestid = DBIDUtil.deref(iditer); } } } // Initialize ensemble with "best" method - logger.verbose("Distance function: " + wdist); - logger.verbose("Initial estimation of outliers: " + union_outliers); - logger.verbose("Initializing ensemble with: " + labels.get(bestid)); + LOG.verbose("Distance function: " + wdist); + LOG.verbose("Initial estimation of outliers: " + union_outliers); + LOG.verbose("Initializing ensemble with: " + labels.get(bestid)); ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid); ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs()); enscands.remove(bestid); enscands.remove(firstid); final double[] greedyensemble = new double[dim]; { - final NumberVector<?, ?> vec = relation.get(bestid); + final NumberVector<?> vec = relation.get(bestid); for(int i = 0; i < dim; i++) { - greedyensemble[i] = vec.doubleValue(i + 1); + greedyensemble[i] = vec.doubleValue(i); } } // Greedily grow the ensemble final double[] testensemble = new double[dim]; while(enscands.size() > 0) { - NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); + NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble); // Weighting factors for combining: double s1 = ensemble.size() / (ensemble.size() + 1.); double s2 = 1. / (ensemble.size() + 1.); final int heapsize = enscands.size(); - TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder()); + TopBoundedHeap<DoubleDBIDPair> heap = new TopBoundedHeap<DoubleDBIDPair>(heapsize, Collections.reverseOrder()); for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iter); double diversity = wdist.doubleDistance(vec, greedyvec); - heap.add(new DoubleObjPair<DBID>(diversity, id)); + heap.add(DBIDUtil.newPair(diversity, iter)); } while(heap.size() > 0) { - DBID bestadd = heap.poll().second; + DBIDRef bestadd = heap.poll(); enscands.remove(bestadd); // Update ensemble: - final NumberVector<?, ?> vec = relation.get(bestadd); + final NumberVector<?> vec = relation.get(bestadd); for(int i = 0; i < dim; i++) { - testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2; + testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i) * s2; } - NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble); + NumberVector<?> testvec = factory.newNumberVector(testensemble); double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec); double newd = wdist.doubleDistance(estimated_truth_vec, testvec); // logger.verbose("Distances: " + oldd + " vs. " + newd); @@ -286,7 +284,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { // Update target vectors and weights TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder()); for(int i = 0; i < dim; i++) { - oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i)); + oheap.add(new DoubleIntPair(vec.doubleValue(i), i)); } for(DoubleIntPair pair : oheap) { assert (outliers_seen[pair.second] > 0); @@ -298,14 +296,14 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } if(refresh) { updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth); - estimated_truth_vec = refvec.newNumberVector(estimated_truth); + estimated_truth_vec = factory.newNumberVector(estimated_truth); } } } } } // Build the improved ensemble: - StringBuffer greedylbl = new StringBuffer(); + StringBuilder greedylbl = new StringBuilder(); { for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) { if(greedylbl.length() > 0) { @@ -314,27 +312,27 @@ public class GreedyEnsembleExperiment extends AbstractApplication { greedylbl.append(labels.get(iter)); } } - NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble); - logger.verbose("Estimated outliers remaining: " + union_outliers); - logger.verbose("Greedy ensemble: " + greedylbl.toString()); + NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble); + LOG.verbose("Estimated outliers remaining: " + union_outliers); + LOG.verbose("Greedy ensemble: " + greedylbl.toString()); - logger.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")"); - logger.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")"); + LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")"); + LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")"); // Evaluate the naive ensemble and the "shrunk" ensemble double naiveauc, naivecost; { naiveauc = computeROCAUC(naivevec, positive, dim); naivecost = tdist.doubleDistance(naivevec, refvec); - logger.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost); - logger.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0)); + LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost); + LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0)); } double greedyauc, greedycost; { greedyauc = computeROCAUC(greedyvec, positive, dim); greedycost = tdist.doubleDistance(greedyvec, refvec); - logger.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost); - logger.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0)); - logger.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0)); + LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost); + LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0)); + LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0)); } { MeanVariance meanauc = new MeanVariance(); @@ -347,33 +345,32 @@ public class GreedyEnsembleExperiment extends AbstractApplication { { DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i); for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) { - DBID id = iter.getDBID(); - assert (!firstid.equals(id)); + assert (!DBIDUtil.equal(firstid, iter)); // logger.verbose("Using: "+labels.get(id)); - final NumberVector<?, ?> vec = relation.get(id); + final NumberVector<?> vec = relation.get(iter); for(int d = 0; d < dim; d++) { - randomensemble[d] += vec.doubleValue(d + 1); + randomensemble[d] += vec.doubleValue(d); } } for(int d = 0; d < dim; d++) { randomensemble[d] /= ensemble.size(); } } - NumberVector<?, ?> randomvec = refvec.newNumberVector(randomensemble); + NumberVector<?> randomvec = factory.newNumberVector(randomensemble); double auc = computeROCAUC(randomvec, positive, dim); meanauc.put(auc); double cost = tdist.doubleDistance(randomvec, refvec); meancost.put(cost); } - logger.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())); - logger.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1)); - logger.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."); - logger.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())); - logger.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0)); - logger.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."); - logger.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0)); - logger.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0)); - logger.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0)); + LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev())); + LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1)); + LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations."); + LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev())); + LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0)); + LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations."); + LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0)); + LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0)); + LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0)); } } @@ -390,21 +387,29 @@ public class GreedyEnsembleExperiment extends AbstractApplication { } } - private PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> getDistanceFunction(double[] estimated_weights) { + private PrimitiveDoubleDistanceFunction<NumberVector<?>> getDistanceFunction(double[] estimated_weights) { // return new WeightedSquaredEuclideanDistanceFunction(estimated_weights); // return new WeightedLPNormDistanceFunction(1.0, estimated_weights); return new WeightedPearsonCorrelationDistanceFunction(estimated_weights); } - private double computeROCAUC(NumberVector<?, ?> vec, Set<Integer> positive, int dim) { + private double computeROCAUC(NumberVector<?> vec, Set<Integer> positive, int dim) { final DoubleIntPair[] scores = new DoubleIntPair[dim]; for(int d = 0; d < dim; d++) { - scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d); + scores[d] = new DoubleIntPair(vec.doubleValue(d), d); } Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); return XYCurve.areaUnderCurve(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator())); } + /** + * Compute the gain coefficient. + * + * @param score New score + * @param ref Reference score + * @param optimal Maximum score possible + * @return Gain + */ double gain(double score, double ref, double optimal) { return 1 - ((optimal - score) / (optimal - ref)); } @@ -418,7 +423,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { */ public static class Parameterizer extends AbstractApplication.Parameterizer { /** - * Data source + * Data source. */ InputStep inputstep; @@ -431,7 +436,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication { @Override protected AbstractApplication makeInstance() { - return new GreedyEnsembleExperiment(verbose, inputstep); + return new GreedyEnsembleExperiment(inputstep); } } diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java index 105eeabc..afb9c122 100644 --- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java +++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java @@ -38,8 +38,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.evaluation.roc.ROC; import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage; import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage.SimilarityMatrix; @@ -80,13 +83,16 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep; * </p> * * @author Erich Schubert + * + * @apiviz.composedOf VisualizerParameterizer + * @apiviz.composedOf SimilarityMatrixVisualizer */ @Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.") public class VisualizePairwiseGainMatrix extends AbstractApplication { /** - * Get static logger + * Get static logger. */ - private static final Logging logger = Logging.getLogger(VisualizePairwiseGainMatrix.class); + private static final Logging LOG = Logging.getLogger(VisualizePairwiseGainMatrix.class); /** * The data input part. @@ -94,19 +100,18 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { private InputStep inputstep; /** - * Parameterizer for visualizers + * Parameterizer for visualizers. */ private VisualizerParameterizer vispar; /** * Constructor. * - * @param verbose Verbosity * @param inputstep Input step * @param vispar Visualizer parameterizer */ - public VisualizePairwiseGainMatrix(boolean verbose, InputStep inputstep, VisualizerParameterizer vispar) { - super(verbose); + public VisualizePairwiseGainMatrix(InputStep inputstep, VisualizerParameterizer vispar) { + super(); this.inputstep = inputstep; this.vispar = vispar; } @@ -114,17 +119,17 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { @Override public void run() { final Database database = inputstep.getDatabase(); - final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); + final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database); - final DBID firstid = labels.iterDBIDs().getDBID(); + final DBID firstid = DBIDUtil.deref(labels.iterDBIDs()); final String firstlabel = labels.get(firstid); if(!firstlabel.matches("bylabel")) { throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!"); } // Dimensionality and reference vector - final int dim = DatabaseUtil.dimensionality(relation); - final NumberVector<?, ?> refvec = relation.get(firstid); + final int dim = RelationUtil.dimensionality(relation); + final NumberVector<?> refvec = relation.get(firstid); // Build the truth vector Set<Integer> pos = new TreeSet<Integer>(); @@ -132,7 +137,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { { for(int d = 0; d < dim; d++) { combined[d] = new DoubleIntPair(0, d); - if(refvec.doubleValue(d + 1) > 0) { + if(refvec.doubleValue(d) > 0) { pos.add(d); } } @@ -145,34 +150,39 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { double[][] data = new double[size][size]; DoubleMinMax minmax = new DoubleMinMax(); - for(int a = 0; a < size; a++) { - final NumberVector<?, ?> veca = relation.get(ids.get(a)); - // Direct AUC score: - { - for(int d = 0; d < dim; d++) { - combined[d].first = veca.doubleValue(d + 1); - combined[d].second = d; + { + int a = 0; + for(DBIDIter id = ids.iter(); id.valid(); id.advance(), a++) { + final NumberVector<?> veca = relation.get(id); + // Direct AUC score: + { + for(int d = 0; d < dim; d++) { + combined[d].first = veca.doubleValue(d); + combined[d].second = d; + } + Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); + double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + data[a][a] = auc; + // minmax.put(auc); + // logger.verbose(auc + " " + labels.get(ids.get(a))); } - Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); - double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); - data[a][a] = auc; - // minmax.put(auc); - // logger.verbose(auc + " " + labels.get(ids.get(a))); - } - // Compare to others, exploiting symmetry - for(int b = a + 1; b < size; b++) { - final NumberVector<?, ?> vecb = relation.get(ids.get(b)); - for(int d = 0; d < dim; d++) { - combined[d].first = veca.doubleValue(d + 1) + vecb.doubleValue(d + 1); - combined[d].second = d; + // Compare to others, exploiting symmetry + DBIDArrayIter id2 = ids.iter(); + id2.seek(a + 1); + for(int b = a + 1; b < size; b++, id2.advance()) { + final NumberVector<?> vecb = relation.get(id2); + for(int d = 0; d < dim; d++) { + combined[d].first = veca.doubleValue(d) + vecb.doubleValue(d); + combined[d].second = d; + } + Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); + double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); + // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " + + // labels.get(ids.get(b))); + data[a][b] = auc; + data[b][a] = auc; + // minmax.put(auc); } - Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR)); - double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator())); - // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " + - // labels.get(ids.get(b))); - data[a][b] = auc; - data[b][a] = auc; - // minmax.put(auc); } } for(int a = 0; a < size; a++) { @@ -189,7 +199,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { data[a][a] = 0; } - logger.verbose(minmax.toString()); + LOG.verbose(minmax.toString()); boolean hasneg = (minmax.getMin() < -1E-3); LinearScaling scale; @@ -233,7 +243,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { VisualizerContext context = vispar.newContext(database); // Attach visualizers to results - SimilarityMatrixVisualizer.Factory factory = new SimilarityMatrixVisualizer.Factory(); + SimilarityMatrixVisualizer factory = new SimilarityMatrixVisualizer(); factory.processNewResult(database, database); List<VisualizationTask> tasks = ResultUtil.filterResults(database, VisualizationTask.class); @@ -247,12 +257,11 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { /** * Show a single visualization. * - * @param context - * - * @param factory - * @param task + * @param context Visualization context + * @param factory Visualizer factory + * @param task Visualization task */ - private void showVisualization(VisualizerContext context, SimilarityMatrixVisualizer.Factory factory, VisualizationTask task) { + private void showVisualization(VisualizerContext context, SimilarityMatrixVisualizer factory, VisualizationTask task) { SVGPlot plot = new SVGPlot(); Visualization vis = factory.makeVisualization(task.clone(plot, context, null, 1.0, 1.0)); plot.getRoot().appendChild(vis.getLayer()); @@ -273,12 +282,12 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { */ public static class Parameterizer extends AbstractApplication.Parameterizer { /** - * Data source + * Data source. */ InputStep inputstep; /** - * Parameterizer for visualizers + * Parameterizer for visualizers. */ private VisualizerParameterizer vispar; @@ -293,7 +302,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication { @Override protected AbstractApplication makeInstance() { - return new VisualizePairwiseGainMatrix(verbose, inputstep, vispar); + return new VisualizePairwiseGainMatrix(inputstep, vispar); } } |