summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/application/greedyensemble
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/application/greedyensemble')
-rw-r--r--src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java47
-rw-r--r--src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java161
-rw-r--r--src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java103
3 files changed, 163 insertions, 148 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java
index de1ddbec..5f5074a2 100644
--- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java
+++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/ComputeKNNOutlierScores.java
@@ -43,8 +43,8 @@ import de.lmu.ifi.dbs.elki.application.AbstractApplication;
import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery;
@@ -94,7 +94,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
/**
* Our logger class.
*/
- private static final Logging logger = Logging.getLogger(ComputeKNNOutlierScores.class);
+ private static final Logging LOG = Logging.getLogger(ComputeKNNOutlierScores.class);
/**
* Input step
@@ -139,7 +139,6 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
/**
* Constructor.
*
- * @param verbose Verbose flag
* @param inputstep Input step
* @param distf Distance function
* @param startk Starting value of k
@@ -148,8 +147,8 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
* @param bylabel By label outlier (reference)
* @param outfile Output file
*/
- public ComputeKNNOutlierScores(boolean verbose, InputStep inputstep, DistanceFunction<? super O, D> distf, int startk, int stepk, int maxk, ByLabelOutlier bylabel, File outfile) {
- super(verbose);
+ public ComputeKNNOutlierScores(InputStep inputstep, DistanceFunction<? super O, D> distf, int startk, int stepk, int maxk, ByLabelOutlier bylabel, File outfile) {
+ super();
this.distf = distf;
this.startk = startk;
this.stepk = stepk;
@@ -163,14 +162,14 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
public void run() {
final Database database = inputstep.getDatabase();
final Relation<O> relation = database.getRelation(distf.getInputTypeRestriction());
- logger.verbose("Running preprocessor ...");
+ LOG.verbose("Running preprocessor ...");
MaterializeKNNPreprocessor<O, D> preproc = new MaterializeKNNPreprocessor<O, D>(relation, distf, maxk + 2);
database.addIndex(preproc);
// Test that we did get a proper index query
KNNQuery<O, D> knnq = QueryUtil.getKNNQuery(relation, distf);
if(!(knnq instanceof PreprocessorKNNQuery)) {
- logger.warning("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass());
+ LOG.warning("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass());
}
final DBIDs ids = relation.getDBIDs();
@@ -187,9 +186,8 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
try {
MessageDigest md = MessageDigest.getInstance("MD5");
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- DBID id = iter.getDBID();
md.update(" ".getBytes());
- md.update(id.toString().getBytes());
+ md.update(DBIDUtil.toString(iter).getBytes());
}
fout.append("# DBID-series MD5:");
fout.append(Base64.encodeBase64(md.digest()));
@@ -214,7 +212,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
// KNN
- logger.verbose("Running KNN");
+ LOG.verbose("Running KNN");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -228,7 +226,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
});
// KNN Weight
- logger.verbose("Running KNNweight");
+ LOG.verbose("Running KNNweight");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -242,7 +240,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
});
// Run LOF
- logger.verbose("Running LOF");
+ LOG.verbose("Running LOF");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -256,7 +254,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
});
// LoOP
- logger.verbose("Running LoOP");
+ LOG.verbose("Running LoOP");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -267,7 +265,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
});
// LDOF
- logger.verbose("Running LDOF");
+ LOG.verbose("Running LDOF");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -286,7 +284,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
final PolynomialKernelFunction poly = new PolynomialKernelFunction(PolynomialKernelFunction.DEFAULT_DEGREE);
@SuppressWarnings("unchecked")
final DistanceFunction<DoubleVector, DoubleDistance> df = DistanceFunction.class.cast(distf);
- logger.verbose("Running ABOD");
+ LOG.verbose("Running ABOD");
runForEachK(new AlgRunner() {
@Override
public void run(int k, String kstr) {
@@ -302,7 +300,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
}
catch(ClassCastException e) {
// ABOD might just be not appropriate.
- logger.warning("Running ABOD failed - probably not appropriate to this data type / distance?", e);
+ LOG.warning("Running ABOD failed - probably not appropriate to this data type / distance?", e);
}
}
}
@@ -375,17 +373,17 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
/**
* Option ID for k step size.
*/
- public static final OptionID STEPK_ID = OptionID.getOrCreateOptionID("stepk", "Step size for k.");
+ public static final OptionID STEPK_ID = new OptionID("stepk", "Step size for k.");
/**
* Option ID for k start size.
*/
- public static final OptionID STARTK_ID = OptionID.getOrCreateOptionID("startk", "Minimum value for k.");
+ public static final OptionID STARTK_ID = new OptionID("startk", "Minimum value for k.");
/**
* Option ID for k step size.
*/
- public static final OptionID MAXK_ID = OptionID.getOrCreateOptionID("maxk", "Maximum value for k.");
+ public static final OptionID MAXK_ID = new OptionID("maxk", "Maximum value for k.");
/**
* k step size
@@ -433,18 +431,21 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
distf = distP.instantiateClass(config);
}
// k parameters
- IntParameter stepkP = new IntParameter(STEPK_ID, new GreaterConstraint(0));
+ IntParameter stepkP = new IntParameter(STEPK_ID);
+ stepkP.addConstraint(new GreaterConstraint(0));
if(config.grab(stepkP)) {
stepk = stepkP.getValue();
}
- IntParameter startkP = new IntParameter(STARTK_ID, true);
+ IntParameter startkP = new IntParameter(STARTK_ID);
+ startkP.setOptional(true);
if(config.grab(startkP)) {
startk = startkP.getValue();
}
else {
startk = stepk;
}
- IntParameter maxkP = new IntParameter(MAXK_ID, new GreaterConstraint(0));
+ IntParameter maxkP = new IntParameter(MAXK_ID);
+ maxkP.addConstraint(new GreaterConstraint(0));
if(config.grab(maxkP)) {
maxk = maxkP.getValue();
}
@@ -455,7 +456,7 @@ public class ComputeKNNOutlierScores<O, D extends NumberDistance<D, ?>> extends
@Override
protected AbstractApplication makeInstance() {
- return new ComputeKNNOutlierScores<O, D>(verbose, inputstep, distf, startk, stepk, maxk, bylabel, outfile);
+ return new ComputeKNNOutlierScores<O, D>(inputstep, distf, startk, stepk, maxk, bylabel, outfile);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
index 6b2f9e13..f3f9c101 100644
--- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
+++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/GreedyEnsembleExperiment.java
@@ -34,11 +34,14 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.correlation.WeightedPearsonCorrelationDistanceFunction;
import de.lmu.ifi.dbs.elki.evaluation.roc.ROC;
@@ -52,7 +55,6 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleIntPair;
-import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair;
import de.lmu.ifi.dbs.elki.workflow.InputStep;
/**
@@ -84,9 +86,9 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep;
@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.")
public class GreedyEnsembleExperiment extends AbstractApplication {
/**
- * Get static logger
+ * Get static logger.
*/
- private static final Logging logger = Logging.getLogger(GreedyEnsembleExperiment.class);
+ private static final Logging LOG = Logging.getLogger(GreedyEnsembleExperiment.class);
/**
* The data input part.
@@ -101,11 +103,10 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
/**
* Constructor.
*
- * @param verbose Verbosity
* @param inputstep Input step
*/
- public GreedyEnsembleExperiment(boolean verbose, InputStep inputstep) {
- super(verbose);
+ public GreedyEnsembleExperiment(InputStep inputstep) {
+ super();
this.inputstep = inputstep;
}
@@ -114,22 +115,23 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Note: the database contains the *result vectors*, not the original data
// points.
final Database database = inputstep.getDatabase();
- final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
+ final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
+ final NumberVector.Factory<NumberVector<?>, ?> factory = RelationUtil.getNumberVectorFactory(relation);
final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
- final DBID firstid = labels.iterDBIDs().getDBID();
+ final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
final String firstlabel = labels.get(firstid);
if(!firstlabel.matches("bylabel")) {
throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
}
// Dimensionality and reference vector
- final int dim = DatabaseUtil.dimensionality(relation);
- final NumberVector<?, ?> refvec = relation.get(firstid);
+ final int dim = RelationUtil.dimensionality(relation);
+ final NumberVector<?> refvec = relation.get(firstid);
// Build the positive index set for ROC AUC.
Set<Integer> positive = new TreeSet<Integer>();
for(int d = 0; d < dim; d++) {
- if(refvec.doubleValue(d + 1) > 0) {
+ if(refvec.doubleValue(d) > 0) {
positive.add(d);
}
}
@@ -140,18 +142,17 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Find the top-k for each ensemble member
{
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
// Skip "by label", obviously
- if(firstid.sameDBID(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
TiedTopBoundedHeap<DoubleIntPair> heap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
- heap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
+ heap.add(new DoubleIntPair(vec.doubleValue(i), i));
}
if(heap.size() >= 2 * estimated_outliers) {
- logger.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size());
+ LOG.warning("Too many ties. Expected: " + estimated_outliers + " got: " + heap.size());
}
for(DoubleIntPair pair : heap) {
if(outliers_seen[pair.second] == 0) {
@@ -164,34 +165,33 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
}
}
- logger.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers");
+ LOG.verbose("Merged top " + estimated_outliers + " outliers to: " + union_outliers + " outliers");
// Build the final weight vector.
final double[] estimated_weights = new double[dim];
final double[] estimated_truth = new double[dim];
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
- NumberVector<?, ?> estimated_truth_vec = refvec.newNumberVector(estimated_truth);
+ NumberVector<?> estimated_truth_vec = factory.newNumberVector(estimated_truth);
- PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> wdist = getDistanceFunction(estimated_weights);
- PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> tdist = wdist;
+ PrimitiveDoubleDistanceFunction<NumberVector<?>> wdist = getDistanceFunction(estimated_weights);
+ PrimitiveDoubleDistanceFunction<NumberVector<?>> tdist = wdist;
// Build the naive ensemble:
final double[] naiveensemble = new double[dim];
{
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
- if(firstid.equals(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
for(int d = 0; d < dim; d++) {
- naiveensemble[d] += vec.doubleValue(d + 1);
+ naiveensemble[d] += vec.doubleValue(d);
}
}
for(int d = 0; d < dim; d++) {
naiveensemble[d] /= (relation.size() - 1);
}
}
- NumberVector<?, ?> naivevec = refvec.newNumberVector(naiveensemble);
+ NumberVector<?> naivevec = factory.newNumberVector(naiveensemble);
// Compute single AUC scores and estimations.
// Remember the method most similar to the estimation
@@ -204,72 +204,70 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
{
// Compute individual scores
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- DBID id = iditer.getDBID();
- if(firstid.equals(id)) {
+ if(DBIDUtil.equal(firstid, iditer)) {
continue;
}
// fout.append(labels.get(id));
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iditer);
double auc = computeROCAUC(vec, positive, dim);
double estimated = wdist.doubleDistance(vec, estimated_truth_vec);
double cost = tdist.doubleDistance(vec, refvec);
- logger.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(id));
+ LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
if(auc > bestauc) {
bestauc = auc;
- bestaucstr = labels.get(id);
+ bestaucstr = labels.get(iditer);
}
if(cost < bestcost) {
bestcost = cost;
- bestcoststr = labels.get(id);
+ bestcoststr = labels.get(iditer);
}
if(estimated < bestest) {
bestest = estimated;
- bestid = id;
+ bestid = DBIDUtil.deref(iditer);
}
}
}
// Initialize ensemble with "best" method
- logger.verbose("Distance function: " + wdist);
- logger.verbose("Initial estimation of outliers: " + union_outliers);
- logger.verbose("Initializing ensemble with: " + labels.get(bestid));
+ LOG.verbose("Distance function: " + wdist);
+ LOG.verbose("Initial estimation of outliers: " + union_outliers);
+ LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
enscands.remove(bestid);
enscands.remove(firstid);
final double[] greedyensemble = new double[dim];
{
- final NumberVector<?, ?> vec = relation.get(bestid);
+ final NumberVector<?> vec = relation.get(bestid);
for(int i = 0; i < dim; i++) {
- greedyensemble[i] = vec.doubleValue(i + 1);
+ greedyensemble[i] = vec.doubleValue(i);
}
}
// Greedily grow the ensemble
final double[] testensemble = new double[dim];
while(enscands.size() > 0) {
- NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble);
+ NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble);
// Weighting factors for combining:
double s1 = ensemble.size() / (ensemble.size() + 1.);
double s2 = 1. / (ensemble.size() + 1.);
final int heapsize = enscands.size();
- TopBoundedHeap<DoubleObjPair<DBID>> heap = new TopBoundedHeap<DoubleObjPair<DBID>>(heapsize, Collections.reverseOrder());
+ TopBoundedHeap<DoubleDBIDPair> heap = new TopBoundedHeap<DoubleDBIDPair>(heapsize, Collections.reverseOrder());
for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
- DBID id = iter.getDBID();
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iter);
double diversity = wdist.doubleDistance(vec, greedyvec);
- heap.add(new DoubleObjPair<DBID>(diversity, id));
+ heap.add(DBIDUtil.newPair(diversity, iter));
}
while(heap.size() > 0) {
- DBID bestadd = heap.poll().second;
+ DBIDRef bestadd = heap.poll();
enscands.remove(bestadd);
// Update ensemble:
- final NumberVector<?, ?> vec = relation.get(bestadd);
+ final NumberVector<?> vec = relation.get(bestadd);
for(int i = 0; i < dim; i++) {
- testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i + 1) * s2;
+ testensemble[i] = greedyensemble[i] * s1 + vec.doubleValue(i) * s2;
}
- NumberVector<?, ?> testvec = refvec.newNumberVector(testensemble);
+ NumberVector<?> testvec = factory.newNumberVector(testensemble);
double oldd = wdist.doubleDistance(estimated_truth_vec, greedyvec);
double newd = wdist.doubleDistance(estimated_truth_vec, testvec);
// logger.verbose("Distances: " + oldd + " vs. " + newd);
@@ -286,7 +284,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
// Update target vectors and weights
TiedTopBoundedHeap<DoubleIntPair> oheap = new TiedTopBoundedHeap<DoubleIntPair>(estimated_outliers, Collections.reverseOrder());
for(int i = 0; i < dim; i++) {
- oheap.add(new DoubleIntPair(vec.doubleValue(i + 1), i));
+ oheap.add(new DoubleIntPair(vec.doubleValue(i), i));
}
for(DoubleIntPair pair : oheap) {
assert (outliers_seen[pair.second] > 0);
@@ -298,14 +296,14 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
if(refresh) {
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
- estimated_truth_vec = refvec.newNumberVector(estimated_truth);
+ estimated_truth_vec = factory.newNumberVector(estimated_truth);
}
}
}
}
}
// Build the improved ensemble:
- StringBuffer greedylbl = new StringBuffer();
+ StringBuilder greedylbl = new StringBuilder();
{
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
if(greedylbl.length() > 0) {
@@ -314,27 +312,27 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
greedylbl.append(labels.get(iter));
}
}
- NumberVector<?, ?> greedyvec = refvec.newNumberVector(greedyensemble);
- logger.verbose("Estimated outliers remaining: " + union_outliers);
- logger.verbose("Greedy ensemble: " + greedylbl.toString());
+ NumberVector<?> greedyvec = factory.newNumberVector(greedyensemble);
+ LOG.verbose("Estimated outliers remaining: " + union_outliers);
+ LOG.verbose("Greedy ensemble: " + greedylbl.toString());
- logger.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
- logger.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
+ LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
+ LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
// Evaluate the naive ensemble and the "shrunk" ensemble
double naiveauc, naivecost;
{
naiveauc = computeROCAUC(naivevec, positive, dim);
naivecost = tdist.doubleDistance(naivevec, refvec);
- logger.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
- logger.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
+ LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
+ LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
}
double greedyauc, greedycost;
{
greedyauc = computeROCAUC(greedyvec, positive, dim);
greedycost = tdist.doubleDistance(greedyvec, refvec);
- logger.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
- logger.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
- logger.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
+ LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
+ LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
+ LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
}
{
MeanVariance meanauc = new MeanVariance();
@@ -347,33 +345,32 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
{
DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long)i);
for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
- DBID id = iter.getDBID();
- assert (!firstid.equals(id));
+ assert (!DBIDUtil.equal(firstid, iter));
// logger.verbose("Using: "+labels.get(id));
- final NumberVector<?, ?> vec = relation.get(id);
+ final NumberVector<?> vec = relation.get(iter);
for(int d = 0; d < dim; d++) {
- randomensemble[d] += vec.doubleValue(d + 1);
+ randomensemble[d] += vec.doubleValue(d);
}
}
for(int d = 0; d < dim; d++) {
randomensemble[d] /= ensemble.size();
}
}
- NumberVector<?, ?> randomvec = refvec.newNumberVector(randomensemble);
+ NumberVector<?> randomvec = factory.newNumberVector(randomensemble);
double auc = computeROCAUC(randomvec, positive, dim);
meanauc.put(auc);
double cost = tdist.doubleDistance(randomvec, refvec);
meancost.put(cost);
}
- logger.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
- logger.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
- logger.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
- logger.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
- logger.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
- logger.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
- logger.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
- logger.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
- logger.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
+ LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
+ LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
+ LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
+ LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
+ LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
+ LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
+ LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
+ LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
+ LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
}
}
@@ -390,21 +387,29 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
}
}
- private PrimitiveDoubleDistanceFunction<NumberVector<?, ?>> getDistanceFunction(double[] estimated_weights) {
+ private PrimitiveDoubleDistanceFunction<NumberVector<?>> getDistanceFunction(double[] estimated_weights) {
// return new WeightedSquaredEuclideanDistanceFunction(estimated_weights);
// return new WeightedLPNormDistanceFunction(1.0, estimated_weights);
return new WeightedPearsonCorrelationDistanceFunction(estimated_weights);
}
- private double computeROCAUC(NumberVector<?, ?> vec, Set<Integer> positive, int dim) {
+ private double computeROCAUC(NumberVector<?> vec, Set<Integer> positive, int dim) {
final DoubleIntPair[] scores = new DoubleIntPair[dim];
for(int d = 0; d < dim; d++) {
- scores[d] = new DoubleIntPair(vec.doubleValue(d + 1), d);
+ scores[d] = new DoubleIntPair(vec.doubleValue(d), d);
}
Arrays.sort(scores, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
return XYCurve.areaUnderCurve(ROC.materializeROC(dim, positive, Arrays.asList(scores).iterator()));
}
+ /**
+ * Compute the gain coefficient.
+ *
+ * @param score New score
+ * @param ref Reference score
+ * @param optimal Maximum score possible
+ * @return Gain
+ */
double gain(double score, double ref, double optimal) {
return 1 - ((optimal - score) / (optimal - ref));
}
@@ -418,7 +423,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
*/
public static class Parameterizer extends AbstractApplication.Parameterizer {
/**
- * Data source
+ * Data source.
*/
InputStep inputstep;
@@ -431,7 +436,7 @@ public class GreedyEnsembleExperiment extends AbstractApplication {
@Override
protected AbstractApplication makeInstance() {
- return new GreedyEnsembleExperiment(verbose, inputstep);
+ return new GreedyEnsembleExperiment(inputstep);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java
index 105eeabc..afb9c122 100644
--- a/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java
+++ b/src/de/lmu/ifi/dbs/elki/application/greedyensemble/VisualizePairwiseGainMatrix.java
@@ -38,8 +38,11 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.evaluation.roc.ROC;
import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage;
import de.lmu.ifi.dbs.elki.evaluation.similaritymatrix.ComputeSimilarityMatrixImage.SimilarityMatrix;
@@ -80,13 +83,16 @@ import de.lmu.ifi.dbs.elki.workflow.InputStep;
* </p>
*
* @author Erich Schubert
+ *
+ * @apiviz.composedOf VisualizerParameterizer
+ * @apiviz.composedOf SimilarityMatrixVisualizer
*/
@Reference(authors = "E. Schubert, R. Wojdanowski, A. Zimek, H.-P. Kriegel", title = "On Evaluation of Outlier Rankings and Outlier Scores", booktitle = "Proc. 12th SIAM International Conference on Data Mining (SDM), Anaheim, CA, 2012.")
public class VisualizePairwiseGainMatrix extends AbstractApplication {
/**
- * Get static logger
+ * Get static logger.
*/
- private static final Logging logger = Logging.getLogger(VisualizePairwiseGainMatrix.class);
+ private static final Logging LOG = Logging.getLogger(VisualizePairwiseGainMatrix.class);
/**
* The data input part.
@@ -94,19 +100,18 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
private InputStep inputstep;
/**
- * Parameterizer for visualizers
+ * Parameterizer for visualizers.
*/
private VisualizerParameterizer vispar;
/**
* Constructor.
*
- * @param verbose Verbosity
* @param inputstep Input step
* @param vispar Visualizer parameterizer
*/
- public VisualizePairwiseGainMatrix(boolean verbose, InputStep inputstep, VisualizerParameterizer vispar) {
- super(verbose);
+ public VisualizePairwiseGainMatrix(InputStep inputstep, VisualizerParameterizer vispar) {
+ super();
this.inputstep = inputstep;
this.vispar = vispar;
}
@@ -114,17 +119,17 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
@Override
public void run() {
final Database database = inputstep.getDatabase();
- final Relation<NumberVector<?, ?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
+ final Relation<NumberVector<?>> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
- final DBID firstid = labels.iterDBIDs().getDBID();
+ final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
final String firstlabel = labels.get(firstid);
if(!firstlabel.matches("bylabel")) {
throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
}
// Dimensionality and reference vector
- final int dim = DatabaseUtil.dimensionality(relation);
- final NumberVector<?, ?> refvec = relation.get(firstid);
+ final int dim = RelationUtil.dimensionality(relation);
+ final NumberVector<?> refvec = relation.get(firstid);
// Build the truth vector
Set<Integer> pos = new TreeSet<Integer>();
@@ -132,7 +137,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
{
for(int d = 0; d < dim; d++) {
combined[d] = new DoubleIntPair(0, d);
- if(refvec.doubleValue(d + 1) > 0) {
+ if(refvec.doubleValue(d) > 0) {
pos.add(d);
}
}
@@ -145,34 +150,39 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
double[][] data = new double[size][size];
DoubleMinMax minmax = new DoubleMinMax();
- for(int a = 0; a < size; a++) {
- final NumberVector<?, ?> veca = relation.get(ids.get(a));
- // Direct AUC score:
- {
- for(int d = 0; d < dim; d++) {
- combined[d].first = veca.doubleValue(d + 1);
- combined[d].second = d;
+ {
+ int a = 0;
+ for(DBIDIter id = ids.iter(); id.valid(); id.advance(), a++) {
+ final NumberVector<?> veca = relation.get(id);
+ // Direct AUC score:
+ {
+ for(int d = 0; d < dim; d++) {
+ combined[d].first = veca.doubleValue(d);
+ combined[d].second = d;
+ }
+ Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
+ double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator()));
+ data[a][a] = auc;
+ // minmax.put(auc);
+ // logger.verbose(auc + " " + labels.get(ids.get(a)));
}
- Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
- double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator()));
- data[a][a] = auc;
- // minmax.put(auc);
- // logger.verbose(auc + " " + labels.get(ids.get(a)));
- }
- // Compare to others, exploiting symmetry
- for(int b = a + 1; b < size; b++) {
- final NumberVector<?, ?> vecb = relation.get(ids.get(b));
- for(int d = 0; d < dim; d++) {
- combined[d].first = veca.doubleValue(d + 1) + vecb.doubleValue(d + 1);
- combined[d].second = d;
+ // Compare to others, exploiting symmetry
+ DBIDArrayIter id2 = ids.iter();
+ id2.seek(a + 1);
+ for(int b = a + 1; b < size; b++, id2.advance()) {
+ final NumberVector<?> vecb = relation.get(id2);
+ for(int d = 0; d < dim; d++) {
+ combined[d].first = veca.doubleValue(d) + vecb.doubleValue(d);
+ combined[d].second = d;
+ }
+ Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
+ double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator()));
+ // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " +
+ // labels.get(ids.get(b)));
+ data[a][b] = auc;
+ data[b][a] = auc;
+ // minmax.put(auc);
}
- Arrays.sort(combined, Collections.reverseOrder(DoubleIntPair.BYFIRST_COMPARATOR));
- double auc = XYCurve.areaUnderCurve(ROC.materializeROC(dim, pos, Arrays.asList(combined).iterator()));
- // logger.verbose(auc + " " + labels.get(ids.get(a)) + " " +
- // labels.get(ids.get(b)));
- data[a][b] = auc;
- data[b][a] = auc;
- // minmax.put(auc);
}
}
for(int a = 0; a < size; a++) {
@@ -189,7 +199,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
data[a][a] = 0;
}
- logger.verbose(minmax.toString());
+ LOG.verbose(minmax.toString());
boolean hasneg = (minmax.getMin() < -1E-3);
LinearScaling scale;
@@ -233,7 +243,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
VisualizerContext context = vispar.newContext(database);
// Attach visualizers to results
- SimilarityMatrixVisualizer.Factory factory = new SimilarityMatrixVisualizer.Factory();
+ SimilarityMatrixVisualizer factory = new SimilarityMatrixVisualizer();
factory.processNewResult(database, database);
List<VisualizationTask> tasks = ResultUtil.filterResults(database, VisualizationTask.class);
@@ -247,12 +257,11 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
/**
* Show a single visualization.
*
- * @param context
- *
- * @param factory
- * @param task
+ * @param context Visualization context
+ * @param factory Visualizer factory
+ * @param task Visualization task
*/
- private void showVisualization(VisualizerContext context, SimilarityMatrixVisualizer.Factory factory, VisualizationTask task) {
+ private void showVisualization(VisualizerContext context, SimilarityMatrixVisualizer factory, VisualizationTask task) {
SVGPlot plot = new SVGPlot();
Visualization vis = factory.makeVisualization(task.clone(plot, context, null, 1.0, 1.0));
plot.getRoot().appendChild(vis.getLayer());
@@ -273,12 +282,12 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
*/
public static class Parameterizer extends AbstractApplication.Parameterizer {
/**
- * Data source
+ * Data source.
*/
InputStep inputstep;
/**
- * Parameterizer for visualizers
+ * Parameterizer for visualizers.
*/
private VisualizerParameterizer vispar;
@@ -293,7 +302,7 @@ public class VisualizePairwiseGainMatrix extends AbstractApplication {
@Override
protected AbstractApplication makeInstance() {
- return new VisualizePairwiseGainMatrix(verbose, inputstep, vispar);
+ return new VisualizePairwiseGainMatrix(inputstep, vispar);
}
}