package de.lmu.ifi.dbs.elki.algorithm.outlier;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
Copyright (C) 2013
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
import java.util.ArrayList;
import java.util.Collections;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.IntIntPair;
/**
* Abstract base class for the sparse-grid-cell based outlier detection of
* Aggarwal and Yu.
*
*
* Reference:
* Outlier detection for high dimensional data
* C.C. Aggarwal, P. S. Yu
* International Conference on Management of Data Proceedings of the 2001 ACM
* SIGMOD international conference on Management of data 2001, Santa Barbara,
* California, United States
*
*
* @author Ahmed Hettab
* @author Erich Schubert
*
* @param Vector type
*/
@Reference(authors = "C.C. Aggarwal, P. S. Yu", title = "Outlier detection for high dimensional data", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD 2001), Santa Barbara, CA, 2001", url = "http://dx.doi.org/10.1145/375663.375668")
public abstract class AbstractAggarwalYuOutlier> extends AbstractAlgorithm implements OutlierAlgorithm {
/**
* Symbolic value for subspaces not in use.
*
* Note: in some places, the implementations may rely on this having the value
* 0 currently!
*/
public static final int DONT_CARE = 0;
/**
* The number of partitions for each dimension.
*/
protected int phi;
/**
* The target dimensionality.
*/
protected int k;
/**
* Constructor.
*
* @param k K parameter
* @param phi Phi parameter
*/
public AbstractAggarwalYuOutlier(int k, int phi) {
super();
this.k = k;
this.phi = phi;
}
/**
* Grid discretization of the data:
* Each attribute of data is divided into phi equi-depth ranges.
* Each range contains a fraction f=1/phi of the records.
*
* @param relation Relation to process
* @return range map
*/
protected ArrayList> buildRanges(Relation relation) {
final int dim = RelationUtil.dimensionality(relation);
final int size = relation.size();
final DBIDs allids = relation.getDBIDs();
final ArrayList> ranges = new ArrayList<>();
// Temporary projection storage of the database
final ArrayList> dbAxis = new ArrayList<>(dim);
for(int i = 0; i < dim; i++) {
ArrayList axis = new ArrayList<>(size);
dbAxis.add(i, axis);
}
// Project
for(DBIDIter iter = allids.iter(); iter.valid(); iter.advance()) {
final V obj = relation.get(iter);
for(int d = 0; d < dim; d++) {
dbAxis.get(d).add(DBIDUtil.newPair(obj.doubleValue(d), iter));
}
}
// Split into cells
final double part = size * 1.0 / phi;
for(int d = 0; d < dim; d++) {
ArrayList axis = dbAxis.get(d);
Collections.sort(axis);
ArrayList dimranges = new ArrayList<>(phi + 1);
dimranges.add(allids);
int start = 0;
for(int r = 0; r < phi; r++) {
int end = (int) (part * r);
if(r == phi - 1) {
end = size;
}
ArrayModifiableDBIDs currange = DBIDUtil.newArray(phi + 1);
for(int i = start; i < end; i++) {
currange.add(axis.get(i));
}
start = end;
dimranges.add(currange);
}
ranges.add(dimranges);
}
return ranges;
}
/**
* Method to calculate the sparsity coefficient of.
*
* @param setsize Size of subset
* @param dbsize Size of database
* @param k Dimensionality
* @param phi Phi parameter
* @return sparsity coefficient
*/
protected static double sparsity(final int setsize, final int dbsize, final int k, final double phi) {
// calculate sparsity c
final double f = 1. / phi;
final double fK = Math.pow(f, k);
final double sC = (setsize - (dbsize * fK)) / Math.sqrt(dbsize * fK * (1 - fK));
return sC;
}
/**
* Method to get the ids in the given subspace.
*
* @param subspace Subspace to process
* @param ranges List of DBID ranges
* @return ids
*/
protected DBIDs computeSubspace(ArrayList subspace, ArrayList> ranges) {
HashSetModifiableDBIDs ids = DBIDUtil.newHashSet(ranges.get(subspace.get(0).first).get(subspace.get(0).second));
// intersect all selected dimensions
for(int i = 1; i < subspace.size(); i++) {
DBIDs current = ranges.get(subspace.get(i).first).get(subspace.get(i).second);
ids.retainAll(current);
if(ids.size() == 0) {
break;
}
}
return ids;
}
/**
* Get the DBIDs in the current subspace.
*
* @param gene gene data
* @param ranges Database ranges
* @return resulting DBIDs
*/
protected DBIDs computeSubspaceForGene(int[] gene, ArrayList> ranges) {
HashSetModifiableDBIDs m = DBIDUtil.newHashSet(ranges.get(0).get(gene[0]));
// intersect
for(int i = 1; i < gene.length; i++) {
if(gene[i] != DONT_CARE) {
DBIDs current = ranges.get(i).get(gene[i]);
m.retainAll(current);
}
}
return m;
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
}
/**
* Parameterization class.
*
* @author Erich Schubert
*
* @apiviz.exclude
*/
public abstract static class Parameterizer extends AbstractParameterizer {
/**
* OptionID for the grid size.
*/
public static final OptionID PHI_ID = new OptionID("ay.phi", "The number of equi-depth grid ranges to use in each dimension.");
/**
* OptionID for the target dimensionality.
*/
public static final OptionID K_ID = new OptionID("ay.k", "Subspace dimensionality to search for.");
/**
* Phi parameter.
*/
protected int phi;
/**
* k Parameter.
*/
protected int k;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
final IntParameter kP = new IntParameter(K_ID);
kP.addConstraint(new GreaterEqualConstraint(2));
if(config.grab(kP)) {
k = kP.getValue();
}
final IntParameter phiP = new IntParameter(PHI_ID);
phiP.addConstraint(new GreaterEqualConstraint(2));
if(config.grab(phiP)) {
phi = phiP.getValue();
}
}
}
}