package de.lmu.ifi.dbs.elki.utilities; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2011 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.ClassLabel; import de.lmu.ifi.dbs.elki.data.FeatureVector; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.ConvertToStringView; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid; import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix; import de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Class with Database-related utility functions such as centroid computation, * covariances etc. * * @author Erich Schubert * * @apiviz.landmark * @apiviz.uses Centroid * @apiviz.uses ProjectedCentroid * @apiviz.uses CovarianceMatrix */ public final class DatabaseUtil { /** * Get the dimensionality of a database * * @param relation relation * @return Vector field type information */ public static > VectorFieldTypeInformation assumeVectorField(Relation relation) { try { return ((VectorFieldTypeInformation) relation.getDataTypeInformation()); } catch(Exception e) { throw new UnsupportedOperationException("Expected a vector field, got type information: " + relation.getDataTypeInformation().toString()); } } /** * Get the dimensionality of a database * * @param relation relation * @return Database dimensionality */ public static int dimensionality(Relation> relation) { try { return ((VectorFieldTypeInformation>) relation.getDataTypeInformation()).dimensionality(); } catch(Exception e) { return -1; } } /** * Returns the centroid as a NumberVector object of the specified database. * The objects must be instance of NumberVector. * * @param Vector type * @param relation the Relation storing the objects * @return the centroid of the specified objects stored in the given database * @throws IllegalArgumentException if the database is empty */ public static > V centroid(Relation relation) { return Centroid.make(relation).toVector(relation); } /** * Returns the centroid as a NumberVector object of the specified objects * stored in the given database. The objects belonging to the specified ids * must be instance of NumberVector. * * @param Vector type * @param relation the relation * @param ids the ids of the objects * @return the centroid of the specified objects stored in the given database * @throws IllegalArgumentException if the id list is empty */ public static > V centroid(Relation relation, DBIDs ids) { return Centroid.make(relation, ids).toVector(relation); } /** * Returns the centroid w.r.t. the dimensions specified by the given BitSet as * a NumberVector object of the specified objects stored in the given * database. The objects belonging to the specified IDs must be instance of * NumberVector. * * @param Vector type * @param relation the database storing the objects * @param ids the identifiable objects * @param dimensions the BitSet representing the dimensions to be considered * @return the centroid of the specified objects stored in the given database * w.r.t. the specified subspace * @throws IllegalArgumentException if the id list is empty */ public static > V centroid(Relation relation, DBIDs ids, BitSet dimensions) { return ProjectedCentroid.make(dimensions, relation, ids).toVector(relation); } /** * Determines the covariance matrix of the objects stored in the given * database. * * @param Vector type * @param database the database storing the objects * @param ids the ids of the objects * @return the covariance matrix of the specified objects */ public static > Matrix covarianceMatrix(Relation database, DBIDs ids) { return CovarianceMatrix.make(database, ids).destroyToNaiveMatrix(); } /** * Determines the d x d covariance matrix of the given n x d data matrix. * * @param data the database storing the objects * @return the covariance matrix of the given data matrix. */ public static Matrix covarianceMatrix(Matrix data) { return CovarianceMatrix.make(data).destroyToNaiveMatrix(); } /** * Determines the variances in each dimension of all objects stored in the * given database. * * @param database the database storing the objects * @return the variances in each dimension of all objects stored in the given * database */ public static > double[] variances(Relation database) { NumberVector centroid = centroid(database); double[] variances = new double[centroid.getDimensionality()]; for(int d = 1; d <= centroid.getDimensionality(); d++) { double mu = centroid.doubleValue(d); for(Iterator it = database.iterDBIDs(); it.hasNext();) { NumberVector o = database.get(it.next()); double diff = o.doubleValue(d) - mu; variances[d - 1] += diff * diff; } variances[d - 1] /= database.size(); } return variances; } /** * Determines the variances in each dimension of the specified objects stored * in the given database. Returns * variances(database, centroid(database, ids), ids) * * @param database the database storing the objects * @param ids the ids of the objects * @return the variances in each dimension of the specified objects */ public static > double[] variances(Relation database, DBIDs ids) { return variances(database, centroid(database, ids), ids); } /** * Determines the variances in each dimension of the specified objects stored * in the given database. * * @param database the database storing the objects * @param ids the ids of the objects * @param centroid the centroid or reference vector of the ids * @return the variances in each dimension of the specified objects */ public static double[] variances(Relation> database, NumberVector centroid, DBIDs ids) { double[] variances = new double[centroid.getDimensionality()]; for(int d = 1; d <= centroid.getDimensionality(); d++) { double mu = centroid.doubleValue(d); for(DBID id : ids) { NumberVector o = database.get(id); double diff = o.doubleValue(d) - mu; variances[d - 1] += diff * diff; } variances[d - 1] /= ids.size(); } return variances; } /** * Determines the minimum and maximum values in each dimension of all objects * stored in the given database. * * @param vector type * @param database the database storing the objects * @return Minimum and Maximum vector for the hyperrectangle */ public static > Pair computeMinMax(Relation database) { int dim = dimensionality(database); double[] mins = new double[dim]; double[] maxs = new double[dim]; for(int i = 0; i < dim; i++) { mins[i] = Double.MAX_VALUE; maxs[i] = -Double.MAX_VALUE; } for(DBID it : database.iterDBIDs()) { NV o = database.get(it); for(int d = 0; d < dim; d++) { double v = o.doubleValue(d + 1); mins[d] = Math.min(mins[d], v); maxs[d] = Math.max(maxs[d], v); } } NV factory = assumeVectorField(database).getFactory(); NV min = factory.newInstance(mins); NV max = factory.newInstance(maxs); return new Pair(min, max); } /** * Returns the median of a data set in the given dimension by using a sampling * method. * * @param relation Relation to process * @param ids DBIDs to process * @param dimension Dimensionality * @param numberOfSamples Number of samples to draw * @return Median value */ public static > double quickMedian(Relation relation, ArrayDBIDs ids, int dimension, int numberOfSamples) { final int everyNthItem = (int) Math.max(1, Math.floor(ids.size() / (double) numberOfSamples)); final double[] vals = new double[numberOfSamples]; for(int i = 0; i < numberOfSamples; i++) { final DBID id = ids.get(i * everyNthItem); vals[i] = relation.get(id).doubleValue(dimension); } Arrays.sort(vals); if(vals.length % 2 == 1) { return vals[((vals.length + 1) / 2) - 1]; } else { final double v1 = vals[vals.length / 2]; final double v2 = vals[(vals.length / 2) - 1]; return (v1 + v2) / 2.0; } } /** * Returns the median of a data set in the given dimension. * * @param relation Relation to process * @param ids DBIDs to process * @param dimension Dimensionality * @return Median value */ public static > double exactMedian(Relation relation, DBIDs ids, int dimension) { final double[] vals = new double[ids.size()]; int i = 0; for(DBID id : ids) { vals[i] = relation.get(id).doubleValue(dimension); i++; } Arrays.sort(vals); if(vals.length % 2 == 1) { return vals[((vals.length + 1) / 2) - 1]; } else { final double v1 = vals[vals.length / 2]; final double v2 = vals[(vals.length / 2) - 1]; return (v1 + v2) / 2.0; } } /** * Guess a potentially label-like representation. * * @param database * @return string representation */ public static Relation guessLabelRepresentation(Database database) throws NoSupportedDataTypeException { try { Relation classrep = database.getRelation(TypeUtil.CLASSLABEL); if(classrep != null) { return new ConvertToStringView(classrep); } } catch(NoSupportedDataTypeException e) { // retry. } try { Relation labelsrep = database.getRelation(TypeUtil.LABELLIST); if(labelsrep != null) { return new ConvertToStringView(labelsrep); } } catch(NoSupportedDataTypeException e) { // retry. } try { Relation stringrep = database.getRelation(TypeUtil.STRING); if(stringrep != null) { return stringrep; } } catch(NoSupportedDataTypeException e) { // retry. } throw new NoSupportedDataTypeException("No label-like representation was found."); } /** * Guess a potentially object label-like representation. * * @param database * @return string representation */ public static Relation guessObjectLabelRepresentation(Database database) throws NoSupportedDataTypeException { try { Relation labelsrep = database.getRelation(TypeUtil.LABELLIST); if(labelsrep != null) { return new ConvertToStringView(labelsrep); } } catch(NoSupportedDataTypeException e) { // retry. } try { Relation stringrep = database.getRelation(TypeUtil.STRING); if(stringrep != null) { return stringrep; } } catch(NoSupportedDataTypeException e) { // retry. } try { Relation classrep = database.getRelation(TypeUtil.CLASSLABEL); if(classrep != null) { return new ConvertToStringView(classrep); } } catch(NoSupportedDataTypeException e) { // retry. } throw new NoSupportedDataTypeException("No label-like representation was found."); } /** * Retrieves all class labels within the database. * * @param database the database to be scanned for class labels * @return a set comprising all class labels that are currently set in the * database */ public static SortedSet getClassLabels(Relation database) { SortedSet labels = new TreeSet(); for(Iterator iter = database.iterDBIDs(); iter.hasNext();) { labels.add(database.get(iter.next())); } return labels; } /** * Retrieves all class labels within the database. * * @param database the database to be scanned for class labels * @return a set comprising all class labels that are currently set in the * database */ public static SortedSet getClassLabels(Database database) { final Relation relation = database.getRelation(TypeUtil.CLASSLABEL); return getClassLabels(relation); } /** * Do a cheap guess at the databases object class. * * @param Restriction type * @param database Database * @return Class of first object in the Database. */ @SuppressWarnings("unchecked") public static Class guessObjectClass(Relation database) { for(DBID id : database.iterDBIDs()) { return (Class) database.get(id).getClass(); } return null; } /** * Do a full inspection of the database to find the base object class. * * Note: this can be an abstract class or interface! * * TODO: Implement a full search for shared superclasses. But since currently * the databases will always use only once class, this is not yet implemented. * * @param Restriction type * @param database Database * @return Superclass of all objects in the database */ public static Class getBaseObjectClassExpensive(Relation database) { List> candidates = new ArrayList>(); Iterator iditer = database.iterDBIDs(); // empty database?! if(!iditer.hasNext()) { return null; } // put first class into result set. candidates.add(database.get(iditer.next()).getClass()); // other objects while(iditer.hasNext()) { Class newcls = database.get(iditer.next()).getClass(); // validate all candidates Iterator> ci = candidates.iterator(); while(ci.hasNext()) { Class cand = ci.next(); if(cand.isAssignableFrom(newcls)) { continue; } // TODO: resolve conflicts by finding all superclasses! // Does this code here work? for(Class interf : cand.getInterfaces()) { candidates.add(interf); } candidates.add(cand.getSuperclass()); ci.remove(); } } // if we have any candidates left ... if(candidates != null && candidates.size() > 0) { // remove subclasses Iterator> ci = candidates.iterator(); while(ci.hasNext()) { Class cand = ci.next(); for(Class oc : candidates) { if(oc != cand && cand.isAssignableFrom(oc)) { ci.remove(); break; } } } assert (candidates.size() > 0); try { return candidates.get(0); } catch(ClassCastException e) { // ignore, and retry with next } } // no resulting class. return null; } /** * Find object by matching their labels. * * @param database Database to search in * @param name_pattern Name to match against class or object label * @return found cluster or it throws an exception. */ public static ArrayModifiableDBIDs getObjectsByLabelMatch(Database database, Pattern name_pattern) { Relation relation = guessLabelRepresentation(database); if(name_pattern == null) { return DBIDUtil.newArray(); } ArrayModifiableDBIDs ret = DBIDUtil.newArray(); for(DBID objid : relation.iterDBIDs()) { if(name_pattern.matcher(relation.get(objid)).matches()) { ret.add(objid); } } return ret; } /** * An ugly vector type cast unavoidable in some situations due to Generics. * * @param Base vector type * @param Derived vector type (is actually V, too) * @param database Database * @return Database */ @SuppressWarnings("unchecked") public static , T extends NumberVector> Relation relationUglyVectorCast(Relation database) { return (Relation) database; } /** * Get the column name or produce a generic label "Column XY". * * @param rel Relation * @param col Column * @return Label */ public static > String getColumnLabel(Relation rel, int col) { String lbl = assumeVectorField(rel).getLabel(col); if(lbl != null) { return lbl; } else { return "Column " + col; } } /** * Iterator class that retrieves the given objects from the database. * * @author Erich Schubert */ public static class RelationObjectIterator implements Iterator { /** * The real iterator. */ final Iterator iter; /** * The database we use */ final Relation database; /** * Full Constructor. * * @param iter Original iterator. * @param database Database */ public RelationObjectIterator(Iterator iter, Relation database) { super(); this.iter = iter; this.database = database; } /** * Simplified constructor. * * @param database Database */ public RelationObjectIterator(Relation database) { super(); this.database = database; this.iter = database.iterDBIDs(); } @Override public boolean hasNext() { return iter.hasNext(); } @Override public O next() { DBID id = iter.next(); return database.get(id); } @Override public void remove() { iter.remove(); } } /** * Collection view on a database that retrieves the objects when needed. * * @author Erich Schubert */ public static class CollectionFromRelation extends AbstractCollection implements Collection { /** * The database we query */ Relation db; /** * Constructor. * * @param db Database */ public CollectionFromRelation(Relation db) { super(); this.db = db; } @Override public Iterator iterator() { return new DatabaseUtil.RelationObjectIterator(db); } @Override public int size() { return db.size(); } } }