package de.lmu.ifi.dbs.elki.utilities; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Pattern; import de.lmu.ifi.dbs.elki.data.ClassLabel; import de.lmu.ifi.dbs.elki.data.FeatureVector; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.relation.ConvertToStringView; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Class with Database-related utility functions such as centroid computation, * covariances etc. * * @author Erich Schubert * * @apiviz.landmark * * @apiviz.has RelationObjectIterator * @apiviz.has CollectionFromRelation */ public final class DatabaseUtil { /** * Fake constructor: Do not instantiate! */ private DatabaseUtil() { // Do not instantiate! } /** * Get the dimensionality of a relation. * * @param relation Relation * @return Dimensionality * * @deprecated Use {@link RelationUtil#dimensionality(Relation)} instead! */ @Deprecated public static > int dimensionality(Relation relation) { return RelationUtil.dimensionality(relation); } /** * Determines the variances in each dimension of the specified objects stored * in the given database. * * @param database the database storing the objects * @param ids the ids of the objects * @param centroid the centroid or reference vector of the ids * @return the variances in each dimension of the specified objects */ public static double[] variances(Relation> database, NumberVector centroid, DBIDs ids) { final int size = ids.size(); double[] variances = new double[centroid.getDimensionality()]; for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { NumberVector o = database.get(iter); for (int d = 0; d < centroid.getDimensionality(); d++) { final double diff = o.doubleValue(d) - centroid.doubleValue(d); variances[d ] += diff * diff / size; } } return variances; } /** * Determines the minimum and maximum values in each dimension of all objects * stored in the given database. * * @param vector type * @param relation the database storing the objects * @return Minimum and Maximum vector for the hyperrectangle */ public static > Pair computeMinMax(Relation relation) { int dim = RelationUtil.dimensionality(relation); double[] mins = new double[dim]; double[] maxs = new double[dim]; for (int i = 0; i < dim; i++) { mins[i] = Double.MAX_VALUE; maxs[i] = -Double.MAX_VALUE; } for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { final NV o = relation.get(iditer); for (int d = 0; d < dim; d++) { final double v = o.doubleValue(d); mins[d] = Math.min(mins[d], v); maxs[d] = Math.max(maxs[d], v); } } NumberVector.Factory factory = RelationUtil.getNumberVectorFactory(relation); NV min = factory.newNumberVector(mins); NV max = factory.newNumberVector(maxs); return new Pair<>(min, max); } /** * Returns the median of a data set in the given dimension by using a sampling * method. * * @param relation Relation to process * @param ids DBIDs to process * @param dimension Dimensionality * @param numberOfSamples Number of samples to draw * @return Median value */ public static > double quickMedian(Relation relation, ArrayDBIDs ids, int dimension, int numberOfSamples) { final int everyNthItem = (int) Math.max(1, Math.floor(ids.size() / (double) numberOfSamples)); final double[] vals = new double[numberOfSamples]; for (int i = 0; i < numberOfSamples; i++) { final DBID id = ids.get(i * everyNthItem); vals[i] = relation.get(id).doubleValue(dimension); } Arrays.sort(vals); if (vals.length % 2 == 1) { return vals[((vals.length + 1) >> 1) - 1]; } else { final double v1 = vals[vals.length >> 1]; final double v2 = vals[(vals.length >> 1) - 1]; return (v1 + v2) / 2.0; } } /** * Returns the median of a data set in the given dimension. * * @param relation Relation to process * @param ids DBIDs to process * @param dimension Dimensionality * @return Median value */ public static > double exactMedian(Relation relation, DBIDs ids, int dimension) { final double[] vals = new double[ids.size()]; int i = 0; for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) { vals[i] = relation.get(iter).doubleValue(dimension); i++; } Arrays.sort(vals); if (vals.length % 2 == 1) { return vals[((vals.length + 1) >> 1) - 1]; } else { final double v1 = vals[vals.length >> 1]; final double v2 = vals[(vals.length >> 1) - 1]; return (v1 + v2) / 2.0; } } /** * Guess a potentially label-like representation, preferring class labels. * * @param database * @return string representation */ public static Relation guessLabelRepresentation(Database database) throws NoSupportedDataTypeException { try { Relation classrep = database.getRelation(TypeUtil.CLASSLABEL); if (classrep != null) { return new ConvertToStringView(classrep); } } catch (NoSupportedDataTypeException e) { // retry. } try { Relation labelsrep = database.getRelation(TypeUtil.LABELLIST); if (labelsrep != null) { return new ConvertToStringView(labelsrep); } } catch (NoSupportedDataTypeException e) { // retry. } try { Relation stringrep = database.getRelation(TypeUtil.STRING); if (stringrep != null) { return stringrep; } } catch (NoSupportedDataTypeException e) { // retry. } throw new NoSupportedDataTypeException("No label-like representation was found."); } /** * Guess a potentially object label-like representation. * * @param database * @return string representation */ public static Relation guessObjectLabelRepresentation(Database database) throws NoSupportedDataTypeException { try { Relation labelsrep = database.getRelation(TypeUtil.LABELLIST); if (labelsrep != null) { return new ConvertToStringView(labelsrep); } } catch (NoSupportedDataTypeException e) { // retry. } try { Relation stringrep = database.getRelation(TypeUtil.STRING); if (stringrep != null) { return stringrep; } } catch (NoSupportedDataTypeException e) { // retry. } try { Relation classrep = database.getRelation(TypeUtil.CLASSLABEL); if (classrep != null) { return new ConvertToStringView(classrep); } } catch (NoSupportedDataTypeException e) { // retry. } throw new NoSupportedDataTypeException("No label-like representation was found."); } /** * Retrieves all class labels within the database. * * @param database the database to be scanned for class labels * @return a set comprising all class labels that are currently set in the * database */ public static SortedSet getClassLabels(Relation database) { SortedSet labels = new TreeSet<>(); for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) { labels.add(database.get(it)); } return labels; } /** * Retrieves all class labels within the database. * * @param database the database to be scanned for class labels * @return a set comprising all class labels that are currently set in the * database */ public static SortedSet getClassLabels(Database database) { final Relation relation = database.getRelation(TypeUtil.CLASSLABEL); return getClassLabels(relation); } /** * Do a cheap guess at the databases object class. * * @param Restriction type * @param database Database * @return Class of first object in the Database. */ @SuppressWarnings("unchecked") public static Class guessObjectClass(Relation database) { return (Class) database.get(database.iterDBIDs()).getClass(); } /** * Do a full inspection of the database to find the base object class. * * Note: this can be an abstract class or interface! * * TODO: Implement a full search for shared superclasses. But since currently * the databases will always use only once class, this is not yet implemented. * * @param Restriction type * @param database Database * @return Superclass of all objects in the database */ public static Class getBaseObjectClassExpensive(Relation database) { List> candidates = new ArrayList<>(); DBIDIter iditer = database.iterDBIDs(); // empty database?! if (!iditer.valid()) { return null; } // put first class into result set. candidates.add(database.get(iditer).getClass()); iditer.advance(); // other objects for (; iditer.valid(); iditer.advance()) { Class newcls = database.get(iditer).getClass(); // validate all candidates Iterator> ci = candidates.iterator(); while (ci.hasNext()) { Class cand = ci.next(); if (cand.isAssignableFrom(newcls)) { continue; } // TODO: resolve conflicts by finding all superclasses! // Does this code here work? for (Class interf : cand.getInterfaces()) { candidates.add(interf); } candidates.add(cand.getSuperclass()); ci.remove(); } } // if we have any candidates left ... if (candidates.size() > 0) { // remove subclasses Iterator> ci = candidates.iterator(); while (ci.hasNext()) { Class cand = ci.next(); for (Class oc : candidates) { if (!oc.equals(cand) && cand.isAssignableFrom(oc)) { ci.remove(); break; } } } assert (candidates.size() > 0); try { return candidates.get(0); } catch (ClassCastException e) { // ignore, and retry with next } } // no resulting class. return null; } /** * Find object by matching their labels. * * @param database Database to search in * @param name_pattern Name to match against class or object label * @return found cluster or it throws an exception. */ public static ArrayModifiableDBIDs getObjectsByLabelMatch(Database database, Pattern name_pattern) { Relation relation = guessLabelRepresentation(database); if (name_pattern == null) { return DBIDUtil.newArray(); } ArrayModifiableDBIDs ret = DBIDUtil.newArray(); for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if (name_pattern.matcher(relation.get(iditer)).find()) { ret.add(iditer); } } return ret; } /** * An ugly vector type cast unavoidable in some situations due to Generics. * * @param Base vector type * @param Derived vector type (is actually V, too) * @param database Database * @return Database */ @SuppressWarnings("unchecked") public static , T extends NumberVector> Relation relationUglyVectorCast(Relation database) { return (Relation) database; } /** * Iterator class that retrieves the given objects from the database. * * @author Erich Schubert */ public static class RelationObjectIterator implements Iterator { /** * The real iterator. */ final DBIDIter iter; /** * The database we use. */ final Relation database; /** * Full Constructor. * * @param iter Original iterator. * @param database Database */ public RelationObjectIterator(DBIDIter iter, Relation database) { super(); this.iter = iter; this.database = database; } /** * Simplified constructor. * * @param database Database */ public RelationObjectIterator(Relation database) { super(); this.database = database; this.iter = database.iterDBIDs(); } @Override public boolean hasNext() { return iter.valid(); } @Override public O next() { O ret = database.get(iter); iter.advance(); return ret; } @Override public void remove() { throw new UnsupportedOperationException(); } } /** * Collection view on a database that retrieves the objects when needed. * * @author Erich Schubert */ public static class CollectionFromRelation extends AbstractCollection implements Collection { /** * The database we query. */ Relation db; /** * Constructor. * * @param db Database */ public CollectionFromRelation(Relation db) { super(); this.db = db; } @Override public Iterator iterator() { return new DatabaseUtil.RelationObjectIterator<>(db); } @Override public int size() { return db.size(); } } }