summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java382
1 files changed, 61 insertions, 321 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java b/src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java
index baa90829..110b5bad 100644
--- a/src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java
+++ b/src/de/lmu/ifi/dbs/elki/utilities/DatabaseUtil.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.utilities;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -23,44 +23,32 @@ package de.lmu.ifi.dbs.elki.utilities;
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-import java.util.AbstractCollection;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import de.lmu.ifi.dbs.elki.data.ClassLabel;
-import de.lmu.ifi.dbs.elki.data.FeatureVector;
import de.lmu.ifi.dbs.elki.data.LabelList;
-import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.type.NoSupportedDataTypeException;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
-import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
-import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.PreprocessorKNNQuery;
import de.lmu.ifi.dbs.elki.database.relation.ConvertToStringView;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
-import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.index.preprocessed.knn.MaterializeKNNPreprocessor;
/**
* Class with Database-related utility functions such as centroid computation,
* covariances etc.
*
* @author Erich Schubert
- *
- * @apiviz.landmark
- *
- * @apiviz.has RelationObjectIterator
- * @apiviz.has CollectionFromRelation
*/
public final class DatabaseUtil {
/**
@@ -69,124 +57,6 @@ public final class DatabaseUtil {
private DatabaseUtil() {
// Do not instantiate!
}
-
- /**
- * Get the dimensionality of a relation.
- *
- * @param relation Relation
- * @return Dimensionality
- *
- * @deprecated Use {@link RelationUtil#dimensionality(Relation)} instead!
- */
- @Deprecated
- public static <V extends FeatureVector<?>> int dimensionality(Relation<V> relation) {
- return RelationUtil.dimensionality(relation);
- }
-
- /**
- * Determines the variances in each dimension of the specified objects stored
- * in the given database.
- *
- * @param database the database storing the objects
- * @param ids the ids of the objects
- * @param centroid the centroid or reference vector of the ids
- * @return the variances in each dimension of the specified objects
- */
- public static double[] variances(Relation<? extends NumberVector<?>> database, NumberVector<?> centroid, DBIDs ids) {
- final int size = ids.size();
- double[] variances = new double[centroid.getDimensionality()];
-
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- NumberVector<?> o = database.get(iter);
- for (int d = 0; d < centroid.getDimensionality(); d++) {
- final double diff = o.doubleValue(d) - centroid.doubleValue(d);
- variances[d ] += diff * diff / size;
- }
- }
- return variances;
- }
-
- /**
- * Determines the minimum and maximum values in each dimension of all objects
- * stored in the given database.
- *
- * @param <NV> vector type
- * @param relation the database storing the objects
- * @return Minimum and Maximum vector for the hyperrectangle
- */
- public static <NV extends NumberVector<?>> Pair<NV, NV> computeMinMax(Relation<NV> relation) {
- int dim = RelationUtil.dimensionality(relation);
- double[] mins = new double[dim];
- double[] maxs = new double[dim];
- for (int i = 0; i < dim; i++) {
- mins[i] = Double.MAX_VALUE;
- maxs[i] = -Double.MAX_VALUE;
- }
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- final NV o = relation.get(iditer);
- for (int d = 0; d < dim; d++) {
- final double v = o.doubleValue(d);
- mins[d] = Math.min(mins[d], v);
- maxs[d] = Math.max(maxs[d], v);
- }
- }
- NumberVector.Factory<NV, ?> factory = RelationUtil.getNumberVectorFactory(relation);
- NV min = factory.newNumberVector(mins);
- NV max = factory.newNumberVector(maxs);
- return new Pair<>(min, max);
- }
-
- /**
- * Returns the median of a data set in the given dimension by using a sampling
- * method.
- *
- * @param relation Relation to process
- * @param ids DBIDs to process
- * @param dimension Dimensionality
- * @param numberOfSamples Number of samples to draw
- * @return Median value
- */
- public static <V extends NumberVector<?>> double quickMedian(Relation<V> relation, ArrayDBIDs ids, int dimension, int numberOfSamples) {
- final int everyNthItem = (int) Math.max(1, Math.floor(ids.size() / (double) numberOfSamples));
- final double[] vals = new double[numberOfSamples];
- for (int i = 0; i < numberOfSamples; i++) {
- final DBID id = ids.get(i * everyNthItem);
- vals[i] = relation.get(id).doubleValue(dimension);
- }
- Arrays.sort(vals);
- if (vals.length % 2 == 1) {
- return vals[((vals.length + 1) >> 1) - 1];
- } else {
- final double v1 = vals[vals.length >> 1];
- final double v2 = vals[(vals.length >> 1) - 1];
- return (v1 + v2) / 2.0;
- }
- }
-
- /**
- * Returns the median of a data set in the given dimension.
- *
- * @param relation Relation to process
- * @param ids DBIDs to process
- * @param dimension Dimensionality
- * @return Median value
- */
- public static <V extends NumberVector<?>> double exactMedian(Relation<V> relation, DBIDs ids, int dimension) {
- final double[] vals = new double[ids.size()];
- int i = 0;
- for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
- vals[i] = relation.get(iter).doubleValue(dimension);
- i++;
- }
- Arrays.sort(vals);
- if (vals.length % 2 == 1) {
- return vals[((vals.length + 1) >> 1) - 1];
- } else {
- final double v1 = vals[vals.length >> 1];
- final double v2 = vals[(vals.length >> 1) - 1];
- return (v1 + v2) / 2.0;
- }
- }
/**
* Guess a potentially label-like representation, preferring class labels.
@@ -197,26 +67,29 @@ public final class DatabaseUtil {
public static Relation<String> guessLabelRepresentation(Database database) throws NoSupportedDataTypeException {
try {
Relation<? extends ClassLabel> classrep = database.getRelation(TypeUtil.CLASSLABEL);
- if (classrep != null) {
+ if(classrep != null) {
return new ConvertToStringView(classrep);
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
try {
Relation<? extends LabelList> labelsrep = database.getRelation(TypeUtil.LABELLIST);
- if (labelsrep != null) {
+ if(labelsrep != null) {
return new ConvertToStringView(labelsrep);
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
try {
Relation<String> stringrep = database.getRelation(TypeUtil.STRING);
- if (stringrep != null) {
+ if(stringrep != null) {
return stringrep;
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
throw new NoSupportedDataTypeException("No label-like representation was found.");
@@ -231,26 +104,29 @@ public final class DatabaseUtil {
public static Relation<String> guessObjectLabelRepresentation(Database database) throws NoSupportedDataTypeException {
try {
Relation<? extends LabelList> labelsrep = database.getRelation(TypeUtil.LABELLIST);
- if (labelsrep != null) {
+ if(labelsrep != null) {
return new ConvertToStringView(labelsrep);
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
try {
Relation<String> stringrep = database.getRelation(TypeUtil.STRING);
- if (stringrep != null) {
+ if(stringrep != null) {
return stringrep;
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
try {
Relation<? extends ClassLabel> classrep = database.getRelation(TypeUtil.CLASSLABEL);
- if (classrep != null) {
+ if(classrep != null) {
return new ConvertToStringView(classrep);
}
- } catch (NoSupportedDataTypeException e) {
+ }
+ catch(NoSupportedDataTypeException e) {
// retry.
}
throw new NoSupportedDataTypeException("No label-like representation was found.");
@@ -265,7 +141,7 @@ public final class DatabaseUtil {
*/
public static SortedSet<ClassLabel> getClassLabels(Relation<? extends ClassLabel> database) {
SortedSet<ClassLabel> labels = new TreeSet<>();
- for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
+ for(DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
labels.add(database.get(it));
}
return labels;
@@ -284,83 +160,6 @@ public final class DatabaseUtil {
}
/**
- * Do a cheap guess at the databases object class.
- *
- * @param <O> Restriction type
- * @param database Database
- * @return Class of first object in the Database.
- */
- @SuppressWarnings("unchecked")
- public static <O> Class<? extends O> guessObjectClass(Relation<O> database) {
- return (Class<? extends O>) database.get(database.iterDBIDs()).getClass();
- }
-
- /**
- * Do a full inspection of the database to find the base object class.
- *
- * Note: this can be an abstract class or interface!
- *
- * TODO: Implement a full search for shared superclasses. But since currently
- * the databases will always use only once class, this is not yet implemented.
- *
- * @param <O> Restriction type
- * @param database Database
- * @return Superclass of all objects in the database
- */
- public static <O> Class<?> getBaseObjectClassExpensive(Relation<O> database) {
- List<Class<?>> candidates = new ArrayList<>();
- DBIDIter iditer = database.iterDBIDs();
- // empty database?!
- if (!iditer.valid()) {
- return null;
- }
- // put first class into result set.
- candidates.add(database.get(iditer).getClass());
- iditer.advance();
- // other objects
- for (; iditer.valid(); iditer.advance()) {
- Class<?> newcls = database.get(iditer).getClass();
- // validate all candidates
- Iterator<Class<?>> ci = candidates.iterator();
- while (ci.hasNext()) {
- Class<?> cand = ci.next();
- if (cand.isAssignableFrom(newcls)) {
- continue;
- }
- // TODO: resolve conflicts by finding all superclasses!
- // Does this code here work?
- for (Class<?> interf : cand.getInterfaces()) {
- candidates.add(interf);
- }
- candidates.add(cand.getSuperclass());
- ci.remove();
- }
- }
- // if we have any candidates left ...
- if (candidates.size() > 0) {
- // remove subclasses
- Iterator<Class<?>> ci = candidates.iterator();
- while (ci.hasNext()) {
- Class<?> cand = ci.next();
- for (Class<?> oc : candidates) {
- if (!oc.equals(cand) && cand.isAssignableFrom(oc)) {
- ci.remove();
- break;
- }
- }
- }
- assert (candidates.size() > 0);
- try {
- return candidates.get(0);
- } catch (ClassCastException e) {
- // ignore, and retry with next
- }
- }
- // no resulting class.
- return null;
- }
-
- /**
* Find object by matching their labels.
*
* @param database Database to search in
@@ -369,12 +168,12 @@ public final class DatabaseUtil {
*/
public static ArrayModifiableDBIDs getObjectsByLabelMatch(Database database, Pattern name_pattern) {
Relation<String> relation = guessLabelRepresentation(database);
- if (name_pattern == null) {
+ if(name_pattern == null) {
return DBIDUtil.newArray();
}
ArrayModifiableDBIDs ret = DBIDUtil.newArray();
- for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
- if (name_pattern.matcher(relation.get(iditer)).find()) {
+ for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
+ if(name_pattern.matcher(relation.get(iditer)).find()) {
ret.add(iditer);
}
}
@@ -382,104 +181,45 @@ public final class DatabaseUtil {
}
/**
- * An ugly vector type cast unavoidable in some situations due to Generics.
+ * Get (or create) a precomputed kNN query for the database.
*
- * @param <V> Base vector type
- * @param <T> Derived vector type (is actually V, too)
* @param database Database
- * @return Database
- */
- @SuppressWarnings("unchecked")
- public static <V extends NumberVector<?>, T extends NumberVector<?>> Relation<V> relationUglyVectorCast(Relation<T> database) {
- return (Relation<V>) database;
- }
-
- /**
- * Iterator class that retrieves the given objects from the database.
- *
- * @author Erich Schubert
+ * @param relation Relation
+ * @param dq Distance query
+ * @param k required number of neighbors
+ * @return KNNQuery for the given relation, that is precomputed.
*/
- public static class RelationObjectIterator<O> implements Iterator<O> {
- /**
- * The real iterator.
- */
- final DBIDIter iter;
-
- /**
- * The database we use.
- */
- final Relation<? extends O> database;
-
- /**
- * Full Constructor.
- *
- * @param iter Original iterator.
- * @param database Database
- */
- public RelationObjectIterator(DBIDIter iter, Relation<? extends O> database) {
- super();
- this.iter = iter;
- this.database = database;
- }
-
- /**
- * Simplified constructor.
- *
- * @param database Database
- */
- public RelationObjectIterator(Relation<? extends O> database) {
- super();
- this.database = database;
- this.iter = database.iterDBIDs();
- }
-
- @Override
- public boolean hasNext() {
- return iter.valid();
- }
-
- @Override
- public O next() {
- O ret = database.get(iter);
- iter.advance();
- return ret;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
+ public static <O> KNNQuery<O> precomputedKNNQuery(Database database, Relation<O> relation, DistanceQuery<O> dq, int k) {
+ // "HEAVY" flag for knn query since it is used more than once
+ KNNQuery<O> knnq = database.getKNNQuery(dq, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+ // No optimized kNN query - use a preprocessor!
+ if(knnq instanceof PreprocessorKNNQuery) {
+ return knnq;
}
+ MaterializeKNNPreprocessor<O> preproc = new MaterializeKNNPreprocessor<>(relation, dq.getDistanceFunction(), k);
+ preproc.initialize();
+ return preproc.getKNNQuery(dq, k);
}
/**
- * Collection view on a database that retrieves the objects when needed.
+ * Get (or create) a precomputed kNN query for the database.
*
- * @author Erich Schubert
- */
- public static class CollectionFromRelation<O> extends AbstractCollection<O> implements Collection<O> {
- /**
- * The database we query.
- */
- Relation<? extends O> db;
-
- /**
- * Constructor.
- *
- * @param db Database
- */
- public CollectionFromRelation(Relation<? extends O> db) {
- super();
- this.db = db;
- }
-
- @Override
- public Iterator<O> iterator() {
- return new DatabaseUtil.RelationObjectIterator<>(db);
- }
-
- @Override
- public int size() {
- return db.size();
- }
+ * @param database Database
+ * @param relation Relation
+ * @param distf Distance function
+ * @param k required number of neighbors
+ * @return KNNQuery for the given relation, that is precomputed.
+ */
+ public static <O> KNNQuery<O> precomputedKNNQuery(Database database, Relation<O> relation, DistanceFunction<? super O> distf, int k) {
+ DistanceQuery<O> dq = database.getDistanceQuery(relation, distf);
+ // "HEAVY" flag for knn query since it is used more than once
+ KNNQuery<O> knnq = database.getKNNQuery(dq, k, DatabaseQuery.HINT_HEAVY_USE, DatabaseQuery.HINT_OPTIMIZED_ONLY, DatabaseQuery.HINT_NO_CACHE);
+ // No optimized kNN query - use a preprocessor!
+ if(knnq instanceof PreprocessorKNNQuery) {
+ return knnq;
+ }
+ MaterializeKNNPreprocessor<O> preproc = new MaterializeKNNPreprocessor<>(relation, dq.getDistanceFunction(), k);
+ preproc.initialize();
+ return preproc.getKNNQuery(dq, k);
}
}