diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/itemsetmining/APRIORI.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/itemsetmining/APRIORI.java | 619 |
1 files changed, 619 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/itemsetmining/APRIORI.java b/src/de/lmu/ifi/dbs/elki/algorithm/itemsetmining/APRIORI.java new file mode 100644 index 00000000..31279b9b --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/itemsetmining/APRIORI.java @@ -0,0 +1,619 @@ +package de.lmu.ifi.dbs.elki.algorithm.itemsetmining; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import gnu.trove.iterator.TLongIntIterator; +import gnu.trove.map.hash.TLongIntHashMap; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.BitVector; +import de.lmu.ifi.dbs.elki.data.SparseFeatureVector; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDs; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.statistics.Duration; +import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; +import de.lmu.ifi.dbs.elki.result.AprioriResult; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; + +/** + * The APRIORI algorithm for Mining Association Rules. + * + * Reference: + * <p> + * R. Agrawal, R. Srikant:<br /> + * Fast Algorithms for Mining Association Rules<br /> + * In Proc. 20th Int. Conf. on Very Large Data Bases (VLDB '94), Santiago de + * Chile, Chile 1994. + * </p> + * + * This implementation uses some simple optimizations for 1- and 2-itemsets. + * + * Note: this algorithm scales well to a large number of transactions, but not + * so well to a large number of frequent itemsets (items). For best results, use + * domain-specific preprocessing to aggregate items into groups. Use statistics + * logging to keep track of candidate set sizes. + * + * @author Arthur Zimek + * @author Erich Schubert + * + * @apiviz.has Itemset + * @apiviz.uses BitVector + */ +@Title("APRIORI: Algorithm for Mining Association Rules") +@Description("Searches for frequent itemsets") +@Reference(authors = "R. Agrawal, R. Srikant", // +title = "Fast Algorithms for Mining Association Rules", // +booktitle = "Proc. 20th Int. Conf. on Very Large Data Bases (VLDB '94), Santiago de Chile, Chile 1994", // +url = "http://www.vldb.org/conf/1994/P487.PDF") +public class APRIORI extends AbstractAlgorithm<AprioriResult> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(APRIORI.class); + + /** + * Statistics logging prefix. + */ + private final String STAT = this.getClass().getName() + "."; + + /** + * Minimum support. If less than 1, considered to be a relative frequency, + * otherwise an absolute count. + */ + private double minfreq; + + /** + * Constructor with minimum frequency. + * + * @param minfreq Minimum frequency + */ + public APRIORI(double minfreq) { + super(); + this.minfreq = minfreq; + } + + /** + * Performs the APRIORI algorithm on the given database. + * + * @param relation the Relation to process + * @return the AprioriResult learned by this APRIORI + */ + public AprioriResult run(Relation<BitVector> relation) { + DBIDs ids = relation.getDBIDs(); + List<Itemset> solution = new ArrayList<>(); + final int size = ids.size(); + final int needed = (int) ((minfreq < 1.) ? Math.ceil(minfreq * size) : minfreq); + + // TODO: we don't strictly require a vector field. + // We could work with knowing just the maximum dimensionality beforehand. + VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation); + if(size > 0) { + final int dim = meta.getDimensionality(); + Duration timeone = LOG.newDuration(STAT + "1-items.time").begin(); + List<OneItemset> oneitems = buildFrequentOneItemsets(relation, dim, needed); + LOG.statistics(timeone.end()); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + "1-items.frequent", oneitems.size())); + LOG.statistics(new LongStatistic(STAT + "1-items.transactions", ids.size())); + } + if(LOG.isDebuggingFine()) { + LOG.debugFine(debugDumpCandidates(new StringBuilder(), oneitems, meta)); + } + solution.addAll(oneitems); + if(oneitems.size() >= 2) { + Duration timetwo = LOG.newDuration(STAT + "2-items.time").begin(); + ArrayModifiableDBIDs survivors = DBIDUtil.newArray(ids.size()); + List<? extends Itemset> candidates = buildFrequentTwoItemsets(oneitems, relation, dim, needed, ids, survivors); + ids = survivors; // Continue with reduced set of transactions. + LOG.statistics(timetwo.end()); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + "2-items.frequent", candidates.size())); + LOG.statistics(new LongStatistic(STAT + "2-items.transactions", ids.size())); + } + if(LOG.isDebuggingFine()) { + LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta)); + } + solution.addAll(candidates); + for(int length = 3; candidates.size() >= length; length++) { + Duration timel = LOG.newDuration(STAT + length + "-items.time").begin(); + // Join to get the new candidates + candidates = aprioriGenerate(candidates, length, dim); + if(LOG.isDebuggingFinest()) { + LOG.debugFinest(debugDumpCandidates(new StringBuilder().append("Before pruning: "), candidates, meta)); + } + survivors = DBIDUtil.newArray(ids.size()); + candidates = frequentItemsets(candidates, relation, needed, ids, survivors, length); + ids = survivors; // Continue with reduced set of transactions. + LOG.statistics(timel.end()); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + length + "-items.frequent", candidates.size())); + LOG.statistics(new LongStatistic(STAT + length + "-items.transactions", ids.size())); + } + if(LOG.isDebuggingFine()) { + LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta)); + } + solution.addAll(candidates); + } + } + } + return new AprioriResult("APRIORI", "apriori", solution, meta); + } + + /** + * Build the 1-itemsets. + * + * @param relation Data relation + * @param dim Maximum dimensionality + * @param needed Minimum support needed + * @return 1-itemsets + */ + protected List<OneItemset> buildFrequentOneItemsets(final Relation<? extends SparseFeatureVector<?>> relation, final int dim, final int needed) { + // TODO: use TIntList and prefill appropriately to avoid knowing "dim" + // beforehand? + int[] counts = new int[dim]; + for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + SparseFeatureVector<?> bv = relation.get(iditer); + for(int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) { + counts[bv.iterDim(it)]++; + } + } + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + "1-items.candidates", dim)); + } + // Generate initial candidates of length 1. + List<OneItemset> frequent = new ArrayList<>(dim); + for(int i = 0; i < dim; i++) { + if(counts[i] >= needed) { + frequent.add(new OneItemset(i, counts[i])); + } + } + return frequent; + } + + /** + * Build the 2-itemsets. + * + * @param oneitems Frequent 1-itemsets + * @param relation Data relation + * @param dim Maximum dimensionality + * @param needed Minimum support needed + * @param ids Objects to process + * @param survivors Output: objects that had at least two 1-frequent items. + * @return Frequent 2-itemsets + */ + protected List<SparseItemset> buildFrequentTwoItemsets(List<OneItemset> oneitems, final Relation<BitVector> relation, final int dim, final int needed, DBIDs ids, ArrayModifiableDBIDs survivors) { + int f1 = 0; + long[] mask = BitsUtil.zero(dim); + for(OneItemset supported : oneitems) { + BitsUtil.setI(mask, supported.item); + f1++; + } + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + "2-items.candidates", f1 * (long) (f1 - 1))); + } + // We quite aggressively size the map, assuming that almost each combination + // is present somewhere. If this won't fit into memory, we're likely running + // OOM somewhere later anyway! + TLongIntHashMap map = new TLongIntHashMap((f1 * (f1 - 1)) >>> 1); + final long[] scratch = BitsUtil.zero(dim); + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + BitsUtil.setI(scratch, mask); + relation.get(iditer).andOnto(scratch); + int lives = 0; + for(int i = BitsUtil.nextSetBit(scratch, 0); i >= 0; i = BitsUtil.nextSetBit(scratch, i + 1)) { + for(int j = BitsUtil.nextSetBit(scratch, i + 1); j >= 0; j = BitsUtil.nextSetBit(scratch, j + 1)) { + long key = (((long) i) << 32) | j; + map.put(key, 1 + map.get(key)); + ++lives; + } + } + if(lives > 2) { + survivors.add(iditer); + } + } + // Generate candidates of length 2. + List<SparseItemset> frequent = new ArrayList<>(f1 * (int) Math.sqrt(f1)); + for(TLongIntIterator iter = map.iterator(); iter.hasNext();) { + iter.advance(); // Trove style iterator - advance first. + if(iter.value() >= needed) { + int ii = (int) (iter.key() >>> 32); + int ij = (int) (iter.key() & -1L); + frequent.add(new SparseItemset(new int[] { ii, ij }, iter.value())); + } + } + // The hashmap may produce them out of order. + Collections.sort(frequent); + if(LOG.isStatistics()) { + LOG.statistics(new LongStatistic(STAT + "2-items.frequent", frequent.size())); + } + return frequent; + } + + /** + * Prunes a given set of candidates to keep only those BitSets where all + * subsets of bits flipping one bit are frequent already. + * + * @param supported Support map + * @param length Itemset length + * @param dim Dimensionality + * @return itemsets that cannot be pruned by apriori + */ + protected List<Itemset> aprioriGenerate(List<? extends Itemset> supported, int length, int dim) { + if(supported.size() < length) { + return Collections.emptyList(); + } + long joined = 0L; + final int ssize = supported.size(); + List<Itemset> candidateList = new ArrayList<>(); + + Itemset ref = supported.get(0); + if(ref instanceof SparseItemset) { + // TODO: we currently never switch to DenseItemSet. This may however be + // beneficial when we have few dimensions and many candidates. + // E.g. when length > 32 and dim < 100. But this needs benchmarking! + // For length < 5 and dim > 3000, SparseItemset unsurprisingly was faster + + // Scratch item to use for searching. + SparseItemset scratch = new SparseItemset(new int[length - 1]); + + for(int i = 0; i < ssize; i++) { + SparseItemset ii = (SparseItemset) supported.get(i); + prefix: for(int j = i + 1; j < ssize; j++) { + SparseItemset ij = (SparseItemset) supported.get(j); + if(!ii.prefixTest(ij)) { + break prefix; // Prefix doesn't match + } + joined++; + // Test subsets (re-) using scratch object + System.arraycopy(ii.indices, 1, scratch.indices, 0, length - 2); + scratch.indices[length - 2] = ij.indices[length - 2]; + for(int k = length - 3; k >= 0; k--) { + scratch.indices[k] = ii.indices[k + 1]; + int pos = Collections.binarySearch(supported, scratch); + if(pos < 0) { + // Prefix was okay, but one other subset was not frequent + continue prefix; + } + } + int[] items = new int[length]; + System.arraycopy(ii.indices, 0, items, 0, length - 1); + items[length - 1] = ij.indices[length - 2]; + candidateList.add(new SparseItemset(items)); + } + } + } + else if(ref instanceof DenseItemset) { + // Scratch item to use for searching. + DenseItemset scratch = new DenseItemset(BitsUtil.zero(dim), length - 1); + + for(int i = 0; i < ssize; i++) { + DenseItemset ii = (DenseItemset) supported.get(i); + prefix: for(int j = i + 1; j < ssize; j++) { + DenseItemset ij = (DenseItemset) supported.get(j); + // Prefix test via "|i1 ^ i2| = 2" + System.arraycopy(ii.items, 0, scratch.items, 0, ii.items.length); + BitsUtil.xorI(scratch.items, ij.items); + if(BitsUtil.cardinality(scratch.items) != 2) { + break prefix; // No prefix match; since sorted, no more can follow! + } + ++joined; + // Ensure that the first difference is the last item in ii: + int first = BitsUtil.nextSetBit(scratch.items, 0); + if(BitsUtil.nextSetBit(ii.items, first + 1) > -1) { + break prefix; // Different overlap by chance? + } + BitsUtil.orI(scratch.items, ij.items); + + // Test subsets. + for(int l = length, b = BitsUtil.nextSetBit(scratch.items, 0); l > 2; l--, b = BitsUtil.nextSetBit(scratch.items, b + 1)) { + BitsUtil.clearI(scratch.items, b); + int pos = Collections.binarySearch(supported, scratch); + if(pos < 0) { + continue prefix; + } + BitsUtil.setI(scratch.items, b); + } + candidateList.add(new DenseItemset(scratch.items.clone(), length)); + } + } + } + else { + throw new AbortException("Unexpected itemset type " + ref.getClass()); + } + if(LOG.isStatistics()) { + // Naive pairwise approach + LOG.statistics(new LongStatistic(STAT + length + "-items.pairwise", (ssize * ((long) ssize - 1)))); + LOG.statistics(new LongStatistic(STAT + length + "-items.joined", joined)); + LOG.statistics(new LongStatistic(STAT + length + "-items.candidates", candidateList.size())); + } + // Note: candidates should have been generated in strictly ascending order + // So we do not need to sort here. + return candidateList; + } + + /** + * Returns the frequent BitSets out of the given BitSets with respect to the + * given database. + * + * @param candidates the candidates to be evaluated + * @param relation the database to evaluate the candidates on + * @param needed Minimum support needed + * @param ids Objects to process + * @param survivors Output: objects that had at least two 1-frequent items. + * @param length Itemset length + * @return Itemsets with sufficient support + */ + protected List<? extends Itemset> frequentItemsets(List<? extends Itemset> candidates, Relation<BitVector> relation, int needed, DBIDs ids, ArrayModifiableDBIDs survivors, int length) { + if(candidates.size() < 1) { + return Collections.emptyList(); + } + Itemset first = candidates.get(0); + // We have an optimized codepath for large and sparse itemsets. + // It probably pays off when #cands >> (avlen choose length) but we do not + // currently have the average number of items. These thresholds yield + // 2700, 6400, 12500, ... and thus will almost always be met until the + // number of frequent itemsets is about to break down to 0. + if(candidates.size() > length * length * length * 100 && first instanceof SparseItemset) { + // Assume that all itemsets are sparse itemsets! + @SuppressWarnings("unchecked") + List<SparseItemset> sparsecand = (List<SparseItemset>) candidates; + return frequentItemsetsSparse(sparsecand, relation, needed, ids, survivors, length); + } + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + BitVector bv = relation.get(iditer); + // TODO: exploit that the candidate set it sorted? + int lives = 0; + for(Itemset candidate : candidates) { + if(candidate.containedIn(bv)) { + candidate.increaseSupport(); + ++lives; + } + } + if(lives > length) { + survivors.add(iditer); + } + } + // Retain only those with minimum support: + List<Itemset> frequent = new ArrayList<>(candidates.size()); + for(Iterator<? extends Itemset> iter = candidates.iterator(); iter.hasNext();) { + final Itemset candidate = iter.next(); + if(candidate.getSupport() >= needed) { + frequent.add(candidate); + } + } + return frequent; + } + + /** + * Returns the frequent BitSets out of the given BitSets with respect to the + * given database. Optimized implementation for SparseItemset. + * + * @param candidates the candidates to be evaluated + * @param relation the database to evaluate the candidates on + * @param needed Minimum support needed + * @param ids Objects to process + * @param survivors Output: objects that had at least two 1-frequent items. + * @param length Itemset length + * @return Itemsets with sufficient support + */ + protected List<SparseItemset> frequentItemsetsSparse(List<SparseItemset> candidates, Relation<BitVector> relation, int needed, DBIDs ids, ArrayModifiableDBIDs survivors, int length) { + // Current search interval: + int begin = 0, end = candidates.size(); + int[] scratchi = new int[length], iters = new int[length]; + SparseItemset scratch = new SparseItemset(scratchi); + for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { + BitVector bv = relation.get(iditer); + if(!initializeSearchItemset(bv, scratchi, iters)) { + continue; + } + int lives = 0; + while(begin < end) { + begin = binarySearch(candidates, scratch, begin, end); + if(begin > 0) { + candidates.get(begin).increaseSupport(); + ++lives; + } + else { + begin = (-begin) - 1; + } + if(begin >= end || !nextSearchItemset(bv, scratchi, iters)) { + break; + } + } + for(Itemset candidate : candidates) { + if(candidate.containedIn(bv)) { + candidate.increaseSupport(); + ++lives; + } + } + if(lives > length) { + survivors.add(iditer); + } + } + // Retain only those with minimum support: + List<SparseItemset> frequent = new ArrayList<>(candidates.size()); + for(Iterator<SparseItemset> iter = candidates.iterator(); iter.hasNext();) { + final SparseItemset candidate = iter.next(); + if(candidate.getSupport() >= needed) { + frequent.add(candidate); + } + } + return frequent; + } + + /** + * Initialize the scratch itemset. + * + * @param bv Bit vector data source + * @param scratchi Scratch itemset + * @param iters Iterator array + * @return {@code true} if the itemset had minimum length + */ + private boolean initializeSearchItemset(BitVector bv, int[] scratchi, int[] iters) { + for(int i = 0; i < scratchi.length; i++) { + iters[i] = (i == 0) ? bv.iter() : bv.iterAdvance(iters[i - 1]); + if(iters[i] < 0) { + return false; + } + scratchi[i] = bv.iterDim(iters[i]); + } + return true; + } + + /** + * Advance scratch itemset to the next. + * + * @param bv Bit vector data source + * @param scratchi Scratch itemset + * @param iters Iterator array + * @return {@code true} if the itemset had minimum length + */ + private boolean nextSearchItemset(BitVector bv, int[] scratchi, int[] iters) { + final int last = scratchi.length - 1; + for(int j = last; j >= 0; j--) { + int n = bv.iterAdvance(iters[j]); + if(n >= 0 && (j == last || n != iters[j + 1])) { + iters[j] = n; + scratchi[j] = bv.iterDim(n); + return true; // Success + } + } + return false; + } + + /** + * Binary-search for the next-larger element. + * + * @param candidates Candidates to search for + * @param scratch Scratch space + * @param begin Search interval begin + * @param end Search interval end + * @return Position of first equal-or-larger element + */ + private int binarySearch(List<SparseItemset> candidates, SparseItemset scratch, int begin, int end) { + --end; + while(begin < end) { + final int mid = (begin + end) >>> 1; + SparseItemset midVal = candidates.get(mid); + int cmp = midVal.compareTo(scratch); + + if(cmp < 0) { + begin = mid + 1; + } + else if(cmp > 0) { + end = mid - 1; + } + else { + return mid; // key found + } + } + return -(begin + 1); // key not found, return next + } + + /** + * Debug method: output all itemsets. + * + * @param msg Output buffer + * @param candidates Itemsets to dump + * @param meta Metadata for item labels + * @return Output buffer + */ + private StringBuilder debugDumpCandidates(StringBuilder msg, List<? extends Itemset> candidates, VectorFieldTypeInformation<BitVector> meta) { + msg.append(':'); + for(Itemset itemset : candidates) { + msg.append(" ["); + itemset.appendTo(msg, meta); + msg.append(']'); + } + return msg; + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.BIT_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter to specify the minimum support, in absolute or relative terms. + */ + public static final OptionID MINSUPP_ID = new OptionID("apriori.minsupp", // + "Threshold for minimum support as minimally required number of transactions (if > 1) " // + + "or the minimum frequency (if <= 1)."); + + /** + * Parameter for minimum support. + */ + protected double minsupp; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + DoubleParameter minsuppP = new DoubleParameter(MINSUPP_ID); + minsuppP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(minsuppP)) { + minsupp = minsuppP.getValue(); + } + } + + @Override + protected APRIORI makeInstance() { + return new APRIORI(minsupp); + } + } +}
\ No newline at end of file |