From 84cdfa55fc81c233a308c82c5fa6d482b8661ca0 Mon Sep 17 00:00:00 2001 From: Clifford Wolf Date: Sat, 2 Mar 2013 13:53:59 +0100 Subject: Added frequent subcircuit miner to subcircuit library --- libs/subcircuit/.gitignore | 2 + libs/subcircuit/Makefile | 1 + libs/subcircuit/README | 43 ++++++-- libs/subcircuit/scshell.cc | 34 +++++- libs/subcircuit/subcircuit.cc | 251 ++++++++++++++++++++++++++++++++++++++++++ libs/subcircuit/subcircuit.h | 15 +++ libs/subcircuit/test_mine.txt | 35 ++++++ 7 files changed, 368 insertions(+), 13 deletions(-) create mode 100644 libs/subcircuit/.gitignore create mode 100644 libs/subcircuit/test_mine.txt diff --git a/libs/subcircuit/.gitignore b/libs/subcircuit/.gitignore new file mode 100644 index 00000000..9f1eb4e8 --- /dev/null +++ b/libs/subcircuit/.gitignore @@ -0,0 +1,2 @@ +demo +scshell diff --git a/libs/subcircuit/Makefile b/libs/subcircuit/Makefile index af745b4b..f81085b5 100644 --- a/libs/subcircuit/Makefile +++ b/libs/subcircuit/Makefile @@ -39,6 +39,7 @@ scshell: scshell.o subcircuit.o test: scshell ./scshell < test_macc22.txt + ./scshell < test_mine.txt perl test_perm.pl | ./scshell splrun test_shorts.spl | ./scshell splrun test_large.spl | ./scshell diff --git a/libs/subcircuit/README b/libs/subcircuit/README index 5c8a8a9e..d1bdb1f6 100644 --- a/libs/subcircuit/README +++ b/libs/subcircuit/README @@ -14,20 +14,12 @@ Introduction This is a library that implements a modified Ullmann Subgraph Isomorphism Algorithm with additional features aimed at working with coarse grain logic -networks. +networks. It also contains a simple frequent subcircuit mining algorithm. A simple command line tool that exposes the features of the library is also included. -Under-Construction Warning --------------------------- - -This work is under constructions. It is likely that they are bugs in the -library that need fixing. Feel free to contact me at clifford@clifford.at -if you have found a bug. - - C++11 Warning ------------- @@ -97,6 +89,9 @@ Algorithm are provided by the library. * Support for finding only non-overlapping matches. + * A simple miner for frequent subcircuts that operates on the same circuit + description format. + * The public API of the library is using std::string identifiers for nodes, node types and ports. Internally the costly part of the algorithm is only using integer values, thus speeding up the @@ -328,6 +323,32 @@ bool userCheckSolution(result): ignored. The default implementation always returns true. +Mining for frequent SubCircuits +------------------------------- + +The solver also contains a miner for frequent subcircuits. The following code +fragment will find all frequent subcircuits with at least minNodes nodes and +at most maxNodes nodes that occurs at least minMatches times: + + std::vector results; + mySolver.mine(results, minNodes, maxNodes, minMatches); + +The miner works by finding frequent pairs of nodes and then combining them +to larger subcircuits. Because of this incremental strategy the miner only +works as expected on graphs with markAllExtern() set. + +The mine() method has an optional fifth parameter that limits the number +of matches counted in one graph. This can be useful when mining for circuits +that are found in at least a number of graphs. E.g. the following call +would find all subcircuits with 5 nodes that are found in at least 7 of +the registered graphs: + + mySolver.mine(results, 5, 5, 7, 1); + +Note that this miner is not very efficient and therefore its use is not +recommended for large circuits. + + Debugging --------- @@ -420,6 +441,10 @@ The following commands can be used in scshell outside a graph ... endgraph block Call Solver::solve(). The must be "1" or "true" for true and "0" or "false" for false. + mine [] + + Call Solver::mine(). + expect Print all results so far since the last call to expect. Expect diff --git a/libs/subcircuit/scshell.cc b/libs/subcircuit/scshell.cc index 70afcfd4..c4b37a4d 100644 --- a/libs/subcircuit/scshell.cc +++ b/libs/subcircuit/scshell.cc @@ -26,6 +26,7 @@ int main() SubCircuit::Solver solver; std::map> initialMappings; std::vector results; + std::vector mineResults; std::vector cmdBuffer; bool lastCommandExpect = false; @@ -162,6 +163,12 @@ int main() continue; } + if (cmdBuffer[0] == "mine" && 4 <= cmdBuffer.size() && cmdBuffer.size() <= 5) { + solver.mine(mineResults, atoi(cmdBuffer[1].c_str()), atoi(cmdBuffer[2].c_str()), + atoi(cmdBuffer[3].c_str()), cmdBuffer.size() == 5 ? atoi(cmdBuffer[4].c_str()) : -1); + continue; + } + if (cmdBuffer[0] == "clearoverlap" && cmdBuffer.size() == 1) { solver.clearOverlapHistory(); continue; @@ -179,7 +186,7 @@ int main() if (cmdBuffer[0] == "expect" && cmdBuffer.size() == 2) { int expected = atoi(cmdBuffer[1].c_str()); - printf("\n-- Expected %d, Got %d --\n", expected, int(results.size())); + printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()) + int(mineResults.size())); for (int i = 0; i < int(results.size()); i++) { printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str()); for (const auto &it : results[i].mappings) { @@ -189,9 +196,18 @@ int main() printf("\n"); } } + for (auto &result : mineResults) { + printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits); + printf(" primary match in %s:", result.graphId.c_str()); + for (auto &node : result.nodes) + printf(" %s", node.nodeId.c_str()); + printf("\n"); + for (auto &it : result.matchesPerGraph) + printf(" matches in %s: %d\n", it.first.c_str(), it.second); + } printf("\n"); - if (expected != int(results.size())) { - printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size())); + if (expected != int(results.size()) + int(mineResults.size())) { + printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()) + int(mineResults.size())); printf(" +----------------+\n"); printf(" | \\|/ ____ \\|/ |\n"); printf(" | \"@'/ ,. \\`@\" |\n"); @@ -202,6 +218,7 @@ int main() return 1; } results.clear(); + mineResults.clear(); lastCommandExpect = true; continue; } @@ -215,7 +232,7 @@ int main() delete graph; if (!lastCommandExpect) { - printf("\n-- Got %d --\n", int(results.size())); + printf("\n-- Got %d --\n", int(results.size()) + int(mineResults.size())); for (int i = 0; i < int(results.size()); i++) { printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str()); for (const auto &it : results[i].mappings) { @@ -225,6 +242,15 @@ int main() printf("\n"); } } + for (auto &result : mineResults) { + printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits); + printf(" primary match in %s:", result.graphId.c_str()); + for (auto &node : result.nodes) + printf(" %s", node.nodeId.c_str()); + printf("\n"); + for (auto &it : result.matchesPerGraph) + printf(" matches in %s: %d\n", it.first.c_str(), it.second); + } } else printf("PASSED.\n"); diff --git a/libs/subcircuit/subcircuit.cc b/libs/subcircuit/subcircuit.cc index b49fa97c..a55b97ab 100644 --- a/libs/subcircuit/subcircuit.cc +++ b/libs/subcircuit/subcircuit.cc @@ -46,6 +46,42 @@ static std::string stringf(const char *fmt, ...) return string; } +SubCircuit::Graph::Graph(const Graph &other, const std::vector &otherNodes) +{ + allExtern = other.allExtern; + + std::map other2this; + for (int i = 0; i < int(otherNodes.size()); i++) { + assert(other.nodeMap.count(otherNodes[i]) > 0); + other2this[other.nodeMap.at(otherNodes[i])] = i; + nodeMap[otherNodes[i]] = i; + } + + std::map edges2this; + for (auto &i1 : other2this) + for (auto &i2 : other.nodes[i1.first].ports) + for (auto &i3 : i2.bits) + if (edges2this.count(i3.edgeIdx) == 0) + edges2this[i3.edgeIdx] = edges2this.size(); + + edges.resize(edges2this.size()); + for (auto &it : edges2this) { + for (auto &bit : other.edges[it.first].portBits) + if (other2this.count(bit.nodeIdx) > 0) + edges[it.second].portBits.insert(BitRef(other2this[bit.nodeIdx], bit.portIdx, bit.bitIdx)); + edges[it.second].constValue = other.edges[it.first].constValue; + edges[it.second].isExtern = other.edges[it.first].isExtern; + } + + nodes.resize(other2this.size()); + for (auto &it : other2this) { + nodes[it.second] = other.nodes[it.first]; + for (auto &i2 : nodes[it.second].ports) + for (auto &i3 : i2.bits) + i3.edgeIdx = edges2this.at(i3.edgeIdx); + } +} + bool SubCircuit::Graph::BitRef::operator < (const BitRef &other) const { if (nodeIdx != other.nodeIdx) @@ -1072,6 +1108,197 @@ class SubCircuit::SolverWorker } } + // additional data structes and functions for mining + + struct NodeSet { + std::string graphId; + std::set nodes; + NodeSet(std::string graphId, int node1, int node2) { + this->graphId = graphId; + nodes.insert(node1); + nodes.insert(node2); + } + NodeSet(std::string graphId, const std::vector &nodes) { + this->graphId = graphId; + for (int node : nodes) + this->nodes.insert(node); + } + void extend(const NodeSet &other) { + assert(this->graphId == other.graphId); + for (int node : other.nodes) + nodes.insert(node); + } + int extendCandidate(const NodeSet &other) const { + if (graphId != other.graphId) + return 0; + int newNodes = 0; + bool intersect = false; + for (int node : other.nodes) + if (nodes.count(node) > 0) + intersect = true; + else + newNodes++; + return intersect ? newNodes : 0; + } + bool operator <(const NodeSet &other) const { + if (graphId != other.graphId) + return graphId < other.graphId; + return nodes < other.nodes; + } + }; + + void solveForMining(std::vector &results, const GraphData &needle) + { + bool backupVerbose = verbose; + verbose = false; + + for (auto &it : graphData) + { + GraphData &haystack = it.second; + assert(haystack.graph.allExtern); + + std::vector> enumerationMatrix; + std::map> initialMappings; + generateEnumerationMatrix(enumerationMatrix, needle, haystack, initialMappings); + + haystack.usedNodes.resize(haystack.graph.nodes.size()); + ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, true, -1); + } + + verbose = backupVerbose; + } + + int testForMining(std::vector &results, std::set &usedSets, std::vector> &nextPool, NodeSet &testSet, + const std::string &graphId, const Graph &graph, int minNodes, int minMatches, int limitMatchesPerGraph) + { + GraphData needle; + std::vector needle_nodes; + for (int nodeIdx : testSet.nodes) + needle_nodes.push_back(graph.nodes[nodeIdx].nodeId); + needle.graph = Graph(graph, needle_nodes); + diCache.add(needle.graph, needle.adjMatrix, graphId, userSolver); + + std::vector ullmannResults; + solveForMining(ullmannResults, needle); + + int matches = 0; + std::map matchesPerGraph; + std::set thisNodeSetSet; + + for (auto &it : ullmannResults) + { + std::vector resultNodes; + for (auto &i2 : it.mappings) + resultNodes.push_back(graphData[it.haystackGraphId].graph.nodeMap[i2.second.haystackNodeId]); + NodeSet resultSet(it.haystackGraphId, resultNodes); + + if (usedSets.count(resultSet) > 0) { + assert(thisNodeSetSet.count(resultSet) > 0); + continue; + } + usedSets.insert(resultSet); + thisNodeSetSet.insert(resultSet); + + matchesPerGraph[it.haystackGraphId]++; + if (limitMatchesPerGraph < 0 || matchesPerGraph[it.haystackGraphId] < limitMatchesPerGraph) + matches++; + } + + if (matches < minMatches) + return 0; + + if (minNodes <= int(testSet.nodes.size())) + { + Solver::MineResult result; + result.graphId = graphId; + result.totalMatchesAfterLimits = matches; + result.matchesPerGraph = matchesPerGraph; + for (int nodeIdx : testSet.nodes) { + Solver::MineResultNode resultNode; + resultNode.nodeId = graph.nodes[nodeIdx].nodeId; + resultNode.userData = graph.nodes[nodeIdx].userData; + result.nodes.push_back(resultNode); + } + results.push_back(result); + } + + nextPool.push_back(thisNodeSetSet); + return matches; + } + + void findNodePairs(std::vector &results, std::vector> &nodePairs, int minNodes, int minMatches, int limitMatchesPerGraph) + { + std::set usedPairs; + + if (verbose) + printf("\nFind frequent node pairs:\n"); + + for (auto &graph_it : graphData) + for (int node1 = 0; node1 < int(graph_it.second.graph.nodes.size()); node1++) + for (auto &adj_it : graph_it.second.adjMatrix.at(node1)) + { + const std::string &graphId = graph_it.first; + const auto &graph = graph_it.second.graph; + int node2 = adj_it.first; + NodeSet pair(graphId, node1, node2); + + if (usedPairs.count(pair) > 0) + continue; + + int matches = testForMining(results, usedPairs, nodePairs, pair, graphId, graph, minNodes, minMatches, limitMatchesPerGraph); + + if (verbose && matches > 0) + printf("Pair %s[%s,%s] -> %d\n", graphId.c_str(), graph.nodes[node1].nodeId.c_str(), + graph.nodes[node2].nodeId.c_str(), matches); + } + } + + void findNextPool(std::vector &results, std::vector> &pool, + int oldSetSize, int increment, int minNodes, int minMatches, int limitMatchesPerGraph) + { + std::vector> nextPool; + std::map> poolPerGraph; + + for (auto &i1 : pool) + for (auto &i2 : i1) + poolPerGraph[i2.graphId].push_back(&i2); + + if (verbose) + printf("\nFind frequent subcircuits of size %d using increment %d:\n", oldSetSize+increment, increment); + + std::set usedSets; + for (auto &it : poolPerGraph) + for (int idx1 = 0; idx1 < int(it.second.size()); idx1++) + for (int idx2 = idx1; idx2 < int(it.second.size()); idx2++) + { + if (it.second[idx1]->extendCandidate(*it.second[idx2]) != increment) + continue; + + NodeSet mergedSet = *it.second[idx1]; + mergedSet.extend(*it.second[idx2]); + + if (usedSets.count(mergedSet) > 0) + continue; + + const std::string &graphId = it.first; + const auto &graph = graphData[it.first].graph; + + int matches = testForMining(results, usedSets, nextPool, mergedSet, graphId, graph, minNodes, minMatches, limitMatchesPerGraph); + + if (verbose) { + printf("Set %s[", graphId.c_str()); + bool first = true; + for (int nodeIdx : mergedSet.nodes) { + printf("%s%s", first ? "" : ",", graph.nodes[nodeIdx].nodeId.c_str()); + first = false; + } + printf("] -> %d\n", matches); + } + } + + pool.swap(nextPool); + } + // interface to the public Solver class protected: @@ -1151,6 +1378,25 @@ protected: ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, allowOverlap, maxSolutions > 0 ? results.size() + maxSolutions : -1); } + void mine(std::vector &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph) + { + int nodeSetSize = 2; + std::vector> pool; + findNodePairs(results, pool, minNodes, minMatches, limitMatchesPerGraph); + + while (nodeSetSize < maxNodes) + { + int increment = nodeSetSize - 1; + if (nodeSetSize + increment >= minNodes) + increment = minNodes - nodeSetSize; + if (nodeSetSize >= minNodes) + increment = 1; + + findNextPool(results, pool, nodeSetSize, increment, minNodes, minMatches, limitMatchesPerGraph); + nodeSetSize += increment; + } + } + void clearOverlapHistory() { for (auto &it : graphData) @@ -1252,6 +1498,11 @@ void SubCircuit::Solver::solve(std::vector &results, std::string needleG worker->solve(results, needleGraphId, haystackGraphId, initialMappings, allowOverlap, maxSolutions); } +void SubCircuit::Solver::mine(std::vector &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph) +{ + worker->mine(results, minNodes, maxNodes, minMatches, limitMatchesPerGraph); +} + void SubCircuit::Solver::clearOverlapHistory() { worker->clearOverlapHistory(); diff --git a/libs/subcircuit/subcircuit.h b/libs/subcircuit/subcircuit.h index da536ba0..b9399a99 100644 --- a/libs/subcircuit/subcircuit.h +++ b/libs/subcircuit/subcircuit.h @@ -73,6 +73,7 @@ namespace SubCircuit public: Graph() : allExtern(false) { }; + Graph(const Graph &other, const std::vector &otherNodes); void createNode(std::string nodeId, std::string typeId, void *userData = NULL); void createPort(std::string nodeId, std::string portId, int width = 1, int minWidth = -1); @@ -100,6 +101,17 @@ namespace SubCircuit std::map mappings; }; + struct MineResultNode { + std::string nodeId; + void *userData; + }; + struct MineResult { + std::string graphId; + int totalMatchesAfterLimits; + std::map matchesPerGraph; + std::vector nodes; + }; + private: SolverWorker *worker; @@ -131,6 +143,9 @@ namespace SubCircuit void solve(std::vector &results, std::string needleGraphId, std::string haystackGraphId, bool allowOverlap = true, int maxSolutions = -1); void solve(std::vector &results, std::string needleGraphId, std::string haystackGraphId, const std::map> &initialMapping, bool allowOverlap = true, int maxSolutions = -1); + + void mine(std::vector &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph = -1); + void clearOverlapHistory(); void clearConfig(); }; diff --git a/libs/subcircuit/test_mine.txt b/libs/subcircuit/test_mine.txt new file mode 100644 index 00000000..e3b9170b --- /dev/null +++ b/libs/subcircuit/test_mine.txt @@ -0,0 +1,35 @@ + +# verbose + +graph macc22 + node mul_1 mul A 32 B 32 Y 32 + node mul_2 mul A 32 B 32 Y 32 + node add_1 add A 32 B 32 Y 32 + connect mul_1 Y add_1 A + connect mul_2 Y add_1 B + allextern +endgraph + +graph macc4x2 + node mul_1 mul A 32 B 32 Y 32 + node mul_2 mul A 32 B 32 Y 32 + node mul_3 mul A 32 B 32 Y 32 + node mul_4 mul A 32 B 32 Y 32 + node add_1 add A 32 B 32 Y 32 + node add_2 add A 32 B 32 Y 32 + node add_3 add A 32 B 32 Y 32 + connect mul_1 Y add_1 A + connect mul_2 Y add_1 B + connect mul_3 Y add_2 A + connect mul_4 Y add_2 B + connect add_1 Y add_3 A + connect add_2 Y add_3 B + allextern +endgraph + +swapgroup mul A B +swapgroup add A B + +mine 2 10 2 +expect 5 + -- cgit v1.2.3