summaryrefslogtreecommitdiff
path: root/libs/subcircuit
diff options
context:
space:
mode:
authorClifford Wolf <clifford@clifford.at>2013-03-02 13:53:59 +0100
committerClifford Wolf <clifford@clifford.at>2013-03-02 13:53:59 +0100
commit84cdfa55fc81c233a308c82c5fa6d482b8661ca0 (patch)
tree20c4635bdda03b64503a8ea0808f5e18b48ec7b2 /libs/subcircuit
parenta338d1a082726d84210912318a9ac49977dc380c (diff)
Added frequent subcircuit miner to subcircuit library
Diffstat (limited to 'libs/subcircuit')
-rw-r--r--libs/subcircuit/.gitignore2
-rw-r--r--libs/subcircuit/Makefile1
-rw-r--r--libs/subcircuit/README43
-rw-r--r--libs/subcircuit/scshell.cc34
-rw-r--r--libs/subcircuit/subcircuit.cc251
-rw-r--r--libs/subcircuit/subcircuit.h15
-rw-r--r--libs/subcircuit/test_mine.txt35
7 files changed, 368 insertions, 13 deletions
diff --git a/libs/subcircuit/.gitignore b/libs/subcircuit/.gitignore
new file mode 100644
index 00000000..9f1eb4e8
--- /dev/null
+++ b/libs/subcircuit/.gitignore
@@ -0,0 +1,2 @@
+demo
+scshell
diff --git a/libs/subcircuit/Makefile b/libs/subcircuit/Makefile
index af745b4b..f81085b5 100644
--- a/libs/subcircuit/Makefile
+++ b/libs/subcircuit/Makefile
@@ -39,6 +39,7 @@ scshell: scshell.o subcircuit.o
test: scshell
./scshell < test_macc22.txt
+ ./scshell < test_mine.txt
perl test_perm.pl | ./scshell
splrun test_shorts.spl | ./scshell
splrun test_large.spl | ./scshell
diff --git a/libs/subcircuit/README b/libs/subcircuit/README
index 5c8a8a9e..d1bdb1f6 100644
--- a/libs/subcircuit/README
+++ b/libs/subcircuit/README
@@ -14,20 +14,12 @@ Introduction
This is a library that implements a modified Ullmann Subgraph Isomorphism
Algorithm with additional features aimed at working with coarse grain logic
-networks.
+networks. It also contains a simple frequent subcircuit mining algorithm.
A simple command line tool that exposes the features of the library is also
included.
-Under-Construction Warning
---------------------------
-
-This work is under constructions. It is likely that they are bugs in the
-library that need fixing. Feel free to contact me at clifford@clifford.at
-if you have found a bug.
-
-
C++11 Warning
-------------
@@ -97,6 +89,9 @@ Algorithm are provided by the library.
* Support for finding only non-overlapping matches.
+ * A simple miner for frequent subcircuts that operates on the same circuit
+ description format.
+
* The public API of the library is using std::string identifiers for
nodes, node types and ports. Internally the costly part of the
algorithm is only using integer values, thus speeding up the
@@ -328,6 +323,32 @@ bool userCheckSolution(result):
ignored. The default implementation always returns true.
+Mining for frequent SubCircuits
+-------------------------------
+
+The solver also contains a miner for frequent subcircuits. The following code
+fragment will find all frequent subcircuits with at least minNodes nodes and
+at most maxNodes nodes that occurs at least minMatches times:
+
+ std::vector<SubCircuit::Solver::MineResult> results;
+ mySolver.mine(results, minNodes, maxNodes, minMatches);
+
+The miner works by finding frequent pairs of nodes and then combining them
+to larger subcircuits. Because of this incremental strategy the miner only
+works as expected on graphs with markAllExtern() set.
+
+The mine() method has an optional fifth parameter that limits the number
+of matches counted in one graph. This can be useful when mining for circuits
+that are found in at least a number of graphs. E.g. the following call
+would find all subcircuits with 5 nodes that are found in at least 7 of
+the registered graphs:
+
+ mySolver.mine(results, 5, 5, 7, 1);
+
+Note that this miner is not very efficient and therefore its use is not
+recommended for large circuits.
+
+
Debugging
---------
@@ -420,6 +441,10 @@ The following commands can be used in scshell outside a graph ... endgraph block
Call Solver::solve(). The <allow_overlap> must be "1" or "true"
for true and "0" or "false" for false.
+ mine <min_nodes> <max_nodes> <min_matches> [<limit_matches_per_graph>]
+
+ Call Solver::mine().
+
expect <number>
Print all results so far since the last call to expect. Expect
diff --git a/libs/subcircuit/scshell.cc b/libs/subcircuit/scshell.cc
index 70afcfd4..c4b37a4d 100644
--- a/libs/subcircuit/scshell.cc
+++ b/libs/subcircuit/scshell.cc
@@ -26,6 +26,7 @@ int main()
SubCircuit::Solver solver;
std::map<std::string, std::set<std::string>> initialMappings;
std::vector<SubCircuit::Solver::Result> results;
+ std::vector<SubCircuit::Solver::MineResult> mineResults;
std::vector<std::string> cmdBuffer;
bool lastCommandExpect = false;
@@ -162,6 +163,12 @@ int main()
continue;
}
+ if (cmdBuffer[0] == "mine" && 4 <= cmdBuffer.size() && cmdBuffer.size() <= 5) {
+ solver.mine(mineResults, atoi(cmdBuffer[1].c_str()), atoi(cmdBuffer[2].c_str()),
+ atoi(cmdBuffer[3].c_str()), cmdBuffer.size() == 5 ? atoi(cmdBuffer[4].c_str()) : -1);
+ continue;
+ }
+
if (cmdBuffer[0] == "clearoverlap" && cmdBuffer.size() == 1) {
solver.clearOverlapHistory();
continue;
@@ -179,7 +186,7 @@ int main()
if (cmdBuffer[0] == "expect" && cmdBuffer.size() == 2) {
int expected = atoi(cmdBuffer[1].c_str());
- printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()));
+ printf("\n-- Expected %d, Got %d --\n", expected, int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
@@ -189,9 +196,18 @@ int main()
printf("\n");
}
}
+ for (auto &result : mineResults) {
+ printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+ printf(" primary match in %s:", result.graphId.c_str());
+ for (auto &node : result.nodes)
+ printf(" %s", node.nodeId.c_str());
+ printf("\n");
+ for (auto &it : result.matchesPerGraph)
+ printf(" matches in %s: %d\n", it.first.c_str(), it.second);
+ }
printf("\n");
- if (expected != int(results.size())) {
- printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()));
+ if (expected != int(results.size()) + int(mineResults.size())) {
+ printf("^^ expected %d, Got %d ^^\n\n", expected, int(results.size()) + int(mineResults.size()));
printf(" +----------------+\n");
printf(" | \\|/ ____ \\|/ |\n");
printf(" | \"@'/ ,. \\`@\" |\n");
@@ -202,6 +218,7 @@ int main()
return 1;
}
results.clear();
+ mineResults.clear();
lastCommandExpect = true;
continue;
}
@@ -215,7 +232,7 @@ int main()
delete graph;
if (!lastCommandExpect) {
- printf("\n-- Got %d --\n", int(results.size()));
+ printf("\n-- Got %d --\n", int(results.size()) + int(mineResults.size()));
for (int i = 0; i < int(results.size()); i++) {
printf("\nMatch #%d: (%s in %s)\n", i, results[i].needleGraphId.c_str(), results[i].haystackGraphId.c_str());
for (const auto &it : results[i].mappings) {
@@ -225,6 +242,15 @@ int main()
printf("\n");
}
}
+ for (auto &result : mineResults) {
+ printf("\nFrequent SubCircuit with %d nodes and %d matches:\n", int(result.nodes.size()), result.totalMatchesAfterLimits);
+ printf(" primary match in %s:", result.graphId.c_str());
+ for (auto &node : result.nodes)
+ printf(" %s", node.nodeId.c_str());
+ printf("\n");
+ for (auto &it : result.matchesPerGraph)
+ printf(" matches in %s: %d\n", it.first.c_str(), it.second);
+ }
} else
printf("PASSED.\n");
diff --git a/libs/subcircuit/subcircuit.cc b/libs/subcircuit/subcircuit.cc
index b49fa97c..a55b97ab 100644
--- a/libs/subcircuit/subcircuit.cc
+++ b/libs/subcircuit/subcircuit.cc
@@ -46,6 +46,42 @@ static std::string stringf(const char *fmt, ...)
return string;
}
+SubCircuit::Graph::Graph(const Graph &other, const std::vector<std::string> &otherNodes)
+{
+ allExtern = other.allExtern;
+
+ std::map<int, int> other2this;
+ for (int i = 0; i < int(otherNodes.size()); i++) {
+ assert(other.nodeMap.count(otherNodes[i]) > 0);
+ other2this[other.nodeMap.at(otherNodes[i])] = i;
+ nodeMap[otherNodes[i]] = i;
+ }
+
+ std::map<int, int> edges2this;
+ for (auto &i1 : other2this)
+ for (auto &i2 : other.nodes[i1.first].ports)
+ for (auto &i3 : i2.bits)
+ if (edges2this.count(i3.edgeIdx) == 0)
+ edges2this[i3.edgeIdx] = edges2this.size();
+
+ edges.resize(edges2this.size());
+ for (auto &it : edges2this) {
+ for (auto &bit : other.edges[it.first].portBits)
+ if (other2this.count(bit.nodeIdx) > 0)
+ edges[it.second].portBits.insert(BitRef(other2this[bit.nodeIdx], bit.portIdx, bit.bitIdx));
+ edges[it.second].constValue = other.edges[it.first].constValue;
+ edges[it.second].isExtern = other.edges[it.first].isExtern;
+ }
+
+ nodes.resize(other2this.size());
+ for (auto &it : other2this) {
+ nodes[it.second] = other.nodes[it.first];
+ for (auto &i2 : nodes[it.second].ports)
+ for (auto &i3 : i2.bits)
+ i3.edgeIdx = edges2this.at(i3.edgeIdx);
+ }
+}
+
bool SubCircuit::Graph::BitRef::operator < (const BitRef &other) const
{
if (nodeIdx != other.nodeIdx)
@@ -1072,6 +1108,197 @@ class SubCircuit::SolverWorker
}
}
+ // additional data structes and functions for mining
+
+ struct NodeSet {
+ std::string graphId;
+ std::set<int> nodes;
+ NodeSet(std::string graphId, int node1, int node2) {
+ this->graphId = graphId;
+ nodes.insert(node1);
+ nodes.insert(node2);
+ }
+ NodeSet(std::string graphId, const std::vector<int> &nodes) {
+ this->graphId = graphId;
+ for (int node : nodes)
+ this->nodes.insert(node);
+ }
+ void extend(const NodeSet &other) {
+ assert(this->graphId == other.graphId);
+ for (int node : other.nodes)
+ nodes.insert(node);
+ }
+ int extendCandidate(const NodeSet &other) const {
+ if (graphId != other.graphId)
+ return 0;
+ int newNodes = 0;
+ bool intersect = false;
+ for (int node : other.nodes)
+ if (nodes.count(node) > 0)
+ intersect = true;
+ else
+ newNodes++;
+ return intersect ? newNodes : 0;
+ }
+ bool operator <(const NodeSet &other) const {
+ if (graphId != other.graphId)
+ return graphId < other.graphId;
+ return nodes < other.nodes;
+ }
+ };
+
+ void solveForMining(std::vector<Solver::Result> &results, const GraphData &needle)
+ {
+ bool backupVerbose = verbose;
+ verbose = false;
+
+ for (auto &it : graphData)
+ {
+ GraphData &haystack = it.second;
+ assert(haystack.graph.allExtern);
+
+ std::vector<std::set<int>> enumerationMatrix;
+ std::map<std::string, std::set<std::string>> initialMappings;
+ generateEnumerationMatrix(enumerationMatrix, needle, haystack, initialMappings);
+
+ haystack.usedNodes.resize(haystack.graph.nodes.size());
+ ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, true, -1);
+ }
+
+ verbose = backupVerbose;
+ }
+
+ int testForMining(std::vector<Solver::MineResult> &results, std::set<NodeSet> &usedSets, std::vector<std::set<NodeSet>> &nextPool, NodeSet &testSet,
+ const std::string &graphId, const Graph &graph, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ GraphData needle;
+ std::vector<std::string> needle_nodes;
+ for (int nodeIdx : testSet.nodes)
+ needle_nodes.push_back(graph.nodes[nodeIdx].nodeId);
+ needle.graph = Graph(graph, needle_nodes);
+ diCache.add(needle.graph, needle.adjMatrix, graphId, userSolver);
+
+ std::vector<Solver::Result> ullmannResults;
+ solveForMining(ullmannResults, needle);
+
+ int matches = 0;
+ std::map<std::string, int> matchesPerGraph;
+ std::set<NodeSet> thisNodeSetSet;
+
+ for (auto &it : ullmannResults)
+ {
+ std::vector<int> resultNodes;
+ for (auto &i2 : it.mappings)
+ resultNodes.push_back(graphData[it.haystackGraphId].graph.nodeMap[i2.second.haystackNodeId]);
+ NodeSet resultSet(it.haystackGraphId, resultNodes);
+
+ if (usedSets.count(resultSet) > 0) {
+ assert(thisNodeSetSet.count(resultSet) > 0);
+ continue;
+ }
+ usedSets.insert(resultSet);
+ thisNodeSetSet.insert(resultSet);
+
+ matchesPerGraph[it.haystackGraphId]++;
+ if (limitMatchesPerGraph < 0 || matchesPerGraph[it.haystackGraphId] < limitMatchesPerGraph)
+ matches++;
+ }
+
+ if (matches < minMatches)
+ return 0;
+
+ if (minNodes <= int(testSet.nodes.size()))
+ {
+ Solver::MineResult result;
+ result.graphId = graphId;
+ result.totalMatchesAfterLimits = matches;
+ result.matchesPerGraph = matchesPerGraph;
+ for (int nodeIdx : testSet.nodes) {
+ Solver::MineResultNode resultNode;
+ resultNode.nodeId = graph.nodes[nodeIdx].nodeId;
+ resultNode.userData = graph.nodes[nodeIdx].userData;
+ result.nodes.push_back(resultNode);
+ }
+ results.push_back(result);
+ }
+
+ nextPool.push_back(thisNodeSetSet);
+ return matches;
+ }
+
+ void findNodePairs(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &nodePairs, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ std::set<NodeSet> usedPairs;
+
+ if (verbose)
+ printf("\nFind frequent node pairs:\n");
+
+ for (auto &graph_it : graphData)
+ for (int node1 = 0; node1 < int(graph_it.second.graph.nodes.size()); node1++)
+ for (auto &adj_it : graph_it.second.adjMatrix.at(node1))
+ {
+ const std::string &graphId = graph_it.first;
+ const auto &graph = graph_it.second.graph;
+ int node2 = adj_it.first;
+ NodeSet pair(graphId, node1, node2);
+
+ if (usedPairs.count(pair) > 0)
+ continue;
+
+ int matches = testForMining(results, usedPairs, nodePairs, pair, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+ if (verbose && matches > 0)
+ printf("Pair %s[%s,%s] -> %d\n", graphId.c_str(), graph.nodes[node1].nodeId.c_str(),
+ graph.nodes[node2].nodeId.c_str(), matches);
+ }
+ }
+
+ void findNextPool(std::vector<Solver::MineResult> &results, std::vector<std::set<NodeSet>> &pool,
+ int oldSetSize, int increment, int minNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ std::vector<std::set<NodeSet>> nextPool;
+ std::map<std::string, std::vector<const NodeSet*>> poolPerGraph;
+
+ for (auto &i1 : pool)
+ for (auto &i2 : i1)
+ poolPerGraph[i2.graphId].push_back(&i2);
+
+ if (verbose)
+ printf("\nFind frequent subcircuits of size %d using increment %d:\n", oldSetSize+increment, increment);
+
+ std::set<NodeSet> usedSets;
+ for (auto &it : poolPerGraph)
+ for (int idx1 = 0; idx1 < int(it.second.size()); idx1++)
+ for (int idx2 = idx1; idx2 < int(it.second.size()); idx2++)
+ {
+ if (it.second[idx1]->extendCandidate(*it.second[idx2]) != increment)
+ continue;
+
+ NodeSet mergedSet = *it.second[idx1];
+ mergedSet.extend(*it.second[idx2]);
+
+ if (usedSets.count(mergedSet) > 0)
+ continue;
+
+ const std::string &graphId = it.first;
+ const auto &graph = graphData[it.first].graph;
+
+ int matches = testForMining(results, usedSets, nextPool, mergedSet, graphId, graph, minNodes, minMatches, limitMatchesPerGraph);
+
+ if (verbose) {
+ printf("Set %s[", graphId.c_str());
+ bool first = true;
+ for (int nodeIdx : mergedSet.nodes) {
+ printf("%s%s", first ? "" : ",", graph.nodes[nodeIdx].nodeId.c_str());
+ first = false;
+ }
+ printf("] -> %d\n", matches);
+ }
+ }
+
+ pool.swap(nextPool);
+ }
+
// interface to the public Solver class
protected:
@@ -1151,6 +1378,25 @@ protected:
ullmannRecursion(results, enumerationMatrix, 0, needle, haystack, allowOverlap, maxSolutions > 0 ? results.size() + maxSolutions : -1);
}
+ void mine(std::vector<Solver::MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+ {
+ int nodeSetSize = 2;
+ std::vector<std::set<NodeSet>> pool;
+ findNodePairs(results, pool, minNodes, minMatches, limitMatchesPerGraph);
+
+ while (nodeSetSize < maxNodes)
+ {
+ int increment = nodeSetSize - 1;
+ if (nodeSetSize + increment >= minNodes)
+ increment = minNodes - nodeSetSize;
+ if (nodeSetSize >= minNodes)
+ increment = 1;
+
+ findNextPool(results, pool, nodeSetSize, increment, minNodes, minMatches, limitMatchesPerGraph);
+ nodeSetSize += increment;
+ }
+ }
+
void clearOverlapHistory()
{
for (auto &it : graphData)
@@ -1252,6 +1498,11 @@ void SubCircuit::Solver::solve(std::vector<Result> &results, std::string needleG
worker->solve(results, needleGraphId, haystackGraphId, initialMappings, allowOverlap, maxSolutions);
}
+void SubCircuit::Solver::mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph)
+{
+ worker->mine(results, minNodes, maxNodes, minMatches, limitMatchesPerGraph);
+}
+
void SubCircuit::Solver::clearOverlapHistory()
{
worker->clearOverlapHistory();
diff --git a/libs/subcircuit/subcircuit.h b/libs/subcircuit/subcircuit.h
index da536ba0..b9399a99 100644
--- a/libs/subcircuit/subcircuit.h
+++ b/libs/subcircuit/subcircuit.h
@@ -73,6 +73,7 @@ namespace SubCircuit
public:
Graph() : allExtern(false) { };
+ Graph(const Graph &other, const std::vector<std::string> &otherNodes);
void createNode(std::string nodeId, std::string typeId, void *userData = NULL);
void createPort(std::string nodeId, std::string portId, int width = 1, int minWidth = -1);
@@ -100,6 +101,17 @@ namespace SubCircuit
std::map<std::string, ResultNodeMapping> mappings;
};
+ struct MineResultNode {
+ std::string nodeId;
+ void *userData;
+ };
+ struct MineResult {
+ std::string graphId;
+ int totalMatchesAfterLimits;
+ std::map<std::string, int> matchesPerGraph;
+ std::vector<MineResultNode> nodes;
+ };
+
private:
SolverWorker *worker;
@@ -131,6 +143,9 @@ namespace SubCircuit
void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId, bool allowOverlap = true, int maxSolutions = -1);
void solve(std::vector<Result> &results, std::string needleGraphId, std::string haystackGraphId,
const std::map<std::string, std::set<std::string>> &initialMapping, bool allowOverlap = true, int maxSolutions = -1);
+
+ void mine(std::vector<MineResult> &results, int minNodes, int maxNodes, int minMatches, int limitMatchesPerGraph = -1);
+
void clearOverlapHistory();
void clearConfig();
};
diff --git a/libs/subcircuit/test_mine.txt b/libs/subcircuit/test_mine.txt
new file mode 100644
index 00000000..e3b9170b
--- /dev/null
+++ b/libs/subcircuit/test_mine.txt
@@ -0,0 +1,35 @@
+
+# verbose
+
+graph macc22
+ node mul_1 mul A 32 B 32 Y 32
+ node mul_2 mul A 32 B 32 Y 32
+ node add_1 add A 32 B 32 Y 32
+ connect mul_1 Y add_1 A
+ connect mul_2 Y add_1 B
+ allextern
+endgraph
+
+graph macc4x2
+ node mul_1 mul A 32 B 32 Y 32
+ node mul_2 mul A 32 B 32 Y 32
+ node mul_3 mul A 32 B 32 Y 32
+ node mul_4 mul A 32 B 32 Y 32
+ node add_1 add A 32 B 32 Y 32
+ node add_2 add A 32 B 32 Y 32
+ node add_3 add A 32 B 32 Y 32
+ connect mul_1 Y add_1 A
+ connect mul_2 Y add_1 B
+ connect mul_3 Y add_2 A
+ connect mul_4 Y add_2 B
+ connect add_1 Y add_3 A
+ connect add_2 Y add_3 B
+ allextern
+endgraph
+
+swapgroup mul A B
+swapgroup add A B
+
+mine 2 10 2
+expect 5
+