summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2007-02-02 09:18:22 -0500
committerDavid Woodhouse <dwmw2@hera.kernel.org>2007-02-02 09:18:22 -0500
commit215ad20a9b077d44ea1cf58b1bac64a7f42d058a (patch)
tree6f1ed85ebf6bd133cb8c7089100963f45cdf341c
parent5622872d31e6f19bab5e5896ed6272872e0a980c (diff)
Add backing store, memory management
-rw-r--r--Makefile11
-rw-r--r--ctree.c432
-rw-r--r--ctree.h62
-rw-r--r--disk-io.c174
-rw-r--r--disk-io.h21
-rw-r--r--kerncompat.h1
-rw-r--r--radix-tree.c836
-rw-r--r--radix-tree.h73
8 files changed, 1447 insertions, 163 deletions
diff --git a/Makefile b/Makefile
index 9f84c08b..63360212 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,12 @@
-ctree: ctree.o
- gcc -g -O2 -Wall -o ctree ctree.c
+CFLAGS= -g -Wall
+
+.c.o:
+ $(CC) $(CFLAGS) -c $<
+
+ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h
+ gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o
clean:
- rm ctree ctree.o
+ rm ctree *.o
diff --git a/ctree.c b/ctree.c
index 4bf5e925..6f0522f2 100644
--- a/ctree.c
+++ b/ctree.c
@@ -1,68 +1,25 @@
#include <stdio.h>
#include <stdlib.h>
#include "kerncompat.h"
-
-#define BLOCKSIZE 4096
-
-struct key {
- u64 objectid;
- u32 flags;
- u64 offset;
-} __attribute__ ((__packed__));
-
-struct header {
- u64 fsid[2]; /* FS specific uuid */
- u64 blocknum;
- u64 parentid;
- u32 csum;
- u32 ham;
- u16 nritems;
- u16 flags;
-} __attribute__ ((__packed__));
-
-#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \
- (sizeof(struct key) + sizeof(u64)))
-
-#define LEVEL_BITS 3
-#define MAX_LEVEL (1 << LEVEL_BITS)
-#define node_level(f) ((f) & (MAX_LEVEL-1))
-#define is_leaf(f) (node_level(f) == 0)
-
-struct ctree_root {
- struct node *node;
-};
-
-struct item {
- struct key key;
- u16 offset;
- u16 size;
-} __attribute__ ((__packed__));
-
-#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header))
-struct leaf {
- struct header header;
- union {
- struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
- u8 data[BLOCKSIZE-sizeof(struct header)];
- };
-} __attribute__ ((__packed__));
-
-struct node {
- struct header header;
- struct key keys[NODEPTRS_PER_BLOCK];
- u64 blockptrs[NODEPTRS_PER_BLOCK];
-} __attribute__ ((__packed__));
-
-struct ctree_path {
- struct node *nodes[MAX_LEVEL];
- int slots[MAX_LEVEL];
-};
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
static inline void init_path(struct ctree_path *p)
{
memset(p, 0, sizeof(*p));
}
+static void release_path(struct ctree_root *root, struct ctree_path *p)
+{
+ int i;
+ for (i = 0; i < MAX_LEVEL; i++) {
+ if (!p->nodes[i])
+ break;
+ tree_block_release(root, p->nodes[i]);
+ }
+}
+
static inline unsigned int leaf_data_end(struct leaf *leaf)
{
unsigned int nr = leaf->header.nritems;
@@ -135,26 +92,25 @@ int bin_search(struct node *c, struct key *key, int *slot)
return -1;
}
-void *read_block(u64 blocknum)
-{
- return (void *)blocknum;
-}
-
int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
{
- struct node *c = root->node;
+ struct tree_buffer *b = root->node;
+ struct node *c;
+
int slot;
int ret;
int level;
- while (c) {
+ b->count++;
+ while (b) {
+ c = &b->node;
level = node_level(c->header.flags);
- p->nodes[level] = c;
+ p->nodes[level] = b;
ret = bin_search(c, key, &slot);
if (!is_leaf(c->header.flags)) {
if (ret && slot > 0)
slot -= 1;
p->slots[level] = slot;
- c = read_block(c->blockptrs[slot]);
+ b = read_tree_block(root, c->blockptrs[slot]);
continue;
} else {
p->slots[level] = slot;
@@ -164,17 +120,20 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
return -1;
}
-static void fixup_low_keys(struct ctree_path *path, struct key *key,
- int level)
+static void fixup_low_keys(struct ctree_root *root,
+ struct ctree_path *path, struct key *key,
+ int level)
{
int i;
/* adjust the pointers going up the tree */
for (i = level; i < MAX_LEVEL; i++) {
- struct node *t = path->nodes[i];
+ struct node *t;
int tslot = path->slots[i];
- if (!t)
+ if (!path->nodes[i])
break;
+ t = &path->nodes[i]->node;
memcpy(t->keys + tslot, key, sizeof(*key));
+ write_tree_block(root, path->nodes[i]);
if (tslot != 0)
break;
}
@@ -190,27 +149,34 @@ int __insert_ptr(struct ctree_root *root,
int nritems;
/* need a new root */
if (!path->nodes[level]) {
- c = malloc(sizeof(struct node));
+ struct tree_buffer *t;
+ t = alloc_free_block(root);
+ c = &t->node;
memset(c, 0, sizeof(c));
c->header.nritems = 2;
c->header.flags = node_level(level);
- lower = path->nodes[level-1];
+ c->header.blocknr = t->blocknr;
+ lower = &path->nodes[level-1]->node;
if (is_leaf(lower->header.flags))
lower_key = &((struct leaf *)lower)->items[0].key;
else
lower_key = lower->keys;
memcpy(c->keys, lower_key, sizeof(struct key));
memcpy(c->keys + 1, key, sizeof(struct key));
- c->blockptrs[0] = (u64)lower;
+ c->blockptrs[0] = path->nodes[level-1]->blocknr;
c->blockptrs[1] = blocknr;
- root->node = c;
- path->nodes[level] = c;
+ /* the path has an extra ref to root->node */
+ tree_block_release(root, root->node);
+ root->node = t;
+ t->count++;
+ write_tree_block(root, t);
+ path->nodes[level] = t;
path->slots[level] = 0;
if (c->keys[1].objectid == 0)
BUG();
return 0;
}
- lower = path->nodes[level];
+ lower = &path->nodes[level]->node;
nritems = lower->header.nritems;
if (slot > nritems)
BUG();
@@ -227,6 +193,7 @@ int __insert_ptr(struct ctree_root *root,
lower->header.nritems++;
if (lower->keys[1].objectid == 0)
BUG();
+ write_tree_block(root, path->nodes[level]);
return 0;
}
@@ -238,6 +205,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
int push_items = 0;
int left_nritems;
int right_nritems;
+ struct tree_buffer *t;
+ struct tree_buffer *right_buf;
if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
return 1;
@@ -245,13 +214,18 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
if (slot == 0)
return 1;
- left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]);
- right = path->nodes[level];
+ t = read_tree_block(root,
+ path->nodes[level + 1]->node.blockptrs[slot - 1]);
+ left = &t->node;
+ right_buf = path->nodes[level];
+ right = &right_buf->node;
left_nritems = left->header.nritems;
right_nritems = right->header.nritems;
push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
- if (push_items <= 0)
+ if (push_items <= 0) {
+ tree_block_release(root, t);
return 1;
+ }
if (right_nritems < push_items)
push_items = right_nritems;
@@ -267,15 +241,20 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
left->header.nritems += push_items;
/* adjust the pointers going up the tree */
- fixup_low_keys(path, right->keys, level + 1);
+ fixup_low_keys(root, path, right->keys, level + 1);
+
+ write_tree_block(root, t);
+ write_tree_block(root, right_buf);
/* then fixup the leaf pointer in the path */
if (path->slots[level] < push_items) {
path->slots[level] += left_nritems;
- path->nodes[level] = (struct node*)left;
+ tree_block_release(root, path->nodes[level]);
+ path->nodes[level] = t;
path->slots[level + 1] -= 1;
} else {
path->slots[level] -= push_items;
+ tree_block_release(root, t);
}
return 0;
}
@@ -283,6 +262,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
{
int slot;
+ struct tree_buffer *t;
+ struct tree_buffer *src_buffer;
struct node *dst;
struct node *src;
int push_items = 0;
@@ -295,16 +276,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
if (slot == NODEPTRS_PER_BLOCK - 1)
return 1;
- if (slot >= path->nodes[level + 1]->header.nritems -1)
+ if (slot >= path->nodes[level + 1]->node.header.nritems -1)
return 1;
- dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]);
- src = path->nodes[level];
+ t = read_tree_block(root,
+ path->nodes[level + 1]->node.blockptrs[slot + 1]);
+ dst = &t->node;
+ src_buffer = path->nodes[level];
+ src = &src_buffer->node;
dst_nritems = dst->header.nritems;
src_nritems = src->header.nritems;
push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
- if (push_items <= 0)
+ if (push_items <= 0) {
+ tree_block_release(root, t);
return 1;
+ }
if (src_nritems < push_items)
push_items = src_nritems;
@@ -322,13 +308,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
dst->header.nritems += push_items;
/* adjust the pointers going up the tree */
- memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1,
+ memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
dst->keys, sizeof(struct key));
+
+ write_tree_block(root, path->nodes[level + 1]);
+ write_tree_block(root, t);
+ write_tree_block(root, src_buffer);
+
/* then fixup the leaf pointer in the path */
if (path->slots[level] >= src->header.nritems) {
path->slots[level] -= src->header.nritems;
- path->nodes[level] = (struct node*)dst;
+ tree_block_release(root, path->nodes[level]);
+ path->nodes[level] = t;
path->slots[level + 1] += 1;
+ } else {
+ tree_block_release(root, t);
}
return 0;
}
@@ -337,15 +331,18 @@ int insert_ptr(struct ctree_root *root,
struct ctree_path *path, struct key *key,
u64 blocknr, int level)
{
- struct node *c = path->nodes[level];
+ struct tree_buffer *t = path->nodes[level];
+ struct node *c = &path->nodes[level]->node;
struct node *b;
- struct node *bal[MAX_LEVEL];
+ struct tree_buffer *b_buffer;
+ struct tree_buffer *bal[MAX_LEVEL];
int bal_level = level;
int mid;
int bal_start = -1;
memset(bal, 0, ARRAY_SIZE(bal));
- while(c && c->header.nritems == NODEPTRS_PER_BLOCK) {
+ while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
+ c = &t->node;
if (push_node_left(root, path,
node_level(c->header.flags)) == 0)
break;
@@ -355,8 +352,10 @@ int insert_ptr(struct ctree_root *root,
bal_start = bal_level;
if (bal_level == MAX_LEVEL - 1)
BUG();
- b = malloc(sizeof(struct node));
+ b_buffer = alloc_free_block(root);
+ b = &b_buffer->node;
b->header.flags = c->header.flags;
+ b->header.blocknr = b_buffer->blocknr;
mid = (c->header.nritems + 1) / 2;
memcpy(b->keys, c->keys + mid,
(c->header.nritems - mid) * sizeof(struct key));
@@ -364,21 +363,28 @@ int insert_ptr(struct ctree_root *root,
(c->header.nritems - mid) * sizeof(u64));
b->header.nritems = c->header.nritems - mid;
c->header.nritems = mid;
- bal[bal_level] = b;
+
+ write_tree_block(root, t);
+ write_tree_block(root, b_buffer);
+
+ bal[bal_level] = b_buffer;
if (bal_level == MAX_LEVEL - 1)
break;
bal_level += 1;
- c = path->nodes[bal_level];
+ t = path->nodes[bal_level];
}
while(bal_start > 0) {
- b = bal[bal_start];
- c = path->nodes[bal_start];
- __insert_ptr(root, path, b->keys, (u64)b,
+ b_buffer = bal[bal_start];
+ c = &path->nodes[bal_start]->node;
+ __insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
path->slots[bal_start + 1] + 1, bal_start + 1);
if (path->slots[bal_start] >= c->header.nritems) {
path->slots[bal_start] -= c->header.nritems;
- path->nodes[bal_start] = b;
+ tree_block_release(root, path->nodes[bal_start]);
+ path->nodes[bal_start] = b_buffer;
path->slots[bal_start + 1] += 1;
+ } else {
+ tree_block_release(root, b_buffer);
}
bal_start--;
if (!bal[bal_start])
@@ -404,7 +410,9 @@ int leaf_space_used(struct leaf *l, int start, int nr)
int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
int data_size)
{
- struct leaf *right = (struct leaf *)path->nodes[0];
+ struct tree_buffer *right_buf = path->nodes[0];
+ struct leaf *right = &right_buf->leaf;
+ struct tree_buffer *t;
struct leaf *left;
int slot;
int i;
@@ -421,9 +429,11 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
if (!path->nodes[1]) {
return 1;
}
- left = read_block(path->nodes[1]->blockptrs[slot - 1]);
+ t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
+ left = &t->leaf;
free_space = leaf_free_space(left);
if (free_space < data_size + sizeof(struct item)) {
+ tree_block_release(root, t);
return 1;
}
for (i = 0; i < right->header.nritems; i++) {
@@ -436,6 +446,7 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
push_space += item->size + sizeof(*item);
}
if (push_items == 0) {
+ tree_block_release(root, t);
return 1;
}
/* push data from right to left */
@@ -446,6 +457,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
right->data + right->items[push_items - 1].offset,
push_space);
old_left_nritems = left->header.nritems;
+ BUG_ON(old_left_nritems < 0);
+
for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
left->items[i].offset -= LEAF_DATA_SIZE -
left->items[old_left_nritems -1].offset;
@@ -460,30 +473,40 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
(right->header.nritems - push_items) * sizeof(struct item));
right->header.nritems -= push_items;
push_space = LEAF_DATA_SIZE;
+
for (i = 0; i < right->header.nritems; i++) {
right->items[i].offset = push_space - right->items[i].size;
push_space = right->items[i].offset;
}
- fixup_low_keys(path, &right->items[0].key, 1);
+
+ write_tree_block(root, t);
+ write_tree_block(root, right_buf);
+
+ fixup_low_keys(root, path, &right->items[0].key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- path->nodes[0] = (struct node*)left;
+ tree_block_release(root, path->nodes[0]);
+ path->nodes[0] = t;
path->slots[1] -= 1;
} else {
+ tree_block_release(root, t);
path->slots[0] -= push_items;
}
+ BUG_ON(path->slots[0] < 0);
return 0;
}
int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
{
- struct leaf *l = (struct leaf *)path->nodes[0];
- int nritems = l->header.nritems;
- int mid = (nritems + 1)/ 2;
- int slot = path->slots[0];
+ struct tree_buffer *l_buf = path->nodes[0];
+ struct leaf *l = &l_buf->leaf;
+ int nritems;
+ int mid;
+ int slot;
struct leaf *right;
+ struct tree_buffer *right_buffer;
int space_needed = data_size + sizeof(struct item);
int data_copy_size;
int rt_data_off;
@@ -491,9 +514,19 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
int ret;
if (push_leaf_left(root, path, data_size) == 0) {
- return 0;
+ l_buf = path->nodes[0];
+ l = &l_buf->leaf;
+ if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+ return 0;
}
- right = malloc(sizeof(struct leaf));
+ slot = path->slots[0];
+ nritems = l->header.nritems;
+ mid = (nritems + 1)/ 2;
+
+ right_buffer = alloc_free_block(root);
+ BUG_ON(!right_buffer);
+ BUG_ON(mid == nritems);
+ right = &right_buffer->leaf;
memset(right, 0, sizeof(*right));
if (mid <= slot) {
if (leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -505,6 +538,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
BUG();
}
right->header.nritems = nritems - mid;
+ right->header.blocknr = right_buffer->blocknr;
+ right->header.flags = node_level(0);
data_copy_size = l->items[mid].offset + l->items[mid].size -
leaf_data_end(l);
memcpy(right->items, l->items + mid,
@@ -518,12 +553,20 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
}
l->header.nritems = mid;
ret = insert_ptr(root, path, &right->items[0].key,
- (u64)right, 1);
+ right_buffer->blocknr, 1);
+
+ write_tree_block(root, right_buffer);
+ write_tree_block(root, l_buf);
+
+ BUG_ON(path->slots[0] != slot);
if (mid <= slot) {
- path->nodes[0] = (struct node *)right;
+ tree_block_release(root, path->nodes[0]);
+ path->nodes[0] = right_buffer;
path->slots[0] -= mid;
path->slots[1] += 1;
- }
+ } else
+ tree_block_release(root, right_buffer);
+ BUG_ON(path->slots[0] < 0);
return ret;
}
@@ -532,28 +575,48 @@ int insert_item(struct ctree_root *root, struct key *key,
{
int ret;
int slot;
+ int slot_orig;
struct leaf *leaf;
+ struct tree_buffer *leaf_buf;
unsigned int nritems;
unsigned int data_end;
struct ctree_path path;
+ if (!root->node) {
+ struct tree_buffer *t;
+ t = alloc_free_block(root);
+ BUG_ON(!t);
+ t->node.header.nritems = 0;
+ t->node.header.flags = node_level(0);
+ t->node.header.blocknr = t->blocknr;
+ root->node = t;
+ write_tree_block(root, t);
+ }
init_path(&path);
ret = search_slot(root, key, &path);
- if (ret == 0)
+ if (ret == 0) {
+ release_path(root, &path);
return -EEXIST;
+ }
- leaf = (struct leaf *)path.nodes[0];
- if (leaf_free_space(leaf) < sizeof(struct item) + data_size)
+ slot_orig = path.slots[0];
+ leaf_buf = path.nodes[0];
+ leaf = &leaf_buf->leaf;
+ if (leaf_free_space(leaf) < sizeof(struct item) + data_size) {
split_leaf(root, &path, data_size);
- leaf = (struct leaf *)path.nodes[0];
+ leaf_buf = path.nodes[0];
+ leaf = &path.nodes[0]->leaf;
+ }
nritems = leaf->header.nritems;
data_end = leaf_data_end(leaf);
+
if (leaf_free_space(leaf) < sizeof(struct item) + data_size)
BUG();
slot = path.slots[0];
+ BUG_ON(slot < 0);
if (slot == 0)
- fixup_low_keys(&path, key, 1);
+ fixup_low_keys(root, &path, key, 1);
if (slot != nritems) {
int i;
unsigned int old_data = leaf->items[slot].offset +
@@ -580,21 +643,25 @@ int insert_item(struct ctree_root *root, struct key *key,
leaf->items[slot].size = data_size;
memcpy(leaf->data + data_end - data_size, data, data_size);
leaf->header.nritems += 1;
+ write_tree_block(root, leaf_buf);
if (leaf_free_space(leaf) < 0)
BUG();
+ release_path(root, &path);
return 0;
}
int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
{
int slot;
+ struct tree_buffer *t;
struct node *node;
int nritems;
while(1) {
- node = path->nodes[level];
- if (!node)
+ t = path->nodes[level];
+ if (!t)
break;
+ node = &t->node;
slot = path->slots[level];
nritems = node->header.nritems;
@@ -606,28 +673,34 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
sizeof(u64) * (nritems - slot - 1));
}
node->header.nritems--;
+ write_tree_block(root, t);
if (node->header.nritems != 0) {
int tslot;
if (slot == 0)
- fixup_low_keys(path, node->keys, level + 1);
+ fixup_low_keys(root, path, node->keys,
+ level + 1);
tslot = path->slots[level+1];
+ t->count++;
push_node_left(root, path, level);
if (node->header.nritems) {
push_node_right(root, path, level);
}
- if (node->header.nritems)
+ if (node->header.nritems) {
+ tree_block_release(root, t);
break;
+ }
+ tree_block_release(root, t);
path->slots[level+1] = tslot;
}
- if (node == root->node) {
- printf("root is now null!\n");
- root->node = NULL;
+ if (t == root->node) {
+ /* just turn the root into a leaf and break */
+ root->node->node.header.flags = node_level(0);
+ write_tree_block(root, t);
break;
}
level++;
if (!path->nodes[level])
BUG();
- free(node);
}
return 0;
}
@@ -636,10 +709,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
{
int slot;
struct leaf *leaf;
+ struct tree_buffer *leaf_buf;
int doff;
int dsize;
- leaf = (struct leaf *)path->nodes[0];
+ leaf_buf = path->nodes[0];
+ leaf = &leaf_buf->leaf;
slot = path->slots[0];
doff = leaf->items[slot].offset;
dsize = leaf->items[slot].size;
@@ -658,14 +733,15 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
}
leaf->header.nritems -= 1;
if (leaf->header.nritems == 0) {
- if (leaf == (struct leaf *)root->node)
- root->node = NULL;
- else
+ if (leaf_buf == root->node) {
+ leaf->header.flags = node_level(0);
+ write_tree_block(root, leaf_buf);
+ } else
del_ptr(root, path, 1);
- free(leaf);
} else {
if (slot == 0)
- fixup_low_keys(path, &leaf->items[0].key, 1);
+ fixup_low_keys(root, path, &leaf->items[0].key, 1);
+ write_tree_block(root, leaf_buf);
if (leaf_space_used(leaf, 0, leaf->header.nritems) <
LEAF_DATA_SIZE / 4) {
/* push_leaf_left fixes the path.
@@ -673,12 +749,13 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
* for possible call to del_ptr below
*/
slot = path->slots[1];
+ leaf_buf->count++;
push_leaf_left(root, path, 1);
if (leaf->header.nritems == 0) {
- free(leaf);
path->slots[1] = slot;
del_ptr(root, path, 1);
}
+ tree_block_release(root, leaf_buf);
}
}
return 0;
@@ -689,7 +766,7 @@ void print_leaf(struct leaf *l)
int i;
int nr = l->header.nritems;
struct item *item;
- printf("leaf %p total ptrs %d free space %d\n", l, nr,
+ printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
leaf_free_space(l));
fflush(stdout);
for (i = 0 ; i < nr ; i++) {
@@ -703,38 +780,45 @@ void print_leaf(struct leaf *l)
fflush(stdout);
}
}
-void print_tree(struct node *c)
+void print_tree(struct ctree_root *root, struct tree_buffer *t)
{
int i;
int nr;
+ struct node *c;
- if (!c)
+ if (!t)
return;
+ c = &t->node;
nr = c->header.nritems;
+ if (c->header.blocknr != t->blocknr)
+ BUG();
if (is_leaf(c->header.flags)) {
print_leaf((struct leaf *)c);
return;
}
- printf("node %p level %d total ptrs %d free spc %lu\n", c,
+ printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
node_level(c->header.flags), c->header.nritems,
NODEPTRS_PER_BLOCK - c->header.nritems);
fflush(stdout);
for (i = 0; i < nr; i++) {
- printf("\tkey %d (%lu %u %lu) block %lx\n",
+ printf("\tkey %d (%lu %u %lu) block %lu\n",
i,
c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
c->blockptrs[i]);
fflush(stdout);
}
for (i = 0; i < nr; i++) {
- struct node *next = read_block(c->blockptrs[i]);
+ struct tree_buffer *next_buf = read_tree_block(root,
+ c->blockptrs[i]);
+ struct node *next = &next_buf->node;
if (is_leaf(next->header.flags) &&
node_level(c->header.flags) != 1)
BUG();
if (node_level(next->header.flags) !=
node_level(c->header.flags) - 1)
BUG();
- print_tree(next);
+ print_tree(root, next_buf);
+ tree_block_release(root, next_buf);
}
}
@@ -746,23 +830,24 @@ int next_key(int i, int max_key) {
}
int main() {
- struct leaf *first_node = malloc(sizeof(struct leaf));
- struct ctree_root root;
+ struct ctree_root *root;
struct key ins;
struct key last = { (u64)-1, 0, 0};
char *buf;
int i;
int num;
int ret;
- int run_size = 100000;
+ int run_size = 1000000;
int max_key = 100000000;
int tree_size = 0;
struct ctree_path path;
+ radix_tree_init();
+
+
+ root = open_ctree("dbfile");
srand(55);
- root.node = (struct node *)first_node;
- memset(first_node, 0, sizeof(*first_node));
for (i = 0; i < run_size; i++) {
buf = malloc(64);
num = next_key(i, max_key);
@@ -772,39 +857,46 @@ int main() {
ins.objectid = num;
ins.offset = 0;
ins.flags = 0;
- ret = insert_item(&root, &ins, buf, strlen(buf));
+ ret = insert_item(root, &ins, buf, strlen(buf));
if (!ret)
tree_size++;
}
+ close_ctree(root);
+ root = open_ctree("dbfile");
+ printf("starting search\n");
srand(55);
for (i = 0; i < run_size; i++) {
num = next_key(i, max_key);
ins.objectid = num;
init_path(&path);
- ret = search_slot(&root, &ins, &path);
+ ret = search_slot(root, &ins, &path);
if (ret) {
- print_tree(root.node);
+ print_tree(root, root->node);
printf("unable to find %d\n", num);
exit(1);
}
- }
- printf("node %p level %d total ptrs %d free spc %lu\n", root.node,
- node_level(root.node->header.flags), root.node->header.nritems,
- NODEPTRS_PER_BLOCK - root.node->header.nritems);
- // print_tree(root.node);
- printf("all searches good\n");
+ release_path(root, &path);
+ }
+ close_ctree(root);
+ root = open_ctree("dbfile");
+ printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
+ node_level(root->node->node.header.flags),
+ root->node->node.header.nritems,
+ NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+ printf("all searches good, deleting some items\n");
i = 0;
srand(55);
for (i = 0 ; i < run_size/4; i++) {
num = next_key(i, max_key);
ins.objectid = num;
init_path(&path);
- ret = search_slot(&root, &ins, &path);
+ ret = search_slot(root, &ins, &path);
if (ret)
continue;
- ret = del_item(&root, &path);
+ ret = del_item(root, &path);
if (ret != 0)
BUG();
+ release_path(root, &path);
tree_size--;
}
srand(128);
@@ -813,38 +905,58 @@ int main() {
num = next_key(i, max_key);
sprintf(buf, "string-%d", num);
ins.objectid = num;
- ret = insert_item(&root, &ins, buf, strlen(buf));
+ ret = insert_item(root, &ins, buf, strlen(buf));
if (!ret)
tree_size++;
}
- while(root.node) {
+ close_ctree(root);
+ root = open_ctree("dbfile");
+ printf("starting search2\n");
+ srand(128);
+ for (i = 0; i < run_size; i++) {
+ num = next_key(i, max_key);
+ ins.objectid = num;
+ init_path(&path);
+ ret = search_slot(root, &ins, &path);
+ if (ret) {
+ print_tree(root, root->node);
+ printf("unable to find %d\n", num);
+ exit(1);
+ }
+ release_path(root, &path);
+ }
+ printf("starting big long delete run\n");
+ while(root->node && root->node->node.header.nritems > 0) {
struct leaf *leaf;
int slot;
ins.objectid = (u64)-1;
init_path(&path);
- ret = search_slot(&root, &ins, &path);
+ ret = search_slot(root, &ins, &path);
if (ret == 0)
BUG();
- leaf = (struct leaf *)(path.nodes[0]);
+ leaf = &path.nodes[0]->leaf;
slot = path.slots[0];
if (slot != leaf->header.nritems)
BUG();
while(path.slots[0] > 0) {
path.slots[0] -= 1;
slot = path.slots[0];
- leaf = (struct leaf *)(path.nodes[0]);
+ leaf = &path.nodes[0]->leaf;
if (comp_keys(&last, &leaf->items[slot].key) <= 0)
BUG();
memcpy(&last, &leaf->items[slot].key, sizeof(last));
- ret = del_item(&root, &path);
- if (ret != 0)
+ ret = del_item(root, &path);
+ if (ret != 0) {
+ printf("del_item returned %d\n", ret);
BUG();
+ }
tree_size--;
}
+ release_path(root, &path);
}
- print_tree(root.node);
+ close_ctree(root);
printf("tree size is now %d\n", tree_size);
return 0;
}
diff --git a/ctree.h b/ctree.h
new file mode 100644
index 00000000..586bf186
--- /dev/null
+++ b/ctree.h
@@ -0,0 +1,62 @@
+#ifndef __CTREE__
+#define __CTREE__
+
+#define CTREE_BLOCKSIZE 4096
+
+struct key {
+ u64 objectid;
+ u32 flags;
+ u64 offset;
+} __attribute__ ((__packed__));
+
+struct header {
+ u64 fsid[2]; /* FS specific uuid */
+ u64 blocknr;
+ u64 parentid;
+ u32 csum;
+ u32 ham;
+ u16 nritems;
+ u16 flags;
+} __attribute__ ((__packed__));
+
+#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
+ (sizeof(struct key) + sizeof(u64)))
+
+#define LEVEL_BITS 3
+#define MAX_LEVEL (1 << LEVEL_BITS)
+#define node_level(f) ((f) & (MAX_LEVEL-1))
+#define is_leaf(f) (node_level(f) == 0)
+
+struct tree_buffer;
+struct ctree_root {
+ struct tree_buffer *node;
+ int fp;
+ struct radix_tree_root cache_radix;
+};
+
+struct item {
+ struct key key;
+ u16 offset;
+ u16 size;
+} __attribute__ ((__packed__));
+
+#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
+struct leaf {
+ struct header header;
+ union {
+ struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+ u8 data[CTREE_BLOCKSIZE-sizeof(struct header)];
+ };
+} __attribute__ ((__packed__));
+
+struct node {
+ struct header header;
+ struct key keys[NODEPTRS_PER_BLOCK];
+ u64 blockptrs[NODEPTRS_PER_BLOCK];
+} __attribute__ ((__packed__));
+
+struct ctree_path {
+ struct tree_buffer *nodes[MAX_LEVEL];
+ int slots[MAX_LEVEL];
+};
+#endif
diff --git a/disk-io.c b/disk-io.c
new file mode 100644
index 00000000..8d51a070
--- /dev/null
+++ b/disk-io.c
@@ -0,0 +1,174 @@
+#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+static int allocated_blocks = 0;
+
+struct ctree_header {
+ u64 root_block;
+} __attribute__ ((__packed__));
+
+static int get_free_block(struct ctree_root *root, u64 *block)
+{
+ struct stat st;
+ int ret;
+
+ st.st_size = 0;
+ ret = fstat(root->fp, &st);
+ if (st.st_size > sizeof(struct ctree_header)) {
+ *block = (st.st_size -
+ sizeof(struct ctree_header)) / CTREE_BLOCKSIZE;
+ } else {
+ *block = 0;
+ }
+ ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) *
+ CTREE_BLOCKSIZE);
+ return ret;
+}
+
+struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
+{
+ struct tree_buffer *buf;
+ int ret;
+ buf = malloc(sizeof(struct tree_buffer));
+ if (!buf)
+ return buf;
+ allocated_blocks++;
+ buf->blocknr = blocknr;
+ buf->count = 1;
+ radix_tree_preload(GFP_KERNEL);
+ ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
+ radix_tree_preload_end();
+ if (ret) {
+ free(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+ u64 free_block;
+ int ret;
+ struct tree_buffer * buf;
+ ret = get_free_block(root, &free_block);
+ if (ret) {
+ BUG();
+ return NULL;
+ }
+ buf = alloc_tree_block(root, free_block);
+ if (!buf)
+ BUG();
+ return buf;
+}
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
+{
+ loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+ struct tree_buffer *buf;
+ int ret;
+
+ buf = radix_tree_lookup(&root->cache_radix, blocknr);
+ if (buf) {
+ buf->count++;
+ if (buf->blocknr != blocknr)
+ BUG();
+ if (buf->blocknr != buf->node.header.blocknr)
+ BUG();
+ return buf;
+ }
+ buf = alloc_tree_block(root, blocknr);
+ if (!buf)
+ return NULL;
+ ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+ if (ret != CTREE_BLOCKSIZE) {
+ free(buf);
+ return NULL;
+ }
+ if (buf->blocknr != buf->node.header.blocknr)
+ BUG();
+ return buf;
+}
+
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+ u64 blocknr = buf->blocknr;
+ loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+ int ret;
+
+ if (buf->blocknr != buf->node.header.blocknr)
+ BUG();
+ ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+ if (ret != CTREE_BLOCKSIZE)
+ return ret;
+ if (buf == root->node)
+ return update_root_block(root);
+ return 0;
+}
+
+struct ctree_root *open_ctree(char *filename)
+{
+ struct ctree_root *root = malloc(sizeof(struct ctree_root));
+ int fp;
+ u64 root_block;
+ int ret;
+
+ fp = open(filename, O_CREAT | O_RDWR);
+ if (fp < 0) {
+ free(root);
+ return NULL;
+ }
+ root->fp = fp;
+ INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+ ret = pread(fp, &root_block, sizeof(u64), 0);
+ if (ret == sizeof(u64)) {
+ printf("reading root node at block %lu\n", root_block);
+ root->node = read_tree_block(root, root_block);
+ } else
+ root->node = NULL;
+ return root;
+}
+
+int close_ctree(struct ctree_root *root)
+{
+ close(root->fp);
+ if (root->node)
+ tree_block_release(root, root->node);
+ free(root);
+ printf("on close %d blocks are allocated\n", allocated_blocks);
+ return 0;
+}
+
+int update_root_block(struct ctree_root *root)
+{
+ int ret;
+ u64 root_block = root->node->blocknr;
+
+ ret = pwrite(root->fp, &root_block, sizeof(u64), 0);
+ if (ret != sizeof(u64))
+ return ret;
+ return 0;
+}
+
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
+{
+ buf->count--;
+ if (buf->count == 0) {
+ if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
+ BUG();
+ radix_tree_delete(&root->cache_radix, buf->blocknr);
+ memset(buf, 0, sizeof(*buf));
+ free(buf);
+ BUG_ON(allocated_blocks == 0);
+ allocated_blocks--;
+ }
+}
+
diff --git a/disk-io.h b/disk-io.h
new file mode 100644
index 00000000..ee95fa05
--- /dev/null
+++ b/disk-io.h
@@ -0,0 +1,21 @@
+#ifndef __DISKIO__
+#define __DISKIO__
+
+struct tree_buffer {
+ u64 blocknr;
+ int count;
+ union {
+ struct node node;
+ struct leaf leaf;
+ };
+};
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+struct ctree_root *open_ctree(char *filename);
+int close_ctree(struct ctree_root *root);
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int update_root_block(struct ctree_root *root);
+
+#endif
diff --git a/kerncompat.h b/kerncompat.h
index 3a4bb4d6..347ca062 100644
--- a/kerncompat.h
+++ b/kerncompat.h
@@ -6,6 +6,7 @@
#define BITS_PER_LONG 64
#define __GFP_BITS_SHIFT 20
#define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
+#define GFP_KERNEL 0
#define __read_mostly
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#define __force
diff --git a/radix-tree.c b/radix-tree.c
new file mode 100644
index 00000000..baa25ca1
--- /dev/null
+++ b/radix-tree.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "kerncompat.h"
+#include "radix-tree.h"
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS \
+ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+ unsigned int count;
+ void *slots[RADIX_TREE_MAP_SIZE];
+ unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+struct radix_tree_path {
+ struct radix_tree_node *node;
+ int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct radix_tree_preload {
+ int nr;
+ struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+};
+struct radix_tree_preload radix_tree_preloads = { 0, };
+
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+ return root->gfp_mask & __GFP_BITS_MASK;
+}
+
+static int internal_nodes = 0;
+/*
+ * This assumes that the caller has performed appropriate preallocation, and
+ * that the caller has pinned this thread of control to the current CPU.
+ */
+static struct radix_tree_node *
+radix_tree_node_alloc(struct radix_tree_root *root)
+{
+ struct radix_tree_node *ret;
+ ret = malloc(sizeof(struct radix_tree_node));
+ if (ret) {
+ memset(ret, 0, sizeof(struct radix_tree_node));
+ internal_nodes++;
+ }
+ return ret;
+}
+
+static inline void
+radix_tree_node_free(struct radix_tree_node *node)
+{
+ internal_nodes--;
+ free(node);
+}
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail. On
+ * success, return zero, with preemption disabled. On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+ struct radix_tree_preload *rtp;
+ struct radix_tree_node *node;
+ int ret = -ENOMEM;
+
+ preempt_disable();
+ rtp = &__get_cpu_var(radix_tree_preloads);
+ while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
+ preempt_enable();
+ node = radix_tree_node_alloc(NULL);
+ if (node == NULL)
+ goto out;
+ preempt_disable();
+ rtp = &__get_cpu_var(radix_tree_preloads);
+ if (rtp->nr < ARRAY_SIZE(rtp->nodes))
+ rtp->nodes[rtp->nr++] = node;
+ else
+ radix_tree_node_free(node);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+ int offset)
+{
+ __set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+ int offset)
+{
+ __clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+ int offset)
+{
+ return test_bit(offset, node->tags[tag]);
+}
+
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+ root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+ root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+ root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+ return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+ int idx;
+ for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+ if (node->tags[tag][idx])
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Return the maximum key which can be store into a
+ * radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+ return height_to_maxindex[height];
+}
+
+/*
+ * Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+ struct radix_tree_node *node;
+ unsigned int height;
+ int tag;
+
+ /* Figure out what the height should be. */
+ height = root->height + 1;
+ while (index > radix_tree_maxindex(height))
+ height++;
+
+ if (root->rnode == NULL) {
+ root->height = height;
+ goto out;
+ }
+
+ do {
+ if (!(node = radix_tree_node_alloc(root)))
+ return -ENOMEM;
+
+ /* Increase the height. */
+ node->slots[0] = root->rnode;
+
+ /* Propagate the aggregated tag info into the new root */
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+ if (root_tag_get(root, tag))
+ tag_set(node, tag, 0);
+ }
+
+ node->count = 1;
+ root->rnode = node;
+ root->height++;
+ } while (height > root->height);
+out:
+ return 0;
+}
+
+/**
+ * radix_tree_insert - insert into a radix tree
+ * @root: radix tree root
+ * @index: index key
+ * @item: item to insert
+ *
+ * Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+ unsigned long index, void *item)
+{
+ struct radix_tree_node *node = NULL, *slot;
+ unsigned int height, shift;
+ int offset;
+ int error;
+
+ /* Make sure the tree is high enough. */
+ if (index > radix_tree_maxindex(root->height)) {
+ error = radix_tree_extend(root, index);
+ if (error)
+ return error;
+ }
+
+ slot = root->rnode;
+ height = root->height;
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+ offset = 0; /* uninitialised var warning */
+ while (height > 0) {
+ if (slot == NULL) {
+ /* Have to add a child node. */
+ if (!(slot = radix_tree_node_alloc(root)))
+ return -ENOMEM;
+ if (node) {
+ node->slots[offset] = slot;
+ node->count++;
+ } else
+ root->rnode = slot;
+ }
+
+ /* Go a level down */
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ node = slot;
+ slot = node->slots[offset];
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ if (slot != NULL)
+ return -EEXIST;
+
+ if (node) {
+ node->count++;
+ node->slots[offset] = item;
+ BUG_ON(tag_get(node, 0, offset));
+ BUG_ON(tag_get(node, 1, offset));
+ } else {
+ root->rnode = item;
+ BUG_ON(root_tag_get(root, 0));
+ BUG_ON(root_tag_get(root, 1));
+ }
+
+ return 0;
+}
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+ unsigned long index)
+{
+ unsigned int height, shift;
+ struct radix_tree_node **slot;
+
+ height = root->height;
+
+ if (index > radix_tree_maxindex(height))
+ return NULL;
+
+ if (height == 0 && root->rnode)
+ return (void **)&root->rnode;
+
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ slot = &root->rnode;
+
+ while (height > 0) {
+ if (*slot == NULL)
+ return NULL;
+
+ slot = (struct radix_tree_node **)
+ ((*slot)->slots +
+ ((index >> shift) & RADIX_TREE_MAP_MASK));
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ return (void **)slot;
+}
+
+/**
+ * radix_tree_lookup_slot - lookup a slot in a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Lookup the slot corresponding to the position @index in the radix tree
+ * @root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+ return __lookup_slot(root, index);
+}
+
+/**
+ * radix_tree_lookup - perform lookup operation on a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+ void **slot;
+
+ slot = __lookup_slot(root, index);
+ return slot != NULL ? *slot : NULL;
+}
+
+/**
+ * radix_tree_tag_set - set a tag on a radix tree node
+ * @root: radix tree root
+ * @index: index key
+ * @tag: tag index
+ *
+ * Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ * corresponding to @index in the radix tree. From
+ * the root all the way down to the leaf node.
+ *
+ * Returns the address of the tagged item. Setting a tag on a not-present
+ * item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag)
+{
+ unsigned int height, shift;
+ struct radix_tree_node *slot;
+
+ height = root->height;
+ BUG_ON(index > radix_tree_maxindex(height));
+
+ slot = root->rnode;
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+
+ while (height > 0) {
+ int offset;
+
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ if (!tag_get(slot, tag, offset))
+ tag_set(slot, tag, offset);
+ slot = slot->slots[offset];
+ BUG_ON(slot == NULL);
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ /* set the root's tag bit */
+ if (slot && !root_tag_get(root, tag))
+ root_tag_set(root, tag);
+
+ return slot;
+}
+
+/**
+ * radix_tree_tag_clear - clear a tag on a radix tree node
+ * @root: radix tree root
+ * @index: index key
+ * @tag: tag index
+ *
+ * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ * corresponding to @index in the radix tree. If
+ * this causes the leaf node to have no tags set then clear the tag in the
+ * next-to-leaf node, etc.
+ *
+ * Returns the address of the tagged item on success, else NULL. ie:
+ * has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag)
+{
+ struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+ struct radix_tree_node *slot = NULL;
+ unsigned int height, shift;
+
+ height = root->height;
+ if (index > radix_tree_maxindex(height))
+ goto out;
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ pathp->node = NULL;
+ slot = root->rnode;
+
+ while (height > 0) {
+ int offset;
+
+ if (slot == NULL)
+ goto out;
+
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ pathp[1].offset = offset;
+ pathp[1].node = slot;
+ slot = slot->slots[offset];
+ pathp++;
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+
+ if (slot == NULL)
+ goto out;
+
+ while (pathp->node) {
+ if (!tag_get(pathp->node, tag, pathp->offset))
+ goto out;
+ tag_clear(pathp->node, tag, pathp->offset);
+ if (any_tag_set(pathp->node, tag))
+ goto out;
+ pathp--;
+ }
+
+ /* clear the root's tag bit */
+ if (root_tag_get(root, tag))
+ root_tag_clear(root, tag);
+
+out:
+ return slot;
+}
+
+#ifndef __KERNEL__ /* Only the test harness uses this at present */
+/**
+ * radix_tree_tag_get - get a tag on a radix tree node
+ * @root: radix tree root
+ * @index: index key
+ * @tag: tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ * Return values:
+ *
+ * 0: tag not present or not set
+ * 1: tag set
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag)
+{
+ unsigned int height, shift;
+ struct radix_tree_node *slot;
+ int saw_unset_tag = 0;
+
+ height = root->height;
+ if (index > radix_tree_maxindex(height))
+ return 0;
+
+ /* check the root's tag bit */
+ if (!root_tag_get(root, tag))
+ return 0;
+
+ if (height == 0)
+ return 1;
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ slot = root->rnode;
+
+ for ( ; ; ) {
+ int offset;
+
+ if (slot == NULL)
+ return 0;
+
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+ /*
+ * This is just a debug check. Later, we can bale as soon as
+ * we see an unset tag.
+ */
+ if (!tag_get(slot, tag, offset))
+ saw_unset_tag = 1;
+ if (height == 1) {
+ int ret = tag_get(slot, tag, offset);
+
+ BUG_ON(ret && saw_unset_tag);
+ return !!ret;
+ }
+ slot = slot->slots[offset];
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ }
+}
+#endif
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+ unsigned int max_items, unsigned long *next_index)
+{
+ unsigned int nr_found = 0;
+ unsigned int shift, height;
+ struct radix_tree_node *slot;
+ unsigned long i;
+
+ height = root->height;
+ if (height == 0) {
+ if (root->rnode && index == 0)
+ results[nr_found++] = root->rnode;
+ goto out;
+ }
+
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+ slot = root->rnode;
+
+ for ( ; height > 1; height--) {
+
+ for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+ i < RADIX_TREE_MAP_SIZE; i++) {
+ if (slot->slots[i] != NULL)
+ break;
+ index &= ~((1UL << shift) - 1);
+ index += 1UL << shift;
+ if (index == 0)
+ goto out; /* 32-bit wraparound */
+ }
+ if (i == RADIX_TREE_MAP_SIZE)
+ goto out;
+
+ shift -= RADIX_TREE_MAP_SHIFT;
+ slot = slot->slots[i];
+ }
+
+ /* Bottom level: grab some items */
+ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+ index++;
+ if (slot->slots[i]) {
+ results[nr_found++] = slot->slots[i];
+ if (nr_found == max_items)
+ goto out;
+ }
+ }
+out:
+ *next_index = index;
+ return nr_found;
+}
+
+/**
+ * radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ * @root: radix tree root
+ * @results: where the results of the lookup are placed
+ * @first_index: start the lookup from this key
+ * @max_items: place up to this many items at *results
+ *
+ * Performs an index-ascending scan of the tree for present items. Places
+ * them at *@results and returns the number of items which were placed at
+ * *@results.
+ *
+ * The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items)
+{
+ const unsigned long max_index = radix_tree_maxindex(root->height);
+ unsigned long cur_index = first_index;
+ unsigned int ret = 0;
+
+ while (ret < max_items) {
+ unsigned int nr_found;
+ unsigned long next_index; /* Index of next search */
+
+ if (cur_index > max_index)
+ break;
+ nr_found = __lookup(root, results + ret, cur_index,
+ max_items - ret, &next_index);
+ ret += nr_found;
+ if (next_index == 0)
+ break;
+ cur_index = next_index;
+ }
+ return ret;
+}
+
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+ unsigned int max_items, unsigned long *next_index, unsigned int tag)
+{
+ unsigned int nr_found = 0;
+ unsigned int shift;
+ unsigned int height = root->height;
+ struct radix_tree_node *slot;
+
+ if (height == 0) {
+ if (root->rnode && index == 0)
+ results[nr_found++] = root->rnode;
+ goto out;
+ }
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ slot = root->rnode;
+
+ do {
+ unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+ for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (tag_get(slot, tag, i)) {
+ BUG_ON(slot->slots[i] == NULL);
+ break;
+ }
+ index &= ~((1UL << shift) - 1);
+ index += 1UL << shift;
+ if (index == 0)
+ goto out; /* 32-bit wraparound */
+ }
+ if (i == RADIX_TREE_MAP_SIZE)
+ goto out;
+ height--;
+ if (height == 0) { /* Bottom level: grab some items */
+ unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+ for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+ index++;
+ if (tag_get(slot, tag, j)) {
+ BUG_ON(slot->slots[j] == NULL);
+ results[nr_found++] = slot->slots[j];
+ if (nr_found == max_items)
+ goto out;
+ }
+ }
+ }
+ shift -= RADIX_TREE_MAP_SHIFT;
+ slot = slot->slots[i];
+ } while (height > 0);
+out:
+ *next_index = index;
+ return nr_found;
+}
+
+/**
+ * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ * based on a tag
+ * @root: radix tree root
+ * @results: where the results of the lookup are placed
+ * @first_index: start the lookup from this key
+ * @max_items: place up to this many items at *results
+ * @tag: the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ * Performs an index-ascending scan of the tree for present items which
+ * have the tag indexed by @tag set. Places the items at *@results and
+ * returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items,
+ unsigned int tag)
+{
+ const unsigned long max_index = radix_tree_maxindex(root->height);
+ unsigned long cur_index = first_index;
+ unsigned int ret = 0;
+
+ /* check the root's tag bit */
+ if (!root_tag_get(root, tag))
+ return 0;
+
+ while (ret < max_items) {
+ unsigned int nr_found;
+ unsigned long next_index; /* Index of next search */
+
+ if (cur_index > max_index)
+ break;
+ nr_found = __lookup_tag(root, results + ret, cur_index,
+ max_items - ret, &next_index, tag);
+ ret += nr_found;
+ if (next_index == 0)
+ break;
+ cur_index = next_index;
+ }
+ return ret;
+}
+
+/**
+ * radix_tree_shrink - shrink height of a radix tree to minimal
+ * @root radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+ /* try to shrink tree height */
+ while (root->height > 0 &&
+ root->rnode->count == 1 &&
+ root->rnode->slots[0]) {
+ struct radix_tree_node *to_free = root->rnode;
+
+ root->rnode = to_free->slots[0];
+ root->height--;
+ /* must only free zeroed nodes into the slab */
+ tag_clear(to_free, 0, 0);
+ tag_clear(to_free, 1, 0);
+ to_free->slots[0] = NULL;
+ to_free->count = 0;
+ radix_tree_node_free(to_free);
+ }
+}
+
+/**
+ * radix_tree_delete - delete an item from a radix tree
+ * @root: radix tree root
+ * @index: index key
+ *
+ * Remove the item at @index from the radix tree rooted at @root.
+ *
+ * Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+ struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+ struct radix_tree_node *slot = NULL;
+ unsigned int height, shift;
+ int tag;
+ int offset;
+
+ height = root->height;
+ if (index > radix_tree_maxindex(height))
+ goto out;
+
+ slot = root->rnode;
+ if (height == 0 && root->rnode) {
+ root_tag_clear_all(root);
+ root->rnode = NULL;
+ goto out;
+ }
+
+ shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+ pathp->node = NULL;
+
+ do {
+ if (slot == NULL)
+ goto out;
+
+ pathp++;
+ offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+ pathp->offset = offset;
+ pathp->node = slot;
+ slot = slot->slots[offset];
+ shift -= RADIX_TREE_MAP_SHIFT;
+ height--;
+ } while (height > 0);
+
+ if (slot == NULL)
+ goto out;
+
+ /*
+ * Clear all tags associated with the just-deleted item
+ */
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+ if (tag_get(pathp->node, tag, pathp->offset))
+ radix_tree_tag_clear(root, index, tag);
+ }
+
+ /* Now free the nodes we do not need anymore */
+ while (pathp->node) {
+ pathp->node->slots[pathp->offset] = NULL;
+ pathp->node->count--;
+
+ if (pathp->node->count) {
+ if (pathp->node == root->rnode)
+ radix_tree_shrink(root);
+ goto out;
+ }
+
+ /* Node with zero slots in use so free it */
+ radix_tree_node_free(pathp->node);
+
+ pathp--;
+ }
+ root_tag_clear_all(root);
+ root->height = 0;
+ root->rnode = NULL;
+
+out:
+ return slot;
+}
+
+/**
+ * radix_tree_tagged - test whether any items in the tree are tagged
+ * @root: radix tree root
+ * @tag: tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+{
+ return root_tag_get(root, tag);
+}
+
+static unsigned long __maxindex(unsigned int height)
+{
+ unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+ unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+ if (tmp >= RADIX_TREE_INDEX_BITS)
+ index = ~0UL;
+ return index;
+}
+
+static void radix_tree_init_maxindex(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+ height_to_maxindex[i] = __maxindex(i);
+}
+
+void radix_tree_init(void)
+{
+ radix_tree_init_maxindex();
+}
diff --git a/radix-tree.h b/radix-tree.h
new file mode 100644
index 00000000..c3ce8813
--- /dev/null
+++ b/radix-tree.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+struct radix_tree_root {
+ unsigned int height;
+ gfp_t gfp_mask;
+ struct radix_tree_node *rnode;
+};
+
+#define RADIX_TREE_INIT(mask) { \
+ .height = 0, \
+ .gfp_mask = (mask), \
+ .rnode = NULL, \
+}
+
+#define RADIX_TREE(name, mask) \
+ struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask) \
+do { \
+ (root)->height = 0; \
+ (root)->gfp_mask = (mask); \
+ (root)->rnode = NULL; \
+} while (0)
+
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items);
+int radix_tree_preload(gfp_t gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+ unsigned long index, unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items,
+ unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+
+static inline void radix_tree_preload_end(void)
+{
+ preempt_enable();
+}
+
+#endif /* _LINUX_RADIX_TREE_H */