diff --git a/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp b/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp
new file mode 100644
index 0000000000..ed651db33e
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp
@@ -0,0 +1,2053 @@
+/* BPlusTree - BFS B+Tree implementation
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** Roughly based on 'btlib' written by Marcus J. Ranum
+**
+** Copyright (c) 2001-2002 pinc Software. All Rights Reserved.
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "cpp.h"
+#include "BPlusTree.h"
+#include "Inode.h"
+#include "Utility.h"
+#include "Stack.h"
+
+#include <TypeConstants.h>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+// Node Caching for the BPlusTree class
+//
+// With write support, there is the need for a function that allocates new
+// nodes by either returning empty nodes, or by growing the file's data stream
+//
+// !! The CachedNode class assumes that you have properly locked the stream
+// !! before asking for nodes.
+//
+// Note: This code will fail if the block size is smaller than the node size!
+// Since BFS supports block sizes of 1024 bytes or greater, and the node size
+// is hard-coded to 1024 bytes, that's not an issue now.
+
+void 
+CachedNode::Unset()
+{
+	if (fTree == NULL || fTree->fStream == NULL)
+		return;
+
+	if (fBlock != NULL) {
+		release_block(fTree->fStream->GetVolume()->Device(),fBlockNumber);
+	
+		fBlock = NULL;
+		fNode = NULL;
+	}
+}
+
+
+bplustree_node *
+CachedNode::SetTo(off_t offset,bool check)
+{
+	if (fTree == NULL || fTree->fStream == NULL) {
+		REPORT_ERROR(B_BAD_VALUE);
+		return NULL;
+	}
+
+	Unset();
+
+	// You can only ask for nodes at valid positions - you can't
+	// even access the b+tree header with this method (use SetToHeader()
+	// instead)
+	if (offset > fTree->fHeader->maximum_size - fTree->fNodeSize
+		|| offset <= 0
+		|| (offset % fTree->fNodeSize) != 0)
+		return NULL;
+
+	if (InternalSetTo(offset) != NULL && check) {
+		// sanity checks (links, all_key_count)
+		bplustree_header *header = fTree->fHeader;
+		if (!header->IsValidLink(fNode->left_link)
+			|| !header->IsValidLink(fNode->right_link)
+			|| !header->IsValidLink(fNode->overflow_link)
+			|| (int8 *)fNode->Values() + fNode->all_key_count * sizeof(off_t) >
+					(int8 *)fNode + fTree->fNodeSize) {
+			FATAL(("invalid node read from offset %Ld, inode at %Ld\n",
+					offset,fTree->fStream->ID()));
+			return NULL;
+		}
+	}
+	return fNode;
+}
+
+
+bplustree_header *
+CachedNode::SetToHeader()
+{
+	if (fTree == NULL || fTree->fStream == NULL) {
+		REPORT_ERROR(B_BAD_VALUE);
+		return NULL;
+	}
+
+	Unset();
+	
+	InternalSetTo(0LL);
+	return (bplustree_header *)fNode;
+}
+
+
+bplustree_node *
+CachedNode::InternalSetTo(off_t offset)
+{
+	fNode = NULL;
+
+	off_t fileOffset;
+	block_run run;
+	if (offset < fTree->fStream->Size()
+		&& fTree->fStream->FindBlockRun(offset,run,fileOffset) == B_OK) {
+		Volume *volume = fTree->fStream->GetVolume();
+
+		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
+		fBlockNumber = volume->ToBlock(run) + blockOffset;
+
+		fBlock = (uint8 *)get_block(volume->Device(),fBlockNumber,volume->BlockSize());
+		if (fBlock) {
+			// the node is somewhere in that block... (confusing offset calculation)
+			fNode = (bplustree_node *)(fBlock + offset -
+						(fileOffset + blockOffset * volume->BlockSize()));
+		} else
+			REPORT_ERROR(B_IO_ERROR);
+	}
+	return fNode;
+}
+
+
+status_t
+CachedNode::Free(Transaction *transaction,off_t offset)
+{
+	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL
+		|| offset == BPLUSTREE_NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// ToDo: scan the free nodes list and remove all nodes at the end
+	// of the tree - perhaps that shouldn't be done everytime that
+	// function is called, perhaps it should be done when the directory
+	// inode is closed or based on some calculation or whatever...
+
+	// if the node is the last one in the tree, we shrink
+	// the tree and file size by one node
+	off_t lastOffset = fTree->fHeader->maximum_size - fTree->fNodeSize;
+	if (offset == lastOffset) {
+		fTree->fHeader->maximum_size = lastOffset;
+
+		status_t status = fTree->fStream->SetFileSize(transaction,lastOffset);
+		if (status < B_OK)
+			return status;
+
+		return fTree->fCachedHeader.WriteBack(transaction);
+	}
+
+	// add the node to the free nodes list
+	fNode->left_link = fTree->fHeader->free_node_pointer;
+	fNode->overflow_link = BPLUSTREE_FREE;
+
+	if (WriteBack(transaction) == B_OK) {
+		fTree->fHeader->free_node_pointer = offset;
+		return fTree->fCachedHeader.WriteBack(transaction);
+	}
+	return B_ERROR;
+}
+
+
+status_t
+CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset)
+{
+	if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL
+		|| fTree->fStream == NULL) {
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	status_t status;
+
+	// if there are any free nodes, recycle them
+	if (SetTo(fTree->fHeader->free_node_pointer,false) != NULL) {
+		*_offset = fTree->fHeader->free_node_pointer;
+		
+		// set new free node pointer
+		fTree->fHeader->free_node_pointer = fNode->left_link;
+		if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) {
+			fNode->Initialize();
+			*_node = fNode;
+			return B_OK;
+		}
+		return status;
+	}
+	// allocate space for a new node
+	Inode *stream = fTree->fStream;
+	if ((status = stream->Append(transaction,fTree->fNodeSize)) < B_OK)
+		return status;
+
+	// the maximum_size has to be changed before the call to SetTo() - or
+	// else it will fail because the requested node is out of bounds
+	off_t offset = fTree->fHeader->maximum_size;
+	fTree->fHeader->maximum_size += fTree->fNodeSize;
+
+	if (SetTo(offset,false) != NULL) {
+		*_offset = offset;
+
+		if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) {
+			fNode->Initialize();
+			*_node = fNode;
+			return B_OK;
+		}
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+status_t 
+CachedNode::WriteBack(Transaction *transaction)
+{
+	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	return transaction->WriteBlocks(fBlockNumber,fBlock);
+}
+
+
+//	#pragma mark -
+
+
+BPlusTree::BPlusTree(Transaction *transaction,Inode *stream,int32 nodeSize)
+	:
+	fStream(NULL),
+	fHeader(NULL),
+	fCachedHeader(this)
+{
+	SetTo(transaction,stream);
+}
+
+
+BPlusTree::BPlusTree(Inode *stream)
+	:
+	fStream(NULL),
+	fHeader(NULL),
+	fCachedHeader(this)
+{
+	SetTo(stream);
+}
+
+
+BPlusTree::BPlusTree()
+	:
+	fStream(NULL),
+	fHeader(NULL),
+	fCachedHeader(this),
+	fNodeSize(BPLUSTREE_NODE_SIZE),
+	fAllowDuplicates(true),
+	fStatus(B_NO_INIT)
+{
+}
+
+
+BPlusTree::~BPlusTree()
+{
+	// if there are any TreeIterators left, we need to stop them
+	// (can happen when the tree's inode gets deleted while
+	// traversing the tree - a TreeIterator doesn't lock the inode)
+	if (fIteratorLock.Lock() < B_OK)
+		return;
+
+	TreeIterator *iterator = NULL;
+	while ((iterator = fIterators.Next(iterator)) != NULL)
+		iterator->Stop();
+
+	fIteratorLock.Unlock();
+}
+
+
+status_t
+BPlusTree::SetTo(Transaction *transaction,Inode *stream,int32 nodeSize)
+{
+	// initializes in-memory B+Tree
+
+	fCachedHeader.Unset();
+	fStream = stream;
+
+	fHeader = fCachedHeader.SetToHeader();
+	if (fHeader == NULL) {
+		// allocate space for new header + node!
+		fStatus = stream->SetFileSize(transaction,nodeSize * 2);
+		if (fStatus < B_OK)
+			RETURN_ERROR(fStatus);
+		
+		fHeader = fCachedHeader.SetToHeader();
+		if (fHeader == NULL)
+			RETURN_ERROR(fStatus = B_ERROR);
+	}
+
+	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
+						&& stream->BlockRun() != stream->Parent())
+						|| (stream->Mode() & S_ALLOW_DUPS) != 0;
+
+	fNodeSize = nodeSize;
+
+	// initialize b+tree header
+ 	fHeader->magic = BPLUSTREE_MAGIC;
+ 	fHeader->node_size = fNodeSize;
+ 	fHeader->max_number_of_levels = 1;
+ 	fHeader->data_type = ModeToKeyType(stream->Mode());
+ 	fHeader->root_node_pointer = nodeSize;
+ 	fHeader->free_node_pointer = BPLUSTREE_NULL;
+ 	fHeader->maximum_size = nodeSize * 2;
+
+	if (fCachedHeader.WriteBack(transaction) < B_OK)
+		RETURN_ERROR(fStatus = B_ERROR);
+
+	// initialize b+tree root node
+	CachedNode cached(this,fHeader->root_node_pointer,false);
+	if (cached.Node() == NULL)
+		RETURN_ERROR(B_ERROR);
+
+	cached.Node()->Initialize();
+	return fStatus = cached.WriteBack(transaction);
+}
+
+
+status_t
+BPlusTree::SetTo(Inode *stream)
+{
+	if (stream == NULL || stream->Node() == NULL)
+		RETURN_ERROR(fStatus = B_BAD_VALUE);
+
+	// get on-disk B+Tree header
+
+	fCachedHeader.Unset();
+	fStream = stream;
+
+	fHeader = fCachedHeader.SetToHeader();
+	if (fHeader == NULL)
+		RETURN_ERROR(fStatus = B_NO_INIT);
+	
+	// is header valid?
+
+	if (fHeader->magic != BPLUSTREE_MAGIC
+		|| fHeader->maximum_size != stream->Size()
+		|| (fHeader->root_node_pointer % fHeader->node_size) != 0
+		|| !fHeader->IsValidLink(fHeader->root_node_pointer)
+		|| !fHeader->IsValidLink(fHeader->free_node_pointer))
+		RETURN_ERROR(fStatus = B_BAD_DATA);
+
+	fNodeSize = fHeader->node_size;
+
+	{
+		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
+						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
+		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
+						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
+	
+		if (fHeader->data_type > BPLUSTREE_DOUBLE_TYPE
+			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->data_type] != mode
+			|| !stream->IsDirectory()) {
+			D(	dump_bplustree_header(fHeader);
+				dump_inode(stream->Node());
+			);
+			RETURN_ERROR(fStatus = B_BAD_TYPE);
+		}
+
+		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
+		 // in the original BFS code - we will honour it nevertheless
+		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
+							&& stream->BlockRun() != stream->Parent())
+							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
+	}
+
+	CachedNode cached(this,fHeader->root_node_pointer);
+	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
+}
+
+
+status_t
+BPlusTree::InitCheck()
+{
+	return fStatus;
+}
+
+
+int32 
+BPlusTree::TypeCodeToKeyType(type_code code)
+{
+	switch (code) {
+		case B_STRING_TYPE:
+			return BPLUSTREE_STRING_TYPE;
+		case B_INT32_TYPE:
+			return BPLUSTREE_INT32_TYPE;
+		case B_UINT32_TYPE:
+			return BPLUSTREE_UINT32_TYPE;
+		case B_INT64_TYPE:
+			return BPLUSTREE_INT64_TYPE;
+		case B_UINT64_TYPE:
+			return BPLUSTREE_UINT64_TYPE;
+		case B_FLOAT_TYPE:
+			return BPLUSTREE_FLOAT_TYPE;
+		case B_DOUBLE_TYPE:
+			return BPLUSTREE_DOUBLE_TYPE;
+	}
+	return -1;
+}
+
+
+int32 
+BPlusTree::ModeToKeyType(mode_t mode)
+{
+	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
+				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
+		case S_INT_INDEX:
+			return BPLUSTREE_INT32_TYPE;
+		case S_UINT_INDEX:
+			return BPLUSTREE_UINT32_TYPE;
+		case S_LONG_LONG_INDEX:
+			return BPLUSTREE_INT64_TYPE;
+		case S_ULONG_LONG_INDEX:
+			return BPLUSTREE_UINT64_TYPE;
+		case S_FLOAT_INDEX:
+			return BPLUSTREE_FLOAT_TYPE;
+		case S_DOUBLE_INDEX:
+			return BPLUSTREE_DOUBLE_TYPE;
+		case S_STR_INDEX:
+		default:
+			// default is for standard directories
+			return BPLUSTREE_STRING_TYPE;
+	}
+}
+
+
+//	#pragma mark -
+
+
+void
+BPlusTree::UpdateIterators(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change)
+{
+	// Although every iterator which is affected by this update currently
+	// waits on a semaphore, other iterators could be added/removed at
+	// any time, so we need to protect this loop
+	if (fIteratorLock.Lock() < B_OK)
+		return;
+
+	TreeIterator *iterator = NULL;
+	while ((iterator = fIterators.Next(iterator)) != NULL)
+		iterator->Update(offset,nextOffset,keyIndex,splitAt,change);
+
+	fIteratorLock.Unlock();
+}
+
+
+void
+BPlusTree::AddIterator(TreeIterator *iterator)
+{
+	if (fIteratorLock.Lock() < B_OK)
+		return;
+
+	fIterators.Add(iterator);
+
+	fIteratorLock.Unlock();
+}
+
+
+void 
+BPlusTree::RemoveIterator(TreeIterator *iterator)
+{
+	if (fIteratorLock.Lock() < B_OK)
+		return;
+
+	fIterators.Remove(iterator);
+
+	fIteratorLock.Unlock();
+}
+
+
+int32
+BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
+{
+	type_code type = 0;
+	switch (fHeader->data_type)
+	{
+	    case BPLUSTREE_STRING_TYPE:
+	    	type = B_STRING_TYPE;
+	    	break;
+		case BPLUSTREE_INT32_TYPE:
+	    	type = B_INT32_TYPE;
+	    	break;
+		case BPLUSTREE_UINT32_TYPE:
+	    	type = B_UINT32_TYPE;
+	    	break;
+		case BPLUSTREE_INT64_TYPE:
+	    	type = B_INT64_TYPE;
+	    	break;
+		case BPLUSTREE_UINT64_TYPE:
+	    	type = B_UINT64_TYPE;
+	    	break;
+		case BPLUSTREE_FLOAT_TYPE:
+	    	type = B_FLOAT_TYPE;
+	    	break;
+		case BPLUSTREE_DOUBLE_TYPE:
+	    	type = B_DOUBLE_TYPE;
+	    	break;
+	}
+   	return compareKeys(type,key1,keyLength1,key2,keyLength2);
+}
+
+
+status_t
+BPlusTree::FindKey(bplustree_node *node,const uint8 *key,uint16 keyLength,uint16 *index,off_t *next)
+{
+	if (node->all_key_count == 0)
+	{
+		if (index)
+			*index = 0;
+		if (next)
+			*next = node->overflow_link;
+		return B_ENTRY_NOT_FOUND;
+	}
+
+	off_t *values = node->Values();
+	int16 saveIndex;
+
+	// binary search in the key array
+	for (int16 first = 0,last = node->all_key_count - 1;first <= last;)
+	{
+		uint16 i = (first + last) >> 1;
+
+		uint16 searchLength;
+		uint8 *searchKey = node->KeyAt(i,&searchLength);
+		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
+			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
+			fStream->GetVolume()->Panic();
+			RETURN_ERROR(B_BAD_DATA);
+		}
+
+		int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength);
+		if (cmp < 0)
+		{
+			last = i - 1;
+			saveIndex = i;
+		}
+		else if (cmp > 0)
+		{
+			saveIndex = first = i + 1;
+		}
+		else
+		{
+			if (index)
+				*index = i;
+			if (next)
+				*next = values[i];
+			return B_OK;
+		}
+	}
+
+	if (index)
+		*index = saveIndex;
+	if (next)
+	{
+		if (saveIndex == node->all_key_count)
+			*next = node->overflow_link;
+		else
+			*next = values[saveIndex];
+	}
+	return B_ENTRY_NOT_FOUND;
+}
+
+
+/**	Prepares the stack to contain all nodes that were passed while
+ *	following the key, from the root node to the leaf node that could
+ *	or should contain that key.
+ */
+
+status_t
+BPlusTree::SeekDown(Stack<node_and_key> &stack,const uint8 *key,uint16 keyLength)
+{
+	// set the root node to begin with
+	node_and_key nodeAndKey;
+	nodeAndKey.nodeOffset = fHeader->root_node_pointer;
+
+	CachedNode cached(this);
+	bplustree_node *node;
+	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
+		// if we are already on leaf level, we're done
+		if (node->overflow_link == BPLUSTREE_NULL) {
+			// node that the keyIndex is not properly set here (but it's not
+			// needed in the calling functions anyway)!
+			nodeAndKey.keyIndex = 0;
+			stack.Push(nodeAndKey);
+			return B_OK;
+		}
+
+		off_t nextOffset;
+		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset);
+		
+		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
+			RETURN_ERROR(B_ERROR);
+
+		// put the node offset & the correct keyIndex on the stack
+		stack.Push(nodeAndKey);
+
+		nodeAndKey.nodeOffset = nextOffset;
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+status_t
+BPlusTree::FindFreeDuplicateFragment(bplustree_node *node,CachedNode *cached,off_t *_offset,bplustree_node **_fragment,uint32 *_index)
+{
+	off_t *values = node->Values();
+	for (int32 i = 0;i < node->all_key_count;i++) {
+		// does the value link to a duplicate fragment?
+		if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT)
+			continue;
+
+		bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]),false);
+		if (fragment == NULL) {
+			FATAL(("Could not get duplicate fragment at %Ld\n",values[i]));
+			continue;
+		}
+		
+		// see if there is some space left for us
+		int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1);
+		for (int32 j = 0;j < num;j++) {
+			duplicate_array *array = fragment->FragmentAt(j);
+
+			if (array->count == 0) {
+				*_offset = bplustree_node::FragmentOffset(values[i]);
+				*_fragment = fragment;
+				*_index = j;
+				return B_OK;
+			}
+		}
+	}
+	return B_ENTRY_NOT_FOUND;
+}
+
+
+status_t
+BPlusTree::InsertDuplicate(Transaction *transaction,CachedNode *cached,bplustree_node *node,uint16 index,off_t value)
+{
+	CachedNode cachedDuplicate(this);
+	off_t *values = node->Values();
+	off_t oldValue = values[index];
+	status_t status;
+	off_t offset;
+
+	if (bplustree_node::IsDuplicate(oldValue)) {
+		//
+		// If it's a duplicate fragment, try to insert it into that, or if it
+		// doesn't fit anymore, create a new duplicate node
+		//
+		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
+			bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue),false);
+			if (duplicate == NULL)
+				return B_IO_ERROR;
+
+			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
+			if (array->count > NUM_FRAGMENT_VALUES
+				|| array->count < 1) {
+				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),bplustree_node::FragmentOffset(oldValue),array->count));
+				return B_BAD_DATA;
+			}
+
+			if (array->count < NUM_FRAGMENT_VALUES) {
+				array->Insert(value);
+			} else {
+				// test if the fragment will be empty if we remove this key's values			
+				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
+					// the node will be empty without our values, so let us
+					// reuse it as a duplicate node
+					offset = bplustree_node::FragmentOffset(oldValue);
+
+					memmove(duplicate->DuplicateArray(),array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
+					duplicate->left_link = duplicate->right_link = BPLUSTREE_NULL;
+
+					array = duplicate->DuplicateArray();
+					array->Insert(value);
+				} else {
+					// create a new duplicate node
+					CachedNode cachedNewDuplicate(this);
+					bplustree_node *newDuplicate;
+					status = cachedNewDuplicate.Allocate(transaction,&newDuplicate,&offset);
+					if (status < B_OK)
+						return status;
+
+					// copy the array from the fragment node to the duplicate node
+					// and free the old entry (by zero'ing all values)
+					newDuplicate->overflow_link = array->count;
+					memcpy(&newDuplicate->all_key_count,&array->values[0],array->count * sizeof(off_t));
+					memset(array,0,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
+	
+					array = newDuplicate->DuplicateArray();
+					array->Insert(value);
+					
+					// if this fails, the old fragments node will contain wrong
+					// data... (but since it couldn't be written, it shouldn't
+					// be fatal)
+					if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK)
+						return status;
+				}
+
+				// update the main pointer to link to a duplicate node
+				values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE,offset);
+				if ((status = cached->WriteBack(transaction)) < B_OK)
+					return status;
+			}
+
+			return cachedDuplicate.WriteBack(transaction);
+		}
+
+		//
+		// Put the value into a dedicated duplicate node
+		//
+
+		// search for free space in the duplicate nodes of that key
+		duplicate_array *array;
+		bplustree_node *duplicate;
+		off_t duplicateOffset;
+		do {
+			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
+			duplicate = cachedDuplicate.SetTo(duplicateOffset,false);
+			if (duplicate == NULL)
+				return B_IO_ERROR;
+
+			array = duplicate->DuplicateArray();
+			if (array->count > NUM_DUPLICATE_VALUES
+				|| array->count < 0) {
+				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",duplicateOffset,array->count));
+				return B_BAD_DATA;
+			}
+		} while (array->count >= NUM_DUPLICATE_VALUES && (oldValue = duplicate->right_link) != BPLUSTREE_NULL);
+
+		if (array->count < NUM_DUPLICATE_VALUES) {
+			array->Insert(value);
+		} else {
+			// no space left - add a new duplicate node
+
+			CachedNode cachedNewDuplicate(this);
+			bplustree_node *newDuplicate;
+			status = cachedNewDuplicate.Allocate(transaction,&newDuplicate,&offset);
+			if (status < B_OK)
+				return status;
+
+			// link the two nodes together
+			duplicate->right_link = offset;
+			newDuplicate->left_link = duplicateOffset;
+			
+			array = newDuplicate->DuplicateArray();
+			array->count = 0;
+			array->Insert(value);
+			
+			status = cachedNewDuplicate.WriteBack(transaction);
+			if (status < B_OK)
+				return status;
+		}
+		return cachedDuplicate.WriteBack(transaction);
+	}
+
+	//
+	// Search for a free duplicate fragment or create a new one
+	// to insert the duplicate value into
+	//
+
+	uint32 fragmentIndex = 0;
+	bplustree_node *fragment;
+	if (FindFreeDuplicateFragment(node,&cachedDuplicate,&offset,&fragment,&fragmentIndex) < B_OK) {
+		// allocate a new duplicate fragment node
+		if ((status = cachedDuplicate.Allocate(transaction,&fragment,&offset)) < B_OK)
+			return status;
+
+		memset(fragment,0,fNodeSize);
+	}
+	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
+	array->Insert(oldValue);
+	array->Insert(value);
+
+	if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK)
+		return status;
+
+	values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,offset,fragmentIndex);
+
+	return cached->WriteBack(transaction);
+}
+
+
+void
+BPlusTree::InsertKey(bplustree_node *node,uint16 index,uint8 *key,uint16 keyLength,off_t value)
+{
+	// should never happen, but who knows?
+	if (index > node->all_key_count)
+		return;
+
+	off_t *values = node->Values();
+	uint16 *keyLengths = node->KeyLengths();
+	uint8 *keys = node->Keys();
+
+	node->all_key_count++;
+	node->all_key_length += keyLength;
+
+	off_t *newValues = node->Values();
+	uint16 *newKeyLengths = node->KeyLengths();
+
+	// move values and copy new value into them
+	memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index));
+	memmove(newValues,values,sizeof(off_t) * index);
+
+	newValues[index] = value;
+
+	// move and update key length index
+	for (uint16 i = node->all_key_count;i-- > index + 1;)
+		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
+	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
+
+	int32 keyStart;
+	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
+
+	// move keys and copy new key into them
+	int32 size = node->all_key_length - newKeyLengths[index];
+	if (size > 0)
+		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
+
+	memcpy(keys + keyStart,key,keyLength);
+}
+
+
+status_t
+BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,bplustree_node *other,off_t otherOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value)
+{
+	if (*_keyIndex > node->all_key_count + 1)
+		return B_BAD_VALUE;
+
+	uint16 *inKeyLengths = node->KeyLengths();
+	off_t *inKeyValues = node->Values();
+	uint8 *inKeys = node->Keys();
+	uint8 *outKeys = other->Keys();
+	int32 keyIndex = *_keyIndex;	// can become less than zero!
+
+	// how many keys will fit in one (half) page?
+	// that loop will find the answer to this question and
+	// change the key lengths indices for their new home
+
+	// "bytes" is the number of bytes written for the new key,
+	// "bytesBefore" are the bytes before that key
+	// "bytesAfter" are the bytes after the new key, if any
+	int32 bytes = 0,bytesBefore = 0,bytesAfter = 0;
+
+	size_t size = fNodeSize >> 1;
+	int32 out,in;
+	for (in = out = 0;in < node->all_key_count + 1;) {
+		if (!bytes)
+			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
+
+		if (in == keyIndex && !bytes) {
+			bytes = *_keyLength;
+		} else {
+			if (keyIndex < out)
+				bytesAfter = inKeyLengths[in] - bytesBefore;
+
+			in++;
+		}
+		out++;
+
+		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
+						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
+			// we have found the number of keys in the new node!
+			break;
+		}
+	}
+
+	// if the new key was not inserted, set the length of the keys
+	// that can be copied directly
+	if (keyIndex >= out && in > 0)
+		bytesBefore = inKeyLengths[in - 1];
+
+	if (bytesBefore < 0 || bytesAfter < 0)
+		return B_BAD_DATA;
+
+	other->left_link = node->left_link;
+	other->right_link = nodeOffset;
+	other->all_key_length = bytes + bytesBefore + bytesAfter;
+	other->all_key_count = out;
+
+	uint16 *outKeyLengths = other->KeyLengths();
+	off_t *outKeyValues = other->Values();
+	int32 keys = out > keyIndex ? keyIndex : out;
+
+	if (bytesBefore) {
+		// copy the keys
+		memcpy(outKeys,inKeys,bytesBefore);
+		memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16));
+		memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t));
+	}
+	if (bytes) {
+		// copy the newly inserted key
+		memcpy(outKeys + bytesBefore,key,bytes);
+		outKeyLengths[keyIndex] = bytes + bytesBefore;
+		outKeyValues[keyIndex] = *_value;
+
+		if (bytesAfter) {
+			// copy the keys after the new key
+			memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter);
+			keys = out - keyIndex - 1;
+			for (int32 i = 0;i < keys;i++)
+				outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes;
+			memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t));
+		}
+	}
+
+	// if the new key was already inserted, we shouldn't use it again
+	if (in != out)
+		keyIndex--;
+
+	int32 total = bytesBefore + bytesAfter;
+
+	// these variables are for the key that will be returned
+	// to the parent node
+	uint8 *newKey = NULL;
+	uint16 newLength;
+	bool newAllocated = false;
+
+	// If we have split an index node, we have to drop the first key
+	// of the next node (which can also be the new key to insert).
+	// The dropped key is also the one which has to be inserted in
+	// the parent node, so we will set the "newKey" already here.
+	if (node->overflow_link != BPLUSTREE_NULL) {
+		if (in == keyIndex) {
+			newKey = key;
+			newLength = *_keyLength;
+
+			other->overflow_link = *_value;
+			keyIndex--;
+		} else {
+			// If a key is dropped (is not the new key), we have to copy
+			// it, because it would be lost if not.
+			uint8 *droppedKey = node->KeyAt(in,&newLength);
+			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
+				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
+				fStream->GetVolume()->Panic();
+				RETURN_ERROR(B_BAD_DATA);
+			}
+			newKey = (uint8 *)malloc(newLength);
+			if (newKey == NULL)
+				return B_NO_MEMORY;
+			memcpy(newKey,droppedKey,newLength);
+
+			other->overflow_link = inKeyValues[in];
+			total = inKeyLengths[in++];
+		}
+	}
+
+	// and now the same game for the other page and the rest of the keys
+	// (but with memmove() instead of memcpy(), because they may overlap)
+
+	bytesBefore = bytesAfter = bytes = 0;
+	out = 0;
+	int32 skip = in;
+	while (in < node->all_key_count + 1) {
+		if (in == keyIndex && !bytes) {
+			// it's enough to set bytesBefore once here, because we do
+			// not need to know the exact length of all keys in this
+			// loop
+			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
+			bytes = *_keyLength;
+		} else {
+			if (in < node->all_key_count) {
+				inKeyLengths[in] -= total;
+				if (bytes) {
+					inKeyLengths[in] += bytes;
+					bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
+				}
+			}
+			in++;
+		}
+
+		out++;
+
+		// break out when all keys are done
+		if (in > node->all_key_count && keyIndex < in)
+			break;
+	}
+
+	// adjust the byte counts (since we were a bit lazy in the loop)
+	if (keyIndex >= in && keyIndex - skip < out)
+		bytesAfter = inKeyLengths[in] - bytesBefore - total;
+	else if (keyIndex < skip)
+		bytesBefore = node->all_key_length - total;
+
+	if (bytesBefore < 0 || bytesAfter < 0)
+		return B_BAD_DATA;
+
+	node->left_link = otherOffset;
+		// right link, and overflow link can stay the same
+	node->all_key_length = bytes + bytesBefore + bytesAfter;
+	node->all_key_count = out - 1;
+
+	// array positions have changed
+	outKeyLengths = node->KeyLengths();
+	outKeyValues = node->Values();
+
+	// move the keys in the old node: the order is important here,
+	// because we don't want to overwrite any contents
+
+	keys = keyIndex <= skip ? out : keyIndex - skip;
+	keyIndex -= skip;
+
+	if (bytesBefore)
+		memmove(inKeys,inKeys + total,bytesBefore);
+	if (bytesAfter)
+		memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter);
+
+	if (bytesBefore)
+		memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16));
+	in = out - keyIndex - 1;
+	if (bytesAfter)
+		memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16));
+
+	if (bytesBefore)
+		memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t));
+	if (bytesAfter)
+		memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t));
+
+	if (bytes) {
+		// finally, copy the newly inserted key (don't overwrite anything)
+		memcpy(inKeys + bytesBefore,key,bytes);
+		outKeyLengths[keyIndex] = bytes + bytesBefore;
+		outKeyValues[keyIndex] = *_value;
+	}
+
+	// Prepare the key that will be inserted in the parent node which
+	// is either the dropped key or the last of the other node.
+	// If it's the dropped key, "newKey" was already set earlier.
+
+	if (newKey == NULL)
+		newKey = other->KeyAt(other->all_key_count - 1,&newLength);
+
+	memcpy(key,newKey,newLength);
+	*_keyLength = newLength;
+	*_value = otherOffset;
+
+	if (newAllocated)
+		free(newKey);
+
+	return B_OK;
+}
+
+
+status_t
+BPlusTree::Insert(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value)
+{
+	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// lock access to stream
+	WriteLocked locked(fStream->Lock());
+
+	Stack<node_and_key> stack;
+	if (SeekDown(stack,key,keyLength) != B_OK)
+		RETURN_ERROR(B_ERROR);
+
+	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
+
+	memcpy(keyBuffer,key,keyLength);
+	keyBuffer[keyLength] = 0;
+
+	node_and_key nodeAndKey;
+	bplustree_node *node;
+
+	CachedNode cached(this);
+	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
+		if (node->IsLeaf())	{
+			// first round, check for duplicate entries
+			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
+
+			// is this a duplicate entry?
+			if (status == B_OK) {
+				if (fAllowDuplicates)
+					return InsertDuplicate(transaction,&cached,node,nodeAndKey.keyIndex,value);
+				else
+					RETURN_ERROR(B_NAME_IN_USE);
+			}
+		}
+
+		// is the node big enough to hold the pair?
+		if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength)
+			+ (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
+		{
+			InsertKey(node,nodeAndKey.keyIndex,keyBuffer,keyLength,value);
+			UpdateIterators(nodeAndKey.nodeOffset,BPLUSTREE_NULL,nodeAndKey.keyIndex,0,1);
+
+			return cached.WriteBack(transaction);
+		} else {
+			CachedNode cachedNewRoot(this);
+			CachedNode cachedOther(this);
+
+			// do we need to allocate a new root node? if so, then do
+			// it now
+			off_t newRoot = BPLUSTREE_NULL;
+			if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) {
+				bplustree_node *root;
+				status_t status = cachedNewRoot.Allocate(transaction,&root,&newRoot);
+				if (status < B_OK) {
+					// The tree is most likely corrupted!
+					// But it's still sane at leaf level - we could set
+					// a flag in the header that forces the tree to be
+					// rebuild next time...
+					// But since we will have journaling, that's not a big
+					// problem anyway.
+					RETURN_ERROR(status);
+				}
+			}
+
+			// reserve space for the other node
+			bplustree_node *other;
+			off_t otherOffset;
+			status_t status = cachedOther.Allocate(transaction,&other,&otherOffset);
+			if (status < B_OK) {
+				cachedNewRoot.Free(transaction,newRoot);
+				RETURN_ERROR(status);
+			}
+
+			if (SplitNode(node,nodeAndKey.nodeOffset,other,otherOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&value) < B_OK) {
+				// free root node & other node here
+				cachedNewRoot.Free(transaction,newRoot);
+				cachedOther.Free(transaction,otherOffset);					
+
+				RETURN_ERROR(B_ERROR);
+			}
+
+			// write the updated nodes back
+		
+			if (cached.WriteBack(transaction) < B_OK
+				|| cachedOther.WriteBack(transaction) < B_OK)
+				RETURN_ERROR(B_ERROR);
+
+			UpdateIterators(nodeAndKey.nodeOffset,otherOffset,nodeAndKey.keyIndex,node->all_key_count,1);
+
+			// update the right link of the node in the left of the new node
+			if ((other = cachedOther.SetTo(other->left_link)) != NULL) {
+				other->right_link = otherOffset;
+				if (cachedOther.WriteBack(transaction) < B_OK)
+					RETURN_ERROR(B_ERROR);
+			}
+
+			// create a new root if necessary
+			if (newRoot != BPLUSTREE_NULL) {
+				bplustree_node *root = cachedNewRoot.Node();
+
+				InsertKey(root,0,keyBuffer,keyLength,node->left_link);
+				root->overflow_link = nodeAndKey.nodeOffset;
+
+				if (cachedNewRoot.WriteBack(transaction) < B_OK)
+					RETURN_ERROR(B_ERROR);
+
+				// finally, update header to point to the new root
+				fHeader->root_node_pointer = newRoot;
+				fHeader->max_number_of_levels++;
+
+				return fCachedHeader.WriteBack(transaction);
+			}
+		}
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+status_t
+BPlusTree::RemoveDuplicate(Transaction *transaction,bplustree_node *node,CachedNode *cached,uint16 index,off_t value)
+{
+	CachedNode cachedDuplicate(this);
+	off_t *values = node->Values();
+	off_t oldValue = values[index];
+	status_t status;
+
+	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
+	bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset,false);
+	if (duplicate == NULL)
+		return B_IO_ERROR;
+
+	// if it's a duplicate fragment, remove the entry from there
+	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
+		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
+
+		if (array->count > NUM_FRAGMENT_VALUES
+			|| array->count < 1) {
+			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),duplicateOffset,array->count));
+			return B_BAD_DATA;
+		}
+		if (!array->Remove(value))
+			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",value,duplicateOffset));
+
+		// remove the array from the fragment node if it is empty
+		if (array->count == 1) {
+			// set the link to the remaining value
+			values[index] = array->values[0];
+
+			// Remove the whole fragment node, if this was the only array,
+			// otherwise free the array and write the changes back
+			if (duplicate->FragmentsUsed(fNodeSize) == 1)
+				status = cachedDuplicate.Free(transaction,duplicateOffset);
+			else {
+				array->count = 0;
+				status = cachedDuplicate.WriteBack(transaction);
+			}
+			if (status < B_OK)
+				return status;
+
+			return cached->WriteBack(transaction);
+		}
+		return cachedDuplicate.WriteBack(transaction);
+	}
+
+	//
+	// Remove value from a duplicate node!
+	//
+
+	duplicate_array *array;
+
+	if (duplicate->left_link != BPLUSTREE_NULL) {
+		FATAL(("invalid duplicate node: first left link points to %Ld!\n",duplicate->left_link));
+		return B_BAD_DATA;
+	}
+
+	// Search the duplicate nodes until the entry could be found (and removed)
+	while (duplicate != NULL) {
+		array = duplicate->DuplicateArray();
+		if (array->count > NUM_DUPLICATE_VALUES
+			|| array->count < 0) {
+			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",duplicateOffset,array->count));
+			return B_BAD_DATA;
+		}
+
+		if (array->Remove(value))
+			break;
+
+		if ((duplicateOffset = duplicate->right_link) == BPLUSTREE_NULL)
+			RETURN_ERROR(B_ENTRY_NOT_FOUND);
+		
+		duplicate = cachedDuplicate.SetTo(duplicateOffset,false);
+	}
+	if (duplicate == NULL)
+		RETURN_ERROR(B_IO_ERROR);
+
+	while (true) {
+		off_t left = duplicate->left_link;
+		off_t right = duplicate->right_link;
+		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
+	
+		if (isLast && array->count == 1 || array->count == 0) {
+			// Free empty duplicate page, link their siblings together, and
+			// update the duplicate link if needed (which should not be, if
+			// we are the only one working on that tree...)
+	
+			if (duplicateOffset == bplustree_node::FragmentOffset(oldValue)
+				|| array->count == 1) {
+				if (array->count == 1 && isLast)
+					values[index] = array->values[0];
+				else if (isLast) {
+					FATAL(("removed last value from duplicate!\n"));
+				} else
+					values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE,right);
+	
+				if ((status = cached->WriteBack(transaction)) < B_OK)
+					return status;
+			}
+	
+			if ((status = cachedDuplicate.Free(transaction,duplicateOffset)) < B_OK)
+				return status;
+	
+			if (left != BPLUSTREE_NULL
+				&& (duplicate = cachedDuplicate.SetTo(left,false)) != NULL) {
+				duplicate->right_link = right;
+				
+				// If the next node is the last node, we need to free that node
+				// and convert the duplicate entry back into a normal entry
+				if (right == BPLUSTREE_NULL && duplicate->left_link == BPLUSTREE_NULL
+					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
+					duplicateOffset = left;
+					continue;
+				}
+
+				status = cachedDuplicate.WriteBack(transaction);
+				if (status < B_OK)
+					return status;
+			}
+			if (right != BPLUSTREE_NULL
+				&& (duplicate = cachedDuplicate.SetTo(right,false)) != NULL) {
+				duplicate->left_link = left;
+	
+				// Again, we may need to turn the duplicate entry back into a normal entry
+				array = duplicate->DuplicateArray();
+				if (left == BPLUSTREE_NULL && duplicate->right_link == BPLUSTREE_NULL
+					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
+					duplicateOffset = right;
+					continue;
+				}
+
+				return cachedDuplicate.WriteBack(transaction);
+			}
+			return status;
+		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
+			// If the number of entries fits in a duplicate fragment, then
+			// either find a free fragment node, or convert this node to a
+			// fragment node.
+			CachedNode cachedOther(this);
+	
+			bplustree_node *fragment = NULL;
+			uint32 fragmentIndex = 0;
+			off_t offset;
+			if (FindFreeDuplicateFragment(node,&cachedOther,&offset,&fragment,&fragmentIndex) < B_OK) {
+				// convert node
+				memmove(duplicate,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
+				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1,0,fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
+			} else {
+				// move to other node
+				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
+				memcpy(target,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
+	
+				cachedDuplicate.Free(transaction,duplicateOffset);
+				duplicateOffset = offset;
+			}
+			values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,duplicateOffset,fragmentIndex);
+	
+			if ((status = cached->WriteBack(transaction)) < B_OK)
+				return status;
+	
+			if (fragment != NULL)
+				return cachedOther.WriteBack(transaction);
+		}
+		return cachedDuplicate.WriteBack(transaction);
+	}
+}
+
+
+/** Removes the key with the given index from the specified node.
+ *	Since it has to get the key from the node anyway (to obtain it's
+ *	pointer), it's not needed to pass the key & its length, although
+ *	the calling method (BPlusTree::Remove()) have this data.
+ */
+
+void
+BPlusTree::RemoveKey(bplustree_node *node,uint16 index)
+{
+	// should never happen, but who knows?
+	if (index > node->all_key_count && node->all_key_count > 0) {
+		FATAL(("Asked me to remove key outer limits: %u\n",index));
+		return;
+	}
+
+	off_t *values = node->Values();
+
+	// if we would have to drop the overflow link, drop
+	// the last key instead and update the overflow link
+	// to the value of that one
+	if (!node->IsLeaf() && index == node->all_key_count)
+		node->overflow_link = values[--index];
+
+	uint16 length;
+	uint8 *key = node->KeyAt(index,&length);
+	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
+		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
+		FATAL(("Key length to long: %s, %u (inode at %ld,%u [%s])\n",key,length,fStream->BlockRun().allocation_group,fStream->BlockRun().start,fStream->Name()));
+		fStream->GetVolume()->Panic();
+		return;
+	}
+
+	uint16 *keyLengths = node->KeyLengths();
+	uint8 *keys = node->Keys();
+
+	node->all_key_count--;
+	node->all_key_length -= length;
+
+	off_t *newValues = node->Values();
+	uint16 *newKeyLengths = node->KeyLengths();
+
+	// move key data
+	memmove(key,key + length,node->all_key_length - (key - keys));
+
+	// move and update key lengths
+	if (index > 0 && newKeyLengths != keyLengths)
+		memmove(newKeyLengths,keyLengths,index * sizeof(uint16));
+	for (uint16 i = index;i < node->all_key_count;i++)
+		newKeyLengths[i] = keyLengths[i + 1] - length;
+
+	// move values
+	if (index > 0)
+		memmove(newValues,values,index * sizeof(off_t));
+	if (node->all_key_count > index)
+		memmove(newValues + index,values + index + 1,(node->all_key_count - index) * sizeof(off_t));
+}
+
+
+/**	Removes the specified key from the tree. The "value" parameter is only used
+ *	for trees which allow duplicates, so you may safely ignore it.
+ *	It's not an optional parameter, so at least you have to think about it.
+ */
+
+status_t
+BPlusTree::Remove(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value)
+{
+	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// lock access to stream
+	WriteLocked locked(fStream->Lock());
+
+	Stack<node_and_key> stack;
+	if (SeekDown(stack,key,keyLength) != B_OK)
+		RETURN_ERROR(B_ERROR);
+
+	node_and_key nodeAndKey;
+	bplustree_node *node;
+
+	CachedNode cached(this);
+	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL)
+	{
+		if (node->IsLeaf())	// first round, check for duplicate entries
+		{
+			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
+			if (status < B_OK)
+				RETURN_ERROR(status); 
+
+			// If we will remove the last key, the iterator will be set
+			// to the next node after the current - if there aren't any
+			// more nodes, we need a way to prevent the TreeIterators to
+			// touch the old node again, we use BPLUSTREE_FREE for this
+			off_t next = node->right_link == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->right_link;
+			UpdateIterators(nodeAndKey.nodeOffset,node->all_key_count == 1 ?
+								next : BPLUSTREE_NULL,nodeAndKey.keyIndex,0,-1);
+
+			// is this a duplicate entry?
+			if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) {
+				if (fAllowDuplicates)
+					return RemoveDuplicate(transaction,node,&cached,nodeAndKey.keyIndex,value);
+				else
+					RETURN_ERROR(B_NAME_IN_USE);
+			}
+		}
+
+		// if it's an empty root node, we have to convert it
+		// to a leaf node by dropping the overflow link, or,
+		// if it's a leaf node, just empty it
+		if (nodeAndKey.nodeOffset == fHeader->root_node_pointer
+			&& node->all_key_count == 0
+			|| node->all_key_count == 1 && node->IsLeaf()) {
+			node->overflow_link = BPLUSTREE_NULL;
+			node->all_key_count = 0;
+			node->all_key_length = 0;
+
+			if (cached.WriteBack(transaction) < B_OK)
+				return B_IO_ERROR;
+
+			fHeader->max_number_of_levels = 1;
+			return fCachedHeader.WriteBack(transaction);
+		}
+
+		// if there is only one key left, we don't have to remove
+		// it, we can just dump the node (index nodes still have
+		// the overflow link, so we have to drop the last key)
+		if (node->all_key_count > 1
+			|| !node->IsLeaf() && node->all_key_count == 1) {
+			RemoveKey(node,nodeAndKey.keyIndex);
+			return cached.WriteBack(transaction);
+		}
+
+		// when we are here, we can just free the node, but
+		// we have to update the right/left link of the
+		// siblings first
+		CachedNode otherCached(this);
+		bplustree_node *other = otherCached.SetTo(node->left_link);
+		if (other != NULL) {
+			other->right_link = node->right_link;
+			if (otherCached.WriteBack(transaction) < B_OK)
+				return B_IO_ERROR;
+		}
+
+		if ((other = otherCached.SetTo(node->right_link)) != NULL) {
+			other->left_link = node->left_link;
+			if (otherCached.WriteBack(transaction) < B_OK)
+				return B_IO_ERROR;
+		}
+
+		cached.Free(transaction,nodeAndKey.nodeOffset);
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+/**	Replaces the value for the key in the tree.
+ *	Returns B_OK if the key could be found and its value replaced,
+ *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
+ *	to indicate that something went terribly wrong.
+ *	Note that this doesn't work with duplicates - it will just
+ *	return B_BAD_TYPE if you call this function on a tree where
+ *	duplicates are allowed.
+ */
+
+status_t
+BPlusTree::Replace(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value)
+{
+	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
+		|| key == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	if (fAllowDuplicates)
+		RETURN_ERROR(B_BAD_TYPE);
+
+	// lock access to stream (a read lock is okay for this purpose)
+	ReadLocked locked(fStream->Lock());
+
+	off_t nodeOffset = fHeader->root_node_pointer;
+	CachedNode cached(this);
+	bplustree_node *node;
+
+	while ((node = cached.SetTo(nodeOffset)) != NULL) {
+		uint16 keyIndex = 0;
+		off_t nextOffset;
+		status_t status = FindKey(node,key,keyLength,&keyIndex,&nextOffset);
+
+		if (node->overflow_link == BPLUSTREE_NULL) {
+			if (status == B_OK) {
+				node->Values()[keyIndex] = value;
+				return cached.WriteBack(transaction);
+			}
+
+			return status;
+		} else if (nextOffset == nodeOffset)
+			RETURN_ERROR(B_ERROR);
+
+		nodeOffset = nextOffset;
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+/**	Searches the key in the tree, and stores the offset found in
+ *	_value, if successful.
+ *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
+ *	a stack while it descends the tree.
+ *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
+ *	if not. It can also return other errors to indicate that
+ *	something went wrong.
+ *	Note that this doesn't work with duplicates - it will just
+ *	return B_BAD_TYPE if you call this function on a tree where
+ *	duplicates are allowed.
+ */
+
+status_t
+BPlusTree::Find(const uint8 *key,uint16 keyLength,off_t *_value)
+{
+	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
+		|| key == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	if (fAllowDuplicates)
+		RETURN_ERROR(B_BAD_TYPE);
+
+	// lock access to stream
+	ReadLocked locked(fStream->Lock());
+
+	off_t nodeOffset = fHeader->root_node_pointer;
+	CachedNode cached(this);
+	bplustree_node *node;
+
+	while ((node = cached.SetTo(nodeOffset)) != NULL) {
+		uint16 keyIndex = 0;
+		off_t nextOffset;
+		status_t status = FindKey(node,key,keyLength,&keyIndex,&nextOffset);
+
+		if (node->overflow_link == BPLUSTREE_NULL) {
+			if (status == B_OK && _value != NULL)
+				*_value = node->Values()[keyIndex];
+
+			return status;
+		} else if (nextOffset == nodeOffset)
+			RETURN_ERROR(B_ERROR);
+
+		nodeOffset = nextOffset;
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+//	#pragma mark -
+
+
+TreeIterator::TreeIterator(BPlusTree *tree)
+	:
+	fTree(tree),
+	fCurrentNodeOffset(BPLUSTREE_NULL),
+	fNext(NULL)
+{
+	tree->AddIterator(this);
+}
+
+
+TreeIterator::~TreeIterator()
+{
+	if (fTree)
+		fTree->RemoveIterator(this);
+}
+
+
+status_t
+TreeIterator::Goto(int8 to)
+{
+	if (fTree == NULL || fTree->fHeader == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// lock access to stream
+	ReadLocked locked(fTree->fStream->Lock());
+
+	off_t nodeOffset = fTree->fHeader->root_node_pointer;
+	CachedNode cached(fTree);
+	bplustree_node *node;
+
+	while ((node = cached.SetTo(nodeOffset)) != NULL) {
+		// is the node a leaf node?
+		if (node->overflow_link == BPLUSTREE_NULL) {
+			fCurrentNodeOffset = nodeOffset;
+			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count;
+			fDuplicateNode = BPLUSTREE_NULL;
+
+			return B_OK;
+		}
+
+		// get the next node offset depending on the direction (and if there
+		// are any keys in that node at all)
+		off_t nextOffset;
+		if (to == BPLUSTREE_END || node->all_key_count == 0)
+			nextOffset = node->overflow_link;
+		else {
+			if (node->all_key_length > fTree->fNodeSize
+				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->all_key_count)
+				RETURN_ERROR(B_ERROR);
+
+			nextOffset = node->Values()[0];
+		}
+		if (nextOffset == nodeOffset)
+			break;
+
+		nodeOffset = nextOffset;
+	}
+	FATAL(("%s fails\n",__PRETTY_FUNCTION__));
+	RETURN_ERROR(B_ERROR);
+}
+
+
+/**	Iterates through the tree in the specified direction.
+ *	When it iterates through duplicates, the "key" is only updated for the
+ *	first entry - if you need to know when this happens, use the "duplicate"
+ *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
+ *	the other duplicates.
+ *	That's not too nice, but saves the 256 bytes that would be needed to
+ *	store the last key - if this will ever become an issue, it will be
+ *	easy to change.
+ *	The other advantage of this is, that the queries can skip all duplicates
+ *	at once when they are not relevant to them.
+ */
+
+status_t
+TreeIterator::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate)
+{
+	if (fTree == NULL)
+		return B_INTERRUPTED;
+	if (fCurrentNodeOffset == BPLUSTREE_NULL
+		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 
+		RETURN_ERROR(B_ERROR);
+
+	// if the tree was emptied since the last call
+	if (fCurrentNodeOffset == BPLUSTREE_FREE)
+		return B_ENTRY_NOT_FOUND;
+
+	// lock access to stream
+	ReadLocked locked(fTree->fStream->Lock());
+
+	CachedNode cached(fTree);
+	bplustree_node *node;
+
+	if (fDuplicateNode != BPLUSTREE_NULL)
+	{
+		// regardless of traverse direction the duplicates are always presented in
+		// the same order; since they are all considered as equal, this shouldn't
+		// cause any problems
+
+		if (!fIsFragment || fDuplicate < fNumDuplicates)
+			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),false);
+		else
+			node = NULL;
+
+		if (node != NULL)
+		{
+			if (!fIsFragment && fDuplicate >= fNumDuplicates)
+			{
+				// if the node is out of duplicates, we go directly to the next one
+				fDuplicateNode = node->right_link;
+				if (fDuplicateNode != BPLUSTREE_NULL
+					&& (node = cached.SetTo(fDuplicateNode,false)) != NULL)
+				{
+					fNumDuplicates = node->CountDuplicates(fDuplicateNode,false);
+					fDuplicate = 0;
+				}
+			}
+			if (fDuplicate < fNumDuplicates)
+			{
+				*value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++);
+				if (duplicate)
+					*duplicate = 2;
+				return B_OK;
+			}
+		}
+		fDuplicateNode = BPLUSTREE_NULL;
+	}
+
+	off_t savedNodeOffset = fCurrentNodeOffset;
+	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
+		RETURN_ERROR(B_ERROR);
+
+	if (duplicate)
+		*duplicate = 0;
+
+	fCurrentKey += direction;
+	
+	// is the current key in the current node?
+	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count)
+		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
+	{
+		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link;
+
+		// are there any more nodes?
+		if (fCurrentNodeOffset != BPLUSTREE_NULL)
+		{
+			node = cached.SetTo(fCurrentNodeOffset);
+			if (!node)
+				RETURN_ERROR(B_ERROR);
+
+			// reset current key
+			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count;
+		}
+		else
+		{
+			// there are no nodes left, so turn back to the last key
+			fCurrentNodeOffset = savedNodeOffset;
+			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1;
+
+			return B_ENTRY_NOT_FOUND;
+		}
+	}
+
+	if (node->all_key_count == 0)
+		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
+
+	uint16 length;
+	uint8 *keyStart = node->KeyAt(fCurrentKey,&length);
+	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
+		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
+		fTree->fStream->GetVolume()->Panic();
+		RETURN_ERROR(B_BAD_DATA);
+	}
+
+	length = min_c(length,maxLength);
+	memcpy(key,keyStart,length);
+	
+	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
+	{
+		if (length == maxLength)
+			length--;
+		((char *)key)[length] = '\0';
+	}
+	*keyLength = length;
+
+	off_t offset = node->Values()[fCurrentKey];
+
+	// duplicate fragments?
+	uint8 type = bplustree_node::LinkType(offset);
+	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
+	{
+		fDuplicateNode = offset;
+
+		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),false);
+		if (node == NULL)
+			RETURN_ERROR(B_ERROR);
+
+		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
+
+		fNumDuplicates = node->CountDuplicates(offset,fIsFragment);
+		if (fNumDuplicates)
+		{
+			offset = node->DuplicateAt(offset,fIsFragment,0);
+			fDuplicate = 1;
+			if (duplicate)
+				*duplicate = 1;
+		}
+		else
+		{
+			// shouldn't happen, but we're dealing here with potentially corrupt disks...
+			fDuplicateNode = BPLUSTREE_NULL;
+			offset = 0;
+		}
+	}
+	*value = offset;
+
+	return B_OK;
+}
+
+
+/**	This is more or less a copy of BPlusTree::Find() - but it just
+ *	sets the current position in the iterator, regardless of if the
+ *	key could be found or not.
+ */
+
+status_t 
+TreeIterator::Find(const uint8 *key, uint16 keyLength)
+{
+	if (fTree == NULL)
+		return B_INTERRUPTED;
+	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
+		|| key == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// lock access to stream
+	ReadLocked locked(fTree->fStream->Lock());
+
+	off_t nodeOffset = fTree->fHeader->root_node_pointer;
+
+	CachedNode cached(fTree);
+	bplustree_node *node;
+	while ((node = cached.SetTo(nodeOffset)) != NULL) {
+		uint16 keyIndex = 0;
+		off_t nextOffset;
+		status_t status = fTree->FindKey(node,key,keyLength,&keyIndex,&nextOffset);
+
+		if (node->overflow_link == BPLUSTREE_NULL) {
+			fCurrentNodeOffset = nodeOffset;
+			fCurrentKey = keyIndex - 1;
+			fDuplicateNode = BPLUSTREE_NULL;
+
+			return status;
+		} else if (nextOffset == nodeOffset)
+			RETURN_ERROR(B_ERROR);
+
+		nodeOffset = nextOffset;
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+void 
+TreeIterator::SkipDuplicates()
+{
+	fDuplicateNode = BPLUSTREE_NULL;
+}
+
+
+void 
+TreeIterator::Update(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change)
+{
+	if (offset != fCurrentNodeOffset)
+		return;
+
+	if (nextOffset != BPLUSTREE_NULL) {
+		fCurrentNodeOffset = nextOffset;
+		if (splitAt <= fCurrentKey) {
+			fCurrentKey -= splitAt;
+			keyIndex -= splitAt;
+		}
+	}
+
+	// Adjust fCurrentKey to point to the same key as before.
+	// Note, that if a key is inserted at the current position
+	// it won't be included in this tree transition.
+	if (keyIndex <= fCurrentKey)
+		fCurrentKey += change;
+
+	// ToDo: duplicate handling!
+}
+
+
+void 
+TreeIterator::Stop()
+{
+	fTree = NULL;
+}
+
+
+#ifdef DEBUG
+void 
+TreeIterator::Dump()
+{
+	__out("TreeIterator at %p:\n",this);
+	__out("\tfTree = %p\n",fTree);
+	__out("\tfCurrentNodeOffset = %Ld\n",fCurrentNodeOffset);
+	__out("\tfCurrentKey = %ld\n",fCurrentKey);
+	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",bplustree_node::FragmentOffset(fDuplicateNode),fDuplicateNode,fDuplicateNode);
+	__out("\tfDuplicate = %u\n",fDuplicate);
+	__out("\tfNumDuplicates = %u\n",fNumDuplicates);
+	__out("\tfIsFragment = %s\n",fIsFragment ? "true" : "false");
+}
+#endif
+
+
+//	#pragma mark -
+
+
+void 
+bplustree_node::Initialize()
+{
+	left_link = right_link = overflow_link = BPLUSTREE_NULL;
+	all_key_count = 0;
+	all_key_length = 0;
+}
+
+
+uint8 *
+bplustree_node::KeyAt(int32 index,uint16 *keyLength) const
+{
+	if (index < 0 || index > all_key_count)
+		return NULL;
+
+	uint8 *keyStart = Keys();
+	uint16 *keyLengths = KeyLengths();
+
+	*keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0);
+	if (index > 0)
+		keyStart += keyLengths[index - 1];
+
+	return keyStart;
+}
+
+
+uint8
+bplustree_node::CountDuplicates(off_t offset,bool isFragment) const
+{
+	// the duplicate fragment handling is currently hard-coded to a node size
+	// of 1024 bytes - with future versions of BFS, this may be a problem
+
+	if (isFragment) {
+		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
+
+		return ((off_t *)this)[fragment];
+	}
+	return overflow_link;
+}
+
+
+off_t
+bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const
+{
+	uint32 start;
+	if (isFragment)
+		start = 8 * ((uint64)offset & 0x3ff);
+	else
+		start = 2;
+
+	return ((off_t *)this)[start + 1 + index];
+}
+
+
+/**	Although the name suggests it, this function doesn't return the real
+ *	used fragment count; at least, it can only count to two: it returns
+ *	0, if there is no fragment used, 1 if there is only one fragment
+ *	used, and 2 if there are at least 2 fragments used.
+ */
+
+int32
+bplustree_node::FragmentsUsed(uint32 nodeSize)
+{
+	uint32 used = 0;
+	for (int32 i = 0;i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));i++) {
+		duplicate_array *array = FragmentAt(i);
+		if (array->count > 0 && ++used > 1)
+			return used;
+	}
+	return used;
+}
+
+
+//	#pragma mark -
+
+
+int32
+compareKeys(type_code type,const void *key1, int keyLength1, const void *key2, int keyLength2)
+{
+	// if one of the keys is NULL, bail out gracefully
+	if (key1 == NULL || key2 == NULL)
+		return -1;
+
+	switch (type)
+	{
+	    case B_STRING_TYPE:
+    	{
+			int len = min_c(keyLength1,keyLength2);
+			int result = strncmp((const char *)key1,(const char *)key2,len);
+			
+			if (result == 0
+				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
+				result = keyLength1 - keyLength2;
+
+			return result;
+		}
+
+		case B_INT32_TYPE:
+			return *(int32 *)key1 - *(int32 *)key2;
+			
+		case B_UINT32_TYPE:
+		{
+			if (*(uint32 *)key1 == *(uint32 *)key2)
+				return 0;
+			else if (*(uint32 *)key1 > *(uint32 *)key2)
+				return 1;
+
+			return -1;
+		}
+			
+		case B_INT64_TYPE:
+		{
+			if (*(int64 *)key1 == *(int64 *)key2)
+				return 0;
+			else if (*(int64 *)key1 > *(int64 *)key2)
+				return 1;
+
+			return -1;
+		}
+
+		case B_UINT64_TYPE:
+		{
+			if (*(uint64 *)key1 == *(uint64 *)key2)
+				return 0;
+			else if (*(uint64 *)key1 > *(uint64 *)key2)
+				return 1;
+
+			return -1;
+		}
+
+		case B_FLOAT_TYPE:
+		{
+			float result = *(float *)key1 - *(float *)key2;
+			if (result == 0.0f)
+				return 0;
+
+			return (result < 0.0f) ? -1 : 1;
+		}
+
+		case B_DOUBLE_TYPE:
+		{
+			double result = *(double *)key1 - *(double *)key2;
+			if (result == 0.0)
+				return 0;
+
+			return (result < 0.0) ? -1 : 1;
+		}
+	}
+	return 0;
+}
+
+
diff --git a/src/add-ons/kernel/file_systems/bfs/BPlusTree.h b/src/add-ons/kernel/file_systems/bfs/BPlusTree.h
new file mode 100644
index 0000000000..402db41f84
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/BPlusTree.h
@@ -0,0 +1,436 @@
+#ifndef B_PLUS_TREE_H
+#define B_PLUS_TREE_H
+/* BPlusTree - BFS B+Tree implementation
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** Roughly based on 'btlib' written by Marcus J. Ranum
+** 
+** Copyright (c) 2001-2002 pinc Software. All Rights Reserved.
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "bfs.h"
+#include "Journal.h"
+#include "Chain.h"
+
+
+//****************** on-disk structures ********************
+
+#define BPLUSTREE_NULL			-1LL
+#define BPLUSTREE_FREE			-2LL
+
+struct bplustree_header {
+	uint32		magic;
+	uint32		node_size;
+	uint32		max_number_of_levels;
+	uint32		data_type;
+	off_t		root_node_pointer;
+	off_t		free_node_pointer;
+	off_t		maximum_size;
+	
+	inline bool IsValidLink(off_t link);
+};
+
+#define BPLUSTREE_MAGIC 			0x69f6c2e8
+#define BPLUSTREE_NODE_SIZE 		1024
+#define BPLUSTREE_MAX_KEY_LENGTH	256
+#define BPLUSTREE_MIN_KEY_LENGTH	1
+
+enum bplustree_types {
+	BPLUSTREE_STRING_TYPE	= 0,
+	BPLUSTREE_INT32_TYPE	= 1,
+	BPLUSTREE_UINT32_TYPE	= 2,
+	BPLUSTREE_INT64_TYPE	= 3,
+	BPLUSTREE_UINT64_TYPE	= 4,
+	BPLUSTREE_FLOAT_TYPE	= 5,
+	BPLUSTREE_DOUBLE_TYPE	= 6
+};
+
+struct sorted_array;
+typedef sorted_array duplicate_array;
+
+struct bplustree_node {
+	off_t	left_link;
+	off_t	right_link;
+	off_t	overflow_link;
+	uint16	all_key_count;
+	uint16	all_key_length;
+	
+	inline uint16 *KeyLengths() const;
+	inline off_t *Values() const;
+	inline uint8 *Keys() const;
+	inline int32 Used() const;
+	uint8 *KeyAt(int32 index,uint16 *keyLength) const;
+	
+	inline bool IsLeaf() const;
+
+	void Initialize();
+	uint8 CountDuplicates(off_t offset,bool isFragment) const;
+	off_t DuplicateAt(off_t offset,bool isFragment,int8 index) const;
+	int32 FragmentsUsed(uint32 nodeSize);
+	inline duplicate_array *FragmentAt(int8 index);
+	inline duplicate_array *DuplicateArray();
+
+	static inline uint8 LinkType(off_t link);
+	static inline off_t MakeLink(uint8 type, off_t link, uint32 fragmentIndex = 0);
+	static inline bool IsDuplicate(off_t link);
+	static inline off_t FragmentOffset(off_t link);
+	static inline uint32 FragmentIndex(off_t link);
+};
+
+//#define BPLUSTREE_NODE 0
+#define BPLUSTREE_DUPLICATE_NODE 2
+#define BPLUSTREE_DUPLICATE_FRAGMENT 3
+
+#define NUM_FRAGMENT_VALUES 7
+#define NUM_DUPLICATE_VALUES 125
+
+//**************************************
+
+enum bplustree_traversing {
+	BPLUSTREE_FORWARD = 1,
+	BPLUSTREE_BACKWARD = -1,
+	
+	BPLUSTREE_BEGIN = 0,
+	BPLUSTREE_END = 1
+};
+
+
+//****************** in-memory structures ********************
+
+template<class T> class Stack;
+class BPlusTree;
+class TreeIterator;
+class CachedNode;
+class Inode;
+
+// needed for searching (utilizing a stack)
+struct node_and_key {
+	off_t	nodeOffset;
+	uint16	keyIndex;
+};
+
+
+//***** Cache handling *****
+
+class CachedNode {
+	public:
+		CachedNode(BPlusTree *tree)
+			:
+			fTree(tree),
+			fNode(NULL),
+			fBlock(NULL)
+		{
+		}
+
+		CachedNode(BPlusTree *tree,off_t offset,bool check = true)
+			:
+			fTree(tree),
+			fNode(NULL),
+			fBlock(NULL)
+		{
+			SetTo(offset,check);
+		}
+
+		~CachedNode()
+		{
+			Unset();
+		}
+
+		bplustree_node *SetTo(off_t offset,bool check = true);
+		bplustree_header *SetToHeader();
+		void Unset();
+
+		status_t Free(Transaction *transaction, off_t offset);
+		status_t Allocate(Transaction *transaction,bplustree_node **node,off_t *offset);
+		status_t WriteBack(Transaction *transaction);
+
+		bplustree_node *Node() const { return fNode; }
+
+	protected:
+		bplustree_node	*InternalSetTo(off_t offset);
+
+		BPlusTree		*fTree;
+		bplustree_node	*fNode;
+		uint8			*fBlock;
+		off_t			fBlockNumber;
+};
+
+
+//******** B+tree class *********
+
+class BPlusTree {
+	public:
+		BPlusTree(Transaction *transaction,Inode *stream,int32 nodeSize = BPLUSTREE_NODE_SIZE);
+		BPlusTree(Inode *stream);
+		BPlusTree();
+		~BPlusTree();
+
+		status_t	SetTo(Transaction *transaction,Inode *stream,int32 nodeSize = BPLUSTREE_NODE_SIZE);
+		status_t	SetTo(Inode *stream);
+		status_t	SetStream(Inode *stream);
+
+		status_t	InitCheck();
+		status_t	Validate();
+
+		status_t	Remove(Transaction *transaction,const uint8 *key, uint16 keyLength, off_t value);
+		status_t	Insert(Transaction *transaction,const uint8 *key, uint16 keyLength, off_t value);
+
+		status_t	Insert(Transaction *transaction,const char *key, off_t value);
+		status_t	Insert(Transaction *transaction,int32 key, off_t value);
+		status_t	Insert(Transaction *transaction,uint32 key, off_t value);
+		status_t	Insert(Transaction *transaction,int64 key, off_t value);
+		status_t	Insert(Transaction *transaction,uint64 key, off_t value);
+		status_t	Insert(Transaction *transaction,float key, off_t value);
+		status_t	Insert(Transaction *transaction,double key, off_t value);
+
+		status_t	Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value);
+		status_t	Find(const uint8 *key, uint16 keyLength, off_t *value);
+
+		static int32 TypeCodeToKeyType(type_code code);
+		static int32 ModeToKeyType(mode_t mode);
+
+	private:
+		int32		CompareKeys(const void *key1, int keylength1, const void *key2, int keylength2);
+		status_t	FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index = NULL, off_t *next = NULL);
+		status_t	SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength);
+
+		status_t	FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset, bplustree_node **_fragment,uint32 *_index);
+		status_t	InsertDuplicate(Transaction *transaction,CachedNode *cached,bplustree_node *node,uint16 index,off_t value);
+		void		InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength, off_t value);
+		status_t	SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value);
+
+		status_t	RemoveDuplicate(Transaction *transaction,bplustree_node *node,CachedNode *cached,uint16 keyIndex, off_t value);
+		void		RemoveKey(bplustree_node *node, uint16 index);
+
+		void		UpdateIterators(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change);
+		void		AddIterator(TreeIterator *iterator);
+		void		RemoveIterator(TreeIterator *iterator);
+
+	private:
+		friend TreeIterator;
+		friend CachedNode;
+
+		Inode		*fStream;
+		bplustree_header *fHeader;
+		CachedNode	fCachedHeader;
+		int32		fNodeSize;
+		bool		fAllowDuplicates;
+		status_t	fStatus;
+		SimpleLock	fIteratorLock;
+		Chain<TreeIterator> fIterators;
+};
+
+
+//***** helper classes/functions *****
+
+extern int32 compareKeys(type_code type,const void *key1, int keyLength1, const void *key2, int keyLength2);
+
+class TreeIterator {
+	public:
+		TreeIterator(BPlusTree *tree);
+		~TreeIterator();
+
+		status_t	Goto(int8 to);
+		status_t	Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength, off_t *value,uint16 *duplicate = NULL);
+		status_t	Find(const uint8 *key, uint16 keyLength);
+
+		status_t	Rewind();
+		status_t	GetNextEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate = NULL);
+		status_t	GetPreviousEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate = NULL);
+		void		SkipDuplicates();
+
+#ifdef DEBUG
+		void		Dump();
+#endif
+
+	private:
+		BPlusTree	*fTree;
+
+		off_t		fCurrentNodeOffset;	// traverse position
+		int32		fCurrentKey;
+		off_t		fDuplicateNode;
+		uint16		fDuplicate, fNumDuplicates;
+		bool		fIsFragment;
+
+	private:
+		friend Chain<TreeIterator>;
+		friend BPlusTree;
+
+		void Update(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change);
+		void Stop();
+		TreeIterator *fNext;
+};
+
+// BPlusTree's inline functions (most of them may not be needed)
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,const char *key,off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_STRING_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)key, strlen(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,int32 key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_INT32_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,uint32 key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_UINT32_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,int64 key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_INT64_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,uint64 key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_UINT64_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,float key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_FLOAT_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+inline status_t
+BPlusTree::Insert(Transaction *transaction,double key, off_t value)
+{
+	if (fHeader->data_type != BPLUSTREE_DOUBLE_TYPE)
+		return B_BAD_TYPE;
+	return Insert(transaction,(uint8 *)&key, sizeof(key), value);
+}
+
+
+/************************ TreeIterator inline functions ************************/
+//	#pragma mark -
+
+inline status_t
+TreeIterator::Rewind()
+{
+	return Goto(BPLUSTREE_BEGIN);
+}
+
+inline status_t
+TreeIterator::GetNextEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate)
+{
+	return Traverse(BPLUSTREE_FORWARD,key,keyLength,maxLength,value,duplicate);
+}
+
+inline status_t
+TreeIterator::GetPreviousEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate)
+{
+	return Traverse(BPLUSTREE_BACKWARD,key,keyLength,maxLength,value,duplicate);
+}
+
+/************************ bplustree_header inline functions ************************/
+//	#pragma mark -
+
+
+inline bool
+bplustree_header::IsValidLink(off_t link)
+{
+	return link == BPLUSTREE_NULL || (link > 0 && link <= maximum_size - node_size);
+}
+
+
+/************************ bplustree_node inline functions ************************/
+//	#pragma mark -
+
+
+inline uint16 *
+bplustree_node::KeyLengths() const
+{
+	return (uint16 *)(((char *)this) + round_up(sizeof(bplustree_node) + all_key_length));
+}
+
+inline off_t *
+bplustree_node::Values() const
+{
+	return (off_t *)((char *)KeyLengths() + all_key_count * sizeof(uint16));
+}
+
+inline uint8 *
+bplustree_node::Keys() const
+{
+	return (uint8 *)this + sizeof(bplustree_node);
+}
+
+inline int32
+bplustree_node::Used() const
+{
+	return round_up(sizeof(bplustree_node) + all_key_length) + all_key_count * (sizeof(uint16) + sizeof(off_t));
+}
+
+inline bool 
+bplustree_node::IsLeaf() const
+{
+	return overflow_link == BPLUSTREE_NULL;
+}
+
+
+inline duplicate_array *
+bplustree_node::FragmentAt(int8 index)
+{
+	return (duplicate_array *)((off_t *)this + index * (NUM_FRAGMENT_VALUES + 1));
+}
+
+
+inline duplicate_array *
+bplustree_node::DuplicateArray()
+{
+	return (duplicate_array *)&this->overflow_link;
+}
+
+
+inline uint8
+bplustree_node::LinkType(off_t link)
+{
+	return *(uint64 *)&link >> 62;
+}
+
+inline off_t
+bplustree_node::MakeLink(uint8 type,off_t link,uint32 fragmentIndex)
+{
+	return ((off_t)type << 62) | (link & 0x3ffffffffffffc00LL) | (fragmentIndex & 0x3ff);
+}
+
+inline bool 
+bplustree_node::IsDuplicate(off_t link)
+{
+	return (LinkType(link) & (BPLUSTREE_DUPLICATE_NODE | BPLUSTREE_DUPLICATE_FRAGMENT)) > 0;
+}
+
+inline off_t
+bplustree_node::FragmentOffset(off_t link)
+{
+	return link & 0x3ffffffffffffc00LL;
+}
+
+inline uint32
+bplustree_node::FragmentIndex(off_t link)
+{
+	return (uint32)(link & 0x3ff);
+}
+
+#endif	/* B_PLUS_TREE_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp
new file mode 100644
index 0000000000..027f2fb1d5
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp
@@ -0,0 +1,599 @@
+/* BlockAllocator - block bitmap handling and allocation policies
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "cpp.h"
+#include "Debug.h"
+#include "BlockAllocator.h"
+#include "Volume.h"
+#include "Inode.h"
+
+#ifdef USER
+#	define spawn_kernel_thread spawn_thread
+#endif
+
+// Things the BlockAllocator should do:
+
+// - find a range of blocks of a certain size nearby a specific position
+// - allocating a unsharp range of blocks for pre-allocation
+// - free blocks
+// - know how to deal with each allocation, special handling for directories,
+//   files, symlinks, etc. (type sensitive allocation policies)
+
+// What makes the code complicated is the fact that we are not just reading
+// in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
+// with a block size of 2048 bytes already has a 800kB bitmap, and the size
+// of partitions will grow even more - so that's not an option.
+// Instead we are reading in every block when it's used - since an allocation
+// group can span several blocks in the block bitmap, the AllocationBlock
+// class is there to make handling those easier.
+
+// The current implementation is very basic and will be heavily optimized
+// in the future.
+// Furthermore, the allocation policies used here (when they will be in place)
+// should have some real world tests.
+
+
+class AllocationBlock : public CachedBlock {
+	public:
+		AllocationBlock(Volume *volume);
+		
+		void Allocate(uint16 start,uint16 numBlocks = 0xffff);
+		void Free(uint16 start,uint16 numBlocks = 0xffff);
+		inline bool IsUsed(uint16 block);
+
+		status_t SetTo(AllocationGroup &group,uint16 block);
+
+		int32 NumBlockBits() const { return fNumBits; }
+
+	private:
+		int32 fNumBits;
+};
+
+
+class AllocationGroup {
+	public:
+		AllocationGroup();
+
+		void AddFreeRange(int32 start,int32 blocks);
+		bool IsFull() const { return fFreeBits == 0; }
+
+		int32 fNumBits;
+		int32 fStart;
+		int32 fFirstFree,fLargest,fLargestFirst;
+		int32 fFreeBits;
+};
+
+
+AllocationBlock::AllocationBlock(Volume *volume)
+	: CachedBlock(volume)
+{
+}
+
+
+status_t 
+AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
+{
+	// 8 blocks per byte
+	fNumBits = fVolume->BlockSize() << 3;
+	// the last group may have less bits in the last block
+	if ((group.fNumBits % fNumBits) != 0)
+		fNumBits = group.fNumBits % fNumBits;
+
+	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
+}
+
+
+bool 
+AllocationBlock::IsUsed(uint16 block)
+{
+	if (block > fNumBits)
+		return true;
+	return ((uint32 *)fBlock)[block >> 5] & (1UL << (block % 32));
+}
+
+
+void
+AllocationBlock::Allocate(uint16 start,uint16 numBlocks)
+{
+	start = start % fNumBits;
+	if (numBlocks == 0xffff) {
+		// allocate all blocks after "start"
+		numBlocks = fNumBits - start;
+	} else if (start + numBlocks > fNumBits) {
+		FATAL(("should allocate more blocks than there are in a block!\n"));
+		numBlocks = fNumBits - start;
+	}
+
+	int32 block = start >> 5;
+
+	while (numBlocks > 0) {
+		uint32 mask = 0;
+		for (int32 i = start % 32;i < 32 && numBlocks;i++,numBlocks--)
+			mask |= 1UL << (i % 32);
+
+		((uint32 *)fBlock)[block++] |= mask;
+		start = 0;
+	}
+}
+
+
+void
+AllocationBlock::Free(uint16 start,uint16 numBlocks)
+{
+	start = start % fNumBits;
+	if (numBlocks == 0xffff) {
+		// free all blocks after "start"
+		numBlocks = fNumBits - start;
+	} else if (start + numBlocks > fNumBits) {
+		FATAL(("should free more blocks than there are in a block!\n"));
+		numBlocks = fNumBits - start;
+	}
+
+	int32 block = start >> 5;
+
+	while (numBlocks > 0) {
+		uint32 mask = 0;
+		for (int32 i = start % 32;i < 32 && numBlocks;i++,numBlocks--)
+			mask |= 1UL << (i % 32);
+
+		((uint32 *)fBlock)[block++] &= ~mask;
+		start = 0;
+	}
+}
+
+
+//	#pragma mark -
+
+
+AllocationGroup::AllocationGroup()
+	:
+	fFirstFree(-1),
+	fLargest(-1),
+	fLargestFirst(-1),
+	fFreeBits(0)
+{
+}
+
+
+void 
+AllocationGroup::AddFreeRange(int32 start, int32 blocks)
+{
+	D(if (blocks > 512)
+		PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
+
+	if (fFirstFree == -1)
+		fFirstFree = start;
+
+	if (fLargest < blocks) {
+		fLargest = blocks;
+		fLargestFirst = start;
+	}
+
+	fFreeBits += blocks;
+}
+
+
+//	#pragma mark -
+
+
+BlockAllocator::BlockAllocator(Volume *volume)
+	:
+	fVolume(volume),
+	fGroups(NULL)
+{
+}
+
+
+BlockAllocator::~BlockAllocator()
+{
+	delete[] fGroups;
+}
+
+
+status_t 
+BlockAllocator::Initialize()
+{
+	if (fLock.InitCheck() < B_OK)
+		return B_ERROR;
+
+	fNumGroups = fVolume->AllocationGroups();
+	fBlocksPerGroup = fVolume->SuperBlock().blocks_per_ag;
+	fGroups = new AllocationGroup[fNumGroups];
+	if (fGroups == NULL)
+		return B_NO_MEMORY;
+	
+	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,"bfs block allocator",B_LOW_PRIORITY,(void *)this);
+	if (id < B_OK)
+		return initialize(this);
+
+	return resume_thread(id);
+}
+
+
+status_t 
+BlockAllocator::initialize(BlockAllocator *allocator)
+{
+	Locker lock(allocator->fLock);
+
+	Volume *volume = allocator->fVolume;
+	uint32 blocks = allocator->fBlocksPerGroup;
+	uint32 numBits = 8 * blocks * volume->BlockSize();
+	off_t freeBlocks = 0;
+
+	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
+	if (buffer == NULL)
+		RETURN_ERROR(B_NO_MEMORY);
+
+	AllocationGroup *groups = allocator->fGroups;
+	off_t offset = 1;
+	int32 num = allocator->fNumGroups;
+
+	for (int32 i = 0;i < num;i++) {
+		if (cached_read(volume->Device(),offset,buffer,blocks,volume->BlockSize()) < B_OK)
+			break;
+
+		// the last allocation group may contain less blocks than the others
+		groups[i].fNumBits = i == num - 1 ? allocator->fVolume->NumBlocks() - i * numBits : numBits;
+		groups[i].fStart = offset;
+
+		// finds all free ranges in this allocation group
+		int32 start,range = 0;
+		int32 size = groups[i].fNumBits,num = 0;
+
+		for (int32 k = 0;k < (size >> 2);k++) {
+			for (int32 j = 0;j < 32 && num < size;j++,num++) {
+				if (buffer[k] & (1UL << j)) {
+					if (range > 0) {
+						groups[i].AddFreeRange(start,range);
+						range = 0;
+					}
+				} else if (range++ == 0)
+					start = num;
+			}
+		}
+		if (range)
+			groups[i].AddFreeRange(start,range);
+
+		freeBlocks += groups[i].fFreeBits;
+
+		offset += blocks;
+	}
+	free(buffer);
+
+	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
+	if (volume->UsedBlocks() != usedBlocks) {
+		// If the disk in a dirty state at mount time, it's
+		// normal that the values don't match
+		INFORM(("volume reports %Ld used blocks, correct is %Ld\n",volume->UsedBlocks(),usedBlocks));
+		volume->SuperBlock().used_blocks = usedBlocks;
+	}
+
+	return B_OK;
+}
+
+
+status_t
+BlockAllocator::AllocateBlocks(Transaction *transaction,int32 group,uint16 start,uint16 maximum,uint16 minimum, block_run &run)
+{
+	AllocationBlock cached(fVolume);
+	Locker lock(fLock);
+
+	// the first scan through all allocation groups will look for the
+	// wanted maximum of blocks, the second scan will just look to
+	// satisfy the minimal requirement
+	uint16 numBlocks = maximum;
+
+	for (int32 i = 0;i < fNumGroups * 2;i++,group++,start = 0) {
+		group = group % fNumGroups;
+
+		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
+			continue;
+
+		if (i >= fNumGroups) {
+			// if the minimum is the same as the maximum, it's not necessary to
+			// search for in the allocation groups a second time
+			if (maximum == minimum)
+				return B_DEVICE_FULL;
+
+			numBlocks = minimum;
+		}
+
+		// The wanted maximum is smaller than the largest free block in the group
+		// or already smaller than the minimum
+		// ToDo: disabled because it's currently not maintained after the first allocation
+		//if (numBlocks > fGroups[group].fLargest)
+		//	continue;
+
+		if (start < fGroups[group].fFirstFree)
+			start = fGroups[group].fFirstFree;
+
+		// there may be more than one block per allocation group - and
+		// we iterate through it to find a place for the allocation.
+		// (one allocation can't exceed one allocation group)
+
+		uint32 block = start / (fVolume->BlockSize() << 3);
+		int32 range = 0, rangeStart = 0,rangeBlock = 0;
+
+		for (;block < fBlocksPerGroup;block++) {
+			if (cached.SetTo(fGroups[group],block) < B_OK)
+				RETURN_ERROR(B_ERROR);
+
+			// find a block large enough to hold the allocation
+			for (int32 bit = start % cached.NumBlockBits();bit < cached.NumBlockBits();bit++) {
+				if (!cached.IsUsed(bit)) {
+					if (range == 0) {
+						// start new range
+						rangeStart = block * cached.NumBlockBits() + bit;
+						rangeBlock = block;
+					}
+
+					// have we found a range large enough to hold numBlocks?
+					if (++range >= maximum)
+						break;
+				} else if (i >= fNumGroups && range >= minimum) {
+					// we have found a block larger than the required minimum (second pass)
+					break;
+				} else {
+					// end of a range
+					range = 0;
+				}
+			}
+
+			// if we found a suitable block, mark the blocks as in use, and write
+			// the updated block bitmap back to disk
+			if (range >= numBlocks) {
+				// adjust allocation size
+				if (numBlocks < maximum)
+					numBlocks = range;
+
+				// Update the allocation group info
+				// Note, the fFirstFree block doesn't have to be really free
+				if (rangeStart == fGroups[group].fFirstFree)
+					fGroups[group].fFirstFree = rangeStart + numBlocks;
+				fGroups[group].fFreeBits -= numBlocks;
+
+				if (block != rangeBlock) {
+					// allocate the part that's in the current block
+					cached.Allocate(0,(rangeStart + numBlocks) % cached.NumBlockBits());
+					if (cached.WriteBack(transaction) < B_OK)
+						RETURN_ERROR(B_ERROR);
+
+					// set the blocks in the previous block
+					if (cached.SetTo(fGroups[group],block - 1) < B_OK)
+						cached.Allocate(rangeStart);
+					else
+						RETURN_ERROR(B_ERROR);
+				} else {
+					// just allocate the bits in the current block
+					cached.Allocate(rangeStart,numBlocks);
+				}
+				run.allocation_group = group;
+				run.start = rangeStart;
+				run.length = numBlocks;
+
+				fVolume->SuperBlock().used_blocks += numBlocks;
+					// We are not writing back the disk's super block - it's
+					// either done by the journaling code, or when the disk
+					// is unmounted.
+					// If the value is not correct at mount time, it will be
+					// fixed anyway.
+
+				return cached.WriteBack(transaction);
+			}
+
+			// start from the beginning of the next block
+			start = 0;
+		}
+	}
+	return B_DEVICE_FULL;
+}
+
+
+status_t 
+BlockAllocator::AllocateForInode(Transaction *transaction,const block_run *parent, mode_t type, block_run &run)
+{
+	// apply some allocation policies here (AllocateBlocks() will break them
+	// if necessary) - we will start with those described in Dominic Giampaolo's
+	// "Practical File System Design", and see how good they work
+	
+	// files are going in the same allocation group as its parent, sub-directories
+	// will be inserted 8 allocation groups after the one of the parent
+	uint16 group = parent->allocation_group;
+	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
+		group += 8;
+
+	return AllocateBlocks(transaction,group,0,1,1,run);
+}
+
+
+status_t 
+BlockAllocator::Allocate(Transaction *transaction,const Inode *inode, off_t numBlocks, block_run &run, uint16 minimum)
+{
+	if (numBlocks <= 0)
+		return B_ERROR;
+
+	// one block_run can't hold more data than it is in one allocation group
+	if (numBlocks > fGroups[0].fNumBits)
+		numBlocks = fGroups[0].fNumBits;
+
+	// apply some allocation policies here (AllocateBlocks() will break them
+	// if necessary)
+	uint16 group = inode->BlockRun().allocation_group;
+	uint16 start = 0;
+
+	// are there already allocated blocks? (then just allocate near the last)
+	if (inode->Size() > 0) {
+		data_stream *data = &inode->Node()->data;
+		// we currently don't care for when the data stream is
+		// already grown into the indirect ranges
+		if (data->max_double_indirect_range == 0
+			&& data->max_indirect_range == 0) {
+			int32 last = 0;
+			for (;last < NUM_DIRECT_BLOCKS - 1;last++)
+				if (data->direct[last + 1].IsZero())
+					break;
+			
+			group = data->direct[last].allocation_group;
+			start = data->direct[last].start + data->direct[last].length;
+		}
+	} else if (inode->IsDirectory()) {
+		// directory data will go in the same allocation group as the inode is in
+		// but after the inode data
+		start = inode->BlockRun().start;
+	} else {
+		// file data will start in the next allocation group
+		group = inode->BlockRun().allocation_group + 1;
+	}
+
+	return AllocateBlocks(transaction,group,start,numBlocks,minimum,run);
+}
+
+
+status_t 
+BlockAllocator::Free(Transaction *transaction,block_run &run)
+{
+	Locker lock(fLock);
+
+	int32 group = run.allocation_group;
+	uint16 start = run.start;
+	uint16 length = run.length;
+
+	// doesn't use Volume::IsValidBlockRun() here because it can check better
+	// against the group size (the last group may have a different length)
+	if (group < 0 || group >= fNumGroups
+		|| start > fGroups[group].fNumBits
+		|| start + length > fGroups[group].fNumBits
+		|| length == 0) {
+		FATAL(("someone tried to free an invalid block_run (%ld, %u, %u)\n",group,start,length));
+		return B_BAD_VALUE;
+	}
+	// check if someone tries to free reserved areas at the beginning of the drive
+	if (group == 0 && start < fVolume->Log().start + fVolume->Log().length) {
+		FATAL(("someone tried to free a reserved block_run (%ld, %u, %u)\n",group,start,length));
+		return B_BAD_VALUE;
+	}
+#ifdef DEBUG	
+	if (CheckBlockRun(run) < B_OK)
+		return B_BAD_DATA;
+#endif
+
+	AllocationBlock cached(fVolume);
+
+	uint32 block = run.start / (fVolume->BlockSize() << 3);
+
+	if (fGroups[group].fFirstFree > start)
+		fGroups[group].fFirstFree = start;
+	fGroups[group].fFreeBits += length;
+
+	for (;block < fBlocksPerGroup;block++) {
+		if (cached.SetTo(fGroups[group],block) < B_OK)
+			RETURN_ERROR(B_IO_ERROR);
+
+		uint16 freeLength = length;
+		if (start + length > cached.NumBlockBits())
+			freeLength = cached.NumBlockBits() - start;
+
+		cached.Free(start,freeLength);
+
+		if (cached.WriteBack(transaction) < B_OK)
+			return B_IO_ERROR;
+
+		length -= freeLength;
+		if (length <= 0)
+			break;
+
+		start = 0;
+	}
+
+	fVolume->SuperBlock().used_blocks -= run.length;
+	return B_OK;
+}
+
+#ifdef DEBUG
+#include "BPlusTree.h"
+
+status_t
+BlockAllocator::CheckBlockRun(block_run run)
+{
+	uint32 block = run.start / (fVolume->BlockSize() << 3);
+	uint32 start = run.start;
+	uint32 pos = 0;
+
+	AllocationBlock cached(fVolume);
+
+	for (;block < fBlocksPerGroup;block++) {
+		if (cached.SetTo(fGroups[run.allocation_group],block) < B_OK)
+			RETURN_ERROR(B_IO_ERROR);
+
+		start = start % cached.NumBlockBits();
+		while (pos < run.length && start + pos < cached.NumBlockBits()) {
+			if (!cached.IsUsed(start + pos)) {
+				PRINT(("block_run(%ld,%u,%u) is only partially allocated!\n",run.allocation_group,run.start,run.length));
+				fVolume->Panic();
+				return B_BAD_DATA;
+			}
+			pos++;
+		}
+		start = 0;
+	}
+	return B_OK;
+}
+
+
+status_t
+BlockAllocator::CheckInode(Inode *inode)
+{
+	status_t status = CheckBlockRun(inode->BlockRun());
+	if (status < B_OK)
+		return status;
+
+	// only checks the direct range for now...
+
+	data_stream *data = &inode->Node()->data;
+	for (int32 i = 0;i < NUM_DIRECT_BLOCKS;i++) {
+		if (data->direct[i].IsZero())
+			break;
+		
+		status = CheckBlockRun(data->direct[i]);
+		if (status < B_OK)
+			return status;
+	}
+	return B_OK;
+}
+
+
+status_t
+BlockAllocator::Check(Inode *inode)
+{
+	if (!inode || !inode->IsDirectory())
+		return B_BAD_VALUE;
+
+	BPlusTree *tree;
+	status_t status = inode->GetTree(&tree);
+	if (status < B_OK)
+		return status;
+
+	TreeIterator iterator(tree);
+	char key[BPLUSTREE_MAX_KEY_LENGTH];
+	uint16 length;
+	off_t offset;
+	while (iterator.GetNextEntry(key,&length,BPLUSTREE_MAX_KEY_LENGTH,&offset) == B_OK) {
+		Vnode vnode(fVolume,offset);
+		Inode *entry;
+		if (vnode.Get(&entry) < B_OK) {
+			FATAL(("could not get inode in tree at: %Ld\n",offset));
+			continue;
+		}
+		block_run run = entry->BlockRun();
+		PRINT(("check allocations of inode \"%s\" (%ld,%u,%u)\n",key,run.allocation_group,run.start,run.length));
+		status = CheckInode(entry);
+		if (status < B_OK)
+			return status;
+	}
+	return B_OK;
+}
+#endif	/* DEBUG */
diff --git a/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h
new file mode 100644
index 0000000000..8f2ea0ba26
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h
@@ -0,0 +1,49 @@
+#ifndef BLOCK_ALLOCATOR_H
+#define BLOCK_ALLOCATOR_H
+/* BlockAllocator - block bitmap handling and allocation policies
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <Lock.h>
+
+
+class AllocationGroup;
+class Transaction;
+class Volume;
+class Inode;
+struct disk_super_block;
+struct block_run;
+
+
+class BlockAllocator {
+	public:
+		BlockAllocator(Volume *volume);
+		~BlockAllocator();
+
+		status_t Initialize();
+
+		status_t AllocateForInode(Transaction *transaction,const block_run *parent,mode_t type,block_run &run);
+		status_t Allocate(Transaction *transaction,const Inode *inode,off_t numBlocks,block_run &run,uint16 minimum = 1);
+		status_t Free(Transaction *transaction,block_run &run);
+
+		status_t AllocateBlocks(Transaction *transaction,int32 group, uint16 start, uint16 numBlocks, uint16 minimum, block_run &run);
+
+#ifdef DEBUG
+		status_t CheckBlockRun(block_run run);
+		status_t CheckInode(Inode *inode);
+		status_t Check(Inode *inode);
+#endif
+
+	private:
+		static status_t initialize(BlockAllocator *);
+
+		Volume			*fVolume;
+		Benaphore		fLock;
+		AllocationGroup	*fGroups;
+		int32			fNumGroups,fBlocksPerGroup;
+};
+
+#endif	/* BLOCK_ALLOCATOR_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Chain.h b/src/add-ons/kernel/file_systems/bfs/Chain.h
new file mode 100644
index 0000000000..7d7e3e87ef
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Chain.h
@@ -0,0 +1,55 @@
+#ifndef CHAIN_H
+#define CHAIN_H
+/* Chain - a chain implementation; it's used for the callback management
+**		throughout the code (currently TreeIterator, and AttributeIterator).
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+/** The Link class you want to use with the Chain class needs to have
+ *	a "fNext" member which is accessable from within the Chain class.
+ */
+
+template<class Link> class Chain {
+	public:
+		Chain()
+			:
+			fFirst(NULL)
+		{
+		}
+
+		void Add(Link *link)
+		{
+			link->fNext = fFirst;
+			fFirst = link;
+		}
+
+		void Remove(Link *link)
+		{
+			// search list for the correct callback to remove
+			Link *last = NULL,*entry;
+			for (entry = fFirst;link != entry;entry = entry->fNext)
+				last = entry;
+			if (link == entry) {
+				if (last)
+					last->fNext = link->fNext;
+				else
+					fFirst = link->fNext;
+			}
+		}
+
+		Link *Next(Link *last)
+		{
+			if (last == NULL)
+				return fFirst;
+
+			return last->fNext;
+		}
+
+	private:
+		Link	*fFirst;
+};
+
+#endif	/* CHAIN_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Debug.cpp b/src/add-ons/kernel/file_systems/bfs/Debug.cpp
new file mode 100644
index 0000000000..89bed31f9a
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Debug.cpp
@@ -0,0 +1,241 @@
+/* Debug - debug stuff
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** Some code is based on work previously done by Marcus Overhagen
+**
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "BPlusTree.h"
+
+#include <KernelExport.h>
+
+#include <time.h>
+
+#define Print __out
+
+
+char *
+get_tupel(uint32 id)
+{
+	static unsigned char tupel[5];
+
+	tupel[0] = 0xff & (id >> 24);
+	tupel[1] = 0xff & (id >> 16);
+	tupel[2] = 0xff & (id >> 8);
+	tupel[3] = 0xff & (id);
+	tupel[4] = 0;
+	for (int16 i = 0;i < 4;i++)
+		if (tupel[i] < ' ' || tupel[i] > 128)
+			tupel[i] = '.';
+
+	return (char *)tupel;
+}
+
+
+void
+dump_block_run(const char *prefix,block_run &run)
+{
+	Print("%s(%ld, %d, %d)\n",prefix,run.allocation_group,run.start,run.length);
+}
+
+
+void
+dump_super_block(disk_super_block *superBlock)
+{
+	Print("disk_super_block:\n");
+	Print("  name           = %s\n",superBlock->name);
+	Print("  magic1         = %#08lx (%s) %s\n",superBlock->magic1, get_tupel(superBlock->magic1), (superBlock->magic1 == SUPER_BLOCK_MAGIC1 ? "valid" : "INVALID"));
+	Print("  fs_byte_order  = %#08lx (%s)\n",superBlock->fs_byte_order, get_tupel(superBlock->fs_byte_order));
+	Print("  block_size     = %lu\n",superBlock->block_size);
+	Print("  block_shift    = %lu\n",superBlock->block_shift);
+	Print("  num_blocks     = %Lu\n",superBlock->num_blocks);
+	Print("  used_blocks    = %Lu\n",superBlock->used_blocks);
+	Print("  inode_size     = %lu\n",superBlock->inode_size);
+	Print("  magic2         = %#08lx (%s) %s\n",superBlock->magic2, get_tupel(superBlock->magic2), (superBlock->magic2 == (int)SUPER_BLOCK_MAGIC2 ? "valid" : "INVALID"));
+	Print("  blocks_per_ag  = %lu\n",superBlock->blocks_per_ag);
+	Print("  ag_shift       = %lu (%ld bytes)\n",superBlock->ag_shift, 1LL << superBlock->ag_shift);
+	Print("  num_ags        = %lu\n",superBlock->num_ags);
+	Print("  flags          = %#08lx (%s)\n",superBlock->flags, get_tupel(superBlock->flags));
+	dump_block_run("  log_blocks     = ",superBlock->log_blocks);
+	Print("  log_start      = %Lu\n",superBlock->log_start);
+	Print("  log_end        = %Lu\n",superBlock->log_end);
+	Print("  magic3         = %#08lx (%s) %s\n",superBlock->magic3, get_tupel(superBlock->magic3), (superBlock->magic3 == SUPER_BLOCK_MAGIC3 ? "valid" : "INVALID"));
+	dump_block_run("  root_dir       = ",superBlock->root_dir);
+	dump_block_run("  indices        = ",superBlock->indices);
+}
+
+
+void
+dump_data_stream(data_stream *stream)
+{
+	Print("data_stream:\n");
+	for (int i = 0; i < NUM_DIRECT_BLOCKS; i++) {
+		if (!stream->direct[i].IsZero()) {
+			Print("  direct[%02d]                = ",i);
+			dump_block_run("",stream->direct[i]);
+		}
+	}
+	Print("  max_direct_range          = %Lu\n",stream->max_direct_range);
+
+	if (!stream->indirect.IsZero())
+		dump_block_run("  indirect                  = ",stream->indirect);
+
+	Print("  max_indirect_range        = %Lu\n",stream->max_indirect_range);
+
+	if (!stream->double_indirect.IsZero())
+		dump_block_run("  double_indirect           = ",stream->double_indirect);
+
+	Print("  max_double_indirect_range = %Lu\n",stream->max_double_indirect_range);
+	Print("  size                      = %Lu\n",stream->size);
+}	
+
+
+void
+dump_inode(bfs_inode *inode)
+{
+	Print("inode:\n");
+	Print("  magic1             = %08lx (%s) %s\n",inode->magic1,
+			get_tupel(inode->magic1), (inode->magic1 == INODE_MAGIC1 ? "valid" : "INVALID"));
+	dump_block_run(	"  inode_num          = ",inode->inode_num);
+	Print("  uid                = %lu\n",inode->uid);
+	Print("  gid                = %lu\n",inode->gid);
+	Print("  mode               = %08lx\n",inode->mode);
+	Print("  flags              = %08lx\n",inode->flags);
+	Print("  create_time        = %Ld (%Ld)\n",inode->create_time,inode->create_time >> INODE_TIME_SHIFT);
+	Print("  last_modified_time = %Ld (%Ld)\n",inode->last_modified_time,inode->last_modified_time >> INODE_TIME_SHIFT);
+	dump_block_run(	"  parent             = ",inode->parent);
+	dump_block_run(	"  attributes         = ",inode->attributes);
+	Print("  type               = %lu\n",inode->type);
+	Print("  inode_size         = %lu\n",inode->inode_size);
+	Print("  etc                = %#08lx\n",inode->etc);
+	Print("  short_symlink      = %s\n",
+			S_ISLNK(inode->mode) && (inode->flags & INODE_LONG_SYMLINK) == 0? inode->short_symlink : "-");
+	dump_data_stream(&(inode->data));
+	Print("  --\n  pad[0]             = %08lx\n",inode->pad[0]);
+	Print("  pad[1]             = %08lx\n",inode->pad[1]);
+	Print("  pad[2]             = %08lx\n",inode->pad[2]);
+	Print("  pad[3]             = %08lx\n",inode->pad[3]);
+}
+
+
+void
+dump_bplustree_header(bplustree_header *header)
+{
+	Print("bplustree_header:\n");
+	Print("  magic                = %#08lx (%s) %s\n",header->magic,
+			get_tupel(header->magic), (header->magic == BPLUSTREE_MAGIC ? "valid" : "INVALID"));
+	Print("  node_size            = %lu\n",header->node_size);
+	Print("  max_number_of_levels = %lu\n",header->max_number_of_levels);
+	Print("  data_type            = %lu\n",header->data_type);
+	Print("  root_node_pointer    = %Ld\n",header->root_node_pointer);
+	Print("  free_node_pointer    = %Ld\n",header->free_node_pointer);
+	Print("  maximum_size         = %Lu\n",header->maximum_size);
+}
+
+
+#define DUMPED_BLOCK_SIZE 16
+
+void
+dump_block(const char *buffer,int size)
+{
+	for(int i = 0;i < size;) {
+		int start = i;
+
+		for(;i < start+DUMPED_BLOCK_SIZE;i++) {
+			if (!(i % 4))
+				Print(" ");
+
+			if (i >= size)
+				Print("  ");
+			else
+				Print("%02x",*(unsigned char *)(buffer+i));
+		}
+		Print("  ");
+
+		for(i = start;i < start + DUMPED_BLOCK_SIZE;i++) {
+			if (i < size) {
+				char c = *(buffer+i);
+
+				if (c < 30)
+					Print(".");
+				else
+					Print("%c",c);
+			}
+			else
+				break;
+		}
+		Print("\n");
+	}
+}
+
+
+void
+dump_bplustree_node(bplustree_node *node,bplustree_header *header,Volume *volume)
+{
+	Print("bplustree_node:\n");
+	Print("  left_link      = %Ld\n",node->left_link);
+	Print("  right_link     = %Ld\n",node->right_link);
+	Print("  overflow_link  = %Ld\n",node->overflow_link);
+	Print("  all_key_count  = %u\n",node->all_key_count);
+	Print("  all_key_length = %u\n",node->all_key_length);
+	
+	if (header == NULL)
+		return;
+
+	if (node->all_key_count > node->all_key_length
+		|| uint32(node->all_key_count * 10) > (uint32)header->node_size
+		|| node->all_key_count == 0) {
+		Print("\n");
+		dump_block((char *)node,header->node_size/*,sizeof(off_t)*/);
+		return;
+	}
+
+	Print("\n");
+	for (int32 i = 0;i < node->all_key_count;i++) {
+		uint16 length;
+		char buffer[256],*key = (char *)node->KeyAt(i,&length);
+		if (length > 255 || length == 0) {
+			Print("  %2ld. Invalid length (%u)!!\n",i,length);
+			dump_block((char *)node,header->node_size/*,sizeof(off_t)*/);
+			break;
+		}
+		memcpy(buffer,key,length);
+		buffer[length] = '\0';
+
+		off_t *value = node->Values() + i;
+		if ((uint32)value < (uint32)node || (uint32)value > (uint32)node + header->node_size)
+			Print("  %2ld. Invalid Offset!!\n",i);
+		else {
+			Print("  %2ld. ",i);
+			if (header->data_type == BPLUSTREE_STRING_TYPE)
+				Print("\"%s\"",buffer);
+			else if (header->data_type == BPLUSTREE_INT32_TYPE)
+				Print("int32 = %ld (0x%lx)",*(int32 *)&buffer,*(int32 *)&buffer);
+			else if (header->data_type == BPLUSTREE_UINT32_TYPE)
+				Print("uint32 = %lu (0x%lx)",*(uint32 *)&buffer,*(uint32 *)&buffer);
+			else if (header->data_type == BPLUSTREE_INT64_TYPE)
+				Print("int64 = %Ld (0x%Lx)",*(int64 *)&buffer,*(int64 *)&buffer);
+			else
+				Print("???");
+
+			off_t offset = *value & 0x3fffffffffffffffLL;
+			Print(" (%d bytes) -> %Ld",length,offset);
+			if (volume != NULL)
+			{
+				block_run run = volume->ToBlockRun(offset);
+				Print(" (%ld, %d)",run.allocation_group,run.start);
+			}
+			if (bplustree_node::LinkType(*value) == BPLUSTREE_DUPLICATE_FRAGMENT)
+				Print(" (duplicate fragment %Ld)\n",*value & 0x3ff);
+			else if (bplustree_node::LinkType(*value) == BPLUSTREE_DUPLICATE_NODE)
+				Print(" (duplicate node)\n");
+			else
+				Print("\n");
+		}
+	}
+}
+
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Debug.h b/src/add-ons/kernel/file_systems/bfs/Debug.h
new file mode 100644
index 0000000000..dfde5cdc92
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Debug.h
@@ -0,0 +1,74 @@
+#ifndef DEBUG_H
+#define DEBUG_H
+/* Debug - debug stuff
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+#ifdef USER
+#	include <stdio.h>
+#	define __out printf
+#else
+#	include <null.h>
+#	define __out dprintf
+#endif
+
+// Short overview over the debug output macros:
+//	PRINT()
+//		is for general messages that very unlikely should appear in a release build
+//	FATAL()
+//		this is for fatal messages, when something has really gone wrong
+//	INFORM()
+//		general information, as disk size, etc.
+//	REPORT_ERROR(status_t)
+//		prints out error information
+//	RETURN_ERROR(status_t)
+//		calls REPORT_ERROR() and return the value
+//	D()
+//		the statements in D() are only included if DEBUG is defined
+
+#ifdef DEBUG
+	#define PRINT(x) { __out("bfs: "); __out x; }
+	#define REPORT_ERROR(status) __out("bfs: %s:%ld: %s\n",__FUNCTION__,__LINE__,strerror(status));
+	#define RETURN_ERROR(err) { status_t _status = err; if (_status < B_OK) REPORT_ERROR(_status); return _status;}
+	#define FATAL(x) { __out("bfs: "); __out x; }
+	#define INFORM(x) { __out("bfs: "); __out x; }
+//	#define FUNCTION() __out("bfs: %s()\n",__FUNCTION__);
+	#define FUNCTION_START(x) { __out("bfs: %s() ",__FUNCTION__); __out x; }
+	#define FUNCTION() ;
+//	#define FUNCTION_START(x) ;
+	#define D(x) {x;};
+#else
+	#define PRINT(x) ;
+	#define REPORT_ERROR(status) ;
+	#define RETURN_ERROR(status) return status;
+	#define FATAL(x) { __out("bfs: "); __out x; }
+	#define INFORM(x) { __out("bfs: "); __out x; }
+	#define FUNCTION() ;
+	#define FUNCTION_START(x) ;
+	#define D(x) ;
+#endif
+
+#ifdef DEBUG
+	struct block_run;
+	struct bplustree_header;
+	struct bplustree_node;
+	struct data_stream;
+	struct bfs_inode;
+	struct disk_super_block;
+	class Volume;
+	
+	// some structure dump functions
+	extern void dump_block_run(const char *prefix, block_run &run);
+	extern void dump_super_block(disk_super_block *superBlock);
+	extern void dump_data_stream(data_stream *stream);
+	extern void dump_inode(bfs_inode *inode);
+	extern void dump_bplustree_header(bplustree_header *header);
+	extern void dump_bplustree_node(bplustree_node *node,bplustree_header *header = NULL,Volume *volume = NULL);
+	extern void dump_block(const char *buffer, int size);
+#endif
+
+#endif	/* DEBUG_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Index.cpp b/src/add-ons/kernel/file_systems/bfs/Index.cpp
new file mode 100644
index 0000000000..48a0d07727
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Index.cpp
@@ -0,0 +1,335 @@
+/* Index - index access functions
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "cpp.h"
+#include "Index.h"
+#include "Volume.h"
+#include "Inode.h"
+#include "BPlusTree.h"
+
+#include <TypeConstants.h>
+
+
+Index::Index(Volume *volume)
+	:
+	fVolume(volume),
+	fNode(NULL)
+{
+}
+
+
+Index::~Index()
+{
+	if (fNode == NULL)
+		return;
+
+	put_vnode(fVolume->ID(),fNode->ID());
+}
+
+
+void
+Index::Unset()
+{
+	if (fNode == NULL)
+		return;
+
+	put_vnode(fVolume->ID(),fNode->ID());
+	fNode = NULL;
+}
+
+
+status_t 
+Index::SetTo(const char *name)
+{
+	// remove the old node, if the index is set for the second time
+	Unset();
+
+	Inode *indices = fVolume->IndicesNode();
+	if (indices == NULL)
+		return B_ENTRY_NOT_FOUND;
+
+	BPlusTree *tree;
+	if (indices->GetTree(&tree) != B_OK)
+		return B_BAD_VALUE;
+
+	vnode_id id;
+	status_t status = tree->Find((uint8 *)name,(uint16)strlen(name),&id);
+	if (status != B_OK)
+		return status;
+
+	if (get_vnode(fVolume->ID(),id,(void **)&fNode) != B_OK)
+		return B_ENTRY_NOT_FOUND;
+
+	if (fNode == NULL) {
+		FATAL(("fatal error at Index::InitCheck(), get_vnode() returned NULL pointer\n"));
+		put_vnode(fVolume->ID(),id);
+		return B_ERROR;
+	}
+	fName = name;
+		// only stores the pointer, so it assumes that it will stay constant
+		// in further comparisons (currently only used in Index::Update())
+
+	return B_OK;
+}
+
+
+uint32 
+Index::Type()
+{
+	if (fNode == NULL)
+		return 0;
+
+	switch (fNode->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX |
+							 S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
+		case S_INT_INDEX:
+			return B_INT32_TYPE;
+		case S_UINT_INDEX:
+			return B_UINT32_TYPE;
+		case S_LONG_LONG_INDEX:
+			return B_INT64_TYPE;
+		case S_ULONG_LONG_INDEX:
+			return B_UINT64_TYPE;
+		case S_FLOAT_INDEX:
+			return B_FLOAT_TYPE;
+		case S_DOUBLE_INDEX:
+			return B_DOUBLE_TYPE;
+		case S_STR_INDEX:
+			return B_STRING_TYPE;
+	}
+	FATAL(("index has unknown type!\n"));
+	return 0;
+}
+
+
+size_t
+Index::KeySize()
+{
+	if (fNode == NULL)
+		return 0;
+	
+	int32 mode = fNode->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX |
+								  S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
+
+	if (mode == S_STR_INDEX)
+		// string indices don't have a fixed key size
+		return 0;
+
+	switch (mode) {
+		case S_INT_INDEX:
+		case S_UINT_INDEX:
+			return sizeof(int32);
+		case S_LONG_LONG_INDEX:
+		case S_ULONG_LONG_INDEX:
+			return sizeof(int64);
+		case S_FLOAT_INDEX:
+			return sizeof(float);
+		case S_DOUBLE_INDEX:
+			return sizeof(double);
+	}
+	FATAL(("index has unknown type!\n"));
+	return 0;
+}
+
+
+status_t
+Index::Create(Transaction *transaction,const char *name,uint32 type)
+{
+	Unset();
+
+	int32 mode = 0;
+	switch (type) {
+		case B_INT32_TYPE:
+			mode = S_INT_INDEX;
+			break;
+		case B_UINT32_TYPE:
+			mode = S_UINT_INDEX;
+			break;
+		case B_INT64_TYPE:
+			mode = S_LONG_LONG_INDEX;
+			break;
+		case B_UINT64_TYPE:
+			mode = S_ULONG_LONG_INDEX;
+			break;
+		case B_FLOAT_TYPE:
+			mode = S_FLOAT_INDEX;
+			break;
+		case B_DOUBLE_TYPE:
+			mode = S_DOUBLE_INDEX;
+			break;
+		case B_STRING_TYPE:
+			mode = S_STR_INDEX;
+			break;
+		default:
+			return B_BAD_TYPE;
+	}
+
+	status_t status;
+
+	// do we need to create the index directory first?
+	if (fVolume->IndicesNode() == NULL) {
+		if ((status = fVolume->CreateIndicesRoot(transaction)) < B_OK)
+			RETURN_ERROR(status);
+	}
+
+	vnode_id id;
+	status = Inode::Create(transaction,fVolume->IndicesNode(),name,S_INDEX_DIR | S_DIRECTORY | mode,0,type,&id);
+	if (status == B_OK) {
+		// since Inode::Create() lets the created inode open if "id" is specified,
+		// we don't need to call Vnode::Keep() here
+		Vnode vnode(fVolume,id);
+		return vnode.Get(&fNode);
+	}
+	return status;
+}
+
+
+/**	Updates the specified index, the oldKey will be removed from, the newKey
+ *	inserted into the tree.
+ *	If the method returns B_BAD_INDEX, it means the index couldn't be found -
+ *	the most common reason will be that the index doesn't exist.
+ *	You may not want to let the whole transaction fail because of that.
+ */
+
+status_t
+Index::Update(Transaction *transaction,const char *name,int32 type,const uint8 *oldKey,uint16 oldLength,const uint8 *newKey,uint16 newLength,Inode *inode)
+{
+	if (name == NULL
+		|| oldKey == NULL && newKey == NULL
+		|| oldKey != NULL && oldLength == 0
+		|| newKey != NULL && newLength == 0)
+		return B_BAD_VALUE;
+
+	// if the two keys are identical, don't do anything
+	if (type != 0 && !compareKeys(type,oldKey,oldLength,newKey,newLength))
+		return B_OK;
+
+	// update all live queries about the change, if they have an index or not
+	fVolume->UpdateLiveQueries(inode,name,type,oldKey,oldLength,newKey,newLength);
+
+	status_t status;
+	if (name != fName && (status = SetTo(name)) < B_OK)
+		return B_BAD_INDEX;
+
+	// now that we have the type, check again for equality
+	if (type == 0 && !compareKeys(Type(),oldKey,oldLength,newKey,newLength))
+		return B_OK;
+
+	BPlusTree *tree;
+	if ((status = Node()->GetTree(&tree)) < B_OK)
+		return status;
+
+	// remove the old key from the tree
+
+	if (oldKey != NULL) {
+		status = tree->Remove(transaction,(const uint8 *)oldKey,oldLength,inode->ID());
+		if (status == B_ENTRY_NOT_FOUND) {
+			// That's not nice, but should be no reason to let the whole thing fail
+			FATAL(("Could not find value in index \"%s\"!\n",name));
+		} else if (status < B_OK)
+			return status;
+	}
+
+	// add the new key to the key
+
+	if (newKey != NULL)
+		status = tree->Insert(transaction,(const uint8 *)newKey,newLength,inode->ID());
+
+	return status;
+}
+
+
+status_t 
+Index::InsertName(Transaction *transaction,const char *name,Inode *inode)
+{
+	return UpdateName(transaction,NULL,name,inode);
+}
+
+
+status_t 
+Index::RemoveName(Transaction *transaction,const char *name,Inode *inode)
+{
+	return UpdateName(transaction,name,NULL,inode);
+}
+
+
+status_t 
+Index::UpdateName(Transaction *transaction,const char *oldName, const char *newName,Inode *inode)
+{
+	uint16 oldLength = oldName ? strlen(oldName) : 0;
+	uint16 newLength = newName ? strlen(newName) : 0;
+	return Update(transaction,"name",B_STRING_TYPE,(uint8 *)oldName,oldLength,(uint8 *)newName,newLength,inode);
+}
+
+
+status_t 
+Index::InsertSize(Transaction *transaction, Inode *inode)
+{
+	off_t size = inode->Size();
+	return Update(transaction,"size",B_INT64_TYPE,NULL,0,(uint8 *)&size,sizeof(int64),inode);
+}
+
+
+status_t 
+Index::RemoveSize(Transaction *transaction, Inode *inode)
+{
+	// Inode::OldSize() is the size that's in the index
+	off_t size = inode->OldSize();
+	return Update(transaction,"size",B_INT64_TYPE,(uint8 *)&size,sizeof(int64),NULL,0,inode);
+}
+
+
+status_t
+Index::UpdateSize(Transaction *transaction,Inode *inode)
+{
+	off_t oldSize = inode->OldSize();
+	off_t newSize = inode->Size();
+	status_t status = Update(transaction,"size",B_INT64_TYPE,(uint8 *)&oldSize,sizeof(int64),
+								(uint8 *)&newSize,sizeof(int64),inode);
+
+	if (status == B_OK)
+		inode->UpdateOldSize();
+
+	return status;
+}
+
+
+status_t 
+Index::InsertLastModified(Transaction *transaction, Inode *inode)
+{
+	off_t modified = inode->Node()->last_modified_time;
+	return Update(transaction,"last_modified",B_INT64_TYPE,NULL,0,(uint8 *)&modified,sizeof(int64),inode);
+}
+
+
+status_t 
+Index::RemoveLastModified(Transaction *transaction, Inode *inode)
+{
+	// Inode::OldLastModified() is the value which is in the index
+	off_t modified = inode->OldLastModified();
+	return Update(transaction,"last_modified",B_INT64_TYPE,(uint8 *)&modified,sizeof(int64),NULL,0,inode);
+}
+
+
+status_t 
+Index::UpdateLastModified(Transaction *transaction, Inode *inode, off_t modified)
+{
+	off_t oldModified = inode->OldLastModified();
+	if (modified == -1)
+		modified = (bigtime_t)time(NULL) << INODE_TIME_SHIFT;
+	modified |= fVolume->GetUniqueID() & INODE_TIME_MASK;
+
+	status_t status = Update(transaction,"last_modified",B_INT64_TYPE,(uint8 *)&oldModified,sizeof(int64),
+								(uint8 *)&modified,sizeof(int64),inode);
+
+	inode->Node()->last_modified_time = modified;
+	if (status == B_OK)
+		inode->UpdateOldLastModified();
+
+	return status;
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Index.h b/src/add-ons/kernel/file_systems/bfs/Index.h
new file mode 100644
index 0000000000..5e65953614
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Index.h
@@ -0,0 +1,51 @@
+#ifndef INDEX_H
+#define INDEX_H
+/* Index - index access functions
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+
+class Transaction;
+class Volume;
+class Inode;
+
+
+class Index {
+	public:
+		Index(Volume *volume);
+		~Index();
+
+		status_t SetTo(const char *name);
+		void Unset();
+
+		Inode *Node() const { return fNode; };
+		uint32 Type();
+		size_t KeySize();
+
+		status_t Create(Transaction *transaction, const char *name, uint32 type);
+
+		status_t Update(Transaction *transaction, const char *name, int32 type, const uint8 *oldKey, uint16 oldLength, const uint8 *newKey, uint16 newLength, Inode *inode);
+
+		status_t InsertName(Transaction *transaction,const char *name,Inode *inode);
+		status_t RemoveName(Transaction *transaction,const char *name,Inode *inode);
+		status_t UpdateName(Transaction *transaction,const char *oldName,const char *newName,Inode *inode);
+
+		status_t InsertSize(Transaction *transaction, Inode *inode);
+		status_t RemoveSize(Transaction *transaction, Inode *inode);
+		status_t UpdateSize(Transaction *transaction, Inode *inode);
+
+		status_t InsertLastModified(Transaction *transaction, Inode *inode);
+		status_t RemoveLastModified(Transaction *transaction, Inode *inode);
+		status_t UpdateLastModified(Transaction *transaction, Inode *inode,off_t modified = -1);
+
+	private:
+		Volume		*fVolume;
+		Inode		*fNode;
+		const char	*fName;
+};
+
+#endif	/* INDEX_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Inode.cpp b/src/add-ons/kernel/file_systems/bfs/Inode.cpp
new file mode 100644
index 0000000000..dc6212f639
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Inode.cpp
@@ -0,0 +1,2107 @@
+/* Inode - inode access functions
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "cpp.h"
+#include "Inode.h"
+#include "BPlusTree.h"
+#include "Index.h"
+
+#include <string.h>
+
+
+class InodeAllocator {
+	public:
+		InodeAllocator(Transaction *transaction);
+		~InodeAllocator();
+
+		status_t New(block_run *parentRun,mode_t mode,block_run &run,Inode **inode);
+		void Keep();
+
+	private:
+		Transaction *fTransaction;
+		block_run fRun;
+		Inode *fInode;
+};
+
+
+InodeAllocator::InodeAllocator(Transaction *transaction)
+	:
+	fTransaction(transaction),
+	fInode(NULL)
+{
+}
+
+
+InodeAllocator::~InodeAllocator()
+{
+	delete fInode;
+	
+	if (fTransaction)
+		fTransaction->GetVolume()->Free(fTransaction,fRun);
+}
+
+
+status_t 
+InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **inode)
+{
+	Volume *volume = fTransaction->GetVolume();
+
+	status_t status = volume->AllocateForInode(fTransaction,parentRun,mode,fRun);
+	if (status < B_OK) {
+		// don't free the space in the destructor, because
+		// the allocation failed
+		fTransaction = NULL;
+		RETURN_ERROR(status);
+	}
+
+	run = fRun;
+	fInode = new Inode(volume,volume->ToVnode(run),true);
+	if (fInode == NULL)
+		RETURN_ERROR(B_NO_MEMORY);
+
+	*inode = fInode;
+	return B_OK;
+}
+
+
+void InodeAllocator::Keep()
+{
+	fTransaction = NULL;
+	fInode = NULL;
+}
+
+
+//	#pragma mark -
+
+
+Inode::Inode(Volume *volume,vnode_id id,bool empty,uint8 reenter)
+	: CachedBlock(volume,volume->VnodeToBlock(id),empty),
+	fTree(NULL),
+	fLock("bfs inode")
+{
+	Node()->flags &= INODE_PERMANENT_FLAGS;
+
+	// these two will help to maintain the indices
+	fOldSize = Size();
+	fOldLastModified = Node()->last_modified_time;
+}
+
+
+Inode::~Inode()
+{
+	delete fTree;
+}
+
+
+status_t 
+Inode::InitCheck()
+{
+	if (!Node())
+		RETURN_ERROR(B_IO_ERROR);
+
+	// test inode magic and flags
+	if (Node()->magic1 != INODE_MAGIC1
+		|| !(Node()->flags & INODE_IN_USE)
+		|| Node()->inode_num.length != 1
+		// matches inode size?
+		|| Node()->inode_size != fVolume->InodeSize()
+		// parent resides on disk?
+		|| Node()->parent.allocation_group > fVolume->AllocationGroups()
+		|| Node()->parent.allocation_group < 0
+		|| Node()->parent.start > (1L << fVolume->AllocationGroupShift())
+		|| Node()->parent.length != 1
+		// attributes, too?
+		|| Node()->attributes.allocation_group > fVolume->AllocationGroups()
+		|| Node()->attributes.allocation_group < 0
+		|| Node()->attributes.start > (1L << fVolume->AllocationGroupShift())) {
+		FATAL(("inode at block %Ld corrupt!\n",fBlockNumber));
+		RETURN_ERROR(B_BAD_DATA);
+	}
+	
+	// ToDo: Add some tests to check the integrity of the other stuff here,
+	// especially for the data_stream!
+
+	// it's more important to know that the inode is corrupt
+	// so we check for the lock not until here
+	return fLock.InitCheck();
+}
+
+
+status_t 
+Inode::CheckPermissions(int accessMode) const
+{
+	uid_t user = geteuid();
+	gid_t group = getegid();
+
+	// you never have write access to a read-only volume
+	if (accessMode & W_OK && fVolume->IsReadOnly())
+		return B_READ_ONLY_DEVICE;
+
+	// root users always have full access (but they can't execute anything)
+	if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0))
+		return B_OK;
+
+	// shift mode bits, to check directly against accessMode
+	mode_t mode = Mode();
+	if (user == Node()->uid)
+		mode >>= 6;
+	else if (group == Node()->gid)
+		mode >>= 3;
+
+	if (accessMode & ~(mode & S_IRWXO))
+		return B_NOT_ALLOWED;
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+
+
+void 
+Inode::AddIterator(AttributeIterator *iterator)
+{
+	if (fSmallDataLock.Lock() < B_OK)
+		return;
+
+	fIterators.Add(iterator);
+
+	fSmallDataLock.Unlock();
+}
+
+
+void 
+Inode::RemoveIterator(AttributeIterator *iterator)
+{
+	if (fSmallDataLock.Lock() < B_OK)
+		return;
+
+	fIterators.Remove(iterator);
+
+	fSmallDataLock.Unlock();
+}
+
+
+/**	Tries to free up "bytes" space in the small_data section by moving
+ *	attributes to real files. Used for system attributes like the name.
+ *	You need to hold the fSmallDataLock when you call this method
+ */
+
+status_t
+Inode::MakeSpaceForSmallData(Transaction *transaction,const char *name,int32 bytes)
+{
+	while (bytes > 0) {
+		small_data *item = Node()->small_data_start,*max = NULL;
+		int32 index = 0,maxIndex = 0;
+		for (;!item->IsLast(Node());item = item->Next(),index++) {
+			// should not remove those
+			if (*item->Name() == FILE_NAME_NAME || !strcmp(name,item->Name()))
+				continue;
+
+			if (max == NULL || max->Size() < item->Size()) {
+				maxIndex = index;
+				max = item;
+			}
+
+			// remove the first one large enough to free the needed amount of bytes
+			if (bytes < item->Size())
+				break;
+		}
+
+		if (item->IsLast(Node()) || item->Size() < bytes)
+			return B_ERROR;
+
+		bytes -= max->Size();
+
+		// Move the attribute to a real attribute file
+		// Luckily, this doesn't cause any index updates
+
+		Inode *attribute;
+		status_t status = CreateAttribute(transaction,item->Name(),item->type,&attribute);
+		if (status < B_OK)
+			RETURN_ERROR(status);
+
+		size_t length = item->data_size;
+		status = attribute->WriteAt(transaction,0,item->Data(),&length);
+
+		ReleaseAttribute(attribute);
+
+		if (status < B_OK) {
+			Vnode vnode(fVolume,Attributes());
+			Inode *attributes;
+			if (vnode.Get(&attributes) < B_OK
+				|| attributes->Remove(transaction,name) < B_OK) {
+				FATAL(("Could not remove newly created attribute!\n"));
+			}
+
+			RETURN_ERROR(status);
+		}
+
+		RemoveSmallData(max,maxIndex);
+	}
+	return B_OK;
+}
+
+
+/**	Private function which removes the given attribute from the small_data
+ *	section.
+ *	You need to hold the fSmallDataLock when you call this method
+ */
+
+status_t 
+Inode::RemoveSmallData(small_data *item,int32 index)
+{
+	small_data *next = item->Next();
+	if (!next->IsLast(Node())) {
+		// find the last attribute
+		small_data *last = next;
+		while (!last->IsLast(Node()))
+			last = last->Next();
+
+		int32 size = (uint8 *)last - (uint8 *)next;
+		if (size < 0 || size > (uint8 *)Node() + fVolume->BlockSize() - (uint8 *)next)
+			return B_BAD_DATA;
+
+		memmove(item,next,size);
+
+		// Move the "last" one to its new location and
+		// correctly terminate the small_data section
+		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
+		memset(last,0,(uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last);
+	} else
+		memset(item,0,item->Size());
+
+	// update all current iterators
+	AttributeIterator *iterator = NULL;
+	while ((iterator = fIterators.Next(iterator)) != NULL)
+		iterator->Update(index,-1);
+
+	return B_OK;
+}
+
+
+/**	Removes the given attribute from the small_data section.
+ *	Note that you need to write back the inode yourself after having called
+ *	that method.
+ */
+
+status_t
+Inode::RemoveSmallData(Transaction *transaction,const char *name)
+{
+	if (name == NULL)
+		return B_BAD_VALUE;
+
+	SimpleLocker locker(fSmallDataLock);
+
+	// search for the small_data item
+
+	small_data *item = Node()->small_data_start;
+	int32 index = 0;
+	while (!item->IsLast(Node()) && strcmp(item->Name(),name)) {
+		item = item->Next();
+		index++;
+	}
+
+	if (item->IsLast(Node()))
+		return B_ENTRY_NOT_FOUND;
+
+	return RemoveSmallData(item,index);
+}
+
+
+/**	Try to place the given attribute in the small_data section - if the
+ *	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
+ *	In that case, the attribute should be written to a real attribute file;
+ *	if the attribute was already part of the small_data section, but the new
+ *	one wouldn't fit, the old one is automatically removed from the small_data
+ *	section.
+ *	Note that you need to write back the inode yourself after having called that
+ *	method - it's a bad API decision that it needs a transaction but enforces you
+ *	to write back the inode all by yourself, but it's just more efficient in most
+ *	cases...
+ */
+
+status_t
+Inode::AddSmallData(Transaction *transaction,const char *name,uint32 type,const uint8 *data,size_t length,bool force)
+{
+	if (name == NULL || data == NULL || type == 0)
+		return B_BAD_VALUE;
+
+	// reject any requests that can't fit into the small_data section
+	uint32 nameLength = strlen(name);
+	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
+	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
+		return B_DEVICE_FULL;
+
+	SimpleLocker locker(fSmallDataLock);
+
+	small_data *item = Node()->small_data_start;
+	int32 index = 0;
+	while (!item->IsLast(Node()) && strcmp(item->Name(),name)) {
+		item = item->Next();
+		index++;
+	}
+
+	// is the attribute already in the small_data section?
+	// then just replace the data part of that one
+	if (!item->IsLast(Node())) {
+		// find last attribute
+		small_data *last = item;
+		while (!last->IsLast(Node()))
+			last = last->Next();
+
+		// try to change the attributes value
+		if (item->data_size > length
+			|| force
+			|| ((uint8 *)last + length - item->data_size) <= ((uint8 *)Node() + fVolume->InodeSize())) {
+			// make room for the new attribute if needed (and we are forced to do so)
+			if (force
+				&& ((uint8 *)last + length - item->data_size) > ((uint8 *)Node() + fVolume->InodeSize())) {
+				// We also take the free space at the end of the small_data section
+				// into account, and request only what's really needed
+				uint32 needed = length - item->data_size -
+						(uint32)((uint8 *)Node() + fVolume->InodeSize() - (uint8 *)last);
+
+				if (MakeSpaceForSmallData(transaction,name,needed) < B_OK)
+					return B_ERROR;
+				
+				// reset our pointers
+				item = Node()->small_data_start;
+				index = 0;
+				while (!item->IsLast(Node()) && strcmp(item->Name(),name)) {
+					item = item->Next();
+					index++;
+				}
+
+				last = item;
+				while (!last->IsLast(Node()))
+					last = last->Next();
+			}
+
+			// move the attributes after the current one
+			small_data *next = item->Next();
+			if (!next->IsLast(Node()))
+				memmove((uint8 *)item + spaceNeeded,next,(uint8 *)last - (uint8 *)next);
+
+			// Move the "last" one to its new location and
+			// correctly terminate the small_data section
+			last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
+			if ((uint8 *)last < (uint8 *)Node() + fVolume->BlockSize())
+				memset(last,0,(uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last);
+
+			item->type = type;
+			item->data_size = length;
+			memcpy(item->Data(),data,length);
+			item->Data()[length] = '\0';
+
+			return B_OK;
+		}
+
+		// Could not replace the old attribute, so remove it to let
+		// let the calling function create an attribute file for it
+		if (RemoveSmallData(item,index) < B_OK)
+			return B_ERROR;
+
+		return B_DEVICE_FULL;
+	}
+
+	// try to add the new attribute!
+
+	if ((uint8 *)item + spaceNeeded > (uint8 *)Node() + fVolume->InodeSize()) {
+		// there is not enough space for it!
+		if (!force)
+			return B_DEVICE_FULL;
+
+		// make room for the new attribute
+		if (MakeSpaceForSmallData(transaction,name,spaceNeeded) < B_OK)
+			return B_ERROR;
+
+		// get new last item!
+		item = Node()->small_data_start;
+		index = 0;
+		while (!item->IsLast(Node())) {
+			item = item->Next();
+			index++;
+		}
+	}
+
+	memset(item,0,spaceNeeded);
+	item->type = type;
+	item->name_size = nameLength;
+	item->data_size = length;
+	strcpy(item->Name(),name);
+	memcpy(item->Data(),data,length);
+
+	// correctly terminate the small_data section
+	item = item->Next();
+	if (!item->IsLast(Node()))
+		memset(item,0,(uint8 *)Node() + fVolume->InodeSize() - (uint8 *)item);
+
+	// update all current iterators
+	AttributeIterator *iterator = NULL;
+	while ((iterator = fIterators.Next(iterator)) != NULL)
+		iterator->Update(index,1);
+
+	return B_OK;
+}
+
+
+/**	Iterates through the small_data section of an inode.
+ *	To start at the beginning of this section, you let smallData
+ *	point to NULL, like:
+ *		small_data *data = NULL;
+ *		while (inode->GetNextSmallData(&data) { ... }
+ *
+ *	This function is reentrant and doesn't allocate any memory;
+ *	you can safely stop calling it at any point (you don't need
+ *	to iterate through the whole list).
+ *	You need to hold the fSmallDataLock when you call this method
+ */
+
+status_t
+Inode::GetNextSmallData(small_data **smallData) const
+{
+	if (!Node())
+		RETURN_ERROR(B_ERROR);
+
+	small_data *data = *smallData;
+
+	// begin from the start?
+	if (data == NULL)
+		data = Node()->small_data_start;
+	else
+		data = data->Next();
+
+	// is already last item?
+	if (data->IsLast(Node()))
+		return B_ENTRY_NOT_FOUND;
+
+	*smallData = data;
+
+	return B_OK;
+}
+
+
+/**	Finds the attribute "name" in the small data section, and
+ *	returns a pointer to it (or NULL if it doesn't exist).
+ *	You need to hold the fSmallDataLock when you call this method
+ */
+
+small_data *
+Inode::FindSmallData(const char *name) const
+{
+	small_data *smallData = NULL;
+	while (GetNextSmallData(&smallData) == B_OK) {
+		if (!strcmp(smallData->Name(),name))
+			return smallData;
+	}
+	return NULL;
+}
+
+
+const char *
+Inode::Name() const
+{
+	SimpleLocker locker(fSmallDataLock);
+
+	small_data *smallData = NULL;
+	while (GetNextSmallData(&smallData) == B_OK) {
+		if (*smallData->Name() == FILE_NAME_NAME && smallData->name_size == FILE_NAME_NAME_LENGTH)
+			return (const char *)smallData->Data();
+	}
+	return NULL;
+}
+
+
+/**	Changes or set the name of a file: in the inode small_data section only, it
+ *	doesn't change it in the parent directory's b+tree.
+ *	Note that you need to write back the inode yourself after having called
+ *	that method. It suffers from the same API decision as AddSmallData() does
+ *	(and for the same reason).
+ */
+
+status_t 
+Inode::SetName(Transaction *transaction,const char *name)
+{
+	if (name == NULL || *name == '\0')
+		return B_BAD_VALUE;
+
+	const char nameTag[2] = {FILE_NAME_NAME, 0};
+
+	return AddSmallData(transaction,nameTag,FILE_NAME_TYPE,(uint8 *)name,strlen(name),true);
+}
+
+
+/**	Reads data from the specified attribute.
+ *	This is a high-level attribute function that understands attributes
+ *	in the small_data section as well as real attribute files.
+ */
+
+status_t
+Inode::ReadAttribute(const char *name,int32 type,off_t pos,uint8 *buffer,size_t *_length)
+{
+	if (pos < 0)
+		pos = 0;
+
+	// search in the small_data section (which has to be locked first)
+	{
+		SimpleLocker locker(fSmallDataLock);
+
+		small_data *smallData = FindSmallData(name);
+		if (smallData != NULL) {
+			size_t length = *_length;
+			if (pos >= smallData->data_size) {
+				*_length = 0;
+				return B_OK;
+			}
+			if (length + pos > smallData->data_size)
+				length = smallData->data_size - pos;
+	
+			memcpy(buffer,smallData->Data() + pos,length);
+			*_length = length;
+			return B_OK;
+		}
+	}
+
+	// search in the attribute directory
+	Inode *attribute;
+	status_t status = GetAttribute(name,&attribute);
+	if (status == B_OK) {
+		if (attribute->Lock().Lock() == B_OK) {
+			status = attribute->ReadAt(pos,(uint8 *)buffer,_length);
+			attribute->Lock().Unlock();
+		} else
+			status = B_ERROR;
+
+		ReleaseAttribute(attribute);
+	}
+
+	RETURN_ERROR(status);
+}
+
+
+/**	Writes data to the specified attribute.
+ *	This is a high-level attribute function that understands attributes
+ *	in the small_data section as well as real attribute files.
+ */
+
+status_t
+Inode::WriteAttribute(Transaction *transaction,const char *name,int32 type,off_t pos,const uint8 *buffer,size_t *_length)
+{
+	// needed to maintain the index
+	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH],*oldData = NULL;
+	size_t oldLength = 0;
+
+	Index index(fVolume);
+	bool hasIndex = index.SetTo(name) == B_OK;
+
+	Inode *attribute = NULL;
+	status_t status;
+	if (GetAttribute(name,&attribute) < B_OK) {
+		// save the old attribute data
+		if (hasIndex) {
+			fSmallDataLock.Lock();
+
+			small_data *smallData = FindSmallData(name);
+			if (smallData != NULL) {
+				oldLength = smallData->data_size;
+				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
+					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
+				memcpy(oldData = oldBuffer,smallData->Data(),oldLength);
+			}
+			fSmallDataLock.Unlock();
+		}
+
+		// if the attribute doesn't exist yet (as a file), try to put it in the
+		// small_data section first - if that fails (due to insufficent space),
+		// create a real attribute file
+		status = AddSmallData(transaction,name,type,buffer,*_length);
+		if (status == B_DEVICE_FULL) {
+			status = CreateAttribute(transaction,name,type,&attribute);
+			if (status < B_OK)
+				RETURN_ERROR(status);
+		} else if (status == B_OK)
+			status = WriteBack(transaction);
+	}
+
+	if (attribute != NULL) {
+		if (attribute->Lock().LockWrite() == B_OK) {
+			// save the old attribute data (if this fails, oldLength will reflect it)
+			if (hasIndex) {
+				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
+				if (attribute->ReadAt(0,oldBuffer,&oldLength) == B_OK)
+					oldData = oldBuffer;
+			}
+			status = attribute->WriteAt(transaction,pos,buffer,_length);
+	
+			attribute->Lock().UnlockWrite();
+		} else
+			status = B_ERROR;
+
+		ReleaseAttribute(attribute);
+	}
+
+	if (status == B_OK) {
+		// ToDo: find a better way for that "pos" thing...
+		// Update index
+		if (hasIndex && pos == 0) {
+			// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
+			uint16 length = *_length;
+			if (length > BPLUSTREE_MAX_KEY_LENGTH)
+				length = BPLUSTREE_MAX_KEY_LENGTH;
+
+			index.Update(transaction,name,0,oldData,oldLength,buffer,length,this);
+		}
+	}
+	return status;
+}
+
+
+/**	Removes the specified attribute from the inode.
+ *	This is a high-level attribute function that understands attributes
+ *	in the small_data section as well as real attribute files.
+ */
+
+status_t
+Inode::RemoveAttribute(Transaction *transaction,const char *name)
+{
+	Index index(fVolume);
+	bool hasIndex = index.SetTo(name) == B_OK;
+
+	// update index for attributes in the small_data section
+	if (hasIndex) {
+		fSmallDataLock.Lock();
+
+		small_data *smallData = FindSmallData(name);
+		if (smallData != NULL) {
+			uint32 length = smallData->data_size;
+			if (length > BPLUSTREE_MAX_KEY_LENGTH)
+				length = BPLUSTREE_MAX_KEY_LENGTH;
+			index.Update(transaction,name,0,smallData->Data(),length,NULL,0,this);
+		}
+		fSmallDataLock.Unlock();
+	}
+
+	status_t status = RemoveSmallData(transaction,name);
+	if (status == B_OK) {
+		status = WriteBack(transaction);
+	} else if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
+		// remove the attribute file if it exists
+		Vnode vnode(fVolume,Attributes());
+		Inode *attributes;
+		if ((status = vnode.Get(&attributes)) < B_OK)
+			return status;
+
+		// update index
+		Inode *attribute;
+		if (hasIndex && GetAttribute(name,&attribute) == B_OK) {
+			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
+			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
+			if (attribute->ReadAt(0,data,&length) == B_OK)
+				index.Update(transaction,name,0,data,length,NULL,0,this);
+
+			ReleaseAttribute(attribute);
+		}
+
+		if ((status = attributes->Remove(transaction,name)) < B_OK)
+			return status;
+
+		if (attributes->IsEmpty()) {
+			// remove attribute directory (don't fail if that can't be done)
+			if (remove_vnode(fVolume->ID(),attributes->ID()) == B_OK) {
+				// update the inode, so that no one will ever doubt it's deleted :-)
+				attributes->Node()->flags |= INODE_DELETED;
+				if (attributes->WriteBack(transaction) == B_OK) {
+					Attributes().SetTo(0,0,0);
+					WriteBack(transaction);
+				} else
+					unremove_vnode(fVolume->ID(),attributes->ID());
+			}
+		}
+	}
+	return status;
+}
+
+
+status_t
+Inode::GetAttribute(const char *name,Inode **attribute)
+{
+	// does this inode even have attributes?
+	if (Attributes().IsZero())
+		return B_ENTRY_NOT_FOUND;
+
+	Vnode vnode(fVolume,Attributes());
+	Inode *attributes;
+	if (vnode.Get(&attributes) < B_OK) {
+		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",name));
+		return B_ERROR;
+	}
+
+	BPlusTree *tree;
+	status_t status = attributes->GetTree(&tree);
+	if (status == B_OK) {
+		vnode_id id;
+		if ((status = tree->Find((uint8 *)name,(uint16)strlen(name),&id)) == B_OK)
+			return get_vnode(fVolume->ID(),id,(void **)attribute);
+	}
+	return status;
+}
+
+
+void
+Inode::ReleaseAttribute(Inode *attribute)
+{
+	if (attribute == NULL)
+		return;
+
+	put_vnode(fVolume->ID(),attribute->ID());
+}
+
+
+status_t
+Inode::CreateAttribute(Transaction *transaction,const char *name,uint32 type,Inode **attribute)
+{
+	// do we need to create the attribute directory first?
+	if (Attributes().IsZero()) {
+		status_t status = Inode::Create(transaction,this,NULL,S_ATTR_DIR | 0666,0,0,NULL);
+		if (status < B_OK)
+			RETURN_ERROR(status);
+	}
+	Vnode vnode(fVolume,Attributes());
+	Inode *attributes;
+	if (vnode.Get(&attributes) < B_OK)
+		return B_ERROR;
+
+	// Inode::Create() locks the inode if we provide the "id" parameter
+	vnode_id id;
+	return Inode::Create(transaction,attributes,name,S_ATTR | 0666,0,type,&id,attribute);
+}
+
+
+//	#pragma mark -
+
+
+/**	Gives the caller direct access to the b+tree for a given directory.
+ *	The tree is created on demand, but lasts until the inode is
+ *	deleted.
+ */
+
+status_t
+Inode::GetTree(BPlusTree **tree)
+{
+	if (fTree) {
+		*tree = fTree;
+		return B_OK;
+	}
+
+	if (IsDirectory()) {
+		fTree = new BPlusTree(this);
+		if (!fTree)
+			RETURN_ERROR(B_NO_MEMORY);
+
+		*tree = fTree;
+		status_t status = fTree->InitCheck();
+		if (status < B_OK) {
+			delete fTree;
+			fTree = NULL;
+		}
+		RETURN_ERROR(status);
+	}
+	RETURN_ERROR(B_BAD_VALUE);
+}
+
+
+bool 
+Inode::IsEmpty()
+{
+	BPlusTree *tree;
+	status_t status = GetTree(&tree);
+	if (status < B_OK)
+		return status;
+
+	TreeIterator iterator(tree);
+
+	// index and attribute directories are really empty when they are
+	// empty - directories for standard files always contain ".", and
+	// "..", so we need to ignore those two
+
+	uint32 count = 0;
+	char name[BPLUSTREE_MAX_KEY_LENGTH];
+	uint16 length;
+	vnode_id id;
+	while (iterator.GetNextEntry(name,&length,B_FILE_NAME_LENGTH,&id) == B_OK) {
+		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
+			return false;
+
+		if (++count > 2 || strcmp(".",name) && strcmp("..",name))
+			return false;
+	}
+	return true;
+}
+
+
+/** Finds the block_run where "pos" is located in the data_stream of
+ *	the inode.
+ *	If successful, "offset" will then be set to the file offset
+ *	of the block_run returned; so "pos - offset" is for the block_run
+ *	what "pos" is for the whole stream.
+ */
+
+status_t
+Inode::FindBlockRun(off_t pos,block_run &run,off_t &offset)
+{
+	data_stream *data = &Node()->data;
+
+	// Inode::ReadAt() does already does this
+	//if (pos > data->size)
+	//	return B_ENTRY_NOT_FOUND;
+
+	// find matching block run
+
+	if (data->max_direct_range > 0 && pos >= data->max_direct_range) {
+		if (data->max_double_indirect_range > 0 && pos >= data->max_indirect_range) {
+			// access to double indirect blocks
+
+			CachedBlock cached(fVolume);
+
+			off_t start = pos - data->max_indirect_range;
+			int32 indirectSize = (16 << fVolume->BlockShift()) * (fVolume->BlockSize() / sizeof(block_run));
+			int32 directSize = 4 << fVolume->BlockShift();
+			int32 index = start / indirectSize;
+			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
+
+			block_run *indirect = (block_run *)cached.SetTo(
+					fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
+			if (indirect == NULL)
+				RETURN_ERROR(B_ERROR);
+
+			//printf("\tstart = %Ld, indirectSize = %ld, directSize = %ld, index = %ld\n",start,indirectSize,directSize,index);
+			//printf("\tlook for indirect block at %ld,%d\n",indirect[index].allocation_group,indirect[index].start);
+
+			int32 current = (start % indirectSize) / directSize;
+
+			indirect = (block_run *)cached.SetTo(
+					fVolume->ToBlock(indirect[index % runsPerBlock]) + current / runsPerBlock);
+			if (indirect == NULL)
+				RETURN_ERROR(B_ERROR);
+
+			run = indirect[current % runsPerBlock];
+			offset = data->max_indirect_range + (index * indirectSize) + (current * directSize);
+			//printf("\tfCurrent = %ld, fRunFileOffset = %Ld, fRunBlockEnd = %Ld, fRun = %ld,%d\n",fCurrent,fRunFileOffset,fRunBlockEnd,fRun.allocation_group,fRun.start);
+		} else {
+			// access to indirect blocks
+
+			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
+			off_t runBlockEnd = data->max_direct_range;
+
+			CachedBlock cached(fVolume);
+			off_t block = fVolume->ToBlock(data->indirect);
+
+			for (int32 i = 0;i < data->indirect.length;i++) {
+				block_run *indirect = (block_run *)cached.SetTo(block + i);
+				if (indirect == NULL)
+					RETURN_ERROR(B_IO_ERROR);
+
+				int32 current = -1;
+				while (++current < runsPerBlock) {
+					if (indirect[current].IsZero())
+						break;
+
+					runBlockEnd += indirect[current].length << fVolume->BlockShift();
+					if (runBlockEnd > pos) {
+						run = indirect[current];
+						offset = runBlockEnd - (run.length << fVolume->BlockShift());
+						//printf("reading from indirect block: %ld,%d\n",fRun.allocation_group,fRun.start);
+						//printf("### indirect-run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.length,fRunFileOffset);
+						return fVolume->IsValidBlockRun(run);
+					}
+				}
+			}
+			RETURN_ERROR(B_ERROR);
+		}
+	} else {
+		// access from direct blocks
+
+		off_t runBlockEnd = 0LL;
+		int32 current = -1;
+
+		while (++current < NUM_DIRECT_BLOCKS) {
+			if (data->direct[current].IsZero())
+				break;
+
+			runBlockEnd += data->direct[current].length << fVolume->BlockShift();
+			if (runBlockEnd > pos) {
+				run = data->direct[current];
+				offset = runBlockEnd - (run.length << fVolume->BlockShift());
+				//printf("### run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.length,fRunFileOffset);
+				return fVolume->IsValidBlockRun(run);
+			}
+		}
+		//PRINT(("FindBlockRun() failed in direct range: size = %Ld, pos = %Ld\n",data->size,pos));
+		return B_ENTRY_NOT_FOUND;
+	}
+	return fVolume->IsValidBlockRun(run);
+}
+
+
+status_t
+Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
+{
+	// set/check boundaries for pos/length
+
+	if (pos < 0)
+		pos = 0;
+	else if (pos >= Node()->data.size) {
+		*_length = 0;
+		return B_NO_ERROR;
+	}
+
+	size_t length = *_length;
+
+	if (pos + length > Node()->data.size)
+		length = Node()->data.size - pos;
+
+	block_run run;
+	off_t offset;
+	if (FindBlockRun(pos,run,offset) < B_OK) {
+		*_length = 0;
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	uint32 bytesRead = 0;
+	uint32 blockSize = fVolume->BlockSize();
+	uint32 blockShift = fVolume->BlockShift();
+	uint8 *block;
+
+	// the first block_run we read could not be aligned to the block_size boundary
+	// (read partial block at the beginning)
+
+	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
+	if (pos % blockSize != 0) {
+		run.start += (pos - offset) / blockSize;
+		run.length -= (pos - offset) / blockSize;
+
+		CachedBlock cached(fVolume,run);
+		if ((block = cached.Block()) == NULL) {
+			*_length = 0;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+
+		bytesRead = blockSize - (pos % blockSize);
+		if (length < bytesRead)
+			bytesRead = length;
+
+		memcpy(buffer,block + (pos % blockSize),bytesRead);
+		pos += bytesRead;
+
+		length -= bytesRead;
+		if (length == 0) {
+			*_length = bytesRead;
+			return B_OK;
+		}
+
+		if (FindBlockRun(pos,run,offset) < B_OK) {
+			*_length = bytesRead;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+	}
+
+	// the first block_run is already filled in at this point
+	// read the following complete blocks using cached_read(),
+	// the last partial block is read using the CachedBlock class
+
+	bool partial = false;
+
+	while (length > 0) {
+		// offset is the offset to the current pos in the block_run
+		run.start += (pos - offset) >> blockShift;
+		run.length -= (pos - offset) >> blockShift;
+
+		if ((run.length << blockShift) > length) {
+			if (length < blockSize) {
+				CachedBlock cached(fVolume,run);
+				if ((block = cached.Block()) == NULL) {
+					*_length = bytesRead;
+					RETURN_ERROR(B_BAD_VALUE);
+				}
+				memcpy(buffer + bytesRead,block,length);
+				bytesRead += length;
+				break;
+			}
+			run.length = length >> blockShift;
+			partial = true;
+		}
+
+		if (cached_read(fVolume->Device(),fVolume->ToBlock(run),buffer + bytesRead,
+						run.length,blockSize) != B_OK) {
+			*_length = bytesRead;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+
+		int32 bytes = run.length << blockShift;
+		length -= bytes;
+		bytesRead += bytes;
+		if (length == 0)
+			break;
+
+		pos += bytes;
+
+		if (partial) {
+			// if the last block was read only partially, point block_run
+			// to the remaining part
+			run.start += run.length;
+			run.length = 1;
+			offset = pos;
+		} else if (FindBlockRun(pos,run,offset) < B_OK) {
+			*_length = bytesRead;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+	}
+
+	*_length = bytesRead;
+	return B_NO_ERROR;
+}
+
+
+status_t 
+Inode::WriteAt(Transaction *transaction,off_t pos,const uint8 *buffer,size_t *_length)
+{
+	size_t length = *_length;
+
+	// set/check boundaries for pos/length
+	if (pos < 0)
+		pos = 0;
+	else if (pos + length > Node()->data.size) {
+		off_t oldSize = Size();
+
+		// the transaction doesn't have to be started already
+		if ((Flags() & INODE_NO_TRANSACTION) == 0)
+			transaction->Start(fVolume,BlockNumber());
+
+		// let's grow the data stream to the size needed
+		status_t status = SetFileSize(transaction,pos + length);
+		if (status < B_OK) {
+			*_length = 0;
+			RETURN_ERROR(status);
+		}
+		// If the position of the write was beyond the file size, we
+		// have to fill the gap between that position and the old file
+		// size with zeros.
+		FillGapWithZeros(oldSize,pos);
+	}
+
+	block_run run;
+	off_t offset;
+	if (FindBlockRun(pos,run,offset) < B_OK) {
+		*_length = 0;
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	bool logStream = (Flags() & INODE_LOGGED) == INODE_LOGGED;
+	if (logStream)
+		transaction->Start(fVolume,BlockNumber());
+
+	uint32 bytesWritten = 0;
+	uint32 blockSize = fVolume->BlockSize();
+	uint32 blockShift = fVolume->BlockShift();
+	uint8 *block;
+
+	// the first block_run we write could not be aligned to the block_size boundary
+	// (write partial block at the beginning)
+
+	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
+	if (pos % blockSize != 0) {
+		run.start += (pos - offset) / blockSize;
+		run.length -= (pos - offset) / blockSize;
+
+		CachedBlock cached(fVolume,run);
+		if ((block = cached.Block()) == NULL) {
+			*_length = 0;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+
+		bytesWritten = blockSize - (pos % blockSize);
+		if (length < bytesWritten)
+			bytesWritten = length;
+
+		memcpy(block + (pos % blockSize),buffer,bytesWritten);
+
+		// either log the stream or write it directly to disk
+		if (logStream)
+			cached.WriteBack(transaction);
+		else
+			fVolume->WriteBlocks(cached.BlockNumber(),block,1);
+
+		pos += bytesWritten;
+		
+		length -= bytesWritten;
+		if (length == 0) {
+			*_length = bytesWritten;
+			return B_OK;
+		}
+
+		if (FindBlockRun(pos,run,offset) < B_OK) {
+			*_length = bytesWritten;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+	}
+	
+	// the first block_run is already filled in at this point
+	// write the following complete blocks using Volume::WriteBlocks(),
+	// the last partial block is written using the CachedBlock class
+
+	bool partial = false;
+
+	while (length > 0) {
+		// offset is the offset to the current pos in the block_run
+		run.start += (pos - offset) >> blockShift;
+		run.length -= (pos - offset) >> blockShift;
+
+		if ((run.length << blockShift) > length) {
+			if (length < blockSize) {
+				CachedBlock cached(fVolume,run);
+				if ((block = cached.Block()) == NULL) {
+					*_length = bytesWritten;
+					RETURN_ERROR(B_BAD_VALUE);
+				}
+				memcpy(block,buffer + bytesWritten,length);
+
+				if (logStream)
+					cached.WriteBack(transaction);
+				else
+					fVolume->WriteBlocks(cached.BlockNumber(),block,1);
+
+				bytesWritten += length;
+				break;
+			}
+			run.length = length >> blockShift;
+			partial = true;
+		}
+
+		status_t status;
+		if (logStream) {
+			status = transaction->WriteBlocks(fVolume->ToBlock(run),
+						buffer + bytesWritten,run.length);
+		} else {
+			status = fVolume->WriteBlocks(fVolume->ToBlock(run),
+						buffer + bytesWritten,run.length);
+		}
+		if (status != B_OK) {
+			*_length = bytesWritten;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+
+		int32 bytes = run.length << blockShift;
+		length -= bytes;
+		bytesWritten += bytes;
+		if (length == 0)
+			break;
+
+		pos += bytes;
+
+		if (partial) {
+			// if the last block was written only partially, point block_run
+			// to the remaining part
+			run.start += run.length;
+			run.length = 1;
+			offset = pos;
+		} else if (FindBlockRun(pos,run,offset) < B_OK) {
+			*_length = bytesWritten;
+			RETURN_ERROR(B_BAD_VALUE);
+		}
+	}
+
+	*_length = bytesWritten;
+
+	return B_NO_ERROR;
+}
+
+
+/**	Fills the gap between the old file size and the new file size
+ *	with zeros.
+ *	It's more or less a copy of Inode::WriteAt() but it can handle
+ *	length differences of more than just 4 GB, and it never uses
+ *	the log, even if the INODE_LOGGED flag is set.
+ */
+
+status_t
+Inode::FillGapWithZeros(off_t pos,off_t newSize)
+{
+	//if (pos >= newSize)
+		return B_OK;
+
+	block_run run;
+	off_t offset;
+	if (FindBlockRun(pos,run,offset) < B_OK)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	off_t length = newSize - pos;
+	uint32 bytesWritten = 0;
+	uint32 blockSize = fVolume->BlockSize();
+	uint32 blockShift = fVolume->BlockShift();
+	uint8 *block;
+
+	// the first block_run we write could not be aligned to the block_size boundary
+	// (write partial block at the beginning)
+
+	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
+	if (pos % blockSize != 0) {
+		run.start += (pos - offset) / blockSize;
+		run.length -= (pos - offset) / blockSize;
+
+		CachedBlock cached(fVolume,run);
+		if ((block = cached.Block()) == NULL)
+			RETURN_ERROR(B_BAD_VALUE);
+
+		bytesWritten = blockSize - (pos % blockSize);
+		if (length < bytesWritten)
+			bytesWritten = length;
+
+		memset(block + (pos % blockSize),0,bytesWritten);
+		fVolume->WriteBlocks(cached.BlockNumber(),block,1);
+
+		pos += bytesWritten;
+		
+		length -= bytesWritten;
+		if (length == 0)
+			return B_OK;
+
+		if (FindBlockRun(pos,run,offset) < B_OK)
+			RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	while (length > 0) {
+		// offset is the offset to the current pos in the block_run
+		run.start += (pos - offset) >> blockShift;
+		run.length -= (pos - offset) >> blockShift;
+
+		CachedBlock cached(fVolume);
+		off_t blockNumber = fVolume->ToBlock(run);
+		for (int32 i = 0;i < run.length;i++) {
+			if ((block = cached.SetTo(blockNumber + i,true)) == NULL)
+				RETURN_ERROR(B_IO_ERROR);
+
+			if (fVolume->WriteBlocks(cached.BlockNumber(),block,1) < B_OK)
+				RETURN_ERROR(B_IO_ERROR);
+		}
+
+		int32 bytes = run.length << blockShift;
+		length -= bytes;
+		bytesWritten += bytes;
+		
+		// since we don't respect a last partial block, length can be lower
+		if (length <= 0)
+			break;
+
+		pos += bytes;
+
+		if (FindBlockRun(pos,run,offset) < B_OK)
+			RETURN_ERROR(B_BAD_VALUE);
+	}
+	return B_OK;
+}
+
+
+status_t 
+Inode::GrowStream(Transaction *transaction, off_t size)
+{
+	data_stream *data = &Node()->data;
+
+	// is the data stream already large enough to hold the new size?
+	// (can be the case with preallocated blocks)
+	if (size < data->max_direct_range
+		|| size < data->max_indirect_range
+		|| size < data->max_double_indirect_range) {
+		data->size = size;
+		return B_OK;
+	}
+
+	// how many bytes are still needed? (unused ranges are always zero)
+	off_t bytes;		
+	if (data->size < data->max_double_indirect_range)
+		bytes = size - data->max_double_indirect_range;
+	else if (data->size < data->max_indirect_range)
+		bytes = size - data->max_indirect_range;
+	else if (data->size < data->max_direct_range)
+		bytes = size - data->max_direct_range;
+	else
+		bytes = size - data->size;
+
+	// do we have enough free blocks on the disk?
+	off_t blocks = (bytes + fVolume->BlockSize() - 1) / fVolume->BlockSize();
+	if (blocks > fVolume->FreeBlocks())
+		return B_DEVICE_FULL;
+
+	// should we preallocate some blocks (currently, always 64k)?
+	off_t blocksNeeded = blocks;
+	if (blocks < 65536 / fVolume->BlockSize() && fVolume->FreeBlocks() > 128)
+		blocks = 65536 / fVolume->BlockSize();
+
+	while (blocksNeeded > 0) {
+		// the requested blocks do not need to be returned with a
+		// single allocation, so we need to iterate until we have
+		// enough blocks allocated
+		block_run run;
+		status_t status = fVolume->Allocate(transaction,this,blocks,run);
+		if (status < B_OK)
+			return status;
+
+		// okay, we have the needed blocks, so just distribute them to the
+		// different ranges of the stream (direct, indirect & double indirect)
+		
+		blocksNeeded -= run.length;
+		// don't preallocate if the first allocation was already too small
+		blocks = blocksNeeded;
+		
+		if (data->size <= data->max_direct_range) {
+			// let's try to put them into the direct block range
+			int32 free = 0;
+			for (;free < NUM_DIRECT_BLOCKS;free++)
+				if (data->direct[free].IsZero())
+					break;
+
+			if (free < NUM_DIRECT_BLOCKS) {
+				// can we merge the last allocated run with the new one?
+				int32 last = free - 1;
+				if (free > 0
+					&& data->direct[last].allocation_group == run.allocation_group
+					&& data->direct[last].start + data->direct[last].length == run.start) {
+					data->direct[last].length += run.length;
+				} else {
+					data->direct[free] = run;
+				}
+				data->max_direct_range += run.length * fVolume->BlockSize();
+				data->size = blocksNeeded > 0 ? data->max_direct_range : size;
+				continue;
+			}
+		}
+
+		if (data->size <= data->max_indirect_range || !data->max_indirect_range) {
+			CachedBlock cached(fVolume);
+			block_run *runs = NULL;
+			int32 free = 0;
+			off_t block;
+
+			// if there is no indirect block yet, create one
+			if (data->indirect.IsZero()) {
+				status = fVolume->Allocate(transaction,this,4,data->indirect,4);
+				if (status < B_OK)
+					return status;
+
+				// make sure those blocks are empty
+				block = fVolume->ToBlock(data->indirect);					
+				for (int32 i = 1;i < data->indirect.length;i++) {
+					block_run *runs = (block_run *)cached.SetTo(block + i,true);
+					if (runs == NULL)
+						return B_IO_ERROR;
+
+					cached.WriteBack(transaction);
+				}
+				data->max_indirect_range = data->max_direct_range;
+				// insert the block_run in the first block
+				runs = (block_run *)cached.SetTo(block,true);
+			} else {
+				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
+				block = fVolume->ToBlock(data->indirect);
+
+				// search first empty entry
+				int32 i = 0;
+				for (;i < data->indirect.length;i++) {
+					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
+						return B_IO_ERROR;
+
+					for (free = 0;free < numberOfRuns;free++)
+						if (runs[free].IsZero())
+							break;
+
+					if (free < numberOfRuns)
+						break;
+				}
+				if (i == data->indirect.length)
+					runs = NULL;
+			}
+
+			if (runs != NULL) {
+				// try to insert the run to the last one - note that this doesn't
+				// take block borders into account, so it could be further optimized
+				int32 last = free - 1;
+				if (free > 0
+					&& runs[last].allocation_group == run.allocation_group
+					&& runs[last].start + runs[last].length == run.start) {
+					runs[last].length += run.length;
+				} else {
+					runs[free] = run;
+				}
+				data->max_indirect_range += run.length * fVolume->BlockSize();
+				data->size = blocksNeeded > 0 ? data->max_indirect_range : size;
+
+				cached.WriteBack(transaction);
+				continue;
+			}
+		}
+
+		// when we are here, we need to grow into the double indirect
+		// range - but that's not yet implemented, so bail out!
+
+		if (data->size <= data->max_double_indirect_range || !data->max_double_indirect_range) {
+			FATAL(("growing in the double indirect range is not yet implemented!\n"));
+			// ToDo: implement growing into the double indirect range, please!
+		}
+
+		RETURN_ERROR(EFBIG);
+	}
+	// update the size of the data stream
+	data->size = size;
+
+	return B_OK;
+}
+
+
+status_t
+Inode::FreeStaticStreamArray(Transaction *transaction,int32 level,block_run run,off_t size,off_t offset,off_t &max)
+{
+	int32 indirectSize;
+	if (level == 0)
+		indirectSize = (16 << fVolume->BlockShift()) * (fVolume->BlockSize() / sizeof(block_run));
+	else if (level == 1)
+		indirectSize = 4 << fVolume->BlockShift();
+
+	off_t start;
+	if (size > offset)
+		start = size - offset;
+	else
+		start = 0;
+
+	int32 index = start / indirectSize;
+	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
+
+	CachedBlock cached(fVolume);
+	off_t blockNumber = fVolume->ToBlock(run);
+
+	// set the file offset to the current block run
+	offset += (off_t)index * indirectSize;
+
+	for (int32 i = index / runsPerBlock;i < run.length;i++) {
+		block_run *array = (block_run *)cached.SetTo(blockNumber + i);
+		if (array == NULL)
+			RETURN_ERROR(B_ERROR);
+
+		for (index = index % runsPerBlock;index < runsPerBlock;index++) {
+			if (array[index].IsZero()) {
+				// we also want to break out of the outer loop
+				i = run.length;
+				break;
+			}
+
+			status_t status = B_OK;
+			if (level == 0)
+				status = FreeStaticStreamArray(transaction,1,array[index],size,offset,max);
+			else if (offset >= size)
+				status = fVolume->Free(transaction,array[index]);
+			else
+				max = offset + indirectSize;
+
+			if (status < B_OK)
+				RETURN_ERROR(status);
+
+			if (offset >= size)
+				array[index].SetTo(0,0,0);
+
+			offset += indirectSize;
+		}
+		index = 0;
+
+		cached.WriteBack(transaction);
+	}
+	return B_OK;
+}
+
+
+/** Frees all block_runs in the array which come after the specified size.
+ *	It also trims the last block_run that contain the size.
+ *	"offset" and "max" are maintained until the last block_run that doesn't
+ *	have to be freed - after this, the values won't be correct anymore, but
+ *	will still assure correct function for all subsequent calls.
+ */
+
+status_t
+Inode::FreeStreamArray(Transaction *transaction,block_run *array,uint32 arrayLength,off_t size,off_t &offset,off_t &max)
+{
+	off_t newOffset = offset;
+	uint32 i = 0;
+	for (;i < arrayLength;i++,offset = newOffset) {
+		if (array[i].IsZero())
+			break;
+
+		newOffset += (off_t)array[i].length << fVolume->BlockShift();
+		if (newOffset <= size)
+			continue;
+
+		block_run run = array[i];
+
+		// determine the block_run to be freed
+		if (newOffset > size && offset < size) {
+			// free partial block_run (and update the original block_run)
+			run.start = array[i].start + ((size - offset) >> fVolume->BlockShift()) + 1;
+			array[i].length = run.start - array[i].start;
+			run.length -= array[i].length;
+
+			if (run.length == 0)
+				continue;
+
+			// update maximum range
+			max = offset + ((off_t)array[i].length << fVolume->BlockShift());
+		} else {
+			// free the whole block_run
+			array[i].SetTo(0,0,0);
+			
+			if (max > offset)
+				max = offset;
+		}
+
+		if (fVolume->Free(transaction,run) < B_OK)
+			return B_IO_ERROR;
+	}
+	return B_OK;
+}
+
+
+status_t 
+Inode::ShrinkStream(Transaction *transaction, off_t size)
+{
+	data_stream *data = &Node()->data;
+
+	if (data->max_double_indirect_range > size) {
+		FreeStaticStreamArray(transaction,0,data->double_indirect,size,data->max_indirect_range,data->max_double_indirect_range);
+		
+		if (size <= data->max_indirect_range) {
+			fVolume->Free(transaction,data->double_indirect);
+			data->double_indirect.SetTo(0,0,0);
+			data->max_double_indirect_range = 0;
+		}
+	}
+	if (data->max_indirect_range > size) {
+		CachedBlock cached(fVolume);
+		off_t block = fVolume->ToBlock(data->indirect);
+		off_t offset = data->max_direct_range;
+
+		for (int32 i = 0;i < data->indirect.length;i++) {
+			block_run *array = (block_run *)cached.SetTo(block + i);
+			if (array == NULL)
+				break;
+
+			if (FreeStreamArray(transaction,array,fVolume->BlockSize() / sizeof(block_run),size,offset,data->max_indirect_range) == B_OK)
+				cached.WriteBack(transaction);
+		}
+		if (data->max_direct_range == data->max_indirect_range) {
+			fVolume->Free(transaction,data->indirect);
+			data->indirect.SetTo(0,0,0);
+			data->max_indirect_range = 0;
+		}
+	}
+	if (data->max_direct_range > size) {
+		off_t offset = 0;
+		FreeStreamArray(transaction,data->direct,NUM_DIRECT_BLOCKS,size,offset,data->max_direct_range);
+	}
+
+	data->size = size;
+	return B_OK;
+}
+
+
+status_t 
+Inode::SetFileSize(Transaction *transaction, off_t size)
+{
+	if (size < 0)
+		return B_BAD_VALUE;
+
+	off_t oldSize = Node()->data.size;
+
+	if (size == oldSize)
+		return B_OK;
+
+	// should the data stream grow or shrink?
+	status_t status;
+	if (size > oldSize) {
+		status = GrowStream(transaction,size);
+		if (status < B_OK) {
+			// if the growing of the stream fails, the whole operation
+			// fails, so we should shrink the stream to its former size
+			ShrinkStream(transaction,oldSize);
+		}
+	}
+	else
+		status = ShrinkStream(transaction,size);
+
+	if (status < B_OK)
+		return status;
+
+	return WriteBack(transaction);
+}
+
+
+status_t 
+Inode::Append(Transaction *transaction,off_t bytes)
+{
+	return SetFileSize(transaction,Size() + bytes);
+}
+
+
+status_t 
+Inode::Trim(Transaction *transaction)
+{
+	return ShrinkStream(transaction,Size());
+}
+
+
+status_t 
+Inode::Sync()
+{
+	// We may also want to flush the attribute's data stream to
+	// disk here... (do we?)
+
+	data_stream *data = &Node()->data;
+	status_t status;
+
+	// flush direct range
+
+	for (int32 i = 0;i < NUM_DIRECT_BLOCKS;i++) {
+		if (data->direct[i].IsZero())
+			return B_OK;
+		
+		status = flush_blocks(fVolume->Device(),fVolume->ToBlock(data->direct[i]),data->direct[i].length);
+		if (status != B_OK)
+			return status;
+	}
+
+	// flush indirect range
+	
+	if (data->max_indirect_range == 0)
+		return B_OK;
+
+	CachedBlock cached(fVolume);
+	off_t block = fVolume->ToBlock(data->indirect);
+	int32 count = fVolume->BlockSize() / sizeof(block_run);
+
+	for (int32 j = 0;j < data->indirect.length;j++) {
+		block_run *runs = (block_run *)cached.SetTo(block + j);
+		if (runs == NULL)
+			break;
+
+		for (int32 i = 0;i < count;i++) {
+			if (runs[i].IsZero())
+				return B_OK;
+
+			status = flush_blocks(fVolume->Device(),fVolume->ToBlock(runs[i]),runs[i].length);
+			if (status != B_OK)
+				return status;
+		}
+	}
+
+	// flush double indirect range
+	
+	if (data->max_double_indirect_range == 0)
+		return B_OK;
+
+	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
+	
+	for (int32 l = 0;l < data->double_indirect.length;l++) {
+		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
+		if (indirectRuns == NULL)
+			return B_FILE_ERROR;
+
+		CachedBlock directCached(fVolume);
+
+		for (int32 k = 0;k < count;k++) {
+			if (indirectRuns[k].IsZero())
+				return B_OK;
+
+			block = fVolume->ToBlock(indirectRuns[k]);			
+			for (int32 j = 0;j < indirectRuns[k].length;j++) {
+				block_run *runs = (block_run *)directCached.SetTo(block + j);
+				if (runs == NULL)
+					return B_FILE_ERROR;
+				
+				for (int32 i = 0;i < count;i++) {
+					if (runs[i].IsZero())
+						return B_OK;
+
+					// ToDo: combine single block_runs to bigger ones when
+					// they are adjacent
+					status = flush_blocks(fVolume->Device(),fVolume->ToBlock(runs[i]),runs[i].length);
+					if (status != B_OK)
+						return status;
+				}
+			}
+		}
+	}
+	return B_OK;
+}
+
+
+status_t
+Inode::Remove(Transaction *transaction,const char *name,off_t *_id,bool isDirectory)
+{
+	BPlusTree *tree;
+	if (GetTree(&tree) != B_OK)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// does the file even exists?
+	off_t id;
+	if (tree->Find((uint8 *)name,(uint16)strlen(name),&id) < B_OK)
+		return B_ENTRY_NOT_FOUND;
+
+	if (_id)
+		*_id = id;
+
+	Vnode vnode(fVolume,id);
+	Inode *inode;
+	status_t status = vnode.Get(&inode);
+	if (status < B_OK) {
+		REPORT_ERROR(status);
+		return B_ENTRY_NOT_FOUND;
+	}
+
+	// It's a bit stupid, but indices are regarded as directories
+	// in BFS - so a test for a directory always succeeds, but you
+	// should really be able to do whatever you want with your indices
+	// without having to remove all files first :)
+	if (!inode->IsIndex()) {
+		// if it's not of the correct type, don't delete it!
+		if (inode->IsDirectory() != isDirectory)
+			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
+
+		// only delete empty directories
+		if (isDirectory && !inode->IsEmpty())
+			return B_DIRECTORY_NOT_EMPTY;
+	}
+
+	// remove_vnode() allows the inode to be accessed until the last put_vnode()
+	if (remove_vnode(fVolume->ID(),id) != B_OK)
+		return B_ERROR;
+
+	if (tree->Remove(transaction,(uint8 *)name,(uint16)strlen(name),id) < B_OK) {
+		unremove_vnode(fVolume->ID(),id);
+		RETURN_ERROR(B_ERROR);
+	}
+
+	// update the inode, so that no one will ever doubt it's deleted :-)
+	inode->Node()->flags |= INODE_DELETED;
+
+	// In balance to the Inode::Create() method, the main indices
+	// are updated here (name, size, & last_modified)
+
+	Index index(fVolume);
+	if ((inode->Mode() & (S_ATTR_DIR | S_ATTR | S_INDEX_DIR)) == 0) {
+		index.RemoveName(transaction,name,inode);
+			// If removing from the index fails, it is not regarded as a
+			// fatal error and will not be reported back!
+			// Deleted inodes won't be visible in queries anyway.
+	}
+	
+	if ((inode->Mode() & (S_FILE | S_SYMLINK)) != 0) {
+		index.RemoveSize(transaction,inode);
+		index.RemoveLastModified(transaction,inode);
+	}
+
+	if (inode->WriteBack(transaction) < B_OK)
+		return B_ERROR;
+
+	return B_OK;
+}
+
+
+/**	Creates the inode with the specified parent directory, and automatically
+ *	adds the created inode to that parent directory. If an attribute directory
+ *	is created, it will also automatically added to the parent inode as such.
+ *	However, the indices root node, and the regular root node won't be added
+ *	to the super block.
+ *	It will also create the initial B+tree for the inode if it's a directory
+ *	of any kind.
+ *	If the "id" variable is given to store the inode's ID, the inode stays
+ *	locked - you have to call put_vnode() if you don't use it anymore.
+ */
+
+status_t 
+Inode::Create(Transaction *transaction,Inode *parent, const char *name, int32 mode, int omode, uint32 type, off_t *_id, Inode **_inode)
+{
+	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0,0,0);
+	Volume *volume = transaction->GetVolume();
+	BPlusTree *tree = NULL;
+
+	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsDirectory()) {
+		// check if the file already exists in the directory
+		if (parent->GetTree(&tree) != B_OK)
+			RETURN_ERROR(B_BAD_VALUE);
+
+		// does the file already exist?
+		off_t offset;
+		if (tree->Find((uint8 *)name,(uint16)strlen(name),&offset) == B_OK) {
+			// return if the file should be a directory or opened in exclusive mode
+			if (mode & S_DIRECTORY || omode & O_EXCL)
+				return B_FILE_EXISTS;
+
+			Vnode vnode(volume,offset);
+			Inode *inode;
+			status_t status = vnode.Get(&inode);
+			if (status < B_OK) {
+				REPORT_ERROR(status);
+				return B_ENTRY_NOT_FOUND;
+			}
+
+			// if it's a directory, bail out!
+			if (inode->IsDirectory())
+				return B_IS_A_DIRECTORY;
+
+			// if omode & O_TRUNC, truncate the existing file
+			if (omode & O_TRUNC) {
+				WriteLocked locked(inode->Lock());
+
+				status_t status = inode->SetFileSize(transaction,0);
+				if (status < B_OK)
+					return status;
+			}
+
+			// only keep the vnode in memory if the vnode_id pointer is provided
+			if (_id) {
+				*_id = offset;
+				vnode.Keep();
+			}
+			if (_inode)
+				*_inode = inode;
+
+			return B_OK;
+		}
+	} else if (parent && (mode & S_ATTR_DIR) == 0)
+		return B_BAD_VALUE;
+
+	// allocate space for the new inode
+	InodeAllocator allocator(transaction);
+	block_run run;
+	Inode *inode;
+	status_t status = allocator.New(&parentRun,mode,run,&inode);
+	if (status < B_OK)
+		return status;
+
+	// initialize the on-disk bfs_inode structure 
+
+	bfs_inode *node = inode->Node();
+
+	node->magic1 = INODE_MAGIC1;
+	node->inode_num = run;
+	node->parent = parentRun;
+
+	node->uid = geteuid();
+	node->gid = parent ? parent->Node()->gid : getegid();
+		// the group ID is inherited from the parent, if available
+	node->mode = mode;
+	node->flags = INODE_IN_USE;
+	node->type = type;
+
+	node->create_time = (bigtime_t)time(NULL) << INODE_TIME_SHIFT;
+	node->last_modified_time = node->create_time | (volume->GetUniqueID() & INODE_TIME_MASK);
+		// we use Volume::GetUniqueID() to avoid having too many duplicates in the
+		// last_modified index
+
+	node->inode_size = volume->InodeSize();
+
+	// only add the name to regular files, directories, or symlinks
+	// don't add it to attributes, or indices
+	if (tree && (mode & (S_INDEX_DIR | S_ATTR_DIR | S_ATTR)) == 0
+		&& inode->SetName(transaction,name) < B_OK)
+		return B_ERROR;
+
+	// initialize b+tree if it's a directory (and add "." & ".." if it's
+	// a standard directory for files - not for attributes or indices)
+	if (mode & (S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR)) {
+		BPlusTree *tree = inode->fTree = new BPlusTree(transaction,inode);
+		if (tree == NULL || tree->InitCheck() < B_OK)
+			return B_ERROR;
+
+		if ((mode & (S_INDEX_DIR | S_ATTR_DIR)) == 0) {
+			if (tree->Insert(transaction,".",inode->BlockNumber()) < B_OK
+				|| tree->Insert(transaction,"..",volume->ToBlock(inode->Parent())) < B_OK)
+				return B_ERROR;
+		}
+	}
+
+	// update the main indices (name, size & last_modified)
+	Index index(volume);
+	if ((mode & (S_ATTR_DIR | S_ATTR | S_INDEX_DIR)) == 0) {
+		status = index.InsertName(transaction,name,inode);
+		if (status < B_OK && status != B_BAD_INDEX)
+			return status;
+	}
+
+	inode->UpdateOldLastModified();
+
+	// The "size" & "last_modified" indices don't contain directories
+	if ((mode & (S_FILE | S_SYMLINK)) != 0) {
+		// if adding to these indices fails, the inode creation will not be harmed
+		index.InsertSize(transaction,inode);
+		index.InsertLastModified(transaction,inode);
+	}
+
+	if ((status = inode->WriteBack(transaction)) < B_OK)
+		return status;
+
+	if (new_vnode(volume->ID(),inode->ID(),inode) != B_OK)
+		return B_ERROR;
+
+	// add a link to the inode from the parent, depending on its type
+	if (tree && tree->Insert(transaction,name,volume->ToBlock(run)) < B_OK) {
+		put_vnode(volume->ID(),inode->ID());
+		RETURN_ERROR(B_ERROR);
+	} else if (parent && mode & S_ATTR_DIR) {
+		parent->Attributes() = run;
+		parent->WriteBack(transaction);
+	}
+
+	allocator.Keep();
+
+	if (_id != NULL)
+		*_id = inode->ID();
+	else
+		put_vnode(volume->ID(),inode->ID());
+
+	if (_inode != NULL)
+		*_inode = inode;
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+
+
+AttributeIterator::AttributeIterator(Inode *inode)
+	:
+	fCurrentSmallData(0),
+	fInode(inode),
+	fAttributes(NULL),
+	fIterator(NULL),
+	fBuffer(NULL)
+{
+	inode->AddIterator(this);
+}
+
+
+AttributeIterator::~AttributeIterator()
+{
+	if (fAttributes)
+		put_vnode(fAttributes->GetVolume()->ID(),fAttributes->ID());
+
+	delete fIterator;
+	fInode->RemoveIterator(this);
+}
+
+
+status_t 
+AttributeIterator::Rewind()
+{
+	fCurrentSmallData = 0;
+
+	if (fIterator != NULL)
+		fIterator->Rewind();
+
+	return B_OK;
+}
+
+
+status_t 
+AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type, vnode_id *_id)
+{
+	// read attributes out of the small data section
+
+	if (fCurrentSmallData >= 0) {
+		small_data *item = fInode->Node()->small_data_start;
+
+		fInode->SmallDataLock().Lock();
+
+		int32 i = 0;
+		for (;;item = item->Next()) {
+			if (item->IsLast(fInode->Node()))
+				break;
+
+			if (item->name_size == FILE_NAME_NAME_LENGTH
+				&& *item->Name() == FILE_NAME_NAME)
+				continue;
+
+			if (i++ == fCurrentSmallData)
+				break;
+		}
+
+		if (!item->IsLast(fInode->Node())) {
+			strncpy(name,item->Name(),B_FILE_NAME_LENGTH);
+			*_type = item->type;
+			*_length = item->name_size;
+			*_id = (vnode_id)fCurrentSmallData;
+
+			fCurrentSmallData = i;
+		}
+		else {
+			// stop traversing the small_data section
+			fCurrentSmallData = -1;
+		}
+
+		fInode->SmallDataLock().Unlock();
+
+		if (fCurrentSmallData != -1)
+			return B_OK;
+	}
+
+	// read attributes out of the attribute directory
+
+	if (fInode->Attributes().IsZero())
+		return B_ENTRY_NOT_FOUND;
+
+	Volume *volume = fInode->GetVolume();
+
+	// if you haven't yet access to the attributes directory, get it
+	if (fAttributes == NULL) {
+		if (get_vnode(volume->ID(),volume->ToVnode(fInode->Attributes()),(void **)&fAttributes) != 0
+			|| fAttributes == NULL) {
+			FATAL(("get_vnode() failed in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
+			return B_ENTRY_NOT_FOUND;
+		}
+
+		BPlusTree *tree;
+		if (fAttributes->GetTree(&tree) < B_OK
+			|| (fIterator = new TreeIterator(tree)) == NULL) {
+			FATAL(("could not get tree in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
+			return B_ENTRY_NOT_FOUND;
+		}
+	}
+
+	block_run run;
+	uint16 length;
+	vnode_id id;
+	status_t status = fIterator->GetNextEntry(name,&length,B_FILE_NAME_LENGTH,&id);
+	if (status < B_OK)
+		return status;
+
+	Vnode vnode(volume,id);
+	Inode *attribute;
+	if ((status = vnode.Get(&attribute)) == B_OK) {
+		*_type = attribute->Node()->type;
+		*_length = attribute->Node()->data.size;
+		*_id = id;
+	}
+
+	return status;
+}
+
+
+void 
+AttributeIterator::Update(uint16 index, int8 change)
+{
+	// fCurrentSmallData points already to the next item
+	if (index < fCurrentSmallData)
+		fCurrentSmallData += change;
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Inode.h b/src/add-ons/kernel/file_systems/bfs/Inode.h
new file mode 100644
index 0000000000..5148270997
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Inode.h
@@ -0,0 +1,309 @@
+#ifndef INODE_H
+#define INODE_H
+/* Inode - inode access functions
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+#ifdef USER
+#	include "myfs.h"
+#	include <stdio.h>
+#endif
+
+#ifndef _IMPEXP_KERNEL
+#	define _IMPEXP_KERNEL
+#endif
+
+extern "C" {
+	#include <lock.h>
+	#include <cache.h>
+}
+
+#include <string.h>
+
+#include "Volume.h"
+#include "Journal.h"
+#include "Lock.h"
+#include "Chain.h"
+#include "Debug.h"
+
+
+class BPlusTree;
+class TreeIterator;
+class AttributeIterator;
+
+
+enum inode_type {
+	S_DIRECTORY	= S_IFDIR,
+	S_FILE		= S_IFREG,
+	S_SYMLINK	= S_IFLNK
+};
+
+
+// The CachedBlock class is completely implemented as inlines.
+// It should be used when cache single blocks to make sure they
+// will be properly released after use (and it's also very
+// convenient to use them).
+
+class CachedBlock {
+	public:
+		CachedBlock(Volume *volume)
+			:
+			fVolume(volume),
+			fBlock(NULL)
+		{
+		}
+
+		CachedBlock(Volume *volume,off_t block,bool empty = false)
+			:
+			fVolume(volume),
+			fBlock(NULL)
+		{
+			SetTo(block,empty);
+		}
+
+		CachedBlock(Volume *volume,block_run run,bool empty = false)
+			:
+			fVolume(volume),
+			fBlock(NULL)
+		{
+			SetTo(volume->ToBlock(run),empty);
+		}
+
+		~CachedBlock()
+		{
+			Unset();
+		}
+
+		void Unset()
+		{
+			if (fBlock != NULL)
+				release_block(fVolume->Device(),fBlockNumber);
+		}
+
+		uint8 *SetTo(off_t block,bool empty = false)
+		{
+			Unset();
+			fBlockNumber = block;
+			return fBlock = empty ? (uint8 *)get_empty_block(fVolume->Device(),block,fVolume->BlockSize())
+								  : (uint8 *)get_block(fVolume->Device(),block,fVolume->BlockSize());
+		}
+
+		uint8 *SetTo(block_run run,bool empty = false)
+		{
+			return SetTo(fVolume->ToBlock(run),empty);
+		}
+
+		status_t WriteBack(Transaction *transaction)
+		{
+			if (transaction == NULL || fBlock == NULL)
+				RETURN_ERROR(B_BAD_VALUE);
+
+			return transaction->WriteBlocks(fBlockNumber,fBlock);
+		}
+
+		uint8 *Block() const { return fBlock; }
+		off_t BlockNumber() const { return fBlockNumber; }
+
+	protected:
+		Volume	*fVolume;
+		off_t	fBlockNumber;
+		uint8	*fBlock;
+};
+
+
+class Inode : public CachedBlock {
+	public:
+		Inode(Volume *volume,vnode_id id,bool empty = false,uint8 reenter = 0);
+		~Inode();
+
+		bfs_inode *Node() const { return (bfs_inode *)fBlock; }
+		vnode_id ID() const { return fVolume->ToVnode(fBlockNumber); }
+
+		ReadWriteLock &Lock() { return fLock; }
+		SimpleLock &SmallDataLock() { return fSmallDataLock; }
+
+		mode_t Mode() const { return Node()->mode; }
+		int32 Flags() const { return Node()->flags; }
+		bool IsDirectory() const { return Mode() & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR); }
+			// note, that this test will also be true for S_IFBLK (not that it's used in the fs :)
+		bool IsIndex() const { return (Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR; }
+			// that's a stupid check, but AFAIK the only possible method...
+
+		bool IsSymLink() const { return S_ISLNK(Mode()); }
+		bool HasUserAccessableStream() const { return S_ISREG(Mode()); }
+			// currently only files can be accessed with bfs_read()/bfs_write()
+
+		off_t Size() const { return Node()->data.size; }
+
+		block_run &BlockRun() const { return Node()->inode_num; }
+		block_run &Parent() const { return Node()->parent; }
+		block_run &Attributes() const { return Node()->attributes; }
+		Volume *GetVolume() const { return fVolume; }
+
+		status_t InitCheck();
+
+		status_t CheckPermissions(int accessMode) const;
+
+		// small_data access methods
+		status_t MakeSpaceForSmallData(Transaction *transaction,const char *name, int32 length);
+		status_t RemoveSmallData(Transaction *transaction,const char *name);
+		status_t AddSmallData(Transaction *transaction,const char *name,uint32 type,const uint8 *data,size_t length,bool force = false);
+		status_t GetNextSmallData(small_data **smallData) const;
+		small_data *FindSmallData(const char *name) const;
+		const char *Name() const;
+		status_t SetName(Transaction *transaction,const char *name);
+
+		// high-level attribute methods
+		status_t ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer, size_t *_length);
+		status_t WriteAttribute(Transaction *transaction, const char *name, int32 type, off_t pos, const uint8 *buffer, size_t *_length);
+		status_t RemoveAttribute(Transaction *transaction, const char *name);
+
+		// attribute methods
+		status_t GetAttribute(const char *name,Inode **attribute);
+		void ReleaseAttribute(Inode *attribute);
+		status_t CreateAttribute(Transaction *transaction,const char *name,uint32 type,Inode **attribute);
+
+		// for directories only:
+		status_t GetTree(BPlusTree **);
+		bool IsEmpty();
+
+		// manipulating the data stream
+		status_t FindBlockRun(off_t pos,block_run &run,off_t &offset);
+
+		status_t ReadAt(off_t pos,uint8 *buffer,size_t *length);
+		status_t WriteAt(Transaction *transaction,off_t pos,const uint8 *buffer,size_t *length);
+		status_t FillGapWithZeros(off_t oldSize,off_t newSize);
+
+		status_t SetFileSize(Transaction *transaction,off_t size);
+		status_t Append(Transaction *transaction,off_t bytes);
+		status_t Trim(Transaction *transaction);
+
+		status_t Sync();
+
+		// create/remove inodes
+		status_t Remove(Transaction *transaction,const char *name,off_t *_id = NULL,bool isDirectory = false);
+		static status_t Create(Transaction *transaction,Inode *parent,const char *name,int32 mode,int omode,uint32 type,off_t *_id = NULL,Inode **_inode = NULL);
+
+		// index maintaining helper
+		void UpdateOldSize() { fOldSize = Size(); }
+		void UpdateOldLastModified() { fOldLastModified = Node()->last_modified_time; }
+		off_t OldSize() { return fOldSize; }
+		off_t OldLastModified() { return fOldLastModified; }
+
+	private:
+		friend AttributeIterator;
+
+		status_t RemoveSmallData(small_data *item,int32 index);
+
+		void AddIterator(AttributeIterator *iterator);
+		void RemoveIterator(AttributeIterator *iterator);
+
+		status_t FreeStaticStreamArray(Transaction *transaction,int32 level,block_run run,off_t size,off_t offset,off_t &max);
+		status_t FreeStreamArray(Transaction *transaction, block_run *array, uint32 arrayLength, off_t size, off_t &offset, off_t &max);
+		status_t GrowStream(Transaction *transaction,off_t size);
+		status_t ShrinkStream(Transaction *transaction,off_t size);
+
+		BPlusTree		*fTree;
+		Inode			*fAttributes;
+		ReadWriteLock	fLock;
+		off_t			fOldSize;			// we need those values to ensure we will remove
+		off_t			fOldLastModified;	// the correct keys from the indices
+
+		mutable SimpleLock	fSmallDataLock;
+		Chain<AttributeIterator> fIterators;
+};
+
+
+// The Vnode class provides a convenience layer upon get_vnode(), so that
+// you don't have to call put_vnode() anymore, which may make code more
+// readable in some cases
+
+class Vnode {
+	public:
+		Vnode(Volume *volume,vnode_id id)
+			:
+			fVolume(volume),
+			fID(id)
+		{
+		}
+
+		Vnode(Volume *volume,block_run run)
+			:
+			fVolume(volume),
+			fID(volume->ToVnode(run))
+		{
+		}
+
+		~Vnode()
+		{
+			Put();
+		}
+
+		status_t Get(Inode **inode)
+		{
+			// should we check inode against NULL here? it should not be necessary
+			return get_vnode(fVolume->ID(),fID,(void **)inode);
+		}
+
+		void Put()
+		{
+			if (fVolume)
+				put_vnode(fVolume->ID(),fID);
+			fVolume = NULL;
+		}
+
+		void Keep()
+		{
+			fVolume = NULL;
+		}
+
+	private:
+		Volume		*fVolume;
+		vnode_id	fID;
+};
+
+
+class AttributeIterator {
+	public:
+		AttributeIterator(Inode *inode);
+		~AttributeIterator();
+		
+		status_t Rewind();
+		status_t GetNext(char *name,size_t *length,uint32 *type,vnode_id *id);
+
+	private:
+		int32		fCurrentSmallData;
+		Inode		*fInode, *fAttributes;
+		TreeIterator *fIterator;
+		void		*fBuffer;
+
+	private:
+		friend Chain<AttributeIterator>;
+		friend Inode;
+
+		void Update(uint16 index,int8 change);
+		AttributeIterator *fNext;
+};
+
+
+/**	Converts the "omode", the open flags given to bfs_open(), into
+ *	access modes, e.g. since O_RDONLY requires read access to the
+ *	file, it will be converted to R_OK.
+ */
+
+inline int oModeToAccess(int omode)
+{
+	omode &= O_RWMASK;
+	if (omode == O_RDONLY)
+		return R_OK;
+	else if (omode == O_WRONLY)
+		return W_OK;
+	
+	return R_OK | W_OK;
+}
+
+#endif	/* INODE_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Jamfile b/src/add-ons/kernel/file_systems/bfs/Jamfile
new file mode 100644
index 0000000000..c7ec61aac5
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Jamfile
@@ -0,0 +1,23 @@
+SubDir OBOS_TOP src add-ons kernel file_systems bfs ;
+
+{
+	local debug = -g ;
+	local defines = [ FDefines DEBUG ] ;
+	SubDirCcFlags $(defines) $(debug) ;
+	SubDirC++Flags $(defines) $(debug) ;
+}
+
+R5KernelAddon obfs : [ FDirName kernel file_systems bfs ] :
+	BlockAllocator.cpp
+	BPlusTree.cpp
+	cpp.cpp
+	Debug.cpp
+	Index.cpp
+	Inode.cpp
+	Journal.cpp
+	kernel_interface.cpp
+	Query.cpp
+	Utility.cpp
+	Volume.cpp
+;
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Journal.cpp b/src/add-ons/kernel/file_systems/bfs/Journal.cpp
new file mode 100644
index 0000000000..a60b7c8b88
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Journal.cpp
@@ -0,0 +1,433 @@
+/* Journal - transaction and logging
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Journal.h"
+#include "Inode.h"
+#include "Debug.h"
+#include "cpp.h"
+
+
+Journal::Journal(Volume *volume)
+	:
+	fVolume(volume),
+	fLock("bfs journal"),
+	fOwner(NULL),
+	fOwningThread(-1),
+	fArray(volume->BlockSize()),
+	fLogSize(volume->Log().length),
+	fMaxTransactionSize(fLogSize / 4 - 5),
+	fUsed(0),
+	fTransactionsInEntry(0)
+{
+	if (fMaxTransactionSize > fLogSize / 2)
+		fMaxTransactionSize = fLogSize / 2 - 5;
+}
+
+
+Journal::~Journal()
+{
+	FlushLogAndBlocks();
+}
+
+
+status_t
+Journal::InitCheck()
+{
+	if (fVolume->LogStart() != fVolume->LogEnd()) {
+		if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
+			FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));
+
+		return ReplayLog();
+	}
+
+	return B_OK;
+}
+
+
+status_t
+Journal::CheckLogEntry(int32 count,off_t *array)
+{
+	// ToDo: check log entry integrity (block numbers and entry size)
+	PRINT(("Log entry has %ld entries (%Ld)\n",count));
+	return B_OK;
+}
+
+
+status_t
+Journal::ReplayLogEntry(int32 *_start)
+{
+	PRINT(("ReplayLogEntry(start = %u)\n",*_start));
+
+	off_t logOffset = fVolume->ToBlock(fVolume->Log());
+	off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log());
+	int32 blockSize = fVolume->BlockSize();
+	int32 count = 1,valuesInBlock = blockSize / sizeof(off_t);
+	int32 numArrayBlocks;
+	off_t blockNumber;
+	bool first = true;
+
+	CachedBlock cached(fVolume);
+	while (count > 0) {
+		off_t *array = (off_t *)cached.SetTo(arrayBlock);
+		if (array == NULL)
+			return B_IO_ERROR;
+
+		int32 index = 0;
+		if (first) {
+			count = array[0];
+			if (count < 1 || count >= fLogSize)
+				return B_BAD_DATA;
+
+			first = false;
+			
+			numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize;
+			blockNumber = (*_start + numArrayBlocks) % fLogSize;
+				// first real block in this log entry
+			*_start += count;
+			index++;
+				// the first entry in the first block is the number
+				// of blocks in that log entry
+		}
+		(*_start)++;
+
+		if (CheckLogEntry(count,array + 1) < B_OK)
+			return B_BAD_DATA;
+
+		CachedBlock cachedCopy(fVolume);
+		for (;index < valuesInBlock && count-- > 0;index++) {
+			PRINT(("replay block %Ld in log at %Ld!\n",array[index],blockNumber));
+
+			uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber);
+			if (copy == NULL)
+				RETURN_ERROR(B_IO_ERROR);
+
+			ssize_t written = write_pos(fVolume->Device(),array[index] << fVolume->BlockShift(),copy,blockSize);
+			if (written != blockSize)
+				RETURN_ERROR(B_IO_ERROR);
+
+			blockNumber = (blockNumber + 1) % fLogSize;
+		}
+		arrayBlock++;
+		if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize)
+			arrayBlock = fVolume->ToBlock(fVolume->Log());
+	}
+	return B_OK;
+}
+
+
+/**	Replays all log entries - this will put the disk into a
+ *	consistent and clean state, if it was not correctly unmounted
+ *	before.
+ *	This method is called by Journal::InitCheck() if the log start
+ *	and end pointer don't match.
+ */
+
+status_t
+Journal::ReplayLog()
+{
+	INFORM(("Replay log, disk was not correctly unmounted...\n"));
+
+	int32 start = fVolume->LogStart();
+	int32 lastStart = -1;
+	while (true) {
+
+		// stop if the log is completely flushed
+		if (start == fVolume->LogEnd())
+			break;
+
+		if (start == lastStart) {
+			// strange, flushing the log hasn't changed the log_start pointer
+			return B_ERROR;
+		}
+		lastStart = start;
+
+		status_t status = ReplayLogEntry(&start);
+		if (status < B_OK) {
+			FATAL(("replaying log entry from %u failed: %s\n",start,strerror(status)));
+			return B_ERROR;
+		}
+		start = start % fLogSize;
+	}
+	
+	PRINT(("replaying worked fine!\n"));
+	fVolume->SuperBlock().log_start = fVolume->LogEnd();
+	fVolume->LogStart() = fVolume->LogEnd();
+	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
+
+	return fVolume->WriteSuperBlock();
+}
+
+
+/**	This is a callback function that is called by the cache, whenever
+ *	a block is flushed to disk that was updated as part of a transaction.
+ *	This is necessary to keep track of completed transactions, to be
+ *	able to update the log start pointer.
+ */
+
+void
+Journal::blockNotify(off_t blockNumber,size_t numBlocks,void *arg)
+{
+	log_entry *logEntry = (log_entry *)arg;
+
+	logEntry->cached_blocks -= numBlocks;
+	if (logEntry->cached_blocks > 0) {
+		// nothing to do yet...
+		return;
+	}
+
+	Journal *journal = logEntry->journal;
+	disk_super_block &superBlock = journal->fVolume->SuperBlock();
+	bool update = false;
+
+	// Set log_start pointer if possible...
+
+	if (logEntry == journal->fEntries.head) {
+		if (logEntry->Next() != NULL) {
+			int32 length = logEntry->next->start - logEntry->start;
+			superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize;
+		} else
+			superBlock.log_start = journal->fVolume->LogEnd();
+
+		update = true;
+	}
+	journal->fUsed -= logEntry->length;
+
+	journal->fEntriesLock.Lock();
+	logEntry->Remove();
+	journal->fEntriesLock.Unlock();
+
+	free(logEntry);
+
+	// update the super block, and change the disk's state, if necessary
+
+	if (update) {
+		journal->fVolume->LogStart() = superBlock.log_start;
+
+		if (superBlock.log_start == superBlock.log_end)
+			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
+
+		journal->fVolume->WriteSuperBlock();
+	}
+}
+
+
+status_t
+Journal::WriteLogEntry()
+{
+	fTransactionsInEntry = 0;
+	fHasChangedBlocks = false;
+
+	sorted_array *array = fArray.Array();
+	if (array == NULL || array->count == 0)
+		return B_OK;
+
+	// Make sure there is enough space in the log.
+	// If that fails for whatever reason, panic!
+	force_cache_flush(fVolume->Device(),false);
+	int32 tries = fLogSize / 2 + 1;
+	while (TransactionSize() > FreeLogBlocks() && tries-- > 0)
+		force_cache_flush(fVolume->Device(),true);
+
+	if (tries <= 0) {
+		fVolume->Panic();
+		return B_BAD_DATA;
+	}
+
+	int32 blockShift = fVolume->BlockShift();
+	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
+	off_t logStart = fVolume->LogEnd();
+	off_t logPosition = logStart % fLogSize;
+
+	// Write disk block array
+
+	uint8 *arrayBlock = (uint8 *)array;
+
+	for (int32 size = fArray.BlocksUsed();size-- > 0;) {
+		write_pos(fVolume->Device(),logOffset + (logPosition << blockShift),arrayBlock,fVolume->BlockSize());
+
+		logPosition = (logPosition + 1) % fLogSize;
+		arrayBlock += fVolume->BlockSize();
+	}
+
+	// Write logged blocks into the log
+
+	CachedBlock cached(fVolume);
+	for (int32 i = 0;i < array->count;i++) {
+		uint8 *block = cached.SetTo(array->values[i]);
+		if (block == NULL)
+			return B_IO_ERROR;
+
+		write_pos(fVolume->Device(),logOffset + (logPosition << blockShift),block,fVolume->BlockSize());
+		logPosition = (logPosition + 1) % fLogSize;
+	}
+
+	log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry));
+	if (logEntry != NULL) {
+		logEntry->start = logStart;
+		logEntry->length = TransactionSize();
+		logEntry->cached_blocks = array->count;
+		logEntry->journal = this;
+
+		fEntriesLock.Lock();
+		fEntries.Add(logEntry);
+		fEntriesLock.Unlock();
+
+		fCurrent = logEntry;
+		fUsed += logEntry->length;
+
+		set_blocks_info(fVolume->Device(),&array->values[0],array->count,blockNotify,logEntry);
+	}
+
+	// If the log goes to the next round (the log is written as a
+	// circular buffer), all blocks will be flushed out which is
+	// possible because we don't have any locked blocks at this
+	// point.
+	if (logPosition < logStart)
+		fVolume->FlushDevice();
+
+	// We need to flush the drives own cache here to ensure
+	// disk consistency.
+	// If that call fails, we can't do anything about it anyway
+	ioctl(fVolume->Device(),B_FLUSH_DRIVE_CACHE);
+
+	fArray.MakeEmpty();
+
+	// Update the log end pointer in the super block
+	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
+	fVolume->SuperBlock().log_end = logPosition;
+	fVolume->LogEnd() = logPosition;
+
+	fVolume->WriteSuperBlock();
+}
+
+
+status_t 
+Journal::FlushLogAndBlocks()
+{
+	status_t status = Lock((Transaction *)this);
+	if (status != B_OK)
+		return status;
+
+	// write the current log entry to disk
+	
+	if (TransactionSize() != 0) {
+		status = WriteLogEntry();
+		if (status < B_OK)
+			FATAL(("writing current log entry failed: %s\n",status));
+	}
+	status = fVolume->FlushDevice();
+
+	Unlock((Transaction *)this,true);
+	return status;
+}
+
+
+status_t
+Journal::Lock(Transaction *owner)
+{
+	if (owner == fOwner)
+		return B_OK;
+
+	status_t status = fLock.Lock();
+	if (status == B_OK) {
+		fOwner = owner;
+		fOwningThread = find_thread(NULL);
+	}
+
+	// if the last transaction is older than 2 secs, start a new one
+	if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L)
+		WriteLogEntry();
+
+	return B_OK;
+}
+
+
+void
+Journal::Unlock(Transaction *owner,bool success)
+{
+	if (owner != fOwner)
+		return;
+
+	TransactionDone(success);
+
+	fTimestamp = system_time();
+	fOwner = NULL;
+	fOwningThread = -1;
+	fLock.Unlock();
+}
+
+
+status_t
+Journal::TransactionDone(bool success)
+{
+	if (!success && fTransactionsInEntry == 0) {
+		// we can safely abort the transaction
+		// ToDo: abort the transaction
+		PRINT(("should abort transaction...\n"));
+	}
+
+	// Up to a maximum size, we will just batch several
+	// transactions together to improve speed
+	if (TransactionSize() < fMaxTransactionSize) {
+		fTransactionsInEntry++;
+		fHasChangedBlocks = false;
+
+		return B_OK;
+	}
+
+	return WriteLogEntry();
+}
+
+
+status_t
+Journal::LogBlocks(off_t blockNumber,const uint8 *buffer,size_t numBlocks)
+{
+	// ToDo: that's for now - we should change the log file size here
+	if (TransactionSize() + numBlocks + 1 > fLogSize)
+		return B_DEVICE_FULL;
+
+	fHasChangedBlocks = true;
+	int32 blockSize = fVolume->BlockSize();
+
+	for (;numBlocks-- > 0;blockNumber++,buffer += blockSize) {
+		if (fArray.Find(blockNumber) >= 0)
+			continue;
+
+		// Insert the block into the transaction's array, and write the changes
+		// back into the locked cache buffer
+		fArray.Insert(blockNumber);
+		status_t status = cached_write_locked(fVolume->Device(),blockNumber,buffer,1,blockSize);
+		if (status < B_OK)
+			return status;
+	}
+
+	// If necessary, flush the log, so that we have enough space for this transaction
+	if (TransactionSize() > FreeLogBlocks())
+		force_cache_flush(fVolume->Device(),true);
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+
+
+status_t 
+Transaction::Start(Volume *volume,off_t refBlock)
+{
+	// has it already been started?
+	if (fJournal != NULL)
+		return B_OK;
+
+	fJournal = volume->GetJournal(refBlock);
+	if (fJournal != NULL && fJournal->Lock(this) == B_OK)
+		return B_OK;
+
+	fJournal = NULL;
+	return B_ERROR;
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Journal.h b/src/add-ons/kernel/file_systems/bfs/Journal.h
new file mode 100644
index 0000000000..f9195d780e
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Journal.h
@@ -0,0 +1,152 @@
+#ifndef JOURNAL_H
+#define JOURNAL_H
+/* Journal - transaction and logging
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+
+#ifdef USER
+#	include "myfs.h"
+#	include <stdio.h>
+#endif
+
+#ifndef _IMPEXP_KERNEL
+#	define _IMPEXP_KERNEL
+#endif
+
+extern "C" {
+	#include <lock.h>
+	#include <cache.h>
+}
+
+#include "Volume.h"
+#include "Chain.h"
+#include "Utility.h"
+
+
+struct log_entry : node<log_entry> {
+	uint16		start;
+	uint16		length;
+	uint32		cached_blocks;
+	Journal		*journal;
+};
+
+
+class Journal {
+	public:
+		Journal(Volume *);
+		~Journal();
+		
+		status_t InitCheck();
+
+		status_t Lock(Transaction *owner);
+		void Unlock(Transaction *owner,bool success);
+
+		status_t CheckLogEntry(int32 count, off_t *array);
+		status_t ReplayLogEntry(int32 *start);
+		status_t ReplayLog();
+
+		status_t WriteLogEntry();
+		status_t LogBlocks(off_t blockNumber,const uint8 *buffer, size_t numBlocks);
+
+		thread_id CurrentThread() const { return fOwningThread; }
+		Transaction *CurrentTransaction() const { return fOwner; }
+		uint32 TransactionSize() const { return fArray.CountItems() + fArray.BlocksUsed(); }
+
+		status_t FlushLogAndBlocks();
+		Volume *GetVolume() const { return fVolume; }
+
+		inline int32 FreeLogBlocks() const;
+
+	private:
+		friend log_entry;
+
+		static void blockNotify(off_t blockNumber, size_t numBlocks, void *arg);
+		status_t TransactionDone(bool success);
+
+		Volume		*fVolume;
+		Benaphore	fLock;
+		Transaction *fOwner;
+		thread_id	fOwningThread;
+		BlockArray	fArray;
+		uint32		fLogSize,fMaxTransactionSize,fUsed;
+		int32		fTransactionsInEntry;
+		SimpleLock		fEntriesLock;
+		list<log_entry>	fEntries;
+		log_entry	*fCurrent;
+		bool		fHasChangedBlocks;
+		bigtime_t	fTimestamp;
+};
+
+
+inline int32 
+Journal::FreeLogBlocks() const
+{
+	return fVolume->LogStart() <= fVolume->LogEnd() ?
+		fLogSize - fVolume->LogEnd() + fVolume->LogStart()
+		: fVolume->LogStart() - fVolume->LogEnd();
+}
+
+
+// For now, that's only a dumb class that does more or less nothing
+// else than writing the blocks directly to the real location.
+// It doesn't yet use logging.
+
+class Transaction {
+	public:
+		Transaction(Volume *volume,off_t refBlock)
+			:
+			fJournal(NULL)
+		{
+			Start(volume,refBlock);
+		}
+
+		Transaction(Volume *volume,block_run refRun)
+			:
+			fJournal(NULL)
+		{
+			Start(volume,volume->ToBlock(refRun));
+		}
+
+		Transaction()
+			:
+			fJournal(NULL)
+		{
+		}
+
+		~Transaction()
+		{
+			if (fJournal)
+				fJournal->Unlock(this,false);
+		}
+
+		status_t Start(Volume *volume,off_t refBlock);
+
+		void Done()
+		{
+			if (fJournal != NULL)
+				fJournal->Unlock(this,true);
+			fJournal = NULL;
+		}
+
+		status_t WriteBlocks(off_t blockNumber,const uint8 *buffer,size_t numBlocks = 1)
+		{
+			if (fJournal == NULL)
+				return B_NO_INIT;
+
+			return fJournal->LogBlocks(blockNumber,buffer,numBlocks);
+			//status_t status = cached_write/*_locked*/(fVolume->Device(),blockNumber,buffer,numBlocks,fVolume->BlockSize());
+			//return status;
+		}
+
+		Volume	*GetVolume() { return fJournal != NULL ? fJournal->GetVolume() : NULL; }
+
+	protected:
+		Journal	*fJournal;
+};
+
+#endif	/* JOURNAL_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Lock.h b/src/add-ons/kernel/file_systems/bfs/Lock.h
new file mode 100644
index 0000000000..8ffd9dc133
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Lock.h
@@ -0,0 +1,337 @@
+#ifndef LOCK_H
+#define LOCK_H
+/* Lock - benaphores, read/write lock implementation
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** Roughly based on a Be sample code written by Nathan Schrenk.
+**
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+
+
+class Benaphore {
+	public:
+		Benaphore(const char *name = "bfs benaphore")
+			:
+			fSemaphore(create_sem(0, name)),
+			fCount(1)
+		{
+		}
+
+		~Benaphore()
+		{
+			delete_sem(fSemaphore);
+		}
+
+		status_t InitCheck()
+		{
+			if (fSemaphore < B_OK)
+				return fSemaphore;
+			
+			return B_OK;
+		}
+
+		status_t Lock()
+		{
+			if (atomic_add(&fCount, -1) <= 0)
+				return acquire_sem(fSemaphore);
+
+			return B_OK;
+		}
+	
+		void Unlock()
+		{
+			if (atomic_add(&fCount, 1) < 0)
+				release_sem(fSemaphore);
+		}
+
+	private:
+		sem_id	fSemaphore;
+		vint32	fCount;
+};
+
+// a convenience class to lock the benaphore
+
+class Locker {
+	public:
+		Locker(Benaphore &lock)
+			: fLock(lock)
+		{
+			fStatus = lock.Lock();
+		}
+		
+		~Locker()
+		{
+			if (fStatus == B_OK)
+				fLock.Unlock();
+		}
+	
+	private:
+		Benaphore	&fLock;
+		status_t	fStatus;
+};
+
+
+//**** Many Reader/Single Writer Lock
+
+// This is a "fast" implementation of a single writer/many reader
+// locking scheme. It's fast because it uses the benaphore idea
+// to do lazy semaphore locking - in most cases it will only have
+// to do some simple integer arithmetic.
+// The second semaphore (fWriteLock) is needed to prevent the situation
+// that a second writer can acquire the lock when there are still readers
+// holding it.
+
+#define MAX_READERS 100000
+
+// Note: this code will break if you actually have 100000 readers
+// at once. With the current thread/... limits in BeOS you can't
+// touch that value, but it might be possible in the future.
+// Also, you can only have about 20000 concurrent writers until
+// the semaphore count exceeds the int32 bounds
+
+// Timeouts:
+// It may be a good idea to have timeouts for the WriteLocked class,
+// in case something went wrong - we'll see if this is necessary,
+// but it would be a somewhat poor work-around for a deadlock...
+// But the only real problem with timeouts could be for things like
+// "chkbfs" - because such a tool may need to lock for some more time
+
+
+// define if you want to have fast locks as the foundation for the
+// ReadWriteLock class - the benefit is that acquire_sem() doesn't
+// have to be called when there is no one waiting.
+// The disadvantage is the use of 2 real semaphores which is quite
+// expensive regarding that BeOS only allows for a total of 64k
+// semaphores.
+
+//#define FAST_LOCK
+#ifdef FAST_LOCK
+class ReadWriteLock {
+	public:
+		ReadWriteLock(const char *name = "bfs r/w lock")
+			:
+			fSemaphore(create_sem(0, name)),
+			fCount(MAX_READERS),
+			fWriteLock()
+		{
+		}
+
+		~ReadWriteLock()
+		{
+			delete_sem(fSemaphore);
+		}
+
+		status_t InitCheck()
+		{
+			if (fSemaphore < B_OK)
+				return fSemaphore;
+			
+			return B_OK;
+		}
+
+		status_t Lock()
+		{
+			if (atomic_add(&fCount, -1) <= 0)
+				return acquire_sem(fSemaphore);
+			
+			return B_OK;
+		}
+		
+		void Unlock()
+		{
+			if (atomic_add(&fCount, 1) < 0)
+				release_sem(fSemaphore);
+		}
+		
+		status_t LockWrite()
+		{
+			if (fWriteLock.Lock() < B_OK)
+				return B_ERROR;
+
+			int32 readers = atomic_add(&fCount, -MAX_READERS);
+			status_t status = B_OK;
+
+			if (readers < MAX_READERS) {
+				// Acquire sem for all readers currently not using a semaphore.
+				// But if we are not the only write lock in the queue, just get
+				// the one for us
+				status = acquire_sem_etc(fSemaphore,readers <= 0 ? 1 : MAX_READERS - readers,0,0);
+			}
+			fWriteLock.Unlock();
+
+			return status;
+		}
+		
+		void UnlockWrite()
+		{
+			int32 readers = atomic_add(&fCount,MAX_READERS);
+			if (readers < 0) {
+				// release sem for all readers only when we were the only writer
+				release_sem_etc(fSemaphore,readers <= -MAX_READERS ? 1 : -readers,0);
+			}
+		}
+
+	private:
+		friend class ReadLocked;
+		friend class WriteLocked;
+
+		sem_id		fSemaphore;
+		vint32		fCount;
+		Benaphore	fWriteLock;
+};
+#else	// FAST_LOCK
+class ReadWriteLock {
+	public:
+		ReadWriteLock(const char *name = "bfs r/w lock")
+			:
+			fSemaphore(create_sem(MAX_READERS, name))
+		{
+		}
+
+		~ReadWriteLock()
+		{
+			delete_sem(fSemaphore);
+		}
+
+		status_t InitCheck()
+		{
+			if (fSemaphore < B_OK)
+				return fSemaphore;
+			
+			return B_OK;
+		}
+
+		status_t Lock()
+		{
+			return acquire_sem(fSemaphore);
+		}
+		
+		void Unlock()
+		{
+			release_sem(fSemaphore);
+		}
+		
+		status_t LockWrite()
+		{
+			return acquire_sem_etc(fSemaphore,MAX_READERS,0,0);
+		}
+		
+		void UnlockWrite()
+		{
+			release_sem_etc(fSemaphore,MAX_READERS,0);
+		}
+
+	private:
+		friend class ReadLocked;
+		friend class WriteLocked;
+
+		sem_id		fSemaphore;
+};
+#endif	// FAST_LOCK
+
+
+class ReadLocked {
+	public:
+		ReadLocked(ReadWriteLock &lock)
+			:
+			fLock(lock)
+		{
+			fStatus = lock.Lock();
+		}
+		
+		~ReadLocked()
+		{
+			if (fStatus == B_OK)
+				fLock.Unlock();
+		}
+	
+	private:
+		ReadWriteLock	&fLock;
+		status_t		fStatus;
+};
+
+
+class WriteLocked {
+	public:
+		WriteLocked(ReadWriteLock &lock)
+			:
+			fLock(lock)
+		{
+			fStatus = lock.LockWrite();
+		}
+
+		~WriteLocked()
+		{
+			if (fStatus == B_OK)
+				fLock.UnlockWrite();
+		}
+
+		status_t IsLocked()
+		{
+			return fStatus;
+		}
+
+	private:
+		ReadWriteLock	&fLock;
+		status_t		fStatus;
+};
+
+
+// A simple locking structure that doesn't use a semaphore - it's useful
+// if you have to protect critical parts with a short runtime.
+
+class SimpleLock {
+	public:
+		SimpleLock()
+			:
+			fLock(0),
+			fUnlock(0)
+		{
+		}
+
+		status_t Lock(bigtime_t time = 500)
+		{
+			int32 turn = atomic_add(&fLock,1);
+			while (turn != fUnlock)
+				snooze(time);
+
+			// ToDo: the lock cannot fail currently! We may want
+			// to change this
+			return B_OK;
+		}
+
+		void Unlock()
+		{
+			atomic_add(&fUnlock,1);
+		}
+
+	private:
+		vint32	fLock;
+		vint32	fUnlock;
+};
+
+// A convenience class to lock the SimpleLock, note the
+// different timing compared to the direct call
+
+class SimpleLocker {
+	public:
+		SimpleLocker(SimpleLock &lock,bigtime_t time = 1000)
+			: fLock(lock)
+		{
+			lock.Lock(time);
+		}
+
+		~SimpleLocker()
+		{
+			fLock.Unlock();
+		}
+
+	private:
+		SimpleLock	&fLock;
+};
+
+#endif	/* LOCK_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Query.cpp b/src/add-ons/kernel/file_systems/bfs/Query.cpp
new file mode 100644
index 0000000000..ce37e6122a
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Query.cpp
@@ -0,0 +1,1505 @@
+/* Query - query parsing and evaluation
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** The pattern matching is roughly based on code originally written
+** by J. Kercheval, and on code written by Kenneth Almquist, though
+** it shares no code.
+**
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Query.h"
+#include "cpp.h"
+#include "bfs.h"
+#include "Debug.h"
+#include "Stack.h"
+#include "Volume.h"
+#include "Inode.h"
+#include "BPlusTree.h"
+#include "Index.h"
+
+#include <SupportDefs.h>
+#include <TypeConstants.h>
+#include <AppDefs.h>
+
+#include <malloc.h>
+#include <stdio.h>
+#include <string.h>
+
+
+// The parser has a very static design, but it will do what is required.
+//
+// ParseOr(), ParseAnd(), ParseEquation() are guarantying the operator
+// precedence, that is =,!=,>,<,>=,<= .. && .. ||.
+// Apparently, the "!" (not) can only be used with brackets.
+//
+// If you think that there are too few NULL pointer checks in some places
+// of the code, just read the beginning of the query constructor.
+// The API is not fully available, just the Query and the Expression class
+// are.
+
+
+enum ops {
+	OP_NONE,
+
+	OP_AND,
+	OP_OR,
+
+	OP_EQUATION,
+
+	OP_EQUAL,
+	OP_UNEQUAL,
+	OP_GREATER_THAN,
+	OP_LESS_THAN,
+	OP_GREATER_THAN_OR_EQUAL,
+	OP_LESS_THAN_OR_EQUAL,
+};
+
+enum match {
+	NO_MATCH = 0,
+	MATCH_OK = 1,
+	
+	MATCH_BAD_PATTERN = -2,
+	MATCH_INVALID_CHARACTER
+};
+
+// return values from isValidPattern()
+enum {
+	PATTERN_INVALID_ESCAPE = -3,
+	PATTERN_INVALID_RANGE,
+	PATTERN_INVALID_SET
+};
+
+union value {
+	int64	Int64;
+	uint64	Uint64;
+	int32	Int32;
+	uint32	Uint32;
+	float	Float;
+	double	Double;
+	char	String[INODE_FILE_NAME_LENGTH];
+};
+
+class Term {
+	public:
+		Term(int8 op) : fOp(op), fParent(NULL) {}
+
+		int8		Op() const { return fOp; }
+
+		void		SetParent(Term *parent) { fParent = parent; }
+		Term		*Parent() const { return fParent; }
+
+		virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0,
+				const uint8 *key = NULL,size_t size = 0) = 0;
+		virtual void Complement() = 0;
+
+		virtual void CalculateScore(Index &index) = 0;
+		virtual int32 Score() const = 0;
+
+		virtual status_t InitCheck() = 0;
+
+#ifdef DEBUG
+		virtual void	PrintToStream() = 0;
+#endif
+
+	protected:
+		int8	fOp;
+		Term	*fParent;
+};
+
+// Although an Equation object is quite independent from the volume on which
+// the query is run, there are some dependencies that are produced while
+// querying:
+// The type/size of the value, the score, and if it has an index or not.
+// So you could run more than one query on the same volume, but it might return
+// wrong values when it runs concurrently on another volume.
+// That's not an issue right now, because we run single-threaded and don't use
+// queries more than once.
+
+class Equation : public Term {
+	public:
+		Equation(char **expr);
+		~Equation();
+
+		virtual status_t InitCheck();
+
+		status_t	ParseQuotedString(char **_start,char **_end);
+		char		*CopyString(char *start, char *end);
+
+		virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0,const uint8 *key = NULL,size_t size = 0);
+		virtual void Complement();
+
+		status_t	PrepareQuery(Volume *volume, Index &index, TreeIterator **iterator);
+		status_t	GetNextMatching(Volume *volume,TreeIterator *iterator,struct dirent *dirent,size_t bufferSize);
+
+		virtual void CalculateScore(Index &index);
+		virtual int32 Score() const { return fScore; }
+
+#ifdef DEBUG
+		virtual void PrintToStream();
+#endif
+
+	private:
+		status_t	ConvertValue(type_code type);
+		bool		CompareTo(const uint8 *value, uint16 size);
+		uint8		*Value() const { return (uint8 *)&fValue; }
+		status_t	MatchEmptyString();
+
+		char		*fAttribute;
+		char		*fString;
+		union value fValue;
+		type_code	fType;
+		size_t		fSize;
+		bool		fIsPattern;
+		bool		fIsSpecialTime;
+
+		int32		fScore;
+		bool		fHasIndex;
+};
+
+class Operator : public Term {
+	public:
+		Operator(Term *,int8,Term *);
+		~Operator();
+
+		Term		*Left() const { return fLeft; }
+		Term		*Right() const { return fRight; }
+
+		virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0,const uint8 *key = NULL,size_t size = 0);
+		virtual void Complement();
+		
+		virtual void CalculateScore(Index &index);
+		virtual int32 Score() const;
+		
+		virtual status_t InitCheck();
+
+		//Term		*Copy() const;
+#ifdef DEBUG
+		virtual void PrintToStream();
+#endif
+
+	protected:
+		Term		*fLeft,*fRight;
+};
+
+
+//---------------------------------
+
+
+void 
+skipWhitespace(char **expr, int32 skip = 0)
+{
+	char *string = (*expr) + skip;
+	while (*string == ' ' || *string == '\t') string++;
+	*expr = string;
+}
+
+
+void 
+skipWhitespaceReverse(char **expr,char *stop)
+{
+	char *string = *expr;
+	while (string > stop && (*string == ' ' || *string == '\t')) string--;
+	*expr = string;
+}
+
+
+//	#pragma mark -
+
+
+uint32
+utf8ToUnicode(char **string)
+{
+	uint8 *bytes = (uint8 *)*string;
+	int32 length;
+	uint8 mask = 0x1f;
+
+	switch (bytes[0] & 0xf0) {
+		case 0xc0:
+		case 0xd0:	length = 2; break;
+		case 0xe0:	length = 3; break;
+		case 0xf0:
+			mask = 0x0f;
+			length = 4;
+			break;
+		default:
+			// valid 1-byte character
+			// and invalid characters
+			(*string)++;
+			return bytes[0];
+	}
+	uint32 c = bytes[0] & mask;
+	int32 i = 1;
+	for (;i < length && (bytes[i] & 0x80) > 0;i++)
+		c = (c << 6) | (bytes[i] & 0x3f);
+
+	if (i < length) {
+		// invalid character
+		(*string)++;
+		return (uint32)bytes[0];
+	}
+	*string += length;
+	return c;
+}
+
+
+int32
+getFirstPatternSymbol(char *string)
+{
+	char c;
+
+	for (int32 index = 0;(c = *string++);index++) {
+		if (c == '*' || c == '?' || c == '[')
+			return index;
+	}
+	return -1;
+}
+
+
+bool
+isPattern(char *string)
+{
+	return getFirstPatternSymbol(string) >= 0 ? true : false;
+}
+
+
+status_t
+isValidPattern(char *pattern)
+{
+	while (*pattern) {
+		switch (*pattern++) {
+			case '\\':
+				// the escape character must not be at the end of the pattern
+				if (!*pattern++)
+					return PATTERN_INVALID_ESCAPE;
+				break;
+
+			case '[':
+				if (pattern[0] == ']' || !pattern[0])
+					return PATTERN_INVALID_SET;
+
+				while (*pattern != ']') {
+					if (*pattern == '\\' && !*++pattern)
+						return PATTERN_INVALID_ESCAPE;
+
+					if (!*pattern)
+						return PATTERN_INVALID_SET;
+
+					if (pattern[0] == '-' && pattern[1] == '-')
+						return PATTERN_INVALID_RANGE;
+
+					pattern++;
+				}
+				break;
+		}
+	}
+	return B_OK;
+}
+
+
+/**	Matches the string against the given wildcard pattern.
+ *	Returns either MATCH_OK, or NO_MATCH when everything went fine,
+ *	or values < 0 (see enum at the top of Query.cpp) if an error
+ *	occurs
+ */
+
+status_t
+matchString(char *pattern,char *string)
+{
+	while (*pattern) {
+		// end of string == valid end of pattern?
+		if (!string[0]) {
+			while (pattern[0] == '*')
+				pattern++;
+			return !pattern[0] ? MATCH_OK : NO_MATCH;
+		}
+
+		switch (*pattern++) {
+			case '?':
+			{
+				// match exactly one UTF-8 character; we are
+				// not interested in the result
+				utf8ToUnicode(&string);
+				break;
+			}
+
+			case '*':
+			{
+				// compact pattern
+				while (true) {
+					if (pattern[0] == '?') {
+						if (!*++string)
+							return NO_MATCH;
+					} else if (pattern[0] != '*')
+						break;
+
+					pattern++;
+				}
+
+				// if the pattern is done, we have matched the string
+				if (!pattern[0])
+					return MATCH_OK;
+
+				while(true) {
+					// we have removed all occurences of '*' and '?'
+					if (pattern[0] == string[0]
+						|| pattern[0] == '['
+						|| pattern[0] == '\\') {
+						status_t status = matchString(pattern,string);
+						if (status < B_OK || status == MATCH_OK)
+							return status;
+					}
+
+					// we could be nice here and just jump to the next
+					// UTF-8 character - but we wouldn't gain that much
+					// and it'd be slower (since we're checking for
+					// equality before entering the recursion)
+					if (!*++string)
+						return NO_MATCH;
+				}
+				break;
+			}
+
+			case '[':
+			{
+				bool invert = false;
+				if (pattern[0] == '^' || pattern[0] == '!') {
+					invert = true;
+					pattern++;
+				}
+				
+				if (!pattern[0] || pattern[0] == ']')
+					return MATCH_BAD_PATTERN;
+
+				uint32 c = utf8ToUnicode(&string);
+				bool matched = false;
+
+				while (pattern[0] != ']') {
+					if (!pattern[0])
+						return MATCH_BAD_PATTERN;
+
+					if (pattern[0] == '\\')
+						pattern++;
+
+					uint32 first = utf8ToUnicode(&pattern);
+
+					// Does this character match, or is this a range?
+					if (first == c) {
+						matched = true;
+						break;
+					} else if (pattern[0] == '-' && pattern[1] != ']' && pattern[1]) {
+						pattern++;
+
+						if (pattern[0] == '\\') {
+							pattern++;
+							if (!pattern[0])
+								return MATCH_BAD_PATTERN;
+						}
+						uint32 last = utf8ToUnicode(&pattern);
+
+						if (c >= first && c <= last) {
+							matched = true;
+							break;
+						}
+					}
+				}
+
+				if (invert)
+					matched = !matched;
+
+				if (matched) {
+					while (pattern[0] != ']') {
+						if (!pattern[0])
+							return MATCH_BAD_PATTERN;
+						pattern++;
+					}
+					pattern++;
+					break;
+				}
+				return NO_MATCH;
+			}
+
+            case '\\':
+				if (!pattern[0])
+					return MATCH_BAD_PATTERN;
+				// supposed to fall through
+			default:
+				if (pattern[-1] != string[0])
+					return NO_MATCH;
+				string++;
+				break;
+		}
+	}
+
+	if (string[0])
+		return NO_MATCH;
+	
+	return MATCH_OK;
+}
+
+
+//	#pragma mark -
+
+
+Equation::Equation(char **expr)
+	: Term(OP_EQUATION),
+	fAttribute(NULL),
+	fString(NULL),
+	fType(0),
+	fIsPattern(false)
+{
+	char *string = *expr;
+	char *start = string;
+	char *end = NULL;
+
+	// Since the equation is the integral part of any query, we're just parsing
+	// the whole thing here.
+	// The whitespace at the start is already removed in Expression::ParseEquation()
+
+	if (*start == '"' || *start == '\'') {
+		// string is quoted (start has to be on the beginning of a string)
+		if (ParseQuotedString(&start,&end) < B_OK)
+			return;
+
+		// set string to a valid start of the equation symbol
+		string = end + 2;
+		skipWhitespace(&string);
+		if (*string != '=' && *string != '<' && *string != '>' && *string != '!') {
+			*expr = string;
+			return;
+		}
+	} else {
+		// search the (in)equation for the actual equation symbol (and for other operators
+		// in case the equation is malformed)
+		while (*string && *string != '=' && *string != '<' && *string != '>' && *string != '!'
+			&& *string != '&' && *string != '|')
+			string++;
+
+		// get the attribute string	(and trim whitespace), in case
+		// the string was not quoted
+		end = string - 1;
+		skipWhitespaceReverse(&end,start);
+	}
+
+	// attribute string is empty (which is not allowed)
+	if (start > end)
+		return;
+		
+	// at this point, "start" points to the beginning of the string, "end" points
+	// to the last character of the string, and "string" points to the first
+	// character of the equation symbol
+
+	// test for the right symbol (as this doesn't need any memory)
+	switch (*string) {
+		case '=':
+			fOp = OP_EQUAL;
+			break;
+		case '>':
+			fOp = *(string + 1) == '=' ? OP_GREATER_THAN_OR_EQUAL : OP_GREATER_THAN;
+			break;
+		case '<':
+			fOp = *(string + 1) == '=' ? OP_LESS_THAN_OR_EQUAL : OP_LESS_THAN;
+			break;
+		case '!':
+			if (*(string + 1) != '=')
+				return;
+			fOp = OP_UNEQUAL;
+			break;
+		
+		// any invalid characters will be rejected
+		default:
+			*expr = string;
+			return;
+	}
+	// lets change "start" to point to the first character after the symbol
+	if (*(string + 1) == '=')
+		string++;
+	string++;
+	skipWhitespace(&string);
+
+	// allocate & copy the attribute string
+
+	fAttribute = CopyString(start,end);
+	if (fAttribute == NULL)
+		return;
+
+	start = string;
+	if (*start == '"' || *start == '\'') {
+		// string is quoted (start has to be on the beginning of a string)
+		if (ParseQuotedString(&start,&end) < B_OK)
+			return;
+		
+		string = end + 2;
+		skipWhitespace(&string);
+	} else {
+		while (*string && *string != '&' && *string != '|' && *string != ')')
+			string++;
+
+		end = string - 1;
+		skipWhitespaceReverse(&end,start);
+	}
+	
+	// at this point, "start" will point to the first character of the value,
+	// "end" will point to its last character, and "start" to the first non-
+	// whitespace character after the value string
+
+	fString = CopyString(start,end);
+	if (fString == NULL)
+		return;
+
+	// patterns are only allowed for these operations (and strings)
+	if (fOp == OP_EQUAL || fOp == OP_UNEQUAL) {
+		fIsPattern = isPattern(fString);
+		if (fIsPattern && isValidPattern(fString) < B_OK) {
+			// we only want to have valid patterns; setting fString
+			// to NULL will cause InitCheck() to fail
+			free(fString);
+			fString = NULL;
+		}
+	}
+
+	// The special time flag is set if the time values are shifted
+	// 64-bit values to reduce the number of duplicates.
+	// We have to be able to compare them against unshifted values
+	// later. The only index which needs this is the last_modified
+	// index, but we may want to open that feature for other indices,
+	// too one day.
+	fIsSpecialTime = !strcmp(fAttribute,"last_modified");
+
+	*expr = string;
+}
+
+
+Equation::~Equation()
+{
+	if (fAttribute != NULL)
+		free(fAttribute);
+	if (fString != NULL)
+		free(fString);
+}
+
+
+status_t 
+Equation::InitCheck()
+{
+	if (fAttribute == NULL
+		|| fString == NULL
+		|| fOp == OP_NONE)
+		return B_BAD_VALUE;
+
+	return B_OK;
+}
+
+
+status_t 
+Equation::ParseQuotedString(char **_start, char **_end)
+{
+	char *start = *_start;
+	char quote = *start++;
+	char *end = start;
+	
+	for (;*end && *end != quote;end++) {
+		if (*end == '\\')
+			end++;
+	}
+	if (*end == '\0')
+		return B_BAD_VALUE;
+
+	*_start = start;
+	*_end = end - 1;
+
+	return B_OK;
+}
+
+
+char *
+Equation::CopyString(char *start,char *end)
+{
+	// end points to the last character of the string - and the length
+	// also has to include the null-termination
+	int32 length = end + 2 - start;
+	// just to make sure; since that's the max. attribute name length and
+	// the max. string in an index, it make sense to have it that way
+	if (length > INODE_FILE_NAME_LENGTH || length <= 0)
+		return NULL;
+
+	char *copy = (char *)malloc(length);
+	if (copy == NULL)
+		return NULL;
+
+	memcpy(copy,start,length - 1);
+	copy[length - 1] = '\0';
+
+	return copy;
+}
+
+
+status_t 
+Equation::ConvertValue(type_code type)
+{
+	// Has the type already been converted?
+	if (type == fType)
+		return B_OK;
+
+	fType = type;
+	char *string = fString;
+
+	switch (type) {
+		// B_MIME_STRING_TYPE is defined in Mime.h which I didn't want to include just for that
+		case 'MIMS':
+			type = B_STRING_TYPE;
+			// supposed to fall through
+		case B_STRING_TYPE:
+			strncpy(fValue.String,string,INODE_FILE_NAME_LENGTH);
+			fValue.String[INODE_FILE_NAME_LENGTH - 1] = '\0';
+			fSize = strlen(fValue.String);
+			break;
+		case B_INT32_TYPE:
+			fValue.Int32 = strtol(string,&string,0);
+			fSize = sizeof(int32);
+			break;
+		case B_UINT32_TYPE:
+			fValue.Int32 = strtoul(string,&string,0);
+			fSize = sizeof(uint32);
+			break;
+		case B_INT64_TYPE:
+			fValue.Int64 = strtoll(string,&string,0);
+			fSize = sizeof(int64);
+			break;
+		case B_UINT64_TYPE:
+			fValue.Uint64 = strtoull(string,&string,0);
+			fSize = sizeof(uint64);
+			break;
+		case B_FLOAT_TYPE:
+			fValue.Float = strtod(string,&string);
+			fSize = sizeof(float);
+			break;
+		case B_DOUBLE_TYPE:
+			fValue.Double = strtod(string,&string);
+			fSize = sizeof(double);
+			break;
+		default:
+			FATAL(("query value conversion to 0x%lx requested!\n",type));
+			// should we fail here or just do a safety int32 conversion?
+			return B_ERROR;
+	}
+
+	// patterns are only allowed for string types
+	if (fType != B_STRING_TYPE && fIsPattern)
+		fIsPattern = false;
+
+	return B_OK;
+}
+
+
+/**	Returns true when the key matches the equation. You have to
+ *	call ConvertValue() before this one.
+ */
+
+bool
+Equation::CompareTo(const uint8 *value,uint16 size)
+{
+	int32 compare;
+
+	// fIsPattern is only true if it's a string type, and fOp OP_EQUAL, or OP_UNEQUAL
+	if (fIsPattern) {
+		// we have already validated the pattern, so we don't check for failing
+		// here - if something is broken, and matchString() returns an error,
+		// we just don't match
+		compare = matchString(fValue.String,(char *)value) == MATCH_OK ? 0 : 1;
+	} else if (fIsSpecialTime) {
+		// the index is a shifted int64 index, but we have to match
+		// against an unshifted value (i.e. the last_modified index)
+		int64 timeValue = *(int64 *)value >> INODE_TIME_SHIFT;
+		compare = compareKeys(fType,&timeValue,sizeof(int64),&fValue.Int64,sizeof(int64));
+	} else
+		compare = compareKeys(fType,value,size,Value(),fSize);
+
+	switch (fOp) {
+		case OP_EQUAL:
+			return compare == 0;
+		case OP_UNEQUAL:
+			return compare != 0;
+		case OP_LESS_THAN:
+			return compare < 0;
+		case OP_LESS_THAN_OR_EQUAL:
+			return compare <= 0;
+		case OP_GREATER_THAN:
+			return compare > 0;
+		case OP_GREATER_THAN_OR_EQUAL:
+			return compare >= 0;
+	}
+	FATAL(("Unknown/Unsupported operation: %d\n",fOp));
+	return false;
+}
+
+
+void 
+Equation::Complement()
+{
+	D(if (fOp <= OP_EQUATION || fOp > OP_LESS_THAN_OR_EQUAL) {
+		FATAL(("op out of range!"));
+		return;
+	});
+
+	int8 complementOp[] = {OP_UNEQUAL, OP_EQUAL, OP_LESS_THAN_OR_EQUAL,
+			OP_GREATER_THAN_OR_EQUAL, OP_LESS_THAN, OP_GREATER_THAN};
+	fOp = complementOp[fOp - OP_EQUAL];
+}
+
+
+status_t
+Equation::MatchEmptyString()
+{
+	// there is no matching attribute, we will just bail out if we
+	// already know that our value is not of a string type.
+	// If not, it will be converted to a string - and then be compared with "".
+	// That's why we have to call ConvertValue() here - but it will be
+	// a cheap call for the next time
+	// Should we do this only for OP_UNEQUAL?
+	if (fType != 0 && fType != B_STRING_TYPE)
+		return NO_MATCH;
+
+	status_t status = ConvertValue(B_STRING_TYPE);
+	if (status == B_OK)
+		status = CompareTo((const uint8 *)"",fSize) ? MATCH_OK : NO_MATCH;
+
+	return status;
+}
+
+
+/**	Matches the inode's attribute value with the equation.
+ *	Returns MATCH_OK if it matches, NO_MATCH if not, < 0 if something went wrong
+ */
+
+status_t
+Equation::Match(Inode *inode,const char *attributeName,int32 type,const uint8 *key,size_t size)
+{
+	// get a pointer to the attribute in question
+	union value value;
+	uint8 *buffer;
+	
+	// first, check if we are matching for a live query and use that value
+	if (attributeName != NULL && !strcmp(fAttribute,attributeName)) {
+		if (key == NULL) {
+			if (type == B_STRING_TYPE)
+				return MatchEmptyString();
+
+			return NO_MATCH;
+		}
+		buffer = const_cast<uint8 *>(key);
+	} else if (!strcmp(fAttribute,"name")) {
+		// if not, check for "fake" attributes, "name", "size", "last_modified",
+		buffer = (uint8 *)inode->Name();
+		if (buffer == NULL)
+			return B_ERROR;
+
+		type = B_STRING_TYPE;
+		size = strlen((const char *)buffer);
+	} else if (!strcmp(fAttribute,"size")) {
+		buffer = (uint8 *)&inode->Node()->data.size;
+		type = B_INT64_TYPE;
+	} else if (!strcmp(fAttribute,"last_modified")) {
+		buffer = (uint8 *)&inode->Node()->last_modified_time;
+		type = B_INT64_TYPE;
+	} else {
+		// then for attributes in the small_data section, and finally for the
+		// real attributes
+		Inode *attribute;
+		
+		inode->SmallDataLock().Lock();
+		small_data *smallData = inode->FindSmallData(fAttribute);
+		if (smallData != NULL) {
+			buffer = smallData->Data();
+			type = smallData->type;
+			size = smallData->data_size;
+			inode->SmallDataLock().Unlock();
+		} else {
+			// needed to unlock the small_data section as fast as possible
+			inode->SmallDataLock().Unlock();
+
+			if (inode->GetAttribute(fAttribute,&attribute) == B_OK) {
+				buffer = (uint8 *)&value;
+				type = attribute->Node()->type;
+				size = attribute->Size();
+	
+				if (size > INODE_FILE_NAME_LENGTH)
+					size = INODE_FILE_NAME_LENGTH;
+	
+				if (attribute->ReadAt(0,buffer,&size) < B_OK) {
+					inode->ReleaseAttribute(attribute);
+					return B_IO_ERROR;
+				}
+				inode->ReleaseAttribute(attribute);
+			} else
+				return MatchEmptyString();
+		}
+	}
+	// prepare own value for use, if it is possible to convert it
+	status_t status = ConvertValue(type);
+	if (status == B_OK)
+		status = CompareTo(buffer,size) ? MATCH_OK : NO_MATCH;
+
+	RETURN_ERROR(status);
+}
+
+
+void 
+Equation::CalculateScore(Index &index)
+{
+	// As always, these values could be tuned and refined.
+	// And the code could also need some real world testing :-)
+
+	// do we have to operate on a "foreign" index?
+	if (fOp == OP_UNEQUAL || index.SetTo(fAttribute) < B_OK) {
+		fScore = 0;
+		return;
+	}
+
+	// if we have a pattern, how much does it help our search?
+	if (fIsPattern)
+		fScore = getFirstPatternSymbol(fString) << 3;
+	else {
+		// Score by operator
+		if (fOp == OP_EQUAL)
+			// higher than pattern="255 chars+*"
+			fScore = 2048;
+		else
+			// the pattern search is regarded cheaper when you have at
+			// least one character to set your index to
+			fScore = 5;
+	}
+	
+	// take index size into account (1024 is the current node size
+	// in our B+trees)
+	// 2048 * 2048 == 4194304 is the maximum score (for an empty
+	// tree, since the header + 1 node are already 2048 bytes)
+	fScore = fScore * ((2048 * 1024LL) / index.Node()->Size());
+}
+
+
+status_t
+Equation::PrepareQuery(Volume */*volume*/, Index &index, TreeIterator **iterator)
+{
+	type_code type;
+	status_t status = index.SetTo(fAttribute);
+	
+	// special case for OP_UNEQUAL - it will always operate through the whole index
+	// but we need the call to the original index to get the correct type
+	if (status < B_OK || fOp == OP_UNEQUAL) {
+		// Try to get an index that holds all files (name)
+		// Also sets the default type for all attributes without index
+		// to string.
+		type = status < B_OK ? B_STRING_TYPE : index.Type();
+
+		if (index.SetTo("name") < B_OK)
+			return B_ENTRY_NOT_FOUND;
+		
+		fHasIndex = false;
+	} else {
+		fHasIndex = true;
+		type = index.Type();
+	}
+
+	if (ConvertValue(type) < B_OK)
+		return B_BAD_VALUE;
+
+	BPlusTree *tree;
+	if (index.Node()->GetTree(&tree) < B_OK)
+		return B_ERROR;
+
+	*iterator = new TreeIterator(tree);
+	if (*iterator == NULL)
+		return B_NO_MEMORY;
+
+	if ((fOp == OP_EQUAL || fOp == OP_GREATER_THAN || fOp == OP_GREATER_THAN_OR_EQUAL
+		|| fIsPattern)
+		&& fHasIndex) {
+		// set iterator to the exact position
+
+		int32 keySize = index.KeySize();
+
+		// at this point, fIsPattern is only true if it's a string type, and fOp
+		// is either OP_EQUAL or OP_UNEQUAL
+		if (fIsPattern) {
+			// let's see if we can use the beginning of the key for positioning
+			// the iterator and adjust the key size; if not, just leave the
+			// iterator at the start and return success
+			keySize = getFirstPatternSymbol(fString);
+			if (keySize <= 0)
+				return B_OK;
+		}
+
+		if (keySize == 0) {
+			if (fType == B_STRING_TYPE)
+				keySize = strlen(fValue.String);
+			else
+				RETURN_ERROR(B_ENTRY_NOT_FOUND);
+		}
+
+		if (fIsSpecialTime) {
+			// we have to find the first matching shifted value
+			off_t value = fValue.Int64 << INODE_TIME_SHIFT;
+			status = (*iterator)->Find((uint8 *)&value,keySize);
+			if (status == B_ENTRY_NOT_FOUND)
+				return B_OK;
+		} else {
+			status = (*iterator)->Find(Value(),keySize);
+			if (fOp == OP_EQUAL && !fIsPattern)
+				return status;
+			else if (status == B_ENTRY_NOT_FOUND && (fIsPattern || fOp == OP_GREATER_THAN || fOp == OP_GREATER_THAN_OR_EQUAL))
+				return B_OK;
+		}
+
+		RETURN_ERROR(status);
+	}
+
+	return B_OK;
+}
+
+
+status_t 
+Equation::GetNextMatching(Volume *volume, TreeIterator *iterator,
+		struct dirent *dirent, size_t bufferSize)
+{
+	while (true) {
+		union value indexValue;
+		uint16 keyLength;
+		uint16 duplicate;
+		off_t offset;
+
+		status_t status = iterator->GetNextEntry(&indexValue,&keyLength,(uint16)sizeof(indexValue),&offset,&duplicate);
+		if (status < B_OK)
+			return status;
+
+		// only compare against the index entry when this is the correct
+		// index for the equation
+		if (fHasIndex && duplicate < 2 && !CompareTo((uint8 *)&indexValue,keyLength)) {
+			// They aren't equal? let the operation decide what to do
+			// Since we always start at the beginning of the index (or the correct
+			// position), only some needs to be stopped if the entry doesn't fit.
+			if (fOp == OP_LESS_THAN
+				|| fOp == OP_LESS_THAN_OR_EQUAL
+				|| (fOp == OP_EQUAL && !fIsPattern))
+				return B_ENTRY_NOT_FOUND;
+
+			if (duplicate > 0)
+				iterator->SkipDuplicates();
+			continue;
+		}
+
+		Inode *inode;
+		if ((status = get_vnode(volume->ID(),offset,(void **)&inode)) != B_OK) {
+			REPORT_ERROR(status);
+			FATAL(("could not get inode %Ld in index \"%s\"!\n",offset,fAttribute));
+			// try with next
+			continue;
+		}
+
+		// check user permissions here - but which one?!
+		// we could filter out all those where we don't have
+		// read access... (we should check for every parent
+		// directory if the X_OK is allowed)
+		// Although it's quite expensive to open all parents,
+		// it's likely that the application that runs the
+		// query will do something similar (and we don't have
+		// to do it for root, either).
+
+		// go up in the tree until a &&-operator is found, and check if the
+		// inode matches with the rest of the expression - we don't have to
+		// check ||-operators for that
+		Term *term = this;
+		status = MATCH_OK;
+
+		if (!fHasIndex)
+			status = Match(inode);
+
+		while (term != NULL && status == MATCH_OK) {
+			Operator *parent = (Operator *)term->Parent();
+			if (parent == NULL)
+				break;
+
+			if (parent->Op() == OP_AND) {
+				// choose the other child of the parent
+				Term *other = parent->Right();
+				if (other == term)
+					other = parent->Left();
+
+				if (other == NULL) {
+					FATAL(("&&-operator has only one child... (parent = %p)\n",parent));
+					break;
+				}
+				status = other->Match(inode);
+				if (status < 0) {
+					REPORT_ERROR(status);
+					status = NO_MATCH;
+				}
+			}
+			term = (Term *)parent;
+		}
+
+		if (status == MATCH_OK) {
+			dirent->d_dev = volume->ID();
+			dirent->d_ino = offset;
+			dirent->d_pdev = volume->ID();
+			dirent->d_pino = volume->ToVnode(inode->Parent());
+			strcpy(dirent->d_name,inode->Name());
+			dirent->d_reclen = strlen(dirent->d_name);
+		}
+
+		put_vnode(volume->ID(), inode->ID());
+
+		if (status == MATCH_OK)
+			return B_OK;
+	}
+	RETURN_ERROR(B_ERROR);
+}
+
+
+//	#pragma mark -
+
+
+Operator::Operator(Term *left, int8 op, Term *right)
+	: Term(op),
+	fLeft(left),
+	fRight(right)
+{
+	if (left)
+		left->SetParent(this);
+	if (right)
+		right->SetParent(this);
+}
+
+
+Operator::~Operator()
+{
+	delete fLeft;
+	delete fRight;
+}
+
+
+status_t
+Operator::Match(Inode *inode,const char *attribute,int32 type,const uint8 *key,size_t size)
+{
+	if (fOp == OP_AND) {
+		status_t status = fLeft->Match(inode,attribute,type,key,size);
+		if (status != MATCH_OK)
+			return status;
+
+		return fRight->Match(inode,attribute,type,key,size);
+	} else {
+		// choose the term with the better score for OP_OR
+		if (fRight->Score() > fLeft->Score()) {
+			status_t status = fRight->Match(inode,attribute,type,key,size);
+			if (status != NO_MATCH)
+				return status;
+		}
+		return fLeft->Match(inode,attribute,type,key,size);
+	}
+}
+
+
+void 
+Operator::Complement()
+{
+	if (fOp == OP_AND)
+		fOp = OP_OR;
+	else
+		fOp = OP_AND;
+	
+	fLeft->Complement();
+	fRight->Complement();
+}
+
+
+void 
+Operator::CalculateScore(Index &index)
+{
+	fLeft->CalculateScore(index);
+	fRight->CalculateScore(index);
+}
+
+
+int32 
+Operator::Score() const
+{
+	if (fOp == OP_AND) {
+		// return the one with the better score
+		if (fRight->Score() > fLeft->Score())
+			return fRight->Score();
+		
+		return fLeft->Score();
+	}
+	
+	// for OP_OR, be honest, and return the one with the worse score
+	if (fRight->Score() < fLeft->Score())
+		return fRight->Score();
+	
+	return fLeft->Score();
+}
+
+
+status_t 
+Operator::InitCheck()
+{
+	if (fOp != OP_AND && fOp != OP_OR
+		|| fLeft == NULL || fLeft->InitCheck() < B_OK
+		|| fRight == NULL || fRight->InitCheck() < B_OK)
+		return B_ERROR;
+
+	return B_OK;
+}
+
+
+#if 0
+Term *
+Operator::Copy() const
+{
+	if (fEquation != NULL) {
+		Equation *equation = new Equation(*fEquation);
+		if (equation == NULL)
+			return NULL;
+
+		Term *term = new Term(equation);
+		if (term == NULL)
+			delete equation;
+		
+		return term;
+	}
+
+	Term *left = NULL, *right = NULL;
+
+	if (fLeft != NULL && (left = fLeft->Copy()) == NULL)
+		return NULL;
+	if (fRight != NULL && (right = fRight->Copy()) == NULL) {
+		delete left;
+		return NULL;
+	}
+
+	Term *term = new Term(left,fOp,right);
+	if (term == NULL) {
+		delete left;
+		delete right;
+		return NULL;
+	}
+	return term;
+}
+#endif
+
+
+//	#pragma mark -
+
+#ifdef DEBUG
+void
+Operator::PrintToStream()
+{
+	D(__out("( "));
+	if (fLeft != NULL)
+		fLeft->PrintToStream();
+	
+	char *op;
+	switch (fOp) {
+		case OP_OR: op = "OR"; break;
+		case OP_AND: op = "AND"; break;
+		default: op = "?"; break;
+	}
+	D(__out(" %s ",op));
+	
+	if (fRight != NULL)
+		fRight->PrintToStream();
+
+	D(__out(" )"));
+}
+
+
+void 
+Equation::PrintToStream()
+{
+	char *symbol = "???";
+	switch (fOp) {
+		case OP_EQUAL: symbol = "=="; break;
+		case OP_UNEQUAL: symbol = "!="; break;
+		case OP_GREATER_THAN: symbol = ">"; break;
+		case OP_GREATER_THAN_OR_EQUAL: symbol = ">="; break;
+		case OP_LESS_THAN: symbol = "<"; break;
+		case OP_LESS_THAN_OR_EQUAL: symbol = "<="; break;
+	}
+	D(__out("[\"%s\" %s \"%s\"]",fAttribute,symbol,fString));
+}
+
+#endif	/* DEBUG */
+
+//	#pragma mark -
+
+
+Expression::Expression(char *expr)
+{
+	if (expr == NULL)
+		return;
+	
+	fTerm = ParseOr(&expr);
+	if (fTerm != NULL && fTerm->InitCheck() < B_OK) {
+		FATAL(("Corrupt tree in expression!\n"));
+		delete fTerm;
+		fTerm = NULL;
+	}
+	D(if (fTerm != NULL) {
+		fTerm->PrintToStream();
+		D(__out("\n"));
+		if (*expr != '\0')
+			PRINT(("Unexpected end of string: \"%s\"!\n",expr));
+	});
+	fPosition = expr;
+}
+
+
+Expression::~Expression()
+{
+	delete fTerm;
+}
+
+
+Term *
+Expression::ParseEquation(char **expr)
+{
+	skipWhitespace(expr);
+
+	bool not = false;
+	if (**expr == '!') {
+		skipWhitespace(expr, 1);
+		if (**expr != '(')
+			return NULL;
+		
+		not = true;
+	}
+
+	if (**expr == ')') {
+		// shouldn't be handled here
+		return NULL;
+	} else if (**expr == '(') {
+		skipWhitespace(expr, 1);
+		
+		Term *term = ParseOr(expr);
+		
+		skipWhitespace(expr);
+		
+		if (**expr != ')') {
+			delete term;
+			return NULL;
+		}
+		
+		// If the term is negated, we just complement the tree, to get
+		// rid of the not, a.k.a. DeMorgan's Law.
+		if (not)
+			term->Complement();
+
+		skipWhitespace(expr, 1);
+
+		return term;
+	}
+
+	Equation *equation = new Equation(expr);
+	if (equation == NULL || equation->InitCheck() < B_OK) {
+		delete equation;
+		return NULL;
+	}
+	return equation;
+}
+
+
+Term *
+Expression::ParseAnd(char **expr)
+{
+	Term *left = ParseEquation(expr);
+	if (left == NULL)
+		return NULL;
+
+	while (IsOperator(expr,'&')) {
+		Term *right = ParseAnd(expr);
+		Term *newParent = NULL;
+
+		if (right == NULL || (newParent = new Operator(left,OP_AND,right)) == NULL) {
+			delete left;
+			delete right;
+
+			return NULL;
+		}
+		left = newParent;
+	}
+
+	return left;
+}
+
+
+Term *
+Expression::ParseOr(char **expr)
+{
+	Term *left = ParseAnd(expr);
+	if (left == NULL)
+		return NULL;
+
+	while (IsOperator(expr,'|')) {
+		Term *right = ParseAnd(expr);
+		Term *newParent = NULL;
+
+		if (right == NULL || (newParent = new Operator(left,OP_OR,right)) == NULL) {
+			delete left;
+			delete right;
+
+			return NULL;
+		}
+		left = newParent;
+	}
+
+	return left;
+}
+
+
+bool 
+Expression::IsOperator(char **expr, char op)
+{
+	char *string = *expr;
+	
+	if (*string == op && *(string + 1) == op) {
+		*expr += 2;
+		return true;
+	}
+	return false;
+}
+
+
+status_t 
+Expression::InitCheck()
+{
+	if (fTerm == NULL)
+		return B_BAD_VALUE;
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+
+
+Query::Query(Volume *volume,Expression *expression)
+	:
+	fVolume(volume),
+	fExpression(expression),
+	fCurrent(NULL),
+	fIterator(NULL),
+	fIndex(volume),
+	fPort(-1)
+{
+	// if the expression has a valid root pointer, the whole tree has
+	// already passed the sanity check, so that we don't have to check
+	// every pointer
+	if (volume == NULL || expression == NULL || expression->Root() == NULL)
+		return;
+
+	// create index on the stack and delete it afterwards
+	fExpression->Root()->CalculateScore(fIndex);
+	fIndex.Unset();
+
+	Stack<Term *> stack;
+	stack.Push(fExpression->Root());
+
+	Term *term;
+	while (stack.Pop(&term)) {
+		if (term->Op() < OP_EQUATION) {
+			Operator *op = (Operator *)term;
+
+			if (op->Op() == OP_OR) {
+				stack.Push(op->Left());
+				stack.Push(op->Right());
+			} else {
+				// For OP_AND, we can use the scoring system to decide which path to add
+				if (op->Right()->Score() > op->Left()->Score())
+					stack.Push(op->Right());
+				else
+					stack.Push(op->Left());
+			}
+		} else if (term->Op() == OP_EQUATION || fStack.Push((Equation *)term) < B_OK)
+			FATAL(("Unknown term on stack or stack error"));
+	}
+	
+	volume->AddQuery(this);
+}
+
+
+Query::~Query()
+{
+	fVolume->RemoveQuery(this);
+}
+
+
+status_t 
+Query::GetNextEntry(struct dirent *dirent, size_t size)
+{
+	// If we don't have an equation to use yet/anymore, get a new one
+	// from the stack
+	while (true) {
+		if (fIterator == NULL) {
+			if (!fStack.Pop(&fCurrent)
+				|| fCurrent == NULL
+				|| fCurrent->PrepareQuery(fVolume,fIndex,&fIterator) < B_OK)
+				return B_ENTRY_NOT_FOUND;
+		}
+		if (fCurrent == NULL)
+			RETURN_ERROR(B_ERROR);
+	
+		status_t status = fCurrent->GetNextMatching(fVolume,fIterator,dirent,size);
+		if (status < B_OK) {
+			delete fIterator;
+			fIterator = NULL;
+			fCurrent = NULL;
+		} else {
+			// only return if we have another entry
+			return B_OK;
+		}
+	}
+}
+
+
+void 
+Query::SetLiveMode(port_id port,int32 token)
+{
+	fPort = port;
+	fToken = token;
+}
+
+
+void 
+Query::LiveUpdate(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength)
+{
+	if (fPort < 0 || fExpression == NULL || attribute == NULL)
+		return;
+
+	// ToDo: check if the attribute is part of the query at all...
+
+	status_t oldStatus = fExpression->Root()->Match(inode,attribute,type,oldKey,oldLength);
+	status_t newStatus = fExpression->Root()->Match(inode,attribute,type,newKey,newLength);
+	int32 op;
+	if (oldStatus == MATCH_OK && newStatus == MATCH_OK) {
+		// only send out a notification if the name was changed 
+		if (oldKey == NULL || strcmp(attribute,"name"))
+			return;
+
+		send_notification(fPort,fToken,B_QUERY_UPDATE,B_ENTRY_REMOVED,fVolume->ID(),0,fVolume->ToVnode(inode->Parent()),0,inode->ID(),(const char *)oldKey);
+		op = B_ENTRY_CREATED;
+	} else if (oldStatus != MATCH_OK && newStatus != MATCH_OK) {
+		// nothing has changed
+		return;
+	} else if (oldStatus == MATCH_OK && newStatus != MATCH_OK)
+		op = B_ENTRY_REMOVED;
+	else
+		op = B_ENTRY_CREATED;
+
+	// if "value" is NULL, send_notification() crashes...
+	const char *value = (const char *)newKey;
+	if (type != B_STRING_TYPE || value == NULL)
+		value = "";
+
+	send_notification(fPort,fToken,B_QUERY_UPDATE,op,fVolume->ID(),0,fVolume->ToVnode(inode->Parent()),0,inode->ID(),value);
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Query.h b/src/add-ons/kernel/file_systems/bfs/Query.h
new file mode 100644
index 0000000000..50f3779063
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Query.h
@@ -0,0 +1,72 @@
+#ifndef QUERY_H
+#define QUERY_H
+/* Query - query parsing and evaluation
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <SupportDefs.h>
+
+#include "Index.h"
+#include "Stack.h"
+#include "Chain.h"
+
+class Volume;
+class Term;
+class Equation;
+class TreeIterator;
+class Query;
+
+
+class Expression {
+	public:
+		Expression(char *expr);
+		~Expression();
+
+		status_t InitCheck();
+		const char *Position() const { return fPosition; }
+		Term *Root() const { return fTerm; }
+
+	protected:
+		Term *ParseOr(char **expr);
+		Term *ParseAnd(char **expr);
+		Term *ParseEquation(char **expr);
+
+		bool IsOperator(char **expr,char op);
+
+	private:
+		char *fPosition;
+		Term *fTerm;
+};
+
+class Query {
+	public:
+		Query(Volume *volume,Expression *expression);
+		~Query();
+
+		status_t GetNextEntry(struct dirent *,size_t size);
+
+		void SetLiveMode(port_id port,int32 token);
+		void LiveUpdate(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength);
+
+		Expression *GetExpression() const { return fExpression; }
+
+	private:
+		Volume			*fVolume;
+		Expression		*fExpression;
+		Equation		*fCurrent;
+		TreeIterator	*fIterator;
+		Index			fIndex;
+		Stack<Equation *> fStack;
+
+		port_id			fPort;
+		int32			fToken;
+
+	private:
+		friend Chain<Query>;
+		Query			*fNext;
+};
+
+#endif	/* QUERY_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Stack.h b/src/add-ons/kernel/file_systems/bfs/Stack.h
new file mode 100644
index 0000000000..9793eb2491
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Stack.h
@@ -0,0 +1,58 @@
+#ifndef STACK_H
+#define STACK_H
+/* Stack - a template stack class
+**
+** Copyright 2001 pinc Software. All Rights Reserved.
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <SupportDefs.h>
+
+
+template<class T> class Stack {
+	public:
+		Stack()
+			:
+			fArray(NULL),
+			fUsed(0),
+			fMax(0)
+		{
+		}
+		
+		~Stack()
+		{
+			if (fArray)
+				free(fArray);
+		}
+		
+		status_t Push(T value)
+		{
+			if (fUsed >= fMax) {
+				fMax += 16;
+				T *newArray = (T *)realloc(fArray,fMax * sizeof(T));
+				if (newArray == NULL)
+					return B_NO_MEMORY;
+
+				fArray = newArray;
+			}
+			fArray[fUsed++] = value;
+			return B_OK;
+		}
+		
+		bool Pop(T *value)
+		{
+			if (fUsed == 0)
+				return false;
+
+			*value = fArray[--fUsed];
+			return true;
+		}
+		
+	private:
+		T		*fArray;
+		int32	fUsed;
+		int32	fMax;
+};
+
+#endif	/* STACK_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/ToDo b/src/add-ons/kernel/file_systems/bfs/ToDo
new file mode 100644
index 0000000000..9badbc44cc
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/ToDo
@@ -0,0 +1,74 @@
+BFS - ToDo, June 5th, 2002
+-----
+
+BlockAllocator
+
+ - the BlockAllocator is only slightly optimized and probably slow
+ - the first free and the largest range are currently not correctly maintained (only efficiency suffers - it does work correctly)
+ - the allocation policies will have to stand against some real world tests
+ - the access to the block bitmap is currently managed using a global lock
+
+
+DataStream
+
+ - growing/shrinking the stream size is not implemented for the double indirect range
+ - only files are trimmed back (in bfs_close()), but every inode has a preallocated stream...
+ - merging of block_runs doesn't work between range/block boundaries
+
+
+Queries
+
+ - There shouldn't be any cases where you can speed up a query with reordering the query expression - test it
+ - Check permissions of the parent directories
+ - Add protection against crashing applications which had a query open - at least the original BeOS kernel does not free the cookie (which throws some memory away *and* prevents unmounting the disk)
+
+
+Journal
+
+ - Check if there are any standard and often-happening cases for a transaction to fail, and if so, start the transaction only when necessary
+ - if the system crashes between bfs_unlink() and bfs_remove_vnode(), the inode can be removed from the tree, but its memory is still allocated - this can happen if the inode is still in use by someone (and that's what the "chkbfs" utility is for, mainly).
+ - add delayed index updating (+ delete actions to solve the issue above)
+ - multiple log files, parallel transactions?
+ - variable sized log file
+ - as long as we have a fixed-sized log file, it should be possible to reserve space for a transaction to be able to decide if batching it is possible
+
+
+BPlusTree
+
+ - BPlusTree::Remove() could trigger CachedNode::Free() to go through the free nodes list and free all pages at the end of the data stream
+ - updating the TreeIterators doesn't work yet for duplicates (which may be a problem if a duplicate node will go away after a remove)
+ - BPlusTree::RemoveDuplicate() could spread the contents of duplicate node with only a few entries to save some space (right now, only empty nodes are freed)
+
+
+Inode
+
+ - sometimes the inode's last modified time seems to be wrong, and is therefore not found in the b+tree (assuming that the b+tree is working correctly, what I do)
+ - Inode::FillGapWithZeros() currently disabled; apart from being slow, it really shouldn't be executed while a transaction is running, because that stops all other threads from doing anything (which can be a long time for a 100 MB file)
+
+
+Indices
+
+
+
+Attributes
+
+ - bfs_write_attr() doesn't check if the attribute data may fit into the small_data region if there already is that attribute as an attribute file
+
+
+Volume
+
+
+kernel_interface
+
+ - missing functions, maybe they are not all needed (but most of them are): bfs_rename_attr(), bfs_rename_index(), bfs_initialize(), bfs_setflags(), bfs_link()
+ - bfs_rename() currently doesn't respect any permissions
+
+
+general stuff
+
+ - There are also some comments with a leading "ToDo:" directly in the code which may not be mentioned here.
+
+
+-----
+Axel Dörfler
+axeld@pinc-software.de
diff --git a/src/add-ons/kernel/file_systems/bfs/Utility.cpp b/src/add-ons/kernel/file_systems/bfs/Utility.cpp
new file mode 100644
index 0000000000..4e1d1b91e2
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Utility.cpp
@@ -0,0 +1,138 @@
+/* Utility - some helper classes
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Utility.h"
+#include "Debug.h"
+#include "cpp.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+
+bool
+sorted_array::FindInternal(off_t value, int32 &index) const
+{
+	int32 min = 0, max = count-1;
+	off_t cmp;
+	while (min <= max) {
+		index = (min + max) / 2;
+
+		cmp = values[index] - value;
+		if (cmp < 0)
+			min = index + 1;
+		else if (cmp > 0)
+			max = index - 1;
+		else
+			return true;
+	}
+	return false;
+}
+
+
+void 
+sorted_array::Insert(off_t value)
+{
+	// if there are more than 8 values in this array, use a
+	// binary search, if not, just iterate linearly to find
+	// the insertion point
+	int32 i;
+	if (count > 8 ) {
+		if (!FindInternal(value,i)
+			&& values[i] <= value)
+			i++;
+	} else {
+		for (i = 0;i < count; i++)
+			if (values[i] > value)
+				break;
+	}
+
+	memmove(&values[i+1],&values[i],(count - i) * sizeof(off_t));
+	values[i] = value;
+	count++;
+}
+
+
+bool 
+sorted_array::Remove(off_t value)
+{
+	int32 index = Find(value);
+	if (index == -1)
+		return false;
+
+	memmove(&values[index],&values[index + 1],(count - index) * sizeof(off_t));
+	count--;
+
+	return true;
+}
+
+
+//	#pragma mark -
+
+
+BlockArray::BlockArray(int32 blockSize)
+	:
+	fArray(NULL),
+	fSize(0),
+	fBlockSize(blockSize)
+{
+}
+
+
+BlockArray::~BlockArray()
+{
+	if (fArray)
+		free(fArray);
+}
+
+
+int32
+BlockArray::Find(off_t value)
+{
+	if (fArray == NULL)
+		return -1;
+	
+	return fArray->Find(value);
+}
+
+
+status_t
+BlockArray::Insert(off_t value)
+{
+	if (fArray == NULL || fArray->count + 1 > fMaxBlocks) {
+		sorted_array *array = (sorted_array *)realloc(fArray,fSize + fBlockSize);
+		if (array == NULL)
+			return B_NO_MEMORY;
+		
+		if (fArray == NULL)
+			array->count = 0;
+
+		fArray = array;
+		fSize += fBlockSize;
+		fMaxBlocks = fSize / sizeof(off_t) - 1;
+	}
+
+	fArray->Insert(value);
+	return B_OK;
+}
+
+
+status_t
+BlockArray::Remove(off_t value)
+{
+	if (fArray == NULL)
+		return B_ENTRY_NOT_FOUND;
+
+	return fArray->Remove(value) ? B_OK : B_ENTRY_NOT_FOUND;
+}
+
+
+void 
+BlockArray::MakeEmpty()
+{
+	fArray->count = 0;
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Utility.h b/src/add-ons/kernel/file_systems/bfs/Utility.h
new file mode 100644
index 0000000000..f095545a16
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Utility.h
@@ -0,0 +1,110 @@
+#ifndef UTILITY_H
+#define UTILITY_H
+/* Utility - some helper classes
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <SupportDefs.h>
+
+
+// Simple array, used for the duplicate handling in the B+Tree,
+// and for the log entries.
+
+struct sorted_array {
+	public:
+		off_t	count;
+		off_t	values[0];
+
+		inline int32 Find(off_t value) const;
+		void Insert(off_t value);
+		bool Remove(off_t value);
+
+	private:
+		bool FindInternal(off_t value,int32 &index) const;
+};
+
+
+inline int32
+sorted_array::Find(off_t value) const
+{
+	int32 i;
+	return FindInternal(value,i) ? i : -1;
+}
+
+
+// The BlockArray reserves a multiple of "blockSize" and
+// maintain array size for new entries.
+// This is used for the in-memory log entries before they
+// are written to disk.
+
+class BlockArray {
+	public:
+		BlockArray(int32 blockSize);
+		~BlockArray();
+
+		int32 Find(off_t value);
+		status_t Insert(off_t value);
+		status_t Remove(off_t value);
+
+		void MakeEmpty();
+
+		int32 CountItems() const { return fArray != NULL ? fArray->count : 0; }
+		int32 BlocksUsed() const { return fArray != NULL ? ((fArray->count + 1) * sizeof(off_t) + fBlockSize - 1) / fBlockSize : 0; }
+		sorted_array *Array() const { return fArray; }
+		int32 Size() const { return fSize; }
+
+	private:
+		sorted_array *fArray;
+		int32	fBlockSize;
+		int32	fSize;
+		int32	fMaxBlocks;
+};
+
+
+// Doubly linked list
+
+template<class Node> struct node {
+	Node *next,*prev;
+
+	void
+	Remove()
+	{
+		prev->next = next;
+		next->prev = prev;
+	}
+
+	Node *
+	Next()
+	{
+		if (next && next->next != NULL)
+			return next;
+
+		return NULL;
+	}
+};
+
+template<class Node> struct list {
+	Node *head,*tail,*last;
+
+	list()
+	{
+		head = (Node *)&tail;
+		tail = NULL;
+		last = (Node *)&head;
+	}
+
+	void
+	Add(Node *entry)
+	{
+		entry->next = (Node *)&tail;
+		entry->prev = last;
+		last->next = entry;
+		last = entry;
+	}
+};
+
+
+#endif	/* UTILITY_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/Volume.cpp b/src/add-ons/kernel/file_systems/bfs/Volume.cpp
new file mode 100644
index 0000000000..f7a3a1aa2f
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Volume.cpp
@@ -0,0 +1,304 @@
+/* Volume - BFS super block, mounting, etc.
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "cpp.h"
+#include "Volume.h"
+#include "Journal.h"
+#include "Inode.h"
+#include "Query.h"
+
+#include <KernelExport.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+
+
+Volume::Volume(nspace_id id)
+	:
+	fID(id),
+	fBlockAllocator(this),
+	fLock("bfs volume"),
+	fDirtyCachedBlocks(0),
+	fUniqueID(0),
+	fFlags(0)
+{
+}
+
+
+Volume::~Volume()
+{
+}
+
+
+bool
+Volume::IsValidSuperBlock()
+{
+	if (fSuperBlock.magic1 != (int32)SUPER_BLOCK_MAGIC1
+		|| fSuperBlock.magic2 != (int32)SUPER_BLOCK_MAGIC2
+		|| fSuperBlock.magic3 != (int32)SUPER_BLOCK_MAGIC3
+		|| (int32)fSuperBlock.block_size != fSuperBlock.inode_size
+		|| fSuperBlock.fs_byte_order != SUPER_BLOCK_FS_LENDIAN
+		|| (1UL << fSuperBlock.block_shift) != fSuperBlock.block_size
+		|| fSuperBlock.num_ags < 1
+		|| fSuperBlock.ag_shift < 1
+		|| fSuperBlock.blocks_per_ag < 1
+		|| fSuperBlock.num_blocks < 10
+		|| fSuperBlock.num_ags != divide_roundup(fSuperBlock.num_blocks,1L << fSuperBlock.ag_shift))
+		return false;
+
+	return true;
+}
+
+
+void 
+Volume::Panic()
+{
+	FATAL(("we have to panic... switch to read-only mode!\n"));
+	fFlags |= VOLUME_READ_ONLY;
+#ifdef USER
+	debugger("BFS panics!");
+#endif
+}
+
+
+status_t
+Volume::Mount(const char *deviceName,uint32 flags)
+{
+	if (flags & B_MOUNT_READ_ONLY)
+		fFlags |= VOLUME_READ_ONLY;
+
+	fDevice = open(deviceName,flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
+	
+	// if we couldn't open the device, try read-only (don't rely on a specific error code)
+	if (fDevice < B_OK && (flags & B_MOUNT_READ_ONLY) == 0) {
+		fDevice = open(deviceName,O_RDONLY);
+		fFlags |= VOLUME_READ_ONLY;
+	}
+
+	if (fDevice < B_OK)
+		RETURN_ERROR(fDevice);
+
+	// check if it's a regular file, and if so, disable the cache for the
+	// underlaying file system
+	struct stat stat;
+	if (fstat(fDevice,&stat) < 0)
+		RETURN_ERROR(B_ERROR);
+
+//#ifndef USER
+	if (stat.st_mode & S_FILE && ioctl(fDevice,IOCTL_FILE_UNCACHED_IO,NULL) < 0) {
+		// mount read-only if the cache couldn't be disabled
+#	ifdef DEBUG
+		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
+#	else
+		FATAL(("couldn't disable cache for image file!\n"));
+		Panic();
+#	endif
+	}
+//#endif
+
+	// read the super block
+	char buffer[1024];
+	if (read_pos(fDevice,0,buffer,sizeof(buffer)) != sizeof(buffer))
+		return B_IO_ERROR;
+
+	status_t status = B_OK;
+
+	// Note: that does work only for x86, for PowerPC, the super block
+	// is located at offset 0!
+	memcpy(&fSuperBlock,buffer + 512,sizeof(disk_super_block));
+
+	if (IsValidSuperBlock()) {
+		// set the current log pointers, so that journaling will work correctly
+		fLogStart = fSuperBlock.log_start;
+		fLogEnd = fSuperBlock.log_end;
+
+		if (init_cache_for_device(fDevice, NumBlocks()) == B_OK) {
+			fJournal = new Journal(this);
+			// replaying the log is the first thing we will do on this disk
+			if (fJournal && fJournal->InitCheck() == B_OK
+				&& fBlockAllocator.Initialize() == B_OK) {
+				fRootNode = new Inode(this,ToVnode(Root()));
+
+				if (fRootNode && fRootNode->InitCheck() == B_OK) {
+					if (new_vnode(fID,ToVnode(Root()),(void *)fRootNode) == B_OK) {
+						// try to get indices root dir
+
+						// question: why doesn't get_vnode() work here??
+						// answer: we have not yet backpropagated the pointer to the
+						// volume in bfs_mount(), so bfs_read_vnode() can't get it.
+						// But it's not needed to do that anyway.
+
+						fIndicesNode = new Inode(this,ToVnode(Indices()));
+						if (fIndicesNode == NULL
+							|| fIndicesNode->InitCheck() < B_OK
+							|| !fIndicesNode->IsDirectory()) {
+							INFORM(("bfs: volume doesn't have indices!\n"));
+
+							if (fIndicesNode) {
+								// if this is the case, the index root node is gone bad, and
+								// BFS switch to read-only mode
+								fFlags |= VOLUME_READ_ONLY;
+								fIndicesNode = NULL;
+							}
+						}
+
+						// all went fine
+						return B_OK;
+					} else
+						status = B_NO_MEMORY;
+				} else
+					status = B_BAD_VALUE;
+
+				FATAL(("could not create root node: new_vnode() failed!\n"));
+			} else {
+				// ToDo: improve error reporting for a bad journal
+				status = B_NO_MEMORY;
+				FATAL(("could not initialize journal/block bitmap allocator!\n"));
+			}
+
+			remove_cached_device_blocks(fDevice,NO_WRITES);
+		} else {
+			FATAL(("could not initialize cache!\n"));
+			status = B_IO_ERROR;
+		}
+		FATAL(("invalid super block!\n"));
+	}
+	else
+		status = B_BAD_VALUE;
+
+	close(fDevice);
+
+	return status;
+}
+
+
+status_t
+Volume::Unmount()
+{
+	// This will also flush the log & all blocks to disk
+	delete fJournal;
+	fJournal = NULL;
+
+	delete fIndicesNode;
+
+	remove_cached_device_blocks(fDevice,ALLOW_WRITES);
+	close(fDevice);
+
+	return B_OK;
+}
+
+
+status_t 
+Volume::Sync()
+{
+	return fJournal->FlushLogAndBlocks();
+}
+
+
+status_t
+Volume::IsValidBlockRun(block_run run)
+{
+	if (run.allocation_group < 0 || run.allocation_group > AllocationGroups()
+		|| run.start > (1LL << AllocationGroupShift())
+		|| run.length == 0
+		|| (uint32)run.length + run.start > (1LL << AllocationGroupShift())) {
+		Panic();
+		FATAL(("*** invalid run(%ld,%d,%d)\n",run.allocation_group,run.start,run.length));
+		return B_BAD_DATA;
+	}
+	return B_OK;
+}
+
+
+block_run 
+Volume::ToBlockRun(off_t block) const
+{
+	block_run run;
+	run.allocation_group = block >> fSuperBlock.ag_shift;
+	run.start = block & ~((1LL << fSuperBlock.ag_shift) - 1);
+	run.length = 1;
+	return run;
+}
+
+
+status_t
+Volume::CreateIndicesRoot(Transaction *transaction)
+{
+	off_t id;
+	status_t status = Inode::Create(transaction,NULL,NULL,
+							S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700,0,0,&id);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	fSuperBlock.indices = ToBlockRun(id);
+	WriteSuperBlock();
+
+	// The Vnode destructor will unlock the inode, but it has already been
+	// locked by the Inode::Create() call.
+	Vnode vnode(this,id);
+	return vnode.Get(&fIndicesNode);
+}
+
+
+status_t 
+Volume::AllocateForInode(Transaction *transaction, const Inode *parent, mode_t type, block_run &run)
+{
+	return fBlockAllocator.AllocateForInode(transaction,&parent->BlockRun(),type,run);
+}
+
+
+status_t 
+Volume::WriteSuperBlock()
+{
+	if (write_pos(fDevice,512,&fSuperBlock,sizeof(disk_super_block)) != sizeof(disk_super_block))
+		return B_IO_ERROR;
+
+	return B_OK;
+}
+
+
+void
+Volume::UpdateLiveQueries(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength)
+{
+	if (fQueryLock.Lock() < B_OK)
+		return;
+
+	Query *query = NULL;
+	while ((query = fQueries.Next(query)) != NULL)
+		query->LiveUpdate(inode,attribute,type,oldKey,oldLength,newKey,newLength);
+
+	fQueryLock.Unlock();
+}
+
+
+void 
+Volume::AddQuery(Query *query)
+{
+	if (fQueryLock.Lock() < B_OK)
+		return;
+
+	fQueries.Add(query);
+
+	fQueryLock.Unlock();
+}
+
+
+void 
+Volume::RemoveQuery(Query *query)
+{
+	if (fQueryLock.Lock() < B_OK)
+		return;
+
+	fQueries.Remove(query);
+
+	fQueryLock.Unlock();
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/Volume.h b/src/add-ons/kernel/file_systems/bfs/Volume.h
new file mode 100644
index 0000000000..1c6a143e10
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/Volume.h
@@ -0,0 +1,176 @@
+#ifndef VOLUME_H
+#define VOLUME_H
+/* Volume - BFS super block, mounting, etc.
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <KernelExport.h>
+
+extern "C" {
+	#ifndef _IMPEXP_KERNEL
+	#	define _IMPEXP_KERNEL
+	#endif
+	#include "fsproto.h"
+	#include "lock.h"
+	#include "cache.h"
+}
+
+#include "bfs.h"
+#include "BlockAllocator.h"
+#include "Chain.h"
+
+class Journal;
+class Inode;
+class Query;
+
+enum volume_flags {
+	VOLUME_READ_ONLY	= 0x0001
+};
+
+
+class Volume {
+	public:
+		Volume(nspace_id id);
+		~Volume();
+
+		status_t			Mount(const char *device,uint32 flags);
+		status_t			Unmount();
+
+		bool				IsValidSuperBlock();
+		bool				IsReadOnly() const { return fFlags & VOLUME_READ_ONLY; }
+		void				Panic();
+		Benaphore			&Lock() { return fLock; }
+
+		block_run			Root() const { return fSuperBlock.root_dir; }
+		Inode				*RootNode() const { return fRootNode; }
+		block_run			Indices() const { return fSuperBlock.indices; }
+		Inode				*IndicesNode() const { return fIndicesNode; }
+		block_run			Log() const { return fSuperBlock.log_blocks; }
+		vint32				&LogStart() { return fLogStart; }
+		vint32				&LogEnd() { return fLogEnd; }
+		int					Device() const { return fDevice; }
+
+		nspace_id			ID() const { return fID; }
+		const char			*Name() const { return fSuperBlock.name; }
+
+		off_t				NumBlocks() const { return fSuperBlock.num_blocks; }
+		off_t				UsedBlocks() const { return fSuperBlock.used_blocks; }
+		off_t				FreeBlocks() const { return fSuperBlock.num_blocks - fSuperBlock.used_blocks; }
+
+		uint32				BlockSize() const { return fSuperBlock.block_size; }
+		uint32				BlockShift() const { return fSuperBlock.block_shift; }
+		uint32				InodeSize() const { return fSuperBlock.inode_size; }
+		uint32				AllocationGroups() const { return fSuperBlock.num_ags; }
+		uint32				AllocationGroupShift() const { return fSuperBlock.ag_shift; }
+		disk_super_block	&SuperBlock() { return fSuperBlock; }
+
+		off_t				ToOffset(block_run run) const { return ToBlock(run) << fSuperBlock.block_shift; }
+		off_t				ToBlock(block_run run) const { return ((((off_t)run.allocation_group) << fSuperBlock.ag_shift) | (off_t)run.start); }
+		block_run			ToBlockRun(off_t block) const;
+		status_t			IsValidBlockRun(block_run run);
+		
+		off_t				ToVnode(block_run run) const { return ToBlock(run); }
+		off_t				ToVnode(off_t block) const { return block; }
+		off_t				VnodeToBlock(vnode_id id) const { return (off_t)id; }
+
+		status_t			CreateIndicesRoot(Transaction *transaction);
+
+		status_t			AllocateForInode(Transaction *transaction,const Inode *parent,mode_t type,block_run &run);
+		status_t			AllocateForInode(Transaction *transaction,const block_run *parent,mode_t type,block_run &run);
+		status_t			Allocate(Transaction *transaction,const Inode *inode,off_t numBlocks,block_run &run,uint16 minimum = 1);
+		status_t			Free(Transaction *transaction,block_run &run);
+
+#ifdef DEBUG
+		BlockAllocator		&Allocator() { return fBlockAllocator; }
+#endif
+
+		status_t			Sync();
+		Journal				*GetJournal(off_t /*refBlock*/) const { return fJournal; }
+
+		status_t			WriteSuperBlock();
+		status_t			WriteBlocks(off_t blockNumber,const uint8 *block,uint32 numBlocks);
+		void				WriteCachedBlocksIfNecessary();
+		status_t			FlushDevice();
+
+		void				UpdateLiveQueries(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength);
+		void				AddQuery(Query *query);
+		void				RemoveQuery(Query *query);
+
+		uint32				GetUniqueID() { return atomic_add(&fUniqueID,1); }
+
+
+	protected:
+		nspace_id			fID;
+		int					fDevice;
+		disk_super_block	fSuperBlock;
+		BlockAllocator		fBlockAllocator;
+		Benaphore			fLock;
+		Journal				*fJournal;
+		vint32				fLogStart,fLogEnd;
+
+		Inode				*fRootNode;
+		Inode				*fIndicesNode;
+
+		vint32				fDirtyCachedBlocks;
+
+		SimpleLock			fQueryLock;
+		Chain<Query>		fQueries;
+
+		int32				fUniqueID;
+		uint32				fFlags;
+};
+
+// inline functions
+
+inline status_t 
+Volume::AllocateForInode(Transaction *transaction, const block_run *parent, mode_t type, block_run &run)
+{
+	return fBlockAllocator.AllocateForInode(transaction,parent,type,run);
+}
+
+
+inline status_t 
+Volume::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks, block_run &run, uint16 minimum)
+{
+	return fBlockAllocator.Allocate(transaction,inode,numBlocks,run,minimum);
+}
+
+
+inline status_t 
+Volume::Free(Transaction *transaction, block_run &run)
+{
+	return fBlockAllocator.Free(transaction,run);
+}
+
+
+inline status_t 
+Volume::WriteBlocks(off_t blockNumber, const uint8 *block, uint32 numBlocks)
+{
+	atomic_add(&fDirtyCachedBlocks,numBlocks);
+	return cached_write(fDevice,blockNumber,block,numBlocks,fSuperBlock.block_size);
+}
+
+
+inline void 
+Volume::WriteCachedBlocksIfNecessary()
+{
+	// the specific values are only valid for the current BeOS cache
+	if (fDirtyCachedBlocks > 128) {
+		force_cache_flush(fDevice,false);
+		atomic_add(&fDirtyCachedBlocks,-64);
+	}
+}
+
+
+inline status_t 
+Volume::FlushDevice()
+{
+	fDirtyCachedBlocks = 0;
+	return flush_device(fDevice,0);
+}
+
+
+#endif	/* VOLUME_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/bfs.h b/src/add-ons/kernel/file_systems/bfs/bfs.h
new file mode 100644
index 0000000000..ba6488e513
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/bfs.h
@@ -0,0 +1,298 @@
+#ifndef BFS_H
+#define BFS_H
+/* bfs - BFS definitions and helper functions
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** Parts of this code is based on work previously done by Marcus Overhagen
+**
+** Copyright 2001 pinc Software. All Rights Reserved.
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <SupportDefs.h>
+
+#ifndef B_BAD_DATA
+#	define B_BAD_DATA B_ERROR
+#endif
+
+
+struct block_run
+{
+	int32		allocation_group;
+	uint16		start;
+	uint16		length;
+	
+	inline bool operator==(const block_run &run) const;
+	inline bool operator!=(const block_run &run) const;
+	inline bool IsZero();
+	inline void SetTo(int32 group,uint16 start,uint16 length = 1);
+
+	inline static block_run Run(int32 group,uint16 start,uint16 length = 1);
+};
+
+typedef block_run inode_addr;
+
+//**************************************
+
+
+#define BFS_DISK_NAME_LENGTH	32
+
+struct disk_super_block
+{
+	char		name[BFS_DISK_NAME_LENGTH];
+	int32		magic1;
+	int32		fs_byte_order;
+	uint32		block_size;
+	uint32		block_shift;
+	off_t		num_blocks;
+	off_t		used_blocks;
+	int32		inode_size;
+	int32		magic2;
+	int32		blocks_per_ag;
+	int32		ag_shift;
+	int32		num_ags;
+	int32		flags;
+	block_run	log_blocks;
+	off_t		log_start;
+	off_t		log_end;
+	int32		magic3;
+	inode_addr	root_dir;
+	inode_addr	indices;
+	int32		pad[8];
+};
+
+#define SUPER_BLOCK_FS_LENDIAN		'BIGE'		/* BIGE */
+
+#define SUPER_BLOCK_MAGIC1			'BFS1'		/* BFS1 */
+#define SUPER_BLOCK_MAGIC2			0xdd121031
+#define SUPER_BLOCK_MAGIC3			0x15b6830e
+
+#define SUPER_BLOCK_DISK_CLEAN		'CLEN'		/* CLEN */
+#define SUPER_BLOCK_DISK_DIRTY		'DIRT'		/* DIRT */
+
+//**************************************
+
+#define NUM_DIRECT_BLOCKS			12
+
+struct data_stream
+{
+	block_run	direct[NUM_DIRECT_BLOCKS];
+	off_t		max_direct_range;
+	block_run	indirect;
+	off_t		max_indirect_range;
+	block_run	double_indirect;
+	off_t		max_double_indirect_range;
+	off_t		size;
+};
+
+//**************************************
+
+struct bfs_inode;
+
+struct small_data
+{
+	uint32		type;
+	uint16		name_size;
+	uint16		data_size;
+	char		name[0];	// name_size long, followed by data
+	
+	inline char		*Name();
+	inline uint8	*Data();
+	inline uint32	Size();
+	inline small_data *Next();
+	inline bool		IsLast(bfs_inode *inode);
+};
+
+// the file name is part of the small_data structure
+#define FILE_NAME_TYPE			'CSTR'
+#define FILE_NAME_NAME			0x13 
+#define FILE_NAME_NAME_LENGTH	1 
+
+//**************************************
+
+#define SHORT_SYMLINK_NAME_LENGTH	144 // length incl. terminating '\0'
+
+struct bfs_inode
+{
+	int32		magic1;
+	inode_addr	inode_num;
+	int32		uid;
+	int32		gid;
+	int32		mode;				// see sys/stat.h
+	int32		flags;
+	bigtime_t	create_time;
+	bigtime_t	last_modified_time;
+	inode_addr	parent;
+	inode_addr	attributes;
+	uint32		type;				// attribute type
+	
+	int32		inode_size;
+	uint32		etc;				// for in-memory structures (unused in OpenBeOS' fs)
+
+	union {
+		data_stream		data;
+		char 			short_symlink[SHORT_SYMLINK_NAME_LENGTH];
+	};
+	int32		pad[4];
+	small_data	small_data_start[0];
+};	
+
+#define INODE_MAGIC1			0x3bbe0ad9
+#define INODE_TIME_SHIFT		16
+#define INODE_TIME_MASK			0xffff
+#define INODE_FILE_NAME_LENGTH	256
+
+enum inode_flags
+{
+	INODE_IN_USE			= 0x00000001,	// always set
+	INODE_ATTR_INODE		= 0x00000004,
+	INODE_LOGGED			= 0x00000008,	// log changes to the data stream
+	INODE_DELETED			= 0x00000010,
+	INODE_EMPTY				= 0x00000020,
+	INODE_LONG_SYMLINK		= 0x00000040,	// symlink in data stream
+
+	INODE_PERMANENT_FLAGS	= 0x0000ffff,
+
+	INODE_NO_CACHE			= 0x00010000,
+	INODE_WAS_WRITTEN		= 0x00020000,
+	INODE_NO_TRANSACTION	= 0x00040000,
+};
+
+//**************************************
+
+struct file_cookie {
+	bigtime_t last_notification;
+	off_t	last_size;
+	int		open_mode;
+};
+
+// notify every second if the file size has changed
+#define INODE_NOTIFICATION_INTERVAL	1000000LL
+
+//**************************************
+
+
+inline int32
+divide_roundup(int32 num,int32 divisor)
+{
+	return (num + divisor - 1) / divisor;
+}
+
+inline int64
+divide_roundup(int64 num,int32 divisor)
+{
+	return (num + divisor - 1) / divisor;
+}
+
+inline int
+get_shift(uint64 i)
+{
+	int c;
+	c = 0;
+	while (i > 1) {
+		i >>= 1;
+		c++;
+	}
+	return c;
+}
+
+inline int32
+round_up(uint32 data)
+{
+	// rounds up to the next off_t boundary
+	return (data + sizeof(off_t) - 1) & ~(sizeof(off_t) - 1);
+}
+
+
+/************************ block_run inline functions ************************/
+//	#pragma mark -
+
+
+inline bool
+block_run::operator==(const block_run &run) const
+{
+	return allocation_group == run.allocation_group
+		&& start == run.start
+		&& length == run.length;
+}
+
+
+inline bool
+block_run::operator!=(const block_run &run) const
+{
+	return allocation_group != run.allocation_group
+		|| start != run.start
+		|| length != run.length;
+}
+
+
+inline bool
+block_run::IsZero()
+{
+	return allocation_group == 0 && start == 0 && length == 0;
+}
+
+
+inline void
+block_run::SetTo(int32 _group,uint16 _start,uint16 _length)
+{
+	allocation_group = _group;
+	start = _start;
+	length = _length;
+}
+
+
+inline block_run
+block_run::Run(int32 group, uint16 start, uint16 length)
+{
+	block_run run;
+	run.allocation_group = group;
+	run.start = start;
+	run.length = length;
+	return run;
+}
+
+
+/************************ small_data inline functions ************************/
+//	#pragma mark -
+
+
+inline char *
+small_data::Name()
+{
+	return name;
+}
+
+
+inline uint8 *
+small_data::Data()
+{
+	return (uint8 *)name + name_size + 3;
+}
+
+
+inline uint32 
+small_data::Size()
+{
+	return sizeof(small_data) + name_size + 3 + data_size + 1;
+}
+
+
+inline small_data *
+small_data::Next()
+{
+	return (small_data *)((uint8 *)this + Size());
+}
+
+
+inline bool
+small_data::IsLast(bfs_inode *inode)
+{
+	// we need to check the location first, because if name_size is already beyond
+	// the block, we would touch invalid memory (although that can't cause wrong
+	// results)
+	return (uint32)this > (uint32)inode + inode->inode_size - sizeof(small_data) || name_size == 0;
+}
+
+#endif	/* BFS_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/cache.h b/src/add-ons/kernel/file_systems/bfs/cache.h
new file mode 100644
index 0000000000..a0e913840e
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/cache.h
@@ -0,0 +1,108 @@
+/*
+	Copyright 1999-2001, Be Incorporated.   All Rights Reserved.
+	This file may be used under the terms of the Be Sample Code License.
+*/
+
+#ifndef _CACHE_H_
+#define _CACHE_H_
+
+#include <BeBuild.h>
+
+typedef struct hash_ent {
+	int              dev;
+	off_t            bnum;
+	off_t            hash_val;
+	void            *data;
+	struct hash_ent *next;
+} hash_ent;
+
+
+typedef struct hash_table {
+    hash_ent **table;
+    int        max;
+    int        mask;          /* == max - 1 */
+    int        num_elements;
+} hash_table;
+
+
+#define HT_DEFAULT_MAX   128
+
+
+typedef struct cache_ent {
+	int               dev;
+	off_t             block_num;
+	int               bsize;
+	volatile int      flags;
+
+	void             *data;
+	void             *clone;         /* copy of data by set_block_info() */
+	int               lock;
+
+	void            (*func)(off_t bnum, size_t num_blocks, void *arg);
+	off_t             logged_bnum;
+	void             *arg;
+
+	struct cache_ent *next,          /* points toward mru end of list */
+	                 *prev;          /* points toward lru end of list */
+
+} cache_ent;
+
+#define CE_NORMAL    0x0000     /* a nice clean pristine page */
+#define CE_DIRTY     0x0002     /* needs to be written to disk */
+#define CE_BUSY      0x0004     /* this block has i/o happening, don't touch it */
+
+
+typedef struct cache_ent_list {
+	cache_ent *lru;              /* tail of the list */
+	cache_ent *mru;              /* head of the list */
+} cache_ent_list;
+
+
+typedef struct block_cache {
+	struct lock		lock;
+	int			 	flags;
+    int				cur_blocks;
+	int				max_blocks;
+	hash_table		ht;
+
+	cache_ent_list	normal,       /* list of "normal" blocks (clean & dirty) */
+					locked;       /* list of clean and locked blocks */
+} block_cache;
+
+#if 0   /* XXXdbg -- need to deal with write through caches */
+#define DC_WRITE_THROUGH    0x0001  /* cache is write-through (for floppies) */
+#endif
+
+#define ALLOW_WRITES  1
+#define NO_WRITES     0
+
+extern _IMPEXP_KERNEL int   init_block_cache(int max_blocks, int flags);
+extern _IMPEXP_KERNEL void  shutdown_block_cache(void);
+
+extern _IMPEXP_KERNEL void  force_cache_flush(int dev, int prefer_log_blocks);
+extern _IMPEXP_KERNEL int   flush_blocks(int dev, off_t bnum, int nblocks);
+extern _IMPEXP_KERNEL int   flush_device(int dev, int warn_locked);
+
+extern _IMPEXP_KERNEL int   init_cache_for_device(int fd, off_t max_blocks);
+extern _IMPEXP_KERNEL int   remove_cached_device_blocks(int dev, int allow_write);
+
+extern _IMPEXP_KERNEL void *get_block(int dev, off_t bnum, int bsize);
+extern _IMPEXP_KERNEL void *get_empty_block(int dev, off_t bnum, int bsize);
+extern _IMPEXP_KERNEL int   release_block(int dev, off_t bnum);
+extern _IMPEXP_KERNEL int   mark_blocks_dirty(int dev, off_t bnum, int nblocks);
+
+
+extern _IMPEXP_KERNEL int  cached_read(int dev, off_t bnum, void *data, off_t num_blocks, int bsize);
+extern _IMPEXP_KERNEL int  cached_write(int dev, off_t bnum, const void *data,
+				  off_t num_blocks, int bsize);
+extern _IMPEXP_KERNEL int  cached_write_locked(int dev, off_t bnum, const void *data,
+						 off_t num_blocks, int bsize);
+extern _IMPEXP_KERNEL int  set_blocks_info(int dev, off_t *blocks, int nblocks,
+					 void (*func)(off_t bnum, size_t nblocks, void *arg),
+					 void *arg);
+
+
+extern _IMPEXP_KERNEL size_t read_phys_blocks (int fd, off_t bnum, void *data, uint num_blocks, int bsize);
+extern _IMPEXP_KERNEL size_t write_phys_blocks(int fd, off_t bnum, void *data, uint num_blocks, int bsize);
+
+#endif /* _CACHE_H_ */
diff --git a/src/add-ons/kernel/file_systems/bfs/cpp.cpp b/src/add-ons/kernel/file_systems/bfs/cpp.cpp
new file mode 100644
index 0000000000..47d5ca110b
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/cpp.cpp
@@ -0,0 +1,17 @@
+/* cpp - C++ in the kernel
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "cpp.h"
+
+
+nothrow_t _dontthrow;
+
+extern "C" void __pure_virtual()
+{
+	//printf("pure virtual function call");
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/cpp.h b/src/add-ons/kernel/file_systems/bfs/cpp.h
new file mode 100644
index 0000000000..c9fd65fdf3
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/cpp.h
@@ -0,0 +1,52 @@
+#ifndef CPP_H
+#define CPP_H
+/* cpp - C++ in the kernel
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include <new>
+#include <stdlib.h>
+
+
+// Oh no! C++ in the kernel! Are you nuts?
+//
+//	- no exceptions
+//	- (almost) no virtuals (well, the Query code now uses them)
+//	- it's basically only the C++ syntax, and type checking
+//	- since one tend to encapsulate everything in classes, it has a slightly
+//	  higher memory overhead
+//	- nicer code
+//	- easier to maintain
+
+
+inline void *operator new(size_t size, const nothrow_t&) throw()
+{
+	return malloc(size);
+} 
+
+inline void *operator new[](size_t size, const nothrow_t&) throw()
+{
+	return malloc(size);
+}
+ 
+inline void operator delete(void *ptr)
+{
+	free(ptr);
+} 
+
+inline void operator delete[](void *ptr)
+{
+	free(ptr);
+}
+
+// now we're using virtuals
+extern "C" void __pure_virtual();
+
+extern nothrow_t _dontthrow;
+#define new new (_dontthrow)
+
+
+#endif	/* CPP_H */
diff --git a/src/add-ons/kernel/file_systems/bfs/fsproto.h b/src/add-ons/kernel/file_systems/bfs/fsproto.h
new file mode 100644
index 0000000000..1fc15ddc7c
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/fsproto.h
@@ -0,0 +1,249 @@
+/*
+	Copyright 1999-2001, Be Incorporated.   All Rights Reserved.
+	This file may be used under the terms of the Be Sample Code License.
+*/
+
+#ifndef _FSPROTO_H
+#define _FSPROTO_H
+
+#include <sys/dirent.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <iovec.h>
+
+#include <OS.h>
+#include <fs_attr.h>
+#include <fs_info.h>
+#include <BeBuild.h>
+#include <Drivers.h>
+
+typedef dev_t		nspace_id;
+typedef ino_t		vnode_id;
+
+/*
+ * PUBLIC PART OF THE FILE SYSTEM PROTOCOL
+ */
+
+#define		WSTAT_MODE		0x0001
+#define		WSTAT_UID		0x0002
+#define		WSTAT_GID		0x0004
+#define		WSTAT_SIZE		0x0008
+#define		WSTAT_ATIME		0x0010
+#define		WSTAT_MTIME		0x0020
+#define		WSTAT_CRTIME	0x0040
+
+#define		WFSSTAT_NAME	0x0001
+
+#define		B_ENTRY_CREATED		1
+#define		B_ENTRY_REMOVED		2
+#define		B_ENTRY_MOVED		3
+#define		B_STAT_CHANGED		4
+#define		B_ATTR_CHANGED		5
+#define		B_DEVICE_MOUNTED	6
+#define		B_DEVICE_UNMOUNTED	7
+
+#define		B_STOP_WATCHING     0x0000
+#define		B_WATCH_NAME		0x0001
+#define		B_WATCH_STAT		0x0002
+#define		B_WATCH_ATTR		0x0004
+#define		B_WATCH_DIRECTORY	0x0008
+
+#define		SELECT_READ			1
+#define		SELECT_WRITE		2
+#define 	SELECT_EXCEPTION	3
+
+// missing ioctl() call added
+#define		IOCTL_FILE_UNCACHED_IO	10000
+
+#define		B_CUR_FS_API_VERSION	2
+
+struct attr_info;
+struct index_info;
+
+typedef int	op_read_vnode(void *ns, vnode_id vnid, char r, void **node);
+typedef int	op_write_vnode(void *ns, void *node, char r);
+typedef int	op_remove_vnode(void *ns, void *node, char r);
+typedef int	op_secure_vnode(void *ns, void *node);
+
+typedef int	op_walk(void *ns, void *base, const char *file, char **newpath,
+					vnode_id *vnid);
+
+typedef int	op_access(void *ns, void *node, int mode);
+
+typedef int	op_create(void *ns, void *dir, const char *name,
+					int omode, int perms, vnode_id *vnid, void **cookie);
+typedef int	op_mkdir(void *ns, void *dir, const char *name,	int perms);
+typedef int	op_symlink(void *ns, void *dir, const char *name,
+					const char *path);
+typedef int op_link(void *ns, void *dir, const char *name, void *node);
+
+typedef int	op_rename(void *ns, void *olddir, const char *oldname,
+					void *newdir, const char *newname);
+typedef int	op_unlink(void *ns, void *dir, const char *name);
+typedef int	op_rmdir(void *ns, void *dir, const char *name);
+
+typedef int	op_readlink(void *ns, void *node, char *buf, size_t *bufsize);
+
+typedef int op_opendir(void *ns, void *node, void **cookie);
+typedef int	op_closedir(void *ns, void *node, void *cookie);
+typedef int	op_rewinddir(void *ns, void *node, void *cookie);
+typedef int	op_readdir(void *ns, void *node, void *cookie, long *num,
+					struct dirent *buf, size_t bufsize);
+
+typedef int	op_open(void *ns, void *node, int omode, void **cookie);
+typedef int	op_close(void *ns, void *node, void *cookie);
+typedef int op_free_cookie(void *ns, void *node, void *cookie);
+typedef int op_read(void *ns, void *node, void *cookie, off_t pos, void *buf,
+					size_t *len);
+typedef int op_write(void *ns, void *node, void *cookie, off_t pos,
+					const void *buf, size_t *len);
+typedef int op_readv(void *ns, void *node, void *cookie, off_t pos, const iovec *vec,
+					size_t count, size_t *len);
+typedef int op_writev(void *ns, void *node, void *cookie, off_t pos, const iovec *vec,
+					size_t count, size_t *len);
+typedef int	op_ioctl(void *ns, void *node, void *cookie, int cmd, void *buf,
+					size_t len);
+typedef	int	op_setflags(void *ns, void *node, void *cookie, int flags);
+
+typedef int	op_rstat(void *ns, void *node, struct stat *);
+typedef int op_wstat(void *ns, void *node, struct stat *, long mask);
+typedef int	op_fsync(void *ns, void *node);
+
+typedef int	op_select(void *ns, void *node, void *cookie, uint8 event,
+				uint32 ref, selectsync *sync);
+typedef int	op_deselect(void *ns, void *node, void *cookie, uint8 event,
+				selectsync *sync);
+
+typedef int	op_initialize(const char *devname, void *parms, size_t len);
+typedef int	op_mount(nspace_id nsid, const char *devname, ulong flags,
+					void *parms, size_t len, void **data, vnode_id *vnid);
+typedef int	op_unmount(void *ns);
+typedef int	op_sync(void *ns);
+typedef int op_rfsstat(void *ns, struct fs_info *);
+typedef int op_wfsstat(void *ns, struct fs_info *, long mask);
+
+
+typedef int	op_open_attrdir(void *ns, void *node, void **cookie);
+typedef int	op_close_attrdir(void *ns, void *node, void *cookie);
+typedef int	op_rewind_attrdir(void *ns, void *node, void *cookie);
+typedef int	op_read_attrdir(void *ns, void *node, void *cookie, long *num,
+					struct dirent *buf, size_t bufsize);
+typedef int	op_remove_attr(void *ns, void *node, const char *name);
+typedef	int	op_rename_attr(void *ns, void *node, const char *oldname,
+					const char *newname);
+typedef int	op_stat_attr(void *ns, void *node, const char *name,
+					struct attr_info *buf);
+
+typedef int	op_write_attr(void *ns, void *node, const char *name, int type,
+					const void *buf, size_t *len, off_t pos);
+typedef int	op_read_attr(void *ns, void *node, const char *name, int type,
+					void *buf, size_t *len, off_t pos);
+
+typedef int	op_open_indexdir(void *ns, void **cookie);
+typedef int	op_close_indexdir(void *ns, void *cookie);
+typedef int	op_rewind_indexdir(void *ns, void *cookie);
+typedef int	op_read_indexdir(void *ns, void *cookie, long *num,
+					struct dirent *buf, size_t bufsize);
+typedef int	op_create_index(void *ns, const char *name, int type, int flags);
+typedef int	op_remove_index(void *ns, const char *name);
+typedef	int	op_rename_index(void *ns, const char *oldname, 
+					const char *newname);
+typedef int	op_stat_index(void *ns, const char *name, struct index_info *buf);
+
+typedef int	op_open_query(void *ns, const char *query, ulong flags,
+					port_id port, long token, void **cookie);
+typedef int	op_close_query(void *ns, void *cookie);
+typedef int	op_read_query(void *ns, void *cookie, long *num,
+					struct dirent *buf, size_t bufsize);
+
+typedef struct vnode_ops {
+	op_read_vnode			(*read_vnode);
+	op_write_vnode			(*write_vnode);
+	op_remove_vnode			(*remove_vnode);
+	op_secure_vnode			(*secure_vnode);
+	op_walk					(*walk);
+	op_access				(*access);
+	op_create				(*create);
+	op_mkdir				(*mkdir);
+	op_symlink				(*symlink);
+	op_link					(*link);
+	op_rename				(*rename);
+	op_unlink				(*unlink);
+	op_rmdir				(*rmdir);
+	op_readlink				(*readlink);
+	op_opendir				(*opendir);
+	op_closedir				(*closedir);
+	op_free_cookie			(*free_dircookie);
+	op_rewinddir			(*rewinddir);
+	op_readdir				(*readdir);
+	op_open					(*open);
+	op_close				(*close);
+	op_free_cookie			(*free_cookie);
+	op_read					(*read);
+	op_write				(*write);
+	op_readv				(*readv);
+	op_writev				(*writev);
+	op_ioctl				(*ioctl);
+	op_setflags				(*setflags);
+	op_rstat				(*rstat);
+	op_wstat				(*wstat);
+	op_fsync				(*fsync);
+	op_initialize			(*initialize);
+	op_mount				(*mount);
+	op_unmount				(*unmount);
+	op_sync					(*sync);
+	op_rfsstat				(*rfsstat);
+	op_wfsstat				(*wfsstat);
+	op_select				(*select);
+	op_deselect				(*deselect);
+	op_open_indexdir		(*open_indexdir);
+	op_close_indexdir		(*close_indexdir);
+	op_free_cookie			(*free_indexdircookie);
+	op_rewind_indexdir		(*rewind_indexdir);
+	op_read_indexdir		(*read_indexdir);
+	op_create_index			(*create_index);
+	op_remove_index			(*remove_index);
+	op_rename_index			(*rename_index);
+	op_stat_index			(*stat_index);
+	op_open_attrdir			(*open_attrdir);
+	op_close_attrdir		(*close_attrdir);
+	op_free_cookie			(*free_attrdircookie);
+	op_rewind_attrdir		(*rewind_attrdir);
+	op_read_attrdir			(*read_attrdir);
+	op_write_attr			(*write_attr);
+	op_read_attr			(*read_attr);
+	op_remove_attr			(*remove_attr);
+	op_rename_attr			(*rename_attr);
+	op_stat_attr			(*stat_attr);
+	op_open_query			(*open_query);
+	op_close_query			(*close_query);
+	op_free_cookie			(*free_querycookie);
+	op_read_query			(*read_query);
+} vnode_ops;
+
+extern _IMPEXP_KERNEL int	new_path(const char *path, char **copy);
+extern _IMPEXP_KERNEL void	free_path(char *p);
+
+extern _IMPEXP_KERNEL int	notify_listener(int op, nspace_id nsid,
+									vnode_id vnida,	vnode_id vnidb,
+									vnode_id vnidc, const char *name);
+extern _IMPEXP_KERNEL void	notify_select_event(selectsync *sync, uint32 ref);
+extern _IMPEXP_KERNEL int	send_notification(port_id port, long token,
+									ulong what, long op, nspace_id nsida,
+									nspace_id nsidb, vnode_id vnida,
+									vnode_id vnidb, vnode_id vnidc,
+									const char *name);
+extern _IMPEXP_KERNEL int	get_vnode(nspace_id nsid, vnode_id vnid, void **data);
+extern _IMPEXP_KERNEL int	put_vnode(nspace_id nsid, vnode_id vnid);
+extern _IMPEXP_KERNEL int	new_vnode(nspace_id nsid, vnode_id vnid, void *data);
+extern _IMPEXP_KERNEL int	remove_vnode(nspace_id nsid, vnode_id vnid);
+extern _IMPEXP_KERNEL int	unremove_vnode(nspace_id nsid, vnode_id vnid);
+extern _IMPEXP_KERNEL int	is_vnode_removed(nspace_id nsid, vnode_id vnid);
+
+
+extern _EXPORT vnode_ops	fs_entry;
+extern _EXPORT int32		api_version;
+
+#endif
diff --git a/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp b/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp
new file mode 100644
index 0000000000..5dcaab7628
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp
@@ -0,0 +1,1880 @@
+/* kernel_interface - file system interface to BeOS' vnode layer
+**
+** Initial version by Axel Dörfler, axeld@pinc-software.de
+** This file may be used under the terms of the OpenBeOS License.
+*/
+
+
+#include "Debug.h"
+#include "cpp.h"
+#include "Volume.h"
+#include "Inode.h"
+#include "Index.h"
+#include "BPlusTree.h"
+#include "Query.h"
+
+#include <string.h>
+#include <stdio.h>
+
+// BeOS vnode layer stuff
+#include <KernelExport.h>
+#ifndef _IMPEXP_KERNEL
+#	define _IMPEXP_KERNEL
+#endif
+
+extern "C" {
+	#include <fsproto.h>
+	#include <lock.h>
+	#include <cache.h>
+}
+#include <fs_index.h>
+#include <fs_query.h>
+
+
+#ifdef USER
+#	define dprintf printf
+#endif
+
+
+extern "C" {
+	static int bfs_mount(nspace_id nsid, const char *device, ulong flags,
+					void *parms, size_t len, void **data, vnode_id *vnid);
+	static int bfs_unmount(void *_ns);
+	static int bfs_read_fs_stat(void *_ns, struct fs_info *);
+	static int bfs_write_fs_stat(void *ns, struct fs_info *, long mode);
+	static int bfs_initialize(const char *devname, void *parms, size_t len);
+
+	static int bfs_sync(void *ns);
+
+	static int bfs_read_vnode(void *_ns, vnode_id vnid, char r, void **node);
+	static int bfs_release_vnode(void *_ns, void *_node, char r);
+	static int bfs_remove_vnode(void *ns, void *node, char r);
+
+	static int bfs_walk(void *_ns, void *_base, const char *file,
+					char **newpath, vnode_id *vnid);
+	
+	static int bfs_ioctl(void *ns, void *node, void *cookie, int cmd, void *buf,size_t len);
+	static int bfs_setflags(void *ns, void *node, void *cookie, int flags);
+
+	static int bfs_select(void *ns, void *node, void *cookie, uint8 event,
+					uint32 ref, selectsync *sync);
+	static int bfs_deselect(void *ns, void *node, void *cookie, uint8 event,
+					selectsync *sync);
+	static int bfs_fsync(void *ns,void *node);
+
+	static int bfs_create(void *ns, void *dir, const char *name,
+					int perms, int omode, vnode_id *vnid, void **cookie);
+	static int bfs_symlink(void *ns, void *dir, const char *name,
+					const char *path);
+	static int bfs_link(void *ns, void *dir, const char *name, void *node);
+	static int bfs_unlink(void *ns, void *dir, const char *name);
+	static int bfs_rename(void *ns, void *oldDir, const char *oldName, void *newDir, const char *newName);
+
+	static int bfs_read_stat(void *_ns, void *_node, struct stat *st);
+	static int bfs_write_stat(void *ns, void *node, struct stat *st, long mask);
+
+	static int bfs_open(void *_ns, void *_node, int omode, void **cookie);
+	static int bfs_read(void *_ns, void *_node, void *cookie, off_t pos,
+					void *buf, size_t *len);
+	static int bfs_write(void *ns, void *node, void *cookie, off_t pos,
+					const void *buf, size_t *len);
+	static int bfs_free_cookie(void *ns, void *node, void *cookie);
+	static int bfs_close(void *ns, void *node, void *cookie);
+	
+	static int bfs_access(void *_ns, void *_node, int mode);
+	static int bfs_read_link(void *_ns, void *_node, char *buffer, size_t *bufferSize);
+
+	// directory functions	
+	static int bfs_mkdir(void *ns, void *dir, const char *name, int perms);
+	static int bfs_rmdir(void *ns, void *dir, const char *name);
+	static int bfs_open_dir(void *_ns, void *_node, void **cookie);
+	static int bfs_read_dir(void *_ns, void *_node, void *cookie,
+					long *num, struct dirent *dirent, size_t bufferSize);
+	static int bfs_rewind_dir(void *_ns, void *_node, void *cookie);
+	static int bfs_close_dir(void *_ns, void *_node, void *cookie);
+	static int bfs_free_dir_cookie(void *_ns, void *_node, void *cookie);
+
+	// attribute support
+	static int bfs_open_attrdir(void *ns, void *node, void **cookie);
+	static int bfs_close_attrdir(void *ns, void *node, void *cookie);
+	static int bfs_free_attrdir_cookie(void *ns, void *node, void *cookie);
+	static int bfs_rewind_attrdir(void *ns, void *node, void *cookie);
+	static int bfs_read_attrdir(void *ns, void *node, void *cookie, long *num,
+					struct dirent *buf, size_t bufferSize);
+	static int bfs_remove_attr(void *ns, void *node, const char *name);
+	static int bfs_rename_attr(void *ns, void *node, const char *oldname,
+					const char *newname);
+	static int bfs_stat_attr(void *ns, void *node, const char *name,
+					struct attr_info *buf);
+	static int bfs_write_attr(void *ns, void *node, const char *name, int type,
+					const void *buf, size_t *len, off_t pos);
+	static int bfs_read_attr(void *ns, void *node, const char *name, int type,
+					void *buf, size_t *len, off_t pos);
+
+	// index support
+	static int bfs_open_indexdir(void *ns, void **cookie);
+	static int bfs_close_indexdir(void *ns, void *cookie);
+	static int bfs_free_indexdir_cookie(void *ns, void *node, void *cookie);
+	static int bfs_rewind_indexdir(void *ns, void *cookie);
+	static int bfs_read_indexdir(void *ns, void *cookie, long *num,struct dirent *dirent,
+					size_t bufferSize);
+	static int bfs_create_index(void *ns, const char *name, int type, int flags);
+	static int bfs_remove_index(void *ns, const char *name);
+	static int bfs_rename_index(void *ns, const char *oldname, const char *newname);
+	static int bfs_stat_index(void *ns, const char *name, struct index_info *indexInfo);
+
+	// query support
+	static int bfs_open_query(void *ns, const char *query, ulong flags,
+					port_id port, long token, void **cookie);
+	static int bfs_close_query(void *ns, void *cookie);
+	static int bfs_free_query_cookie(void *ns, void *node, void *cookie);
+	static int bfs_read_query(void *ns, void *cookie, long *num,
+					struct dirent *buf, size_t bufsize);
+}	// extern "C"
+
+
+/* vnode_ops struct. Fill this in to tell the kernel how to call
+	functions in your driver.
+*/
+
+vnode_ops fs_entry =  {
+	&bfs_read_vnode,			// read_vnode
+	&bfs_release_vnode,			// write_vnode
+	&bfs_remove_vnode,			// remove_vnode
+	NULL,						// secure_vnode (not needed)
+	&bfs_walk,					// walk
+	&bfs_access,				// access
+	&bfs_create,				// create
+	&bfs_mkdir,					// mkdir
+	&bfs_symlink,				// symlink
+	&bfs_link,					// link
+	&bfs_rename,				// rename
+	&bfs_unlink,				// unlink
+	&bfs_rmdir,					// rmdir
+	&bfs_read_link,				// readlink
+	&bfs_open_dir,				// opendir
+	&bfs_close_dir,				// closedir
+	&bfs_free_dir_cookie,		// free_dircookie
+	&bfs_rewind_dir,			// rewinddir
+	&bfs_read_dir,				// readdir
+	&bfs_open,					// open file
+	&bfs_close,					// close file
+	&bfs_free_cookie,			// free cookie
+	&bfs_read,					// read file
+	&bfs_write,					// write file
+	NULL,						// readv
+	NULL,						// writev
+	&bfs_ioctl,					// ioctl
+	&bfs_setflags,				// setflags file
+	&bfs_read_stat,				// read stat
+	&bfs_write_stat,			// write stat
+	&bfs_fsync,					// fsync
+	&bfs_initialize,			// initialize
+	&bfs_mount,					// mount
+	&bfs_unmount,				// unmount
+	&bfs_sync,					// sync
+	&bfs_read_fs_stat,			// read fs stat
+	&bfs_write_fs_stat,			// write fs stat
+	&bfs_select,				// select
+	&bfs_deselect,				// deselect
+
+	&bfs_open_indexdir,			// open index dir
+	&bfs_close_indexdir,		// close index dir
+	&bfs_free_indexdir_cookie,	// free index dir cookie
+	&bfs_rewind_indexdir,		// rewind index dir
+	&bfs_read_indexdir,			// read index dir
+	&bfs_create_index,			// create index
+	&bfs_remove_index,			// remove index
+	&bfs_rename_index,			// rename index
+	&bfs_stat_index,			// stat index
+
+	&bfs_open_attrdir,			// open attr dir
+	&bfs_close_attrdir,			// close attr dir
+	&bfs_free_attrdir_cookie,	// free attr dir cookie
+	&bfs_rewind_attrdir,		// rewind attr dir
+	&bfs_read_attrdir,			// read attr dir
+	&bfs_write_attr,			// write attr
+	&bfs_read_attr,				// read attr
+	&bfs_remove_attr,			// remove attr
+	&bfs_rename_attr,			// rename attr
+	&bfs_stat_attr,				// stat attr
+
+	&bfs_open_query,			// open query
+	&bfs_close_query,			// close query
+	&bfs_free_query_cookie,		// free query cookie
+	&bfs_read_query				// read query
+};
+
+#define BFS_IO_SIZE 65536
+int32	api_version = B_CUR_FS_API_VERSION;
+
+
+static int
+bfs_mount(nspace_id nsid, const char *device, ulong flags, void *parms,
+		size_t len, void **data, vnode_id *rootID)
+{
+	FUNCTION();
+
+#ifndef USER
+	// If you can't build the file system because of this line, you can either
+	// add the prototype:
+	//	extern int load_driver_symbols(const char *driver_name);
+	// to your KernelExport.h include (since it's missing there in some releases
+	// of BeOS R5), or just comment out the line, it won't do any harm and is
+	// used only for debugging purposes.
+	load_driver_symbols("obfs");
+#endif
+
+	Volume *volume = new Volume(nsid);
+	if (volume == NULL)
+		return B_NO_MEMORY;
+
+	status_t status;
+	if ((status = volume->Mount(device,flags)) == B_OK) {
+		*data = volume;
+		*rootID = volume->ToVnode(volume->Root());
+		INFORM(("mounted \"%s\" (root node at %Ld, device = %s)\n",volume->Name(),*rootID,device));
+	}
+	else
+		delete volume;
+
+	RETURN_ERROR(status);
+}
+
+
+static int
+bfs_unmount(void *ns)
+{
+	FUNCTION();
+	Volume* volume = (Volume *)ns;
+
+	status_t status = volume->Unmount();
+	delete volume;
+
+	RETURN_ERROR(status);
+}
+
+
+/**	Fill in bfs_info struct for device.
+ */
+
+static int
+bfs_read_fs_stat(void *_ns, struct fs_info *info)
+{
+	FUNCTION();
+	if (_ns == NULL || info == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+
+	// File system flags.
+	info->flags = B_FS_IS_PERSISTENT | B_FS_HAS_ATTR | B_FS_HAS_MIME | B_FS_HAS_QUERY |
+			(volume->IsReadOnly() ? B_FS_IS_READONLY : 0);
+
+	info->io_size = BFS_IO_SIZE;
+		// whatever is appropriate here? Just use the same value as BFS (and iso9660) for now
+
+	info->block_size = volume->BlockSize();
+	info->total_blocks = volume->NumBlocks();
+	info->free_blocks = volume->FreeBlocks();
+
+	// Volume name
+	strncpy(info->volume_name, volume->Name(), sizeof(info->volume_name) - 1);
+	info->volume_name[sizeof(info->volume_name) - 1] = '\0';
+
+	// File system name (ToDo: has to change to "bfs" later)
+	strcpy(info->fsh_name,"obfs");
+
+	return B_NO_ERROR;
+}
+
+
+static int
+bfs_write_fs_stat(void *_ns, struct fs_info *info, long mask)
+{
+	FUNCTION_START(("mask = %ld\n",mask));
+	Volume *volume = (Volume *)_ns;
+	disk_super_block &superBlock = volume->SuperBlock();
+	
+	Locker locker(volume->Lock());
+
+	status_t status = B_BAD_VALUE;
+
+	if (mask & WFSSTAT_NAME) {
+		strncpy(superBlock.name,info->volume_name,sizeof(superBlock.name) - 1);
+		superBlock.name[sizeof(superBlock.name) - 1] = '\0';
+
+		status = volume->WriteSuperBlock();
+	}
+	return status;
+}
+
+
+int 
+bfs_initialize(const char *deviceName, void *parms, size_t len)
+{
+	FUNCTION_START(("deviceName = %s, parameter len = %ld\n",deviceName,len));
+
+	// ToDo: implement bfs_initialize()!
+
+	return B_ERROR;
+}
+
+
+int 
+bfs_sync(void *_ns)
+{
+	FUNCTION();
+	if (_ns == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+
+	return volume->Sync();
+}
+
+
+//	#pragma mark -
+
+
+/**	Using vnode id, read in vnode information into fs-specific struct,
+ *	and return it in node. the reenter flag tells you if this function
+ *	is being called via some other fs routine, so that things like 
+ *	double-locking can be avoided.
+ */
+
+static int
+bfs_read_vnode(void *_ns, vnode_id id, char reenter, void **node)
+{
+	FUNCTION_START(("vnode_id = %Ld\n",id));
+	Volume *volume = (Volume *)_ns;
+
+	if (id < 0 || id > volume->NumBlocks()) {
+		FATAL(("inode at %Ld requested!\n",id));
+		return B_ERROR;
+	}
+
+	Inode *inode = new Inode(volume,id,false,reenter);
+	if (inode == NULL)
+		return B_NO_MEMORY;
+
+	if (inode->InitCheck() == B_OK) {
+		*node = (void *)inode;
+		return B_OK;
+	}
+
+	delete inode;
+	RETURN_ERROR(B_ERROR);
+}
+
+
+static int
+bfs_release_vnode(void *ns, void *_node, char reenter)
+{
+	//FUNCTION_START(("node = %p\n",_node));
+	Inode *inode = (Inode *)_node;
+	
+	delete inode;
+
+	return B_NO_ERROR;
+}
+
+
+int 
+bfs_remove_vnode(void *_ns, void *_node, char reenter)
+{
+	FUNCTION();
+
+	if (_ns == NULL || _node == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	// If the inode isn't in use anymore, we were called before
+	// bfs_unlink() returns - in this case, we can just use the
+	// transaction which has already deleted the inode.
+	Transaction localTransaction,*transaction = &localTransaction;
+	Journal *journal = volume->GetJournal(volume->ToBlock(inode->Parent()));
+
+	if (journal != NULL && journal->CurrentThread() == find_thread(NULL))
+		transaction = journal->CurrentTransaction();
+	else
+		localTransaction.Start(volume,inode->BlockNumber());
+
+	// Perhaps there should be an implementation of Inode::ShrinkStream() that
+	// just frees the data_stream, but doesn't change the inode (since it is
+	// freed anyway) - that would make an undelete command possible
+	status_t status = inode->SetFileSize(transaction,0);
+	if (status < B_OK)
+		return status;
+
+	// Free all attributes, and remove their indices
+	{
+		// We have to limit the scope of AttributeIterator, so that its
+		// destructor is not called after the inode is deleted
+		AttributeIterator iterator(inode);
+
+		char name[B_FILE_NAME_LENGTH];
+		uint32 type;
+		size_t length;
+		vnode_id id;
+		while ((status = iterator.GetNext(name,&length,&type,&id)) == B_OK)
+			inode->RemoveAttribute(transaction,name);
+	}
+
+	if ((status = volume->Free(transaction,inode->BlockRun())) == B_OK) {
+		if (transaction == &localTransaction)
+			localTransaction.Done();
+
+		delete inode;
+	}
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+
+
+/**	the walk function just "walks" through a directory looking for the
+ *	specified file. It calls get_vnode() on its vnode-id to init it
+ *	for the kernel.
+ */
+
+static int
+bfs_walk(void *_ns, void *_directory, const char *file, char **_resolvedPath, vnode_id *vnid)
+{
+	FUNCTION_START(("file = %s\n",file));
+
+	if (_ns == NULL || _directory == NULL || file == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	// check access permissions
+	status_t status = directory->CheckPermissions(X_OK);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	BPlusTree *tree;
+	if (directory->GetTree(&tree) != B_OK)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	if ((status = tree->Find((uint8 *)file,(uint16)strlen(file),vnid)) < B_OK)
+		RETURN_ERROR(status);
+
+	Inode *inode;
+	if ((status = get_vnode(volume->ID(),*vnid,(void **)&inode)) != B_OK) {
+		REPORT_ERROR(status);
+		return B_ENTRY_NOT_FOUND;
+	}
+
+	// Is inode a symlink? Then resolve it, if we should
+
+	if (inode->IsSymLink() && _resolvedPath != NULL) {
+		status_t status = B_OK;
+		char *newPath = NULL;
+		
+		// Symbolic links can store their target in the data stream (for links
+		// that take more than 144 bytes of storage [the size of the data_stream
+		// structure]), or directly instead of the data_stream class
+		// So we have to deal with both cases here.
+		
+		// Note: we would naturally call bfs_read_link() here, but the API of the
+		// vnode layer would require us to always reserve a large chunk of memory
+		// for the path, so we're not going to do that
+
+		if (inode->Flags() & INODE_LONG_SYMLINK) {
+			size_t readBytes = inode->Node()->data.size;
+			char *data = (char *)malloc(readBytes);
+			if (data != NULL) {
+				status = inode->ReadAt(0, (uint8 *)data, &readBytes);
+				if (status == B_OK && readBytes == inode->Node()->data.size)
+					status = new_path(data, &newPath);
+
+				free(data);
+			} else
+				status = B_NO_MEMORY;
+		} else
+			status = new_path((char *)&inode->Node()->short_symlink, &newPath);
+
+		put_vnode(volume->ID(), inode->ID());
+		if (status == B_OK)
+			*_resolvedPath = newPath;
+
+		RETURN_ERROR(status);
+	}
+
+	return B_OK;
+}
+
+
+int 
+bfs_ioctl(void *_ns, void *_node, void *_cookie, int cmd, void *buffer, size_t bufferLength)
+{
+	FUNCTION_START(("node = %p, cmd = %d, buf = %p, len = %ld\n",_node,cmd,buffer,bufferLength));
+
+	if (_ns == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	switch (cmd) {
+		case IOCTL_FILE_UNCACHED_IO:
+			if (inode != NULL)
+				PRINT(("trying to make access to inode %lx uncached. Not yet implemented!\n",inode->ID()));
+			return B_ERROR;
+#ifdef DEBUG
+		case 56742:
+		{
+			// allocate all free blocks and zero them out (a test for the BlockAllocator)!
+			BlockAllocator &allocator = volume->Allocator();
+			Transaction transaction(volume,0);
+			CachedBlock cached(volume);
+			block_run run;
+			while (allocator.AllocateBlocks(&transaction,8,0,64,1,run) == B_OK) {
+				PRINT(("write block_run(%ld, %d, %d)\n",run.allocation_group,run.start,run.length));
+				for (int32 i = 0;i < run.length;i++) {
+					uint8 *block = cached.SetTo(run);
+					if (block != NULL) {
+						memset(block,0,volume->BlockSize());
+						cached.WriteBack(&transaction);
+					}
+				}
+			}
+			return B_OK;
+		}
+		case 56743:
+			dump_super_block(&volume->SuperBlock());
+			return B_OK;
+		case 56744:
+			if (inode != NULL)
+				dump_inode(inode->Node());
+			return B_OK;
+		case 56745:
+			if (inode != NULL)
+				dump_block((const char *)inode->Node(),volume->BlockSize());
+			return B_OK;
+#endif
+	}
+	return B_BAD_VALUE;
+}
+
+
+int 
+bfs_setflags(void *ns, void *node, void *cookie, int flags)
+{
+	FUNCTION_START(("node = %p, flags = %d",node,flags));
+
+	// ToDo: implement bfs_setflags()!
+	INFORM(("setflags not yet implemented...\n"));
+
+	return B_OK;
+}
+
+
+int 
+bfs_select(void *ns, void *node, void *cookie, uint8 event, uint32 ref, selectsync *sync)
+{
+	FUNCTION_START(("event = %d, ref = %lu, sync = %p\n",event,ref,sync));
+	notify_select_event(sync, ref);
+
+	return B_OK;
+}
+
+
+int 
+bfs_deselect(void *ns, void *node, void *cookie, uint8 event, selectsync *sync)
+{
+	FUNCTION();
+	return B_OK;
+}
+
+
+int 
+bfs_fsync(void *_ns, void *_node)
+{
+	FUNCTION();
+	if (_node == NULL)
+		return B_BAD_VALUE;
+
+	Inode *inode = (Inode *)_node;
+	return inode->Sync();
+}
+
+
+/**	Fills in the stat struct for a node
+ */
+
+static int
+bfs_read_stat(void *_ns, void *_node, struct stat *st)
+{
+	FUNCTION();
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+	bfs_inode *node = inode->Node();
+
+	st->st_dev = volume->ID();
+	st->st_ino = inode->ID();
+	st->st_nlink = 1;
+	st->st_blksize = BFS_IO_SIZE;
+
+	st->st_uid = node->uid;
+	st->st_gid = node->gid;
+	st->st_mode = node->mode;
+	st->st_size = node->data.size;
+
+	st->st_atime = time(NULL);
+	st->st_mtime = st->st_ctime = (time_t)(node->last_modified_time >> INODE_TIME_SHIFT);
+	st->st_crtime = (time_t)(node->create_time >> INODE_TIME_SHIFT);
+
+	return B_NO_ERROR;
+}
+
+
+int 
+bfs_write_stat(void *_ns, void *_node, struct stat *stat, long mask)
+{
+	FUNCTION();
+
+	if (_ns == NULL || _node == NULL || stat == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	// that may be incorrect here - I don't think we need write access to
+	// change most of the stat...
+	// we should definitely check a bit more if the new stats are correct and valid...
+	
+	status_t status = inode->CheckPermissions(W_OK);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	WriteLocked locked(inode->Lock());
+	if (locked.IsLocked() < B_OK)
+		RETURN_ERROR(B_ERROR);
+
+	Transaction transaction(volume,inode->BlockNumber());
+
+	bfs_inode *node = inode->Node();
+	
+	if (mask & WSTAT_MODE) {
+		PRINT(("original mode = %ld, stat->st_mode = %ld\n",node->mode,stat->st_mode));
+		node->mode = node->mode & ~S_IUMSK | stat->st_mode & S_IUMSK;
+	}
+
+	if (mask & WSTAT_UID)
+		node->uid = stat->st_uid;
+	if (mask & WSTAT_GID)
+		node->gid = stat->st_gid;
+
+	if (mask & WSTAT_SIZE) {
+		if (inode->IsDirectory())
+			return B_IS_A_DIRECTORY;
+
+		if (inode->Size() != stat->st_size) {
+			status = inode->SetFileSize(&transaction,stat->st_size);
+
+			// fill the new blocks (if any) with zeros
+			inode->FillGapWithZeros(inode->OldSize(),inode->Size());
+
+			Index index(volume);
+			index.UpdateSize(&transaction,inode);
+
+			if ((mask & WSTAT_MTIME) == 0)
+				index.UpdateLastModified(&transaction,inode);
+		}
+	}
+
+	if (mask & WSTAT_MTIME) {
+		// Index::UpdateLastModified() will set the new time in the inode
+		Index index(volume);
+		index.UpdateLastModified(&transaction,inode,(bigtime_t)stat->st_mtime << INODE_TIME_SHIFT);
+	}
+	if (mask & WSTAT_CRTIME) {
+		node->create_time = (bigtime_t)stat->st_crtime << INODE_TIME_SHIFT;
+	}
+
+	if ((status = inode->WriteBack(&transaction)) == B_OK)
+		transaction.Done();
+
+	notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL);
+
+	return status;
+}
+
+
+int 
+bfs_create(void *_ns, void *_directory, const char *name, int omode, int mode, vnode_id *vnid, void **_cookie)
+{
+	FUNCTION_START(("name = \"%s\", perms = %ld, omode = %ld\n",name,mode,omode));
+
+	if (_ns == NULL || _directory == NULL || _cookie == NULL
+		|| name == NULL || *name == '\0')
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	if (!directory->IsDirectory())
+		RETURN_ERROR(B_BAD_TYPE);
+
+	status_t status = directory->CheckPermissions(W_OK);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	file_cookie *cookie = (file_cookie *)malloc(sizeof(file_cookie));
+	if (cookie == NULL)
+		RETURN_ERROR(B_NO_MEMORY); 
+
+	// initialize the cookie
+	cookie->open_mode = omode;
+	cookie->last_size = 0;
+	cookie->last_notification = system_time();
+
+	Transaction transaction(volume,directory->BlockNumber());
+
+	status = Inode::Create(&transaction,directory,name,S_FILE | (mode & S_IUMSK),omode,0,vnid);
+	if (status == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,*vnid,name);
+	}
+	if (status < B_OK)
+		free(cookie);
+	else
+		*_cookie = cookie;
+
+	return status;
+}
+
+
+int 
+bfs_symlink(void *_ns, void *_directory, const char *name, const char *path)
+{
+	FUNCTION();
+
+	if (_ns == NULL || _directory == NULL || path == NULL
+		|| name == NULL || *name == '\0')
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	if (!directory->IsDirectory())
+		RETURN_ERROR(B_BAD_TYPE);
+
+	status_t status = directory->CheckPermissions(W_OK);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	Transaction transaction(volume,directory->BlockNumber());
+
+	Inode *link;
+	off_t id;
+	status = Inode::Create(&transaction,directory,name,S_SYMLINK | 0777,0,0,&id,&link);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	size_t length = strlen(path);
+	if (length < SHORT_SYMLINK_NAME_LENGTH) {
+		strcpy(link->Node()->short_symlink,path);
+		status = link->WriteBack(&transaction);
+	} else {
+		link->Node()->flags |= INODE_LONG_SYMLINK | INODE_LOGGED;
+		// The following call will have to write the inode back, so
+		// we don't have to do that here...
+		status = link->WriteAt(&transaction,0,(const uint8 *)path,&length);
+	}
+
+	// Inode::Create() left the inode locked
+	put_vnode(volume->ID(),id);
+
+	if (status == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,id,name);
+	}
+
+	return status;
+}
+
+
+int 
+bfs_link(void *ns, void *dir, const char *name, void *node)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+
+	// ToDo: implement bfs_link()?!?
+
+	return B_ERROR;
+}
+
+
+int 
+bfs_unlink(void *_ns, void *_directory, const char *name)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+
+	if (_ns == NULL || _directory == NULL || name == NULL || *name == '\0')
+		return B_BAD_VALUE;
+	if (!strcmp(name,"..") || !strcmp(name,"."))
+		return B_NOT_ALLOWED;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	status_t status = directory->CheckPermissions(W_OK);
+	if (status < B_OK)
+		return status;
+
+	Transaction transaction(volume,directory->BlockNumber());
+
+	off_t id;
+	if ((status = directory->Remove(&transaction,name,&id)) == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ENTRY_REMOVED,volume->ID(),directory->ID(),0,id,NULL);
+	}
+	return status;
+}
+
+
+int 
+bfs_rename(void *_ns, void *_oldDir, const char *oldName, void *_newDir, const char *newName)
+{
+	FUNCTION_START(("oldDir = %p, oldName = \"%s\", newDir = %p, newName = \"%s\"\n",_oldDir,oldName,_newDir,newName));
+
+	// there may be some more tests needed?!
+	if (_ns == NULL || _oldDir == NULL || _newDir == NULL
+		|| oldName == NULL || *oldName == '\0'
+		|| newName == NULL || *newName == '\0'
+		|| !strcmp(oldName,".") || !strcmp(oldName,"..")
+		|| !strcmp(newName,".") || !strcmp(newName,".."))
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *oldDirectory = (Inode *)_oldDir;
+	Inode *newDirectory = (Inode *)_newDir;
+
+	// get the directory's tree, and a pointer to the inode which should be changed
+	BPlusTree *tree;
+	status_t status = oldDirectory->GetTree(&tree);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	off_t id;
+	status = tree->Find((const uint8 *)oldName,strlen(oldName),&id);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	Vnode vnode(volume,id);
+	Inode *inode;
+	if (vnode.Get(&inode) < B_OK)
+		return B_IO_ERROR;
+
+	// Don't move a directory into one of its children - we soar up
+	// from the newDirectory to either the root node or the old
+	// directory, whichever comes first.
+	// If we meet our inode on that way, we have to bail out.
+
+	if (oldDirectory != newDirectory) {
+		vnode_id parent = volume->ToVnode(newDirectory->Parent());
+		vnode_id root = volume->RootNode()->ID();
+		while (true)
+		{
+			if (parent == id)
+				return B_BAD_VALUE;
+			else if (parent == root || parent == oldDirectory->ID())
+				break;
+
+			Vnode vnode(volume,parent);
+			Inode *parentNode;
+			if (vnode.Get(&parentNode) < B_OK)
+				return B_ERROR;
+
+			parent = volume->ToVnode(parentNode->Parent());
+		}
+	}
+
+	// Everything okay? Then lets get to work...
+
+	Transaction transaction(volume,oldDirectory->BlockNumber());
+
+	// First, try to make sure there is nothing that will stop us in
+	// the target directory - since this is the only non-critical
+	// failure, we will test this case first
+	BPlusTree *newTree = tree;
+	if (newDirectory != oldDirectory) {
+		status = newDirectory->GetTree(&newTree);
+		if (status < B_OK)
+			RETURN_ERROR(status);
+	}
+
+	status = newTree->Insert(&transaction,(const uint8 *)newName,strlen(newName),id);
+	if (status == B_NAME_IN_USE) {
+		// If there is already a file with that name, we have to remove
+		// it, as long it's not a directory with files in it
+		off_t clobber;
+		if (newTree->Find((const uint8 *)newName,strlen(newName),&clobber) < B_OK)
+			return B_NAME_IN_USE;
+		if (clobber == id)
+			return B_BAD_VALUE;
+
+		Vnode vnode(volume,clobber);
+		Inode *other;
+		if (vnode.Get(&other) < B_OK)
+			return B_NAME_IN_USE;
+
+		status = newDirectory->Remove(&transaction,newName,NULL,other->IsDirectory());
+		if (status < B_OK)
+			return status;
+
+		notify_listener(B_ENTRY_REMOVED,volume->ID(),newDirectory->ID(),0,clobber,NULL);
+
+		status = newTree->Insert(&transaction,(const uint8 *)newName,strlen(newName),id);
+	}
+	if (status < B_OK)
+		return status;
+
+	// If anything fails now, we have to remove the inode from the
+	// new directory in any case to restore the previous state
+	status_t bailStatus = B_OK;
+	
+	// update the name only when they differ
+	bool nameUpdated = false;
+	if (strcmp(oldName,newName)) {
+		status = inode->SetName(&transaction,newName);
+		if (status == B_OK) {
+			Index index(volume);
+			index.UpdateName(&transaction,oldName,newName,inode);
+			nameUpdated = true;
+		}
+	}
+	
+	if (status == B_OK) {
+		status = tree->Remove(&transaction,(const uint8 *)oldName,strlen(oldName),id);
+		if (status == B_OK) {
+			inode->Node()->parent = newDirectory->BlockRun();
+			
+			// if it's a directory, update the parent directory pointer
+			// in its tree if necessary
+			BPlusTree *movedTree = NULL;
+			if (oldDirectory != newDirectory
+				&& inode->IsDirectory()
+				&& (status = inode->GetTree(&movedTree)) == B_OK)
+				status = movedTree->Replace(&transaction,(const uint8 *)"..",2,newDirectory->ID());
+
+			if (status == B_OK) {
+				status = inode->WriteBack(&transaction);
+				if (status == B_OK)	{
+					transaction.Done();
+
+					notify_listener(B_ENTRY_MOVED,volume->ID(),oldDirectory->ID(),newDirectory->ID(),id,newName);
+					return B_OK;
+				}
+			}
+			// Those better don't fail, or we switch to a read-only
+			// device for safety reasons (Volume::Panic() does this
+			// for us)
+			// Anyway, if we overwrote a file in the target directory
+			// this is lost now (only in-memory, not on-disk)...
+			bailStatus = tree->Insert(&transaction,(const uint8 *)oldName,strlen(oldName),id);
+			if (movedTree != NULL)
+				movedTree->Replace(&transaction,(const uint8 *)"..",2,oldDirectory->ID());
+		}
+	}
+	if (bailStatus == B_OK && nameUpdated)
+		bailStatus = inode->SetName(&transaction,oldName);
+
+	if (bailStatus == B_OK)
+		bailStatus = newTree->Remove(&transaction,(const uint8 *)newName,strlen(newName),id);
+
+	if (bailStatus < B_OK)
+		volume->Panic();
+
+	return status;
+}
+
+
+/**	Opens the file with the specified mode.
+ */
+
+static int
+bfs_open(void *_ns, void *_node, int omode, void **_cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || _node == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	// opening a directory read-only is allowed, although you can't read
+	// any data from it.
+	if (inode->IsDirectory() && omode & O_RWMASK) {
+		omode = omode & ~O_RWMASK;
+		// ToDo: for compatibility reasons, we don't return an error here...
+		// e.g. "copyattr" tries to do that
+		//return B_IS_A_DIRECTORY;
+	}
+
+	status_t status = inode->CheckPermissions(oModeToAccess(omode));
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	// we could actually use the cookie to keep track of:
+	//	- the last block_run
+	//	- the location in the data_stream (indirect, double indirect,
+	//	  position in block_run array)
+	//
+	// This could greatly speed up continuous reads of big files, especially
+	// in the indirect block section.
+
+	file_cookie *cookie = (file_cookie *)malloc(sizeof(file_cookie));
+	if (cookie == NULL)
+		RETURN_ERROR(B_NO_MEMORY); 
+
+	// initialize the cookie
+	cookie->open_mode = omode;
+		// needed by e.g. bfs_write() for O_APPEND
+	cookie->last_size = inode->Size();
+	cookie->last_notification = system_time();
+
+	// Should we truncate the file?
+	if (omode & O_TRUNC) {
+		Transaction transaction(volume,inode->BlockNumber());
+		WriteLocked locked(inode->Lock());
+
+		status_t status = inode->SetFileSize(&transaction,0);
+		if (status < B_OK) {
+			// bfs_free_cookie() is only called if this function is successful
+			free(cookie);
+			return status;
+		}
+
+		transaction.Done();
+	}
+
+	*_cookie = cookie;
+	return B_OK;
+}
+
+
+/**	Read a file specified by node, using information in cookie
+ *	and at offset specified by pos. read len bytes into buffer buf.
+ */
+
+static int
+bfs_read(void *_ns, void *_node, void *_cookie, off_t pos, void *buffer, size_t *_length)
+{
+	//FUNCTION();
+	Inode *inode = (Inode *)_node;
+	
+	if (!inode->HasUserAccessableStream()) {
+		*_length = 0;
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	ReadLocked locked(inode->Lock());
+	return inode->ReadAt(pos,(uint8 *)buffer,_length);
+}
+
+
+int 
+bfs_write(void *_ns, void *_node, void *_cookie, off_t pos, const void *buffer, size_t *_length)
+{
+	//FUNCTION();
+	// uncomment to be more robust against a buggy vnode layer ;-)
+	//if (_ns == NULL || _node == NULL || _cookie == NULL)
+	//	return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	if (!inode->HasUserAccessableStream()) {
+		*_length = 0;
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	file_cookie *cookie = (file_cookie *)_cookie;
+
+	if (cookie->open_mode & O_APPEND)
+		pos = inode->Size();
+
+	WriteLocked locked(inode->Lock());
+	if (locked.IsLocked() < B_OK)
+		RETURN_ERROR(B_ERROR);
+
+	Transaction transaction;
+		// We are not starting the transaction here, since
+		// it might not be needed at all
+
+	status_t status = inode->WriteAt(&transaction,pos,(const uint8 *)buffer,_length);
+	
+	if (status == B_OK)
+		transaction.Done();
+
+	// periodically notify if the file size has changed
+	if (cookie->last_size != inode->Size()
+		&& system_time() > cookie->last_notification + INODE_NOTIFICATION_INTERVAL) {
+		notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL);
+		cookie->last_size = inode->Size();
+		cookie->last_notification = system_time();
+	}
+
+	// This will flush the dirty blocks to disk from time to time.
+	// It's done here and not in Inode::WriteAt() so that it won't
+	// add to the duration of a transaction - it might even be a
+	// good idea to offload those calls to another thread
+	volume->WriteCachedBlocksIfNecessary();
+
+	return status;
+}
+
+
+/**	Do whatever is necessary to close a file, EXCEPT for freeing
+ *	the cookie!
+ */
+
+static int
+bfs_close(void *_ns, void *_node, void *_cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || _node == NULL || _cookie == NULL)
+		return B_BAD_VALUE;
+
+	file_cookie *cookie = (file_cookie *)_cookie;
+
+	if (cookie->open_mode & O_RWMASK) {
+		// trim the preallocated blocks and update the size,
+		// and last_modified indices if needed
+		Volume *volume = (Volume *)_ns;
+		Inode *inode = (Inode *)_node;
+
+		Transaction transaction(volume,inode->BlockNumber());
+
+		status_t status = inode->Trim(&transaction);
+		if (status < B_OK)
+			FATAL(("Could not trim preallocated blocks!"));
+
+		Index index(volume);
+		index.UpdateSize(&transaction,inode);
+		index.UpdateLastModified(&transaction,inode);
+
+		if (status == B_OK)
+			transaction.Done();
+
+		notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL);
+	}
+
+	return B_OK;
+}
+
+
+static int
+bfs_free_cookie(void * /*ns*/, void * /*node*/, void *cookie)
+{
+	FUNCTION();
+
+	if (cookie != NULL)
+		free(cookie);
+
+	return B_OK;
+}
+
+
+/**	Checks access permissions, return B_NOT_ALLOWED if the action
+ *	is not allowed.
+ */
+
+static int
+bfs_access(void *_ns, void *_node, int accessMode)
+{
+	FUNCTION();
+	
+	if (_ns == NULL || _node == NULL)
+		return B_BAD_VALUE;
+
+	Inode *inode = (Inode *)_node;
+	status_t status = inode->CheckPermissions(accessMode);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	return B_OK;
+}
+
+
+static int
+bfs_read_link(void *_ns, void *_node, char *buffer, size_t *bufferSize)
+{
+	FUNCTION();
+
+	Inode *inode = (Inode *)_node;
+	
+	if (!inode->IsSymLink())
+		RETURN_ERROR(B_BAD_VALUE);
+
+	if (inode->Flags() & INODE_LONG_SYMLINK) {
+		status_t status = inode->ReadAt(0, (uint8 *)buffer, bufferSize);
+		if (status < B_OK)
+			RETURN_ERROR(status);
+		
+		*bufferSize = inode->Size();
+		return B_OK;
+	}
+
+	size_t numBytes = strlen((char *)&inode->Node()->short_symlink);
+	uint32 bytes = numBytes;
+	if (bytes > *bufferSize)
+		bytes = *bufferSize;
+
+	memcpy(buffer, inode->Node()->short_symlink, bytes);
+	*bufferSize = numBytes;
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+//	Directory functions
+
+
+int 
+bfs_mkdir(void *_ns, void *_directory, const char *name, int mode)
+{
+	FUNCTION_START(("name = \"%s\", perms = %ld\n",name,mode));
+
+	if (_ns == NULL || _directory == NULL
+		|| name == NULL || *name == '\0')
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	if (!directory->IsDirectory())
+		RETURN_ERROR(B_BAD_TYPE);
+
+	status_t status = directory->CheckPermissions(W_OK);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	Transaction transaction(volume,directory->BlockNumber());
+
+	// Inode::Create() locks the inode if we pass the "id" parameter, but we
+	// need it anyway
+	off_t id;
+	status = Inode::Create(&transaction,directory,name,S_DIRECTORY | (mode & S_IUMSK),0,0,&id);
+	if (status == B_OK) {
+		put_vnode(volume->ID(),id);
+		transaction.Done();
+
+		notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,id,name);
+	}
+
+	return status;
+}
+
+
+int 
+bfs_rmdir(void *_ns, void *_directory, const char *name)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+	
+	if (_ns == NULL || _directory == NULL || name == NULL || *name == '\0')
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *directory = (Inode *)_directory;
+
+	Transaction transaction(volume,directory->BlockNumber());
+
+	off_t id;
+	status_t status = directory->Remove(&transaction,name,&id,true);
+	if (status == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ENTRY_REMOVED,volume->ID(),directory->ID(),0,id,NULL);
+	}
+
+	return status;
+}
+
+
+/**	creates fs-specific "cookie" struct that keeps track of where
+ *	you are at in reading through directory entries in bfs_readdir.
+ */
+
+static int
+bfs_open_dir(void *_ns, void *_node, void **_cookie)
+{
+	FUNCTION();
+	
+	if (_ns == NULL || _node == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+	
+	Inode *inode = (Inode *)_node;
+	
+	if (!inode->IsDirectory())
+		RETURN_ERROR(B_BAD_VALUE);
+
+	BPlusTree *tree;
+	if (inode->GetTree(&tree) != B_OK)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	TreeIterator *iterator = new TreeIterator(tree);
+	if (iterator == NULL)
+		RETURN_ERROR(B_NO_MEMORY);
+
+	*_cookie = iterator;
+	return B_OK;
+}
+
+
+static int
+bfs_read_dir(void *_ns, void *_node, void *_cookie, long *num, 
+			struct dirent *dirent, size_t bufferSize)
+{
+	FUNCTION();
+
+	TreeIterator *iterator = (TreeIterator *)_cookie;
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	uint16 length;
+	vnode_id id;
+	status_t status = iterator->GetNextEntry(dirent->d_name,&length,bufferSize,&id);
+	if (status == B_ENTRY_NOT_FOUND) {
+		*num = 0;
+		return B_OK;
+	} else if (status != B_OK)
+		RETURN_ERROR(status);
+
+	Volume *volume = (Volume *)_ns;
+
+	dirent->d_dev = volume->ID();
+	dirent->d_ino = id;
+	dirent->d_reclen = length;
+
+	*num = 1;
+	return B_OK;
+}
+
+		
+/** Sets the TreeIterator back to the beginning of the directory
+ */
+
+static int
+bfs_rewind_dir(void * /*ns*/, void * /*node*/, void *_cookie)
+{
+	FUNCTION();
+	TreeIterator *iterator = (TreeIterator *)_cookie;
+
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+	
+	return iterator->Rewind();
+}
+
+
+static int		
+bfs_close_dir(void * /*ns*/, void * /*node*/, void * /*_cookie*/)
+{
+	FUNCTION();
+	// Do whatever you need to to close a directory, but DON'T free the cookie!
+	return B_OK;
+}
+
+
+static int
+bfs_free_dir_cookie(void *ns, void *node, void *_cookie)
+{
+	TreeIterator *iterator = (TreeIterator *)_cookie;
+	
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	delete iterator;
+	return B_OK;
+}
+
+
+//	#pragma mark -
+//	Attribute functions
+
+
+int 
+bfs_open_attrdir(void *_ns, void *_node, void **cookie)
+{
+	FUNCTION();
+	
+	Inode *inode = (Inode *)_node;
+	if (inode == NULL || inode->Node() == NULL)
+		RETURN_ERROR(B_ERROR);
+
+	AttributeIterator *iterator = new AttributeIterator(inode);
+	if (iterator == NULL)
+		RETURN_ERROR(B_NO_MEMORY);
+
+	*cookie = iterator;
+	return B_OK;
+}
+
+
+int
+bfs_close_attrdir(void *ns, void *node, void *cookie)
+{
+	FUNCTION();
+	return B_OK;
+}
+
+
+int
+bfs_free_attrdir_cookie(void *ns, void *node, void *_cookie)
+{
+	FUNCTION();
+	AttributeIterator *iterator = (AttributeIterator *)_cookie;
+
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	delete iterator;
+	return B_OK;
+}
+
+
+int
+bfs_rewind_attrdir(void *_ns, void *_node, void *_cookie)
+{
+	FUNCTION();
+	
+	AttributeIterator *iterator = (AttributeIterator *)_cookie;
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	RETURN_ERROR(iterator->Rewind());
+}
+
+
+int 
+bfs_read_attrdir(void *_ns, void *node, void *_cookie, long *num, struct dirent *dirent, size_t bufsize)
+{
+	FUNCTION();
+	AttributeIterator *iterator = (AttributeIterator *)_cookie;
+
+	if (iterator == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	uint32 type;
+	size_t length;
+	status_t status = iterator->GetNext(dirent->d_name,&length,&type,&dirent->d_ino);
+	if (status == B_ENTRY_NOT_FOUND) {
+		*num = 0;
+		return B_OK;
+	} else if (status != B_OK)
+		RETURN_ERROR(status);
+
+	Volume *volume = (Volume *)_ns;
+
+	dirent->d_dev = volume->ID();
+	dirent->d_reclen = length;
+
+	*num = 1;
+	return B_OK;
+}
+
+
+int
+bfs_remove_attr(void *_ns, void *_node, const char *name)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+
+	if (_ns == NULL || _node == NULL || name == NULL)
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	status_t status = inode->CheckPermissions(W_OK);
+	if (status < B_OK)
+		return status;
+
+	Transaction transaction(volume,inode->BlockNumber());
+
+	status = inode->RemoveAttribute(&transaction,name);
+	if (status == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ATTR_CHANGED,volume->ID(),0,0,inode->ID(),name);
+	}
+
+	RETURN_ERROR(status);
+}
+
+
+int
+bfs_rename_attr(void *ns, void *node, const char *oldname,const char *newname)
+{
+	FUNCTION_START(("name = \"%s\",to = \"%s\"\n",oldname,newname));
+
+	// ToDo: implement bfs_rename_attr()!
+	// Does anybody need this? :-)
+
+	RETURN_ERROR(B_ENTRY_NOT_FOUND);
+}
+
+
+int
+bfs_stat_attr(void *ns, void *_node, const char *name,struct attr_info *attrInfo)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+
+	Inode *inode = (Inode *)_node;
+	if (inode == NULL || inode->Node() == NULL)
+		RETURN_ERROR(B_ERROR);
+	
+	small_data *smallData = NULL;
+	if (inode->SmallDataLock().Lock() == B_OK)
+	{
+		if ((smallData = inode->FindSmallData((const char *)name)) != NULL) {
+			attrInfo->type = smallData->type;
+			attrInfo->size = smallData->data_size;
+		}
+		inode->SmallDataLock().Unlock();
+	}
+	if (smallData != NULL)
+		return B_OK;
+
+	// search in the attribute directory
+	Inode *attribute;
+	status_t status = inode->GetAttribute(name,&attribute);
+	if (status == B_OK) {
+		attrInfo->type = attribute->Node()->type;
+		attrInfo->size = attribute->Node()->data.size;
+
+		inode->ReleaseAttribute(attribute);
+		return B_OK;
+	}
+
+	RETURN_ERROR(status);
+}
+
+
+int
+bfs_write_attr(void *_ns, void *_node, const char *name, int type,const void *buffer, size_t *_length, off_t pos)
+{
+	FUNCTION_START(("name = \"%s\"\n",name));
+
+	if (_ns == NULL || _node == NULL || name == NULL || *name == '\0')
+		RETURN_ERROR(B_BAD_VALUE);
+
+	// Writing the name attribute using this function is not allowed,
+	// also using the reserved indices name, last_modified, and size
+	// shouldn't be allowed.
+	if (name[0] == FILE_NAME_NAME && name[1] == '\0'
+		|| !strcmp(name,"name")
+		|| !strcmp(name,"last_modified")
+		|| !strcmp(name,"size"))
+		RETURN_ERROR(B_NOT_ALLOWED);
+
+	Volume *volume = (Volume *)_ns;
+	Inode *inode = (Inode *)_node;
+
+	status_t status = inode->CheckPermissions(W_OK);
+	if (status < B_OK)
+		return status;
+
+	Transaction transaction(volume,inode->BlockNumber());
+
+	status = inode->WriteAttribute(&transaction,name,type,pos,(const uint8 *)buffer,_length);
+	if (status == B_OK) {
+		transaction.Done();
+
+		notify_listener(B_ATTR_CHANGED,volume->ID(),0,0,inode->ID(),name);
+	}
+
+	return status;
+}
+
+
+int
+bfs_read_attr(void *_ns, void *_node, const char *name, int type,void *buffer, size_t *_length, off_t pos)
+{
+	FUNCTION();
+	Inode *inode = (Inode *)_node;
+
+	if (inode == NULL || name == NULL || *name == '\0' || buffer == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	status_t status = inode->CheckPermissions(R_OK);
+	if (status < B_OK)
+		return status;
+
+	return inode->ReadAttribute(name,type,pos,(uint8 *)buffer,_length);
+}
+
+
+//	#pragma mark -
+//	Index functions
+
+
+int 
+bfs_open_indexdir(void *_ns, void **_cookie)
+{
+	FUNCTION();
+
+	if (_ns == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+
+	if (volume->IndicesNode() == NULL)
+		RETURN_ERROR(B_ENTRY_NOT_FOUND);
+
+	// Since the indices root node is just a directory, and we are storing
+	// a pointer to it in our Volume object, we can just use the directory
+	// traversal functions.
+	// In fact we're storing it in the Volume object for that reason.
+
+	RETURN_ERROR(bfs_open_dir(_ns,volume->IndicesNode(),_cookie));
+}
+
+
+int 
+bfs_close_indexdir(void *_ns, void *_cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	RETURN_ERROR(bfs_close_dir(_ns,volume->IndicesNode(),_cookie));
+}
+
+
+int 
+bfs_free_indexdir_cookie(void *_ns, void *_node, void *_cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	RETURN_ERROR(bfs_free_dir_cookie(_ns,volume->IndicesNode(),_cookie));
+}
+
+
+int 
+bfs_rewind_indexdir(void *_ns, void *_cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	RETURN_ERROR(bfs_rewind_dir(_ns,volume->IndicesNode(),_cookie));
+}
+
+
+int 
+bfs_read_indexdir(void *_ns, void *_cookie, long *num, struct dirent *dirent, size_t bufferSize)
+{
+	FUNCTION();
+	if (_ns == NULL || _cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	RETURN_ERROR(bfs_read_dir(_ns,volume->IndicesNode(),_cookie,num,dirent,bufferSize));
+}
+
+
+int 
+bfs_create_index(void *_ns, const char *name, int type, int flags)
+{
+	FUNCTION_START(("name = \"%s\", type = %ld, flags = %ld\n",name,type,flags));
+	if (_ns == NULL || name == NULL || *name == '\0')
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+
+	if (volume->IsReadOnly())
+		return B_READ_ONLY_DEVICE;
+
+	// only root users are allowed to create indices
+	if (geteuid() != 0)
+		return B_NOT_ALLOWED;
+
+	Transaction transaction(volume,volume->Indices());
+
+	Index index(volume);
+	status_t status = index.Create(&transaction,name,type);
+
+	if (status == B_OK)
+		transaction.Done();
+
+	RETURN_ERROR(status);
+}
+
+
+int 
+bfs_remove_index(void *_ns, const char *name)
+{
+	FUNCTION();
+	if (_ns == NULL || name == NULL || *name == '\0')
+		return B_BAD_VALUE;
+
+	Volume *volume = (Volume *)_ns;
+
+	if (volume->IsReadOnly())
+		return B_READ_ONLY_DEVICE;
+
+	// only root users are allowed to remove indices
+	if (geteuid() != 0)
+		return B_NOT_ALLOWED;
+
+	Inode *indices;
+	if ((indices = volume->IndicesNode()) == NULL)
+		return B_ENTRY_NOT_FOUND;
+
+	Transaction transaction(volume,volume->Indices());
+
+	status_t status = indices->Remove(&transaction,name);
+	if (status == B_OK)
+		transaction.Done();
+
+	RETURN_ERROR(status);
+}
+
+
+int 
+bfs_rename_index(void *ns, const char *oldname, const char *newname)
+{
+	FUNCTION_START(("from = %s, to = %s\n",oldname,newname));
+
+	// ToDo: implement bfs_rename_index()?!
+	// Well, renaming an index doesn't make that much sense, as you
+	// would also need to remove every file in it (or the index
+	// would contain wrong data)
+	// But in that case, you can better remove the old one, and
+	// create a new one...
+	// There is also no way to call this function from a userland
+	// application.
+
+	RETURN_ERROR(B_ENTRY_NOT_FOUND);
+}
+
+
+int 
+bfs_stat_index(void *_ns, const char *name, struct index_info *indexInfo)
+{
+	FUNCTION_START(("name = %s\n",name));
+	if (_ns == NULL || name == NULL || indexInfo == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Volume *volume = (Volume *)_ns;
+	Index index(volume);
+	status_t status = index.SetTo(name);
+	if (status < B_OK)
+		RETURN_ERROR(status);
+
+	bfs_inode *node = index.Node()->Node();
+
+	indexInfo->type = index.Type();
+	indexInfo->size = node->data.size;
+	indexInfo->modification_time = (time_t)(node->last_modified_time >> INODE_TIME_SHIFT);
+	indexInfo->creation_time = (time_t)(node->create_time >> INODE_TIME_SHIFT);
+	indexInfo->uid = node->uid;
+	indexInfo->gid = node->gid;
+
+	return B_OK;
+}
+
+
+//	#pragma mark -
+//	Query functions
+
+
+int 
+bfs_open_query(void *_ns,const char *queryString,ulong flags,port_id port,long token,void **cookie)
+{
+	FUNCTION();
+	if (_ns == NULL || queryString == NULL || cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	PRINT(("query = \"%s\", flags = %lu, port_id = %ld, token = %ld\n",queryString,flags,port,token));
+
+	Volume *volume = (Volume *)_ns;
+	
+	Expression *expression = new Expression((char *)queryString);
+	if (expression == NULL)
+		RETURN_ERROR(B_NO_MEMORY);
+	
+	if (expression->InitCheck() < B_OK) {
+		FATAL(("Could not parse query, stopped at: \"%s\"\n",expression->Position()));
+		delete expression;
+		RETURN_ERROR(B_BAD_VALUE);
+	}
+
+	Query *query = new Query(volume,expression);
+	if (query == NULL) {
+		delete expression;
+		RETURN_ERROR(B_NO_MEMORY);
+	}
+	
+	if (flags & B_LIVE_QUERY)
+		query->SetLiveMode(port,token);
+
+	*cookie = (void *)query;
+	
+	return B_OK;
+}
+
+
+int
+bfs_close_query(void *ns, void *cookie)
+{
+	FUNCTION();
+	return B_OK;
+}
+
+
+int
+bfs_free_query_cookie(void *ns, void *node, void *cookie)
+{
+	FUNCTION();
+	if (cookie == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	Query *query = (Query *)cookie;
+	Expression *expression = query->GetExpression();
+	delete query;
+	delete expression;
+
+	return B_OK;
+}
+
+
+int
+bfs_read_query(void */*ns*/,void *cookie,long *num,struct dirent *dirent,size_t bufferSize)
+{
+	FUNCTION();
+	Query *query = (Query *)cookie;
+	if (query == NULL)
+		RETURN_ERROR(B_BAD_VALUE);
+
+	status_t status = query->GetNextEntry(dirent,bufferSize);
+	if (status == B_OK)
+		*num = 1;
+	else if (status == B_ENTRY_NOT_FOUND)
+		*num = 0;
+	else
+		return status;
+
+	return B_OK;
+}
+
diff --git a/src/add-ons/kernel/file_systems/bfs/lock.h b/src/add-ons/kernel/file_systems/bfs/lock.h
new file mode 100644
index 0000000000..b05adaa21b
--- /dev/null
+++ b/src/add-ons/kernel/file_systems/bfs/lock.h
@@ -0,0 +1,47 @@
+/*
+	Copyright 1999-2001, Be Incorporated.   All Rights Reserved.
+	This file may be used under the terms of the Be Sample Code License.
+*/
+
+#ifndef _LOCK_H
+#define _LOCK_H
+
+#include <BeBuild.h>
+
+#include <OS.h>
+
+#ifdef __cplusplus
+	extern "C" {
+#else
+	typedef struct lock lock;
+	typedef struct mlock mlock;
+#endif
+
+
+struct lock {
+	sem_id		s;
+	long		c;
+};
+
+struct mlock {
+	sem_id		s;
+};
+
+extern _IMPEXP_KERNEL int	new_lock(lock *l, const char *name);
+extern _IMPEXP_KERNEL int	free_lock(lock *l);
+
+#define	LOCK(l)		if (atomic_add(&l.c, -1) <= 0) acquire_sem(l.s);
+#define	UNLOCK(l)	if (atomic_add(&l.c, 1) < 0) release_sem(l.s);
+
+extern _IMPEXP_KERNEL int	new_mlock(mlock *l, long c, const char *name);
+extern _IMPEXP_KERNEL int	free_mlock(mlock *l);
+
+#define		LOCKM(l,cnt)	acquire_sem_etc(l.s, cnt, 0, 0)
+#define		UNLOCKM(l,cnt)	release_sem_etc(l.s, cnt, 0)
+
+
+#ifdef __cplusplus
+  } // extern "C"
+#endif
+
+#endif